Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ https://github.com/KoljaB/RealtimeTTS/assets/7604638/87dcd9a5-3a4e-4f57-be45-837
- **High-Quality Audio**
- generates clear and natural-sounding speech
- **Multiple TTS Engine Support**
- supports OpenAI TTS, Elevenlabs, Azure Speech Services, Coqui TTS, StyleTTS2, Piper, gTTS, Edge TTS, Parler TTS, Kokoro, Cartesia, Faster Qwen 3, NeuTTS, PocketTTS, Modelslab, CAMB AI, MiniMax and System TTS
- supports OpenAI TTS, Elevenlabs, Azure Speech Services, Coqui TTS, StyleTTS2, Piper, gTTS, Edge TTS, Parler TTS, Kokoro, Cartesia, Faster Qwen 3, NeuTTS, PocketTTS, Modelslab, CAMB AI, MiniMax, Typecast and System TTS
- **Multilingual**
- **Robust and Reliable**:
- ensures continuous operation through a fallback mechanism
Expand Down Expand Up @@ -140,6 +140,7 @@ This library uses:
- **OrpheusEngine** 🏠: Llama‑powered TTS with emotion tags
- **CambEngine** 🌐: CAMB AI MARS models with 140+ languages
- **MiniMaxEngine** 🌐: MiniMax Cloud TTS with 12 voice presets
- **TypecastEngine** 🌐: Typecast Cloud TTS with emotion and prosody control
- **ZipVoiceEngine** 🏠: 123M zero‑shot model, state‑of‑the‑art quality
- **PocketTTSEngine** 🏠: Kyutai Labs 100M model, CPU-optimized with voice cloning
- **NeuTTSEngine** 🏠: Voice cloning with 3-second reference audio
Expand Down Expand Up @@ -183,6 +184,7 @@ Install only required dependencies using these options:
- **coqui**: Coqui TTS engine
- **camb**: CAMB AI MARS TTS
- **minimax**: MiniMax Cloud TTS
- **typecast**: Typecast Cloud TTS
- **minimal**: Core package only (for custom engine development)

Example: `pip install realtimetts[all]`, `pip install realtimetts[azure]`, `pip install realtimetts[azure,elevenlabs,openai]`
Expand Down Expand Up @@ -235,6 +237,13 @@ To use the `MiniMaxEngine`, you need:
- Available models: `speech-2.8-hd` (high quality), `speech-2.8-turbo` (fast)
- 12 voice presets including English and multilingual options

### TypecastEngine
To use the `TypecastEngine`, you need:
- Typecast API key (provided via TypecastEngine constructor parameter "api_key" or in the environment variable TYPECAST_API_KEY)
- Voice ID (provided via TypecastEngine constructor parameter "voice_id" or in the environment variable TYPECAST_VOICE_ID)
- Available models: `ssfm-v30` (default, supports smart emotion), `ssfm-v21`
- Emotion presets (ssfm-v30): `normal`, `happy`, `sad`, `angry`, `whisper`, `toneup`, `tonedown`

### ElevenlabsEngine
For the `ElevenlabsEngine`, you need:
- Elevenlabs API key (provided via ElevenlabsEngine constructor parameter "api_key" or in the environment variable ELEVENLABS_API_KEY)
Expand Down
16 changes: 16 additions & 0 deletions RealtimeTTS/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"MiniMaxEngine", "MiniMaxVoice",
"CartesiaEngine", "CartesiaVoice",
"FasterQwenEngine", "FasterQwenVoice",
"TypecastEngine", "TypecastVoice",
]


Expand Down Expand Up @@ -276,6 +277,19 @@ def _load_fasterqwen_engine():
return FasterQwenEngine


def _load_typecast_engine():
try:
from .engines.typecast_engine import TypecastEngine, TypecastVoice
except ImportError as e:
raise ImportError(
"Failed to load TypecastEngine and TypecastVoice. "
"Make sure the `cast` CLI is installed: https://typecast.ai"
) from e
globals()["TypecastEngine"] = TypecastEngine
globals()["TypecastVoice"] = TypecastVoice
return TypecastEngine


# Mapping names to their lazy loader functions.
_lazy_imports = {
"SystemEngine": _load_system_engine,
Expand Down Expand Up @@ -316,6 +330,8 @@ def _load_fasterqwen_engine():
"CartesiaVoice": _load_cartesia_engine,
"FasterQwenEngine": _load_fasterqwen_engine,
"FasterQwenVoice": _load_fasterqwen_engine,
"TypecastEngine": _load_typecast_engine,
"TypecastVoice": _load_typecast_engine,
}


Expand Down
12 changes: 11 additions & 1 deletion RealtimeTTS/engines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
"ModelsLabEngine", "ModelsLabVoice",
"MiniMaxEngine", "MiniMaxVoice",
"CartesiaEngine", "CartesiaVoice",
"FasterQwenEngine", "FasterQwenVoice"
"FasterQwenEngine", "FasterQwenVoice",
"TypecastEngine", "TypecastVoice"
]


Expand Down Expand Up @@ -166,6 +167,13 @@ def _load_fasterqwen_engine():
return FasterQwenEngine


def _load_typecast_engine():
from .typecast_engine import TypecastEngine, TypecastVoice
globals()["TypecastEngine"] = TypecastEngine
globals()["TypecastVoice"] = TypecastVoice
return TypecastEngine


# Map attribute names to lazy loader functions.
_lazy_imports = {
"AzureEngine": _load_azure_engine,
Expand Down Expand Up @@ -208,6 +216,8 @@ def _load_fasterqwen_engine():
"CartesiaVoice": _load_cartesia_engine,
"FasterQwenEngine": _load_fasterqwen_engine,
"FasterQwenVoice": _load_fasterqwen_engine,
"TypecastEngine": _load_typecast_engine,
"TypecastVoice": _load_typecast_engine,
}


Expand Down
236 changes: 236 additions & 0 deletions RealtimeTTS/engines/typecast_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
from .base_engine import BaseEngine
from typing import Union
import os
import io
import wave
import pyaudio


class TypecastVoice:
def __init__(self, name: str, voice_id: str, gender: str = "", age: str = "",
use_cases: list = None, models: list = None):
self.name = name
self.voice_id = voice_id
self.gender = gender
self.age = age
self.use_cases = use_cases or []
self.models = models or []

def __repr__(self):
return f"{self.name} ({self.voice_id})"


class TypecastEngine(BaseEngine):
def __init__(
self,
voice_id: str = None,
model: str = "ssfm-v30",
tempo: float = None,
pitch: int = None,
volume: int = None,
language: str = None,
emotion_type: str = "preset",
emotion_preset: str = "normal",
emotion_intensity: float = None,
seed: int = None,
api_key: str = None,
host: str = None,
debug: bool = False,
chunk_size: int = 4096,
):
"""
Initializes a Typecast TTS engine using the official typecast-python SDK.

Args:
voice_id (str): Typecast voice ID. If unspecified, uses TYPECAST_VOICE_ID env var.
model (str): Model version. "ssfm-v30" or "ssfm-v21". Defaults to "ssfm-v30".
tempo (float): Speech speed multiplier (0.5–2.0).
pitch (int): Pitch in semitones (-12 to +12).
volume (int): Volume (0–200).
language (str): Language code (ISO 639-3). Auto-detected if None.
emotion_type (str): Emotion mode. "preset" or "smart" (ssfm-v30 only).
"smart" infers emotion from context automatically.
emotion_preset (str): Emotion preset when emotion_type is "preset".
ssfm-v30: normal, happy, sad, angry, whisper, toneup, tonedown.
ssfm-v21: normal, happy, sad, angry.
emotion_intensity (float): Emotion intensity (0.0–2.0). Defaults to 1.0.
seed (int): Random seed for reproducible output.
api_key (str): Typecast API key. If unspecified, uses TYPECAST_API_KEY env var.
host (str): API host URL override. If unspecified, the SDK uses
TYPECAST_API_HOST env var or the default endpoint.
debug (bool): Print debug information.
chunk_size (int): Audio chunk size in bytes for streaming.
"""
self.api_key = api_key or os.environ.get("TYPECAST_API_KEY")
if not self.api_key:
raise ValueError(
"Typecast API key is required. Provide it via api_key parameter "
"or TYPECAST_API_KEY environment variable."
)

self.voice_id = voice_id or os.environ.get("TYPECAST_VOICE_ID")
self.model = model
self.tempo = tempo
self.pitch = pitch
self.volume = volume
self.language = language
self.emotion_type = emotion_type
self.emotion_preset = emotion_preset
self.emotion_intensity = emotion_intensity
self.seed = seed
self.host = host # If None, SDK falls back to TYPECAST_API_HOST env var or default
self.debug = debug
self.chunk_size = chunk_size
self._client = None

def post_init(self):
self.engine_name = "typecast"

def _get_client(self):
if self._client is None:
try:
from typecast import Typecast
except ImportError as e:
raise ImportError(
"typecast-python package is required. Install with:\n"
"pip install typecast-python"
) from e
kwargs = {"api_key": self.api_key}
if self.host:
kwargs["host"] = self.host
self._client = Typecast(**kwargs)
return self._client

def _build_prompt(self):
"""Builds the appropriate prompt object based on emotion_type and model."""
from typecast.models import Prompt, PresetPrompt, SmartPrompt

if self.model == "ssfm-v21":
kwargs = {}
if self.emotion_preset is not None:
kwargs["emotion_preset"] = self.emotion_preset
if self.emotion_intensity is not None:
kwargs["emotion_intensity"] = self.emotion_intensity
return Prompt(**kwargs) if kwargs else None

# ssfm-v30
if self.emotion_type == "smart":
return SmartPrompt()
else:
kwargs = {}
if self.emotion_preset is not None:
kwargs["emotion_preset"] = self.emotion_preset
if self.emotion_intensity is not None:
kwargs["emotion_intensity"] = self.emotion_intensity
return PresetPrompt(**kwargs)

def get_stream_info(self):
"""
Returns PyAudio stream configuration for Typecast (WAV: mono, 44100Hz, 16-bit).
"""
return pyaudio.paInt16, 1, 44100

def synthesize(self, text: str) -> bool:
"""
Synthesizes text to audio using the typecast-python SDK and streams to queue.

Args:
text (str): Text to synthesize.

Returns:
bool: True on success, False on failure.
"""
from typecast.models import TTSRequest, TTSModel, Output

if not self.voice_id:
print("[TypecastEngine] Error: voice_id is not set. "
"Pass voice_id to the constructor or set TYPECAST_VOICE_ID.")
return False

if self.debug:
print(f"[TypecastEngine] Synthesizing: \"{text}\"")

try:
output_kwargs = {"audio_format": "wav"}
if self.tempo is not None:
output_kwargs["audio_tempo"] = self.tempo
if self.pitch is not None:
output_kwargs["audio_pitch"] = self.pitch
if self.volume is not None:
output_kwargs["volume"] = self.volume

request = TTSRequest(
text=text,
voice_id=self.voice_id,
model=TTSModel(self.model),
language=self.language,
prompt=self._build_prompt(),
seed=self.seed,
output=Output(**output_kwargs),
)

client = self._get_client()
response = client.text_to_speech(request)

with wave.open(io.BytesIO(response.audio_data), "rb") as wf:
frames_per_chunk = self.chunk_size // (wf.getsampwidth() * wf.getnchannels())
while not self.stop_synthesis_event.is_set():
data = wf.readframes(frames_per_chunk)
if not data:
break
self.queue.put(data)

except Exception as e:
print(f"[TypecastEngine] Error during synthesis: {e}")
return False

return True

def get_voices(self) -> list:
"""
Returns available Typecast voices via the SDK.
"""
try:
voices = self._get_client().voices_v2()
return [
TypecastVoice(
name=v.voice_name,
voice_id=v.voice_id,
gender=v.gender.value if v.gender else "",
age=v.age.value if v.age else "",
use_cases=v.use_cases or [],
models=[m.version for m in v.models] if v.models else [],
)
for v in voices
]
except Exception as e:
print(f"[TypecastEngine] Error fetching voices: {e}")
return []

def set_voice(self, voice: Union[str, TypecastVoice]):
"""
Sets the voice for synthesis.

Args:
voice: Voice name/ID string or TypecastVoice object.
"""
if isinstance(voice, TypecastVoice):
self.voice_id = voice.voice_id
return
for v in self.get_voices():
if voice.lower() in v.name.lower() or voice == v.voice_id:
self.voice_id = v.voice_id
return
# Fallback: treat as raw voice_id
self.voice_id = voice

def set_voice_parameters(self, **voice_parameters):
"""
Sets voice parameters.

Supported keys: tempo, pitch, volume, language, seed, model,
emotion_type, emotion_preset, emotion_intensity
"""
for key, value in voice_parameters.items():
if hasattr(self, key):
setattr(self, key, value)
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
- **minimax**: MiniMax Cloud TTS
- **cartesia** Cartesia API integration
- **qwen** Faster Qwen3 TTS integration
- **typecast**: Typecast Cloud TTS
- **minimal**: Core package only (for custom engine development)

You can install multiple engines by separating them with commas. For example:
Expand Down Expand Up @@ -94,6 +95,7 @@ def parse_requirements(filename):
"minimax": base_requirements + [requirements.get("requests", "requests")],
"orpheus": base_requirements + [requirements.get("snac", "snac")],
"qwen": base_requirements + [requirements.get("faster-qwen3-tts", "faster-qwen3-tts")],
"typecast": base_requirements + ["typecast-python"],
"jp": base_requirements +["mecab-python3>=1.0.6", "unidic-lite>=1.0.8", "cutlet", "fugashi>=1.4.0", "jaconv>=0.4.0", "mojimoji>=0.0.13", "pyopenjtalk>=0.4.0"],
"zh": base_requirements +["pypinyin>=0.53.0", "ordered_set>=4.1.0", "jieba>=0.42.1", "cn2an>=0.5.23"],
"ko": base_requirements +["hangul_romanize"],
Expand Down
Loading