diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2b0cf254..d00df528 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -62,6 +62,7 @@ jobs: pip install pyinstaller pip install -r backend/requirements.txt pip install --no-deps chatterbox-tts + pip install --no-deps hume-tada - name: Install MLX dependencies (Apple Silicon only) if: matrix.backend == 'mlx' @@ -188,6 +189,7 @@ jobs: pip install pyinstaller pip install -r backend/requirements.txt pip install --no-deps chatterbox-tts + pip install --no-deps hume-tada - name: Install PyTorch with CUDA 12.6 run: | diff --git a/Dockerfile b/Dockerfile index 20da9e1a..1ad85e53 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,6 +35,8 @@ RUN pip install --no-cache-dir --upgrade pip COPY backend/requirements.txt . RUN pip install --no-cache-dir --prefix=/install -r requirements.txt +RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts +RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada RUN pip install --no-cache-dir --prefix=/install \ git+https://github.com/QwenLM/Qwen3-TTS.git diff --git a/README.md b/README.md index 614e4f21..1acc9a4f 100644 --- a/README.md +++ b/README.md @@ -59,10 +59,10 @@ ## What is Voicebox? -Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. +Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. - **Complete privacy** — models and voice data stay on your machine -- **4 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo +- **5 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA - **23 languages** — from English to Arabic, Japanese, Hindi, Swahili, and more - **Post-processing effects** — pitch shift, reverb, delay, chorus, compression, and filters - **Expressive speech** — paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo @@ -93,7 +93,7 @@ Voicebox is a **local-first voice cloning studio** — a free and open-source al ### Multi-Engine Voice Cloning -Four TTS engines with different strengths, switchable per-generation: +Five TTS engines with different strengths, switchable per-generation: | Engine | Languages | Strengths | | --------------------------- | --------- | ---------------------------------------------------------------------------------------------------------------------------------------- | @@ -101,6 +101,7 @@ Four TTS engines with different strengths, switchable per-generation: | **LuxTTS** | English | Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU | | **Chatterbox Multilingual** | 23 | Broadest language coverage — Arabic, Danish, Finnish, Greek, Hebrew, Hindi, Malay, Norwegian, Polish, Swahili, Swedish, Turkish and more | | **Chatterbox Turbo** | English | Fast 350M model with paralinguistic emotion/sound tags | +| **TADA** (1B / 3B) | 10 | HumeAI speech-language model — 700s+ coherent audio, text-acoustic dual alignment | ### Emotions & Paralinguistic Tags @@ -230,7 +231,7 @@ Full API documentation available at `http://localhost:17493/docs`. | Frontend | React, TypeScript, Tailwind CSS | | State | Zustand, React Query | | Backend | FastAPI (Python) | -| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo | +| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo, TADA | | Effects | Pedalboard (Spotify) | | Transcription | Whisper / Whisper Turbo (PyTorch or MLX) | | Inference | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) | @@ -245,7 +246,7 @@ Full API documentation available at `http://localhost:17493/docs`. | ----------------------- | ---------------------------------------------- | | **Real-time Streaming** | Stream audio as it generates, word by word | | **Voice Design** | Create new voices from text descriptions | -| **More Models** | XTTS, Bark, and other open-source voice models | +| **More Models** | XTTS, Bark, and other open-source voice models | | **Plugin Architecture** | Extend with custom models and effects | | **Mobile Companion** | Control Voicebox from your phone | diff --git a/app/src/components/Generation/EngineModelSelector.tsx b/app/src/components/Generation/EngineModelSelector.tsx index 77dac03f..4382d3f7 100644 --- a/app/src/components/Generation/EngineModelSelector.tsx +++ b/app/src/components/Generation/EngineModelSelector.tsx @@ -20,6 +20,8 @@ const ENGINE_OPTIONS = [ { value: 'luxtts', label: 'LuxTTS' }, { value: 'chatterbox', label: 'Chatterbox' }, { value: 'chatterbox_turbo', label: 'Chatterbox Turbo' }, + { value: 'tada:1B', label: 'TADA 1B' }, + { value: 'tada:3B', label: 'TADA 3B Multilingual' }, ] as const; const ENGINE_DESCRIPTIONS: Record = { @@ -27,6 +29,7 @@ const ENGINE_DESCRIPTIONS: Record = { luxtts: 'Fast, English-focused', chatterbox: '23 languages, incl. Hebrew', chatterbox_turbo: 'English, [laugh] [cough] tags', + tada: 'HumeAI, 700s+ coherent audio', }; /** Engines that only support English and should force language to 'en' on select. */ @@ -34,6 +37,7 @@ const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']); function getSelectValue(engine: string, modelSize?: string): string { if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`; + if (engine === 'tada') return `tada:${modelSize || '1B'}`; return engine; } @@ -48,6 +52,20 @@ function handleEngineChange(form: UseFormReturn, value: st if (!available.some((l) => l.value === currentLang)) { form.setValue('language', available[0]?.value ?? 'en'); } + } else if (value.startsWith('tada:')) { + const [, modelSize] = value.split(':'); + form.setValue('engine', 'tada'); + form.setValue('modelSize', modelSize as '1B' | '3B'); + // TADA 1B is English-only; 3B is multilingual + if (modelSize === '1B') { + form.setValue('language', 'en'); + } else { + const currentLang = form.getValues('language'); + const available = getLanguageOptionsForEngine('tada'); + if (!available.some((l) => l.value === currentLang)) { + form.setValue('language', available[0]?.value ?? 'en'); + } + } } else { form.setValue('engine', value as GenerationFormValues['engine']); form.setValue('modelSize', undefined as unknown as '1.7B' | '0.6B'); diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx index 5d113d89..c415306d 100644 --- a/app/src/components/ServerSettings/ModelManagement.tsx +++ b/app/src/components/ServerSettings/ModelManagement.tsx @@ -62,6 +62,10 @@ const MODEL_DESCRIPTIONS: Record = { 'Production-grade open source TTS by Resemble AI. Supports 23 languages with voice cloning and emotion exaggeration control.', 'chatterbox-turbo': 'Streamlined 350M parameter TTS by Resemble AI. High-quality English speech with less compute and VRAM than larger models.', + 'tada-1b': + 'HumeAI TADA 1B — English speech-language model built on Llama 3.2 1B. Generates 700s+ of coherent audio with synchronized text-acoustic alignment.', + 'tada-3b-ml': + 'HumeAI TADA 3B Multilingual — built on Llama 3.2 3B. Supports 10 languages with high-fidelity voice cloning via text-acoustic dual alignment.', 'whisper-base': 'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.', 'whisper-small': @@ -391,7 +395,8 @@ export function ModelManagement() { (m) => m.model_name.startsWith('qwen-tts') || m.model_name.startsWith('luxtts') || - m.model_name.startsWith('chatterbox'), + m.model_name.startsWith('chatterbox') || + m.model_name.startsWith('tada'), ) ?? []; const whisperModels = modelStatus?.models.filter((m) => m.model_name.startsWith('whisper')) ?? []; diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts index daae2a95..aa85d001 100644 --- a/app/src/lib/api/types.ts +++ b/app/src/lib/api/types.ts @@ -42,8 +42,8 @@ export interface GenerationRequest { text: string; language: LanguageCode; seed?: number; - model_size?: '1.7B' | '0.6B'; - engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo'; + model_size?: '1.7B' | '0.6B' | '1B' | '3B'; + engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada'; instruct?: string; max_chunk_chars?: number; crossfade_ms?: number; diff --git a/app/src/lib/constants/languages.ts b/app/src/lib/constants/languages.ts index 19d6bca6..a0d233a5 100644 --- a/app/src/lib/constants/languages.ts +++ b/app/src/lib/constants/languages.ts @@ -66,6 +66,7 @@ export const ENGINE_LANGUAGES: Record = { 'zh', ], chatterbox_turbo: ['en'], + tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'], } as const; /** Helper: get language options for a given engine. */ diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts index 74f9a94c..8e73ce07 100644 --- a/app/src/lib/hooks/useGenerationForm.ts +++ b/app/src/lib/hooks/useGenerationForm.ts @@ -15,9 +15,9 @@ const generationSchema = z.object({ text: z.string().min(1, '').max(50000), language: z.enum(LANGUAGE_CODES as [LanguageCode, ...LanguageCode[]]), seed: z.number().int().optional(), - modelSize: z.enum(['1.7B', '0.6B']).optional(), + modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(), instruct: z.string().max(500).optional(), - engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo']).optional(), + engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']).optional(), }); export type GenerationFormValues = z.infer; @@ -79,7 +79,11 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { ? 'chatterbox-tts' : engine === 'chatterbox_turbo' ? 'chatterbox-turbo' - : `qwen-tts-${data.modelSize}`; + : engine === 'tada' + ? data.modelSize === '3B' + ? 'tada-3b-ml' + : 'tada-1b' + : `qwen-tts-${data.modelSize}`; const displayName = engine === 'luxtts' ? 'LuxTTS' @@ -87,9 +91,13 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { ? 'Chatterbox TTS' : engine === 'chatterbox_turbo' ? 'Chatterbox Turbo' - : data.modelSize === '1.7B' - ? 'Qwen TTS 1.7B' - : 'Qwen TTS 0.6B'; + : engine === 'tada' + ? data.modelSize === '3B' + ? 'TADA 3B Multilingual' + : 'TADA 1B' + : data.modelSize === '1.7B' + ? 'Qwen TTS 1.7B' + : 'Qwen TTS 0.6B'; // Check if model needs downloading try { @@ -104,7 +112,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { console.error('Failed to check model status:', error); } - const isQwen = engine === 'qwen'; + const hasModelSizes = engine === 'qwen' || engine === 'tada'; const effectsChain = options.getEffectsChain?.(); // This now returns immediately with status="generating" const result = await generation.mutateAsync({ @@ -112,9 +120,9 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { text: data.text, language: data.language, seed: data.seed, - model_size: isQwen ? data.modelSize : undefined, + model_size: hasModelSizes ? data.modelSize : undefined, engine, - instruct: isQwen ? data.instruct || undefined : undefined, + instruct: engine === 'qwen' ? data.instruct || undefined : undefined, max_chunk_chars: maxChunkChars, crossfade_ms: crossfadeMs, normalize: normalizeAudio, diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py index 6f20f3de..a4f5113a 100644 --- a/backend/backends/__init__.py +++ b/backend/backends/__init__.py @@ -166,6 +166,7 @@ def is_loaded(self) -> bool: "luxtts": "LuxTTS", "chatterbox": "Chatterbox TTS", "chatterbox_turbo": "Chatterbox Turbo", + "tada": "TADA", } @@ -259,6 +260,24 @@ def _get_non_qwen_tts_configs() -> list[ModelConfig]: needs_trim=True, languages=["en"], ), + ModelConfig( + model_name="tada-1b", + display_name="TADA 1B (English)", + engine="tada", + hf_repo_id="HumeAI/tada-1b", + model_size="1B", + size_mb=4000, + languages=["en"], + ), + ModelConfig( + model_name="tada-3b-ml", + display_name="TADA 3B Multilingual", + engine="tada", + hf_repo_id="HumeAI/tada-3b-ml", + model_size="3B", + size_mb=8000, + languages=["en", "ar", "zh", "de", "es", "fr", "it", "ja", "pl", "pt"], + ), ] @@ -339,10 +358,12 @@ def engine_has_model_sizes(engine: str) -> bool: async def load_engine_model(engine: str, model_size: str = "default") -> None: - """Load a model for the given engine, handling the Qwen model_size special case.""" + """Load a model for the given engine, handling engines with multiple model sizes.""" backend = get_tts_backend_for_engine(engine) if engine == "qwen": await backend.load_model_async(model_size) + elif engine == "tada": + await backend.load_model(model_size) else: await backend.load_model() @@ -358,7 +379,7 @@ async def ensure_model_cached_or_raise(engine: str, model_size: str = "default") cfg = c break - if engine == "qwen": + if engine in ("qwen", "tada"): if not backend._is_model_cached(model_size): raise HTTPException( status_code=400, @@ -490,6 +511,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend: from .chatterbox_turbo_backend import ChatterboxTurboTTSBackend backend = ChatterboxTurboTTSBackend() + elif engine == "tada": + from .hume_backend import HumeTadaBackend + + backend = HumeTadaBackend() else: raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}") diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py new file mode 100644 index 00000000..456fd46e --- /dev/null +++ b/backend/backends/hume_backend.py @@ -0,0 +1,347 @@ +""" +HumeAI TADA TTS backend implementation. + +Wraps HumeAI's TADA (Text-Acoustic Dual Alignment) model for +high-quality voice cloning. Two model variants: + - tada-1b: English-only, ~2B params (Llama 3.2 1B base) + - tada-3b-ml: Multilingual, ~4B params (Llama 3.2 3B base) + +Both use a shared encoder/codec (HumeAI/tada-codec). The encoder +produces 1:1 aligned token embeddings from reference audio, and the +causal LM generates speech via flow-matching diffusion. + +24kHz output, bf16 inference on CUDA, fp32 on CPU. +""" + +import asyncio +import logging +import threading +from typing import ClassVar, List, Optional, Tuple + +import numpy as np + +from . import TTSBackend +from .base import ( + is_model_cached, + get_torch_device, + combine_voice_prompts as _combine_voice_prompts, + model_load_progress, +) +from ..utils.cache import get_cache_key, get_cached_voice_prompt, cache_voice_prompt + +logger = logging.getLogger(__name__) + +# HuggingFace repos +TADA_CODEC_REPO = "HumeAI/tada-codec" +TADA_1B_REPO = "HumeAI/tada-1b" +TADA_3B_ML_REPO = "HumeAI/tada-3b-ml" + +TADA_MODEL_REPOS = { + "1B": TADA_1B_REPO, + "3B": TADA_3B_ML_REPO, +} + +# Key weight files for cache detection +_TADA_MODEL_WEIGHT_FILES = [ + "model.safetensors", +] + +_TADA_CODEC_WEIGHT_FILES = [ + "encoder/model.safetensors", +] + + +class HumeTadaBackend: + """HumeAI TADA TTS backend for high-quality voice cloning.""" + + _load_lock: ClassVar[threading.Lock] = threading.Lock() + + def __init__(self): + self.model = None + self.encoder = None + self.model_size = "1B" # default to 1B + self._device = None + self._model_load_lock = asyncio.Lock() + + def _get_device(self) -> str: + # Force CPU on macOS — MPS has issues with flow matching + # and large vocab lm_head (>65536 output channels) + return get_torch_device(force_cpu_on_mac=True) + + def is_loaded(self) -> bool: + return self.model is not None + + def _get_model_path(self, model_size: str = "1B") -> str: + return TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO) + + def _is_model_cached(self, model_size: str = "1B") -> bool: + repo = TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO) + model_cached = is_model_cached(repo, required_files=_TADA_MODEL_WEIGHT_FILES) + codec_cached = is_model_cached(TADA_CODEC_REPO, required_files=_TADA_CODEC_WEIGHT_FILES) + return model_cached and codec_cached + + async def load_model(self, model_size: str = "1B") -> None: + """Load the TADA model and encoder.""" + if self.model is not None and self.model_size == model_size: + return + async with self._model_load_lock: + if self.model is not None and self.model_size == model_size: + return + # Unload existing model if switching sizes + if self.model is not None: + self.unload_model() + self.model_size = model_size + await asyncio.to_thread(self._load_model_sync, model_size) + + def _load_model_sync(self, model_size: str = "1B"): + """Synchronous model loading with progress tracking.""" + model_name = f"tada-{model_size.lower()}" + is_cached = self._is_model_cached(model_size) + repo = TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO) + + with model_load_progress(model_name, is_cached): + # Install DAC shim before importing tada — tada's encoder/decoder + # import dac.nn.layers.Snake1d which requires the descript-audio-codec + # package. The real package pulls in onnx/tensorboard/matplotlib via + # descript-audiotools, so we use a lightweight shim instead. + from ..utils.dac_shim import install_dac_shim + install_dac_shim() + + import torch + from huggingface_hub import snapshot_download + + device = self._get_device() + self._device = device + logger.info(f"Loading HumeAI TADA {model_size} on {device}...") + + # Download codec (encoder + decoder) if not cached + logger.info("Downloading TADA codec...") + snapshot_download( + repo_id=TADA_CODEC_REPO, + token=None, + allow_patterns=["*.safetensors", "*.json", "*.txt", "*.bin"], + ) + + # Download model weights if not cached + logger.info(f"Downloading TADA {model_size} model...") + snapshot_download( + repo_id=repo, + token=None, + allow_patterns=["*.safetensors", "*.json", "*.txt", "*.bin", "*.model"], + ) + + # TADA hardcodes "meta-llama/Llama-3.2-1B" as the tokenizer + # source in its Aligner and TadaForCausalLM.from_pretrained(). + # That repo is gated (requires Meta license acceptance). + # Download the tokenizer from an ungated mirror and get its + # local cache path so we can point TADA at it directly. + logger.info("Downloading Llama tokenizer (ungated mirror)...") + tokenizer_path = snapshot_download( + repo_id="unsloth/Llama-3.2-1B", + token=None, + allow_patterns=["tokenizer*", "special_tokens*"], + ) + + # Determine dtype — use bf16 on CUDA for ~50% memory savings + if device == "cuda" and torch.cuda.is_bf16_supported(): + model_dtype = torch.bfloat16 + else: + model_dtype = torch.float32 + + # Patch the Aligner config class to use the local tokenizer + # path instead of the gated "meta-llama/Llama-3.2-1B" default. + # This avoids monkey-patching AutoTokenizer.from_pretrained + # which corrupts the classmethod descriptor for other engines. + from tada.modules.aligner import AlignerConfig + AlignerConfig.tokenizer_name = tokenizer_path + + # Load encoder (only needed for voice prompt encoding) + from tada.modules.encoder import Encoder + logger.info("Loading TADA encoder...") + self.encoder = Encoder.from_pretrained( + TADA_CODEC_REPO, subfolder="encoder" + ).to(device) + self.encoder.eval() + + # Load the causal LM (includes decoder for wav generation). + # TadaForCausalLM.from_pretrained() calls + # getattr(config, "tokenizer_name", "meta-llama/Llama-3.2-1B") + # which hits the gated repo. Pre-load the config from HF, + # inject the local tokenizer path, then pass it in. + from tada.modules.tada import TadaForCausalLM, TadaConfig + logger.info(f"Loading TADA {model_size} model...") + config = TadaConfig.from_pretrained(repo) + config.tokenizer_name = tokenizer_path + self.model = TadaForCausalLM.from_pretrained( + repo, config=config, torch_dtype=model_dtype + ).to(device) + self.model.eval() + + logger.info(f"HumeAI TADA {model_size} loaded successfully on {device}") + + def unload_model(self) -> None: + """Unload model and encoder to free memory.""" + if self.model is not None: + del self.model + self.model = None + if self.encoder is not None: + del self.encoder + self.encoder = None + + self._device = None + + import torch + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + logger.info("HumeAI TADA unloaded") + + async def create_voice_prompt( + self, + audio_path: str, + reference_text: str, + use_cache: bool = True, + ) -> Tuple[dict, bool]: + """ + Create voice prompt from reference audio using TADA's encoder. + + TADA's encoder performs forced alignment between audio and text tokens, + producing an EncoderOutput with 1:1 token-audio alignment. If no + reference_text is provided, the encoder uses built-in ASR (English only). + + We serialize the EncoderOutput to a dict for caching. + """ + await self.load_model(self.model_size) + + cache_key = ( + "tada_" + get_cache_key(audio_path, reference_text) + ) if use_cache else None + + if cache_key: + cached = get_cached_voice_prompt(cache_key) + if cached is not None and isinstance(cached, dict): + return cached, True + + def _encode_sync(): + import torch + import soundfile as sf + + device = self._device + + # Load audio with soundfile (torchaudio 2.10+ requires torchcodec) + audio_np, sr = sf.read(str(audio_path), dtype="float32") + audio = torch.from_numpy(audio_np).float() + if audio.ndim == 1: + audio = audio.unsqueeze(0) # (samples,) -> (1, samples) + else: + audio = audio.T # (samples, channels) -> (channels, samples) + audio = audio.to(device) + + # Encode with forced alignment + text_arg = [reference_text] if reference_text else None + prompt = self.encoder( + audio, text=text_arg, sample_rate=sr + ) + + # Serialize EncoderOutput to a dict of CPU tensors for caching + prompt_dict = {} + for field_name in prompt.__dataclass_fields__: + val = getattr(prompt, field_name) + if isinstance(val, torch.Tensor): + prompt_dict[field_name] = val.detach().cpu() + elif isinstance(val, list): + prompt_dict[field_name] = val + elif isinstance(val, (int, float)): + prompt_dict[field_name] = val + else: + prompt_dict[field_name] = val + return prompt_dict + + encoded = await asyncio.to_thread(_encode_sync) + + if cache_key: + cache_voice_prompt(cache_key, encoded) + + return encoded, False + + async def combine_voice_prompts( + self, + audio_paths: List[str], + reference_texts: List[str], + ) -> Tuple[np.ndarray, str]: + return await _combine_voice_prompts(audio_paths, reference_texts, sample_rate=24000) + + async def generate( + self, + text: str, + voice_prompt: dict, + language: str = "en", + seed: Optional[int] = None, + instruct: Optional[str] = None, + ) -> Tuple[np.ndarray, int]: + """ + Generate audio from text using HumeAI TADA. + + Args: + text: Text to synthesize + voice_prompt: Serialized EncoderOutput dict from create_voice_prompt() + language: Language code (en, ar, de, es, fr, it, ja, pl, pt, zh) + seed: Random seed for reproducibility + instruct: Not supported by TADA (ignored) + + Returns: + Tuple of (audio_array, sample_rate=24000) + """ + await self.load_model(self.model_size) + + def _generate_sync(): + import torch + from tada.modules.encoder import EncoderOutput + + if seed is not None: + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + device = self._device + + # Reconstruct EncoderOutput from the cached dict + restored = {} + for k, v in voice_prompt.items(): + if isinstance(v, torch.Tensor): + # Move to device and match model dtype for float tensors + if v.is_floating_point(): + model_dtype = next(self.model.parameters()).dtype + restored[k] = v.to(device=device, dtype=model_dtype) + else: + restored[k] = v.to(device=device) + else: + restored[k] = v + + prompt = EncoderOutput(**restored) + + # For non-English with the 3B-ML model, we could reload the + # encoder with the language-specific aligner. However, the + # generation itself is language-agnostic — only the encoder's + # aligner changes. Since we encode at create_voice_prompt time, + # the language is already baked in. For simplicity, we don't + # reload the encoder here. + + logger.info(f"[TADA] Generating ({language}), text length: {len(text)}") + + output = self.model.generate( + prompt=prompt, + text=text, + ) + + # output.audio is a list of tensors (one per batch item) + if output.audio and output.audio[0] is not None: + audio_tensor = output.audio[0] + audio = audio_tensor.detach().cpu().numpy().squeeze().astype(np.float32) + else: + logger.warning("[TADA] Generation produced no audio") + audio = np.zeros(24000, dtype=np.float32) + + return audio, 24000 + + return await asyncio.to_thread(_generate_sync) diff --git a/backend/build_binary.py b/backend/build_binary.py index f9cdb1b7..0574894f 100644 --- a/backend/build_binary.py +++ b/backend/build_binary.py @@ -186,6 +186,42 @@ def build_server(cuda=False): # needed by LuxTTS for text-to-phoneme conversion "--collect-all", "piper_phonemize", + # HumeAI TADA — speech-language model using Llama + flow matching + "--hidden-import", + "backend.backends.hume_backend", + "--hidden-import", + "tada", + "--hidden-import", + "tada.modules", + "--hidden-import", + "tada.modules.tada", + "--hidden-import", + "tada.modules.encoder", + "--hidden-import", + "tada.modules.decoder", + "--hidden-import", + "tada.modules.aligner", + "--hidden-import", + "tada.modules.acoustic_spkr_verf", + "--hidden-import", + "tada.nn", + "--hidden-import", + "tada.nn.vibevoice", + "--hidden-import", + "tada.utils", + "--hidden-import", + "tada.utils.gray_code", + "--hidden-import", + "tada.utils.text", + # DAC shim — provides dac.nn.layers.Snake1d without the real + # descript-audio-codec package (which pulls onnx/tensorboard via + # descript-audiotools). The shim is in backend/utils/dac_shim.py. + "--hidden-import", + "backend.utils.dac_shim", + "--hidden-import", + "torchaudio", + "--collect-submodules", + "tada", ] ) diff --git a/backend/models.py b/backend/models.py index 3308b3bc..4dd2b368 100644 --- a/backend/models.py +++ b/backend/models.py @@ -66,9 +66,9 @@ class GenerationRequest(BaseModel): text: str = Field(..., min_length=1, max_length=50000) language: str = Field(default="en", pattern="^(zh|en|ja|ko|de|fr|ru|pt|es|it|he|ar|da|el|fi|hi|ms|nl|no|pl|sv|sw|tr)$") seed: Optional[int] = Field(None, ge=0) - model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B)$") + model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B)$") instruct: Optional[str] = Field(None, max_length=500) - engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo)$") + engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada)$") max_chunk_chars: int = Field( default=800, ge=100, le=5000, description="Max characters per chunk for long text splitting" ) diff --git a/backend/requirements.txt b/backend/requirements.txt index 0d927975..d77f97be 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -33,6 +33,13 @@ s3tokenizer spacy-pkuseg pyloudnorm +# HumeAI TADA sub-dependencies (hume-tada itself is installed +# --no-deps in the setup script because it pins torch>=2.7,<2.8. +# descript-audio-codec is NOT installed — it pulls onnx/tensorboard +# via descript-audiotools. A lightweight shim in utils/dac_shim.py +# provides the only class TADA uses: Snake1d.) +torchaudio + # Audio processing librosa>=0.10.0 soundfile>=0.12.0 diff --git a/backend/utils/dac_shim.py b/backend/utils/dac_shim.py new file mode 100644 index 00000000..89968c18 --- /dev/null +++ b/backend/utils/dac_shim.py @@ -0,0 +1,95 @@ +""" +Minimal shim for descript-audio-codec (DAC). + +TADA only imports Snake1d from dac.nn.layers and dac.model.dac. +The real DAC package pulls in descript-audiotools which depends on +onnx, tensorboard, protobuf, matplotlib, pystoi, etc. — none of +which are needed for TADA's runtime use of Snake1d. + +This shim provides the exact Snake1d implementation (MIT-licensed, +from https://github.com/descriptinc/descript-audio-codec) so we can +avoid the entire audiotools dependency chain. + +If the real DAC package is installed, this module is never used — +Python's import system will find the site-packages version first. +Install this shim only when descript-audio-codec is NOT installed. +""" + +import sys +import types + +import torch +import torch.nn as nn + + +# ── Snake activation (from dac/nn/layers.py) ──────────────────────── + +# NOTE: The original DAC code uses @torch.jit.script here for a 1.4x +# speedup. We omit it because TorchScript calls inspect.getsource() +# which fails inside a PyInstaller frozen binary (no .py source files). +def snake(x: torch.Tensor, alpha: torch.Tensor) -> torch.Tensor: + shape = x.shape + x = x.reshape(shape[0], shape[1], -1) + x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2) + x = x.reshape(shape) + return x + + +class Snake1d(nn.Module): + def __init__(self, channels: int): + super().__init__() + self.alpha = nn.Parameter(torch.ones(1, channels, 1)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return snake(x, self.alpha) + + +# ── Register as dac.nn.layers and dac.model.dac ───────────────────── + +def install_dac_shim() -> None: + """Register fake dac package modules in sys.modules. + + Only installs the shim if 'dac' is not already importable + (i.e. the real descript-audio-codec is not installed). + """ + try: + import dac # noqa: F401 — real package exists, do nothing + return + except ImportError: + pass + + # Create the module tree: dac -> dac.nn -> dac.nn.layers + # -> dac.model -> dac.model.dac + dac_pkg = types.ModuleType("dac") + dac_pkg.__path__ = [] # make it a package + dac_pkg.__package__ = "dac" + + dac_nn = types.ModuleType("dac.nn") + dac_nn.__path__ = [] + dac_nn.__package__ = "dac.nn" + + dac_nn_layers = types.ModuleType("dac.nn.layers") + dac_nn_layers.__package__ = "dac.nn" + dac_nn_layers.Snake1d = Snake1d + dac_nn_layers.snake = snake + + dac_model = types.ModuleType("dac.model") + dac_model.__path__ = [] + dac_model.__package__ = "dac.model" + + dac_model_dac = types.ModuleType("dac.model.dac") + dac_model_dac.__package__ = "dac.model" + dac_model_dac.Snake1d = Snake1d + + # Wire up submodules + dac_pkg.nn = dac_nn + dac_pkg.model = dac_model + dac_nn.layers = dac_nn_layers + dac_model.dac = dac_model_dac + + # Register in sys.modules + sys.modules["dac"] = dac_pkg + sys.modules["dac.nn"] = dac_nn + sys.modules["dac.nn.layers"] = dac_nn_layers + sys.modules["dac.model"] = dac_model + sys.modules["dac.model.dac"] = dac_model_dac diff --git a/docs/content/docs/developer/tts-engines.mdx b/docs/content/docs/developer/tts-engines.mdx index f95e7a71..90135a37 100644 --- a/docs/content/docs/developer/tts-engines.mdx +++ b/docs/content/docs/developer/tts-engines.mdx @@ -490,7 +490,7 @@ Based on the current model landscape, these are candidates for future integratio | **Fish Speech** | 50+ | Medium | Word-level control via inline text | Ready | | **Kokoro-82M** | English | 82M | CPU realtime, Apache 2.0 | Ready | | **XTTS-v2** | 17+ | Medium | Zero-shot cloning | Ready | -| **HumeAI TADA** | EN (1B), Multi (3B) | Medium | 700s+ coherent audio, synced transcripts | Needs vetting | +| **HumeAI TADA** | EN (1B), Multi (3B) | Medium | 700s+ coherent audio, synced transcripts | Shipped | | **MOSS-TTS** | Multilingual | Medium | Text-to-voice design, multi-speaker dialogue | Needs vetting | | **Pocket TTS** | English | ~100M | CPU-first, >1× realtime | Needs vetting | diff --git a/docs/content/docs/index.mdx b/docs/content/docs/index.mdx index cbe120c3..9a7da078 100644 --- a/docs/content/docs/index.mdx +++ b/docs/content/docs/index.mdx @@ -3,12 +3,12 @@ title: "Voicebox Documentation" description: "Voicebox is a local-first voice cloning studio -- a free and open-source alternative to ElevenLabs." --- -Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. +Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. ![Voicebox App Screenshot](/images/app-screenshot-1.webp) - **Complete privacy** -- models and voice data stay on your machine -- **4 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo +- **5 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA - **23 languages** -- from English to Arabic, Japanese, Hindi, Swahili, and more - **Post-processing effects** -- pitch shift, reverb, delay, chorus, compression, and filters - **Expressive speech** -- paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo diff --git a/docs/content/docs/overview/introduction.mdx b/docs/content/docs/overview/introduction.mdx index 134f1d71..d740a61e 100644 --- a/docs/content/docs/overview/introduction.mdx +++ b/docs/content/docs/overview/introduction.mdx @@ -5,10 +5,10 @@ description: "Voicebox is a local-first voice cloning studio -- a free and open- ## What is Voicebox? -Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. +Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. - **Complete privacy** -- models and voice data stay on your machine -- **4 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo +- **5 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA - **23 languages** -- from English to Arabic, Japanese, Hindi, Swahili, and more - **Post-processing effects** -- pitch shift, reverb, delay, chorus, compression, and filters - **Expressive speech** -- paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo @@ -20,7 +20,7 @@ Voicebox is a **local-first voice cloning studio** -- a free and open-source alt ## TTS Engines -Four engines with different strengths, switchable per-generation: +Five engines with different strengths, switchable per-generation: | Engine | Languages | Strengths | |--------|-----------|-----------| @@ -28,6 +28,7 @@ Four engines with different strengths, switchable per-generation: | **LuxTTS** | English | Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU | | **Chatterbox Multilingual** | 23 | Broadest language coverage | | **Chatterbox Turbo** | English | Fast 350M model with paralinguistic emotion/sound tags | +| **TADA** (1B / 3B) | 10 | HumeAI speech-language model -- 700s+ coherent audio | ## GPU Support @@ -56,7 +57,7 @@ Four engines with different strengths, switchable per-generation: | Frontend | React, TypeScript, Tailwind CSS | | State | Zustand, React Query | | Backend | FastAPI (Python) | -| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo | +| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo, TADA | | Effects | Pedalboard (Spotify) | | Transcription | Whisper / Whisper Turbo (PyTorch or MLX) | | Inference | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) | diff --git a/docs/notes/PROJECT_STATUS.md b/docs/notes/PROJECT_STATUS.md index 628dfa0a..71c06453 100644 --- a/docs/notes/PROJECT_STATUS.md +++ b/docs/notes/PROJECT_STATUS.md @@ -36,6 +36,10 @@ │ │ │ │ Qwen3-TTS│ │LuxTTS │ │Chatterbox │ │ │ │ │ │ │ │(Py/MLX) │ │ │ │(MTL+Turbo)│ │ │ │ │ │ │ └──────────┘ └───────┘ └───────────┘ │ │ │ +│ │ │ ┌──────────┐ │ │ │ +│ │ │ │ TADA │ │ │ │ +│ │ │ │(1B / 3B) │ │ │ │ +│ │ │ └──────────┘ │ │ │ │ │ └─────────────────────────────────────────┘ │ │ │ │ ┌───────────┐ ┌─────────┐ │ │ │ │ │ STTBackend│ │ Profiles│ │ │ @@ -59,6 +63,7 @@ | LuxTTS | `backend/backends/luxtts_backend.py` | LuxTTS — fast, CPU-friendly | | Chatterbox MTL | `backend/backends/chatterbox_backend.py` | Chatterbox Multilingual — 23 languages | | Chatterbox Turbo | `backend/backends/chatterbox_turbo_backend.py` | Chatterbox Turbo — English, paralinguistic tags | +| TADA | `backend/backends/hume_backend.py` | HumeAI TADA — 1B English + 3B Multilingual | | Platform detect | `backend/platform_detect.py` | Apple Silicon → MLX, else → PyTorch | | API types | `backend/models.py` | Pydantic request/response models | | HF progress | `backend/utils/hf_progress.py` | HFProgressTracker (tqdm patching for download progress) | @@ -78,7 +83,7 @@ ``` POST /generate 1. Look up voice profile from DB - 2. Resolve engine from request (qwen | luxtts | chatterbox | chatterbox_turbo) + 2. Resolve engine from request (qwen | luxtts | chatterbox | chatterbox_turbo | tada) 3. Get backend: get_tts_backend_for_engine(engine) # thread-safe singleton per engine 4. Check model cache → if missing, trigger background download, return HTTP 202 5. Load model (lazy): tts_backend.load_model(model_size) @@ -104,7 +109,8 @@ POST /generate - LuxTTS integration — fast, CPU-friendly English TTS (PR #254) - Chatterbox Multilingual TTS — 23 languages including Hebrew (PR #257) - Instruct parameter UI exists but is non-functional across all backends (see #224, Known Limitations) -- Single flat model dropdown (Qwen 1.7B, Qwen 0.6B, LuxTTS, Chatterbox, Chatterbox Turbo) +- HumeAI TADA integration — 1B English + 3B Multilingual speech-language model (PR #296) +- Single flat model dropdown (Qwen 1.7B, Qwen 0.6B, LuxTTS, Chatterbox, Chatterbox Turbo, TADA 1B, TADA 3B) - Centralized model config registry (`ModelConfig` dataclass) — no per-engine dispatch maps in `main.py` - Shared `EngineModelSelector` component — engine/model dropdown defined once, used in both generation forms @@ -136,6 +142,8 @@ POST /generate | LuxTTS | `luxtts` | English | ~300 MB | CPU-friendly, 48 kHz, fast | None | | Chatterbox | `chatterbox-tts` | 23 (incl. Hebrew, Arabic, Hindi, etc.) | ~3.2 GB | Zero-shot cloning, multilingual | Partial — `exaggeration` float (0-1) for expressiveness | | Chatterbox Turbo | `chatterbox-turbo` | English | ~1.5 GB | Paralinguistic tags ([laugh], [cough]), 350M params, low latency | Partial — inline tags only, no separate instruct param | +| TADA 1B | `tada-1b` | English | ~4 GB | HumeAI speech-language model, 700s+ coherent audio | None | +| TADA 3B Multilingual | `tada-3b-ml` | 10 (en, ar, zh, de, es, fr, it, ja, pl, pt) | ~8 GB | Multilingual, text-acoustic dual alignment | None | ### Multi-Engine Architecture (Shipped) @@ -143,7 +151,7 @@ The singleton TTS backend blocker described in the previous version of this doc - **Thread-safe backend registry** (`_tts_backends` dict + `_tts_backends_lock`) with double-checked locking - **Per-engine backend instances** — each engine gets its own singleton, loaded lazily -- **Engine field on GenerationRequest** — frontend sends `engine: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo'` +- **Engine field on GenerationRequest** — frontend sends `engine: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada'` - **Per-engine language filtering** — `ENGINE_LANGUAGES` map in frontend, backend regex accepts all languages - **Per-engine voice prompts** — `create_voice_prompt_for_profile()` dispatches to the correct backend - **Trim post-processing** — `trim_tts_output()` for Chatterbox engines (cuts trailing silence/hallucination) @@ -337,7 +345,7 @@ Notable requests: | **CosyVoice2-0.5B** | 3-10s zero-shot | Very fast | 24 kHz | Multilingual | Low | **Yes** — `inference_instruct2()`, works with cloning | Ready | Best instruct candidate | | **Fish Speech** | 10-30s few-shot | Real-time | 24-44 kHz | 50+ | Medium | **Yes** — inline text descriptions, word-level control | Ready | Multi-engine arch in place | | **MOSS-TTS Family** | Zero-shot | — | — | Multilingual | Medium | **Yes** — text prompts for style + timbre design | Needs vetting | Apache 2.0, multi-speaker dialogue | -| **HumeAI TADA 1B/3B** | Zero-shot | 5× faster than LLM-TTS | — | EN (1B), Multilingual (3B) | Medium | Partial — automatic prosody from text context | Needs vetting | MIT, 700s+ coherent, synced transcript output | +| **HumeAI TADA 1B/3B** | Zero-shot | 5× faster than LLM-TTS | 24 kHz | EN (1B), Multilingual (3B) | Medium | Partial — automatic prosody from text context | **Shipped** | PR #296, MIT, 700s+ coherent | | **VoxCPM 1.5** | Zero-shot (seconds) | ~0.15 RTF streaming | — | Bilingual (EN/ZH) | Medium | Partial — automatic context-aware prosody | Needs vetting | Apache 2.0, tokenizer-free continuous diffusion | | **Kokoro-82M** | 3s instant | CPU realtime | 24 kHz | English | Tiny (82M) | Partial — automatic style inference | Ready | Apache 2.0, multi-engine arch in place | | **XTTS-v2** | 6s zero-shot | Mid-GPU | 24 kHz | 17+ | Medium | Partial — style transfer from ref audio only | Ready | Multi-engine arch in place | @@ -475,7 +483,7 @@ The generation form now uses a flat model dropdown with engine-based routing. Pe | `/history/{id}/export` | GET | Export generation ZIP | | `/history/{id}/export-audio` | GET | Export audio only | | `/transcribe` | POST | Transcribe audio (Whisper) | -| `/models/status` | GET | All model statuses (Qwen, LuxTTS, Chatterbox, Chatterbox Turbo, Whisper) | +| `/models/status` | GET | All model statuses (Qwen, LuxTTS, Chatterbox, Chatterbox Turbo, TADA, Whisper) | | `/models/download` | POST | Trigger model download | | `/models/download/cancel` | POST | Cancel/dismiss download | | `/models/{name}` | DELETE | Delete downloaded model | diff --git a/justfile b/justfile index 796e3ddd..fd8bf962 100644 --- a/justfile +++ b/justfile @@ -46,6 +46,8 @@ setup-python: {{ pip }} install -r {{ backend_dir }}/requirements.txt # Chatterbox pins numpy<1.26 / torch==2.6 which break on Python 3.12+ {{ pip }} install --no-deps chatterbox-tts + # HumeAI TADA pins torch>=2.7,<2.8 which conflicts with our torch>=2.1 + {{ pip }} install --no-deps hume-tada # Apple Silicon: install MLX backend if [ "$(uname -m)" = "arm64" ] && [ "$(uname)" = "Darwin" ]; then echo "Detected Apple Silicon — installing MLX dependencies..." @@ -74,6 +76,7 @@ setup-python: } & "{{ pip }}" install -r {{ backend_dir }}/requirements.txt & "{{ pip }}" install --no-deps chatterbox-tts + & "{{ pip }}" install --no-deps hume-tada & "{{ pip }}" install git+https://github.com/QwenLM/Qwen3-TTS.git & "{{ pip }}" install pyinstaller ruff pytest pytest-asyncio -q Write-Host "Python environment ready."