From 4e7772a21df5ebfc9dbb54f35ee3772276db3054 Mon Sep 17 00:00:00 2001 From: James Pine Date: Tue, 17 Mar 2026 01:55:15 -0700 Subject: [PATCH 1/7] add HumeAI TADA TTS engine (1B English + 3B Multilingual) Integrates HumeAI's TADA (Text-Acoustic Dual Alignment) speech-language model as a new TTS engine. TADA uses a novel 1:1 token-audio alignment that produces coherent speech over long sequences (700s+). Two model variants: - tada-1b: English-only, ~4GB, built on Llama 3.2 1B - tada-3b-ml: 10 languages, ~8GB, built on Llama 3.2 3B Backend uses the Encoder for voice prompt encoding with caching, and TadaForCausalLM with flow-matching diffusion for generation. Supports bf16 inference on CUDA, forces CPU on macOS (MPS compatibility). Installed with --no-deps due to torch>=2.7 pin conflict; descript-audio-codec and torchaudio added as explicit sub-dependencies. --- .github/workflows/release.yml | 2 + Dockerfile | 2 + .../Generation/EngineModelSelector.tsx | 18 + .../ServerSettings/ModelManagement.tsx | 7 +- app/src/lib/api/types.ts | 4 +- app/src/lib/constants/languages.ts | 1 + app/src/lib/hooks/useGenerationForm.ts | 24 +- backend/backends/__init__.py | 29 +- backend/backends/hume_backend.py | 310 ++++++++++++++++++ backend/build_binary.py | 44 +++ backend/models.py | 4 +- backend/requirements.txt | 5 + justfile | 3 + 13 files changed, 437 insertions(+), 16 deletions(-) create mode 100644 backend/backends/hume_backend.py diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2b0cf254..d00df528 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -62,6 +62,7 @@ jobs: pip install pyinstaller pip install -r backend/requirements.txt pip install --no-deps chatterbox-tts + pip install --no-deps hume-tada - name: Install MLX dependencies (Apple Silicon only) if: matrix.backend == 'mlx' @@ -188,6 +189,7 @@ jobs: pip install pyinstaller pip install -r backend/requirements.txt pip install --no-deps chatterbox-tts + pip install --no-deps hume-tada - name: Install PyTorch with CUDA 12.6 run: | diff --git a/Dockerfile b/Dockerfile index 20da9e1a..1ad85e53 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,6 +35,8 @@ RUN pip install --no-cache-dir --upgrade pip COPY backend/requirements.txt . RUN pip install --no-cache-dir --prefix=/install -r requirements.txt +RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts +RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada RUN pip install --no-cache-dir --prefix=/install \ git+https://github.com/QwenLM/Qwen3-TTS.git diff --git a/app/src/components/Generation/EngineModelSelector.tsx b/app/src/components/Generation/EngineModelSelector.tsx index 77dac03f..4382d3f7 100644 --- a/app/src/components/Generation/EngineModelSelector.tsx +++ b/app/src/components/Generation/EngineModelSelector.tsx @@ -20,6 +20,8 @@ const ENGINE_OPTIONS = [ { value: 'luxtts', label: 'LuxTTS' }, { value: 'chatterbox', label: 'Chatterbox' }, { value: 'chatterbox_turbo', label: 'Chatterbox Turbo' }, + { value: 'tada:1B', label: 'TADA 1B' }, + { value: 'tada:3B', label: 'TADA 3B Multilingual' }, ] as const; const ENGINE_DESCRIPTIONS: Record = { @@ -27,6 +29,7 @@ const ENGINE_DESCRIPTIONS: Record = { luxtts: 'Fast, English-focused', chatterbox: '23 languages, incl. Hebrew', chatterbox_turbo: 'English, [laugh] [cough] tags', + tada: 'HumeAI, 700s+ coherent audio', }; /** Engines that only support English and should force language to 'en' on select. */ @@ -34,6 +37,7 @@ const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']); function getSelectValue(engine: string, modelSize?: string): string { if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`; + if (engine === 'tada') return `tada:${modelSize || '1B'}`; return engine; } @@ -48,6 +52,20 @@ function handleEngineChange(form: UseFormReturn, value: st if (!available.some((l) => l.value === currentLang)) { form.setValue('language', available[0]?.value ?? 'en'); } + } else if (value.startsWith('tada:')) { + const [, modelSize] = value.split(':'); + form.setValue('engine', 'tada'); + form.setValue('modelSize', modelSize as '1B' | '3B'); + // TADA 1B is English-only; 3B is multilingual + if (modelSize === '1B') { + form.setValue('language', 'en'); + } else { + const currentLang = form.getValues('language'); + const available = getLanguageOptionsForEngine('tada'); + if (!available.some((l) => l.value === currentLang)) { + form.setValue('language', available[0]?.value ?? 'en'); + } + } } else { form.setValue('engine', value as GenerationFormValues['engine']); form.setValue('modelSize', undefined as unknown as '1.7B' | '0.6B'); diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx index 5d113d89..c415306d 100644 --- a/app/src/components/ServerSettings/ModelManagement.tsx +++ b/app/src/components/ServerSettings/ModelManagement.tsx @@ -62,6 +62,10 @@ const MODEL_DESCRIPTIONS: Record = { 'Production-grade open source TTS by Resemble AI. Supports 23 languages with voice cloning and emotion exaggeration control.', 'chatterbox-turbo': 'Streamlined 350M parameter TTS by Resemble AI. High-quality English speech with less compute and VRAM than larger models.', + 'tada-1b': + 'HumeAI TADA 1B — English speech-language model built on Llama 3.2 1B. Generates 700s+ of coherent audio with synchronized text-acoustic alignment.', + 'tada-3b-ml': + 'HumeAI TADA 3B Multilingual — built on Llama 3.2 3B. Supports 10 languages with high-fidelity voice cloning via text-acoustic dual alignment.', 'whisper-base': 'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.', 'whisper-small': @@ -391,7 +395,8 @@ export function ModelManagement() { (m) => m.model_name.startsWith('qwen-tts') || m.model_name.startsWith('luxtts') || - m.model_name.startsWith('chatterbox'), + m.model_name.startsWith('chatterbox') || + m.model_name.startsWith('tada'), ) ?? []; const whisperModels = modelStatus?.models.filter((m) => m.model_name.startsWith('whisper')) ?? []; diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts index daae2a95..aa85d001 100644 --- a/app/src/lib/api/types.ts +++ b/app/src/lib/api/types.ts @@ -42,8 +42,8 @@ export interface GenerationRequest { text: string; language: LanguageCode; seed?: number; - model_size?: '1.7B' | '0.6B'; - engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo'; + model_size?: '1.7B' | '0.6B' | '1B' | '3B'; + engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada'; instruct?: string; max_chunk_chars?: number; crossfade_ms?: number; diff --git a/app/src/lib/constants/languages.ts b/app/src/lib/constants/languages.ts index 19d6bca6..a0d233a5 100644 --- a/app/src/lib/constants/languages.ts +++ b/app/src/lib/constants/languages.ts @@ -66,6 +66,7 @@ export const ENGINE_LANGUAGES: Record = { 'zh', ], chatterbox_turbo: ['en'], + tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'], } as const; /** Helper: get language options for a given engine. */ diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts index 74f9a94c..d174bad0 100644 --- a/app/src/lib/hooks/useGenerationForm.ts +++ b/app/src/lib/hooks/useGenerationForm.ts @@ -15,9 +15,9 @@ const generationSchema = z.object({ text: z.string().min(1, '').max(50000), language: z.enum(LANGUAGE_CODES as [LanguageCode, ...LanguageCode[]]), seed: z.number().int().optional(), - modelSize: z.enum(['1.7B', '0.6B']).optional(), + modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(), instruct: z.string().max(500).optional(), - engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo']).optional(), + engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']).optional(), }); export type GenerationFormValues = z.infer; @@ -79,7 +79,9 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { ? 'chatterbox-tts' : engine === 'chatterbox_turbo' ? 'chatterbox-turbo' - : `qwen-tts-${data.modelSize}`; + : engine === 'tada' + ? `tada-${(data.modelSize || '1B').toLowerCase()}` + : `qwen-tts-${data.modelSize}`; const displayName = engine === 'luxtts' ? 'LuxTTS' @@ -87,9 +89,13 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { ? 'Chatterbox TTS' : engine === 'chatterbox_turbo' ? 'Chatterbox Turbo' - : data.modelSize === '1.7B' - ? 'Qwen TTS 1.7B' - : 'Qwen TTS 0.6B'; + : engine === 'tada' + ? data.modelSize === '3B' + ? 'TADA 3B Multilingual' + : 'TADA 1B' + : data.modelSize === '1.7B' + ? 'Qwen TTS 1.7B' + : 'Qwen TTS 0.6B'; // Check if model needs downloading try { @@ -104,7 +110,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { console.error('Failed to check model status:', error); } - const isQwen = engine === 'qwen'; + const hasModelSizes = engine === 'qwen' || engine === 'tada'; const effectsChain = options.getEffectsChain?.(); // This now returns immediately with status="generating" const result = await generation.mutateAsync({ @@ -112,9 +118,9 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { text: data.text, language: data.language, seed: data.seed, - model_size: isQwen ? data.modelSize : undefined, + model_size: hasModelSizes ? data.modelSize : undefined, engine, - instruct: isQwen ? data.instruct || undefined : undefined, + instruct: engine === 'qwen' ? data.instruct || undefined : undefined, max_chunk_chars: maxChunkChars, crossfade_ms: crossfadeMs, normalize: normalizeAudio, diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py index 6f20f3de..a4f5113a 100644 --- a/backend/backends/__init__.py +++ b/backend/backends/__init__.py @@ -166,6 +166,7 @@ def is_loaded(self) -> bool: "luxtts": "LuxTTS", "chatterbox": "Chatterbox TTS", "chatterbox_turbo": "Chatterbox Turbo", + "tada": "TADA", } @@ -259,6 +260,24 @@ def _get_non_qwen_tts_configs() -> list[ModelConfig]: needs_trim=True, languages=["en"], ), + ModelConfig( + model_name="tada-1b", + display_name="TADA 1B (English)", + engine="tada", + hf_repo_id="HumeAI/tada-1b", + model_size="1B", + size_mb=4000, + languages=["en"], + ), + ModelConfig( + model_name="tada-3b-ml", + display_name="TADA 3B Multilingual", + engine="tada", + hf_repo_id="HumeAI/tada-3b-ml", + model_size="3B", + size_mb=8000, + languages=["en", "ar", "zh", "de", "es", "fr", "it", "ja", "pl", "pt"], + ), ] @@ -339,10 +358,12 @@ def engine_has_model_sizes(engine: str) -> bool: async def load_engine_model(engine: str, model_size: str = "default") -> None: - """Load a model for the given engine, handling the Qwen model_size special case.""" + """Load a model for the given engine, handling engines with multiple model sizes.""" backend = get_tts_backend_for_engine(engine) if engine == "qwen": await backend.load_model_async(model_size) + elif engine == "tada": + await backend.load_model(model_size) else: await backend.load_model() @@ -358,7 +379,7 @@ async def ensure_model_cached_or_raise(engine: str, model_size: str = "default") cfg = c break - if engine == "qwen": + if engine in ("qwen", "tada"): if not backend._is_model_cached(model_size): raise HTTPException( status_code=400, @@ -490,6 +511,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend: from .chatterbox_turbo_backend import ChatterboxTurboTTSBackend backend = ChatterboxTurboTTSBackend() + elif engine == "tada": + from .hume_backend import HumeTadaBackend + + backend = HumeTadaBackend() else: raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}") diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py new file mode 100644 index 00000000..1d4a4600 --- /dev/null +++ b/backend/backends/hume_backend.py @@ -0,0 +1,310 @@ +""" +HumeAI TADA TTS backend implementation. + +Wraps HumeAI's TADA (Text-Acoustic Dual Alignment) model for +high-quality voice cloning. Two model variants: + - tada-1b: English-only, ~2B params (Llama 3.2 1B base) + - tada-3b-ml: Multilingual, ~4B params (Llama 3.2 3B base) + +Both use a shared encoder/codec (HumeAI/tada-codec). The encoder +produces 1:1 aligned token embeddings from reference audio, and the +causal LM generates speech via flow-matching diffusion. + +24kHz output, bf16 inference on CUDA, fp32 on CPU. +""" + +import asyncio +import logging +import threading +from typing import ClassVar, List, Optional, Tuple + +import numpy as np + +from . import TTSBackend +from .base import ( + is_model_cached, + get_torch_device, + combine_voice_prompts as _combine_voice_prompts, + model_load_progress, +) +from ..utils.cache import get_cache_key, get_cached_voice_prompt, cache_voice_prompt + +logger = logging.getLogger(__name__) + +# HuggingFace repos +TADA_CODEC_REPO = "HumeAI/tada-codec" +TADA_1B_REPO = "HumeAI/tada-1b" +TADA_3B_ML_REPO = "HumeAI/tada-3b-ml" + +TADA_MODEL_REPOS = { + "1B": TADA_1B_REPO, + "3B": TADA_3B_ML_REPO, +} + +# Key weight files for cache detection +_TADA_MODEL_WEIGHT_FILES = [ + "model.safetensors", +] + +_TADA_CODEC_WEIGHT_FILES = [ + "encoder/model.safetensors", +] + + +class HumeTadaBackend: + """HumeAI TADA TTS backend for high-quality voice cloning.""" + + _load_lock: ClassVar[threading.Lock] = threading.Lock() + + def __init__(self): + self.model = None + self.encoder = None + self.model_size = "1B" # default to 1B + self._device = None + self._model_load_lock = asyncio.Lock() + + def _get_device(self) -> str: + # Force CPU on macOS — MPS has issues with flow matching + # and large vocab lm_head (>65536 output channels) + return get_torch_device(force_cpu_on_mac=True) + + def is_loaded(self) -> bool: + return self.model is not None + + def _get_model_path(self, model_size: str = "1B") -> str: + return TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO) + + def _is_model_cached(self, model_size: str = "1B") -> bool: + repo = TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO) + model_cached = is_model_cached(repo, required_files=_TADA_MODEL_WEIGHT_FILES) + codec_cached = is_model_cached(TADA_CODEC_REPO, required_files=_TADA_CODEC_WEIGHT_FILES) + return model_cached and codec_cached + + async def load_model(self, model_size: str = "1B") -> None: + """Load the TADA model and encoder.""" + if self.model is not None and self.model_size == model_size: + return + async with self._model_load_lock: + if self.model is not None and self.model_size == model_size: + return + # Unload existing model if switching sizes + if self.model is not None: + self.unload_model() + self.model_size = model_size + await asyncio.to_thread(self._load_model_sync, model_size) + + def _load_model_sync(self, model_size: str = "1B"): + """Synchronous model loading with progress tracking.""" + model_name = f"tada-{model_size.lower()}" + is_cached = self._is_model_cached(model_size) + repo = TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO) + + with model_load_progress(model_name, is_cached): + import torch + from huggingface_hub import snapshot_download + + device = self._get_device() + self._device = device + logger.info(f"Loading HumeAI TADA {model_size} on {device}...") + + # Download codec (encoder + decoder) if not cached + logger.info("Downloading TADA codec...") + snapshot_download( + repo_id=TADA_CODEC_REPO, + token=None, + allow_patterns=["*.safetensors", "*.json", "*.txt", "*.bin"], + ) + + # Download model weights if not cached + logger.info(f"Downloading TADA {model_size} model...") + snapshot_download( + repo_id=repo, + token=None, + allow_patterns=["*.safetensors", "*.json", "*.txt", "*.bin", "*.model"], + ) + + # Determine dtype — use bf16 on CUDA for ~50% memory savings + if device == "cuda" and torch.cuda.is_bf16_supported(): + model_dtype = torch.bfloat16 + else: + model_dtype = torch.float32 + + # Load encoder (only needed for voice prompt encoding) + from tada.modules.encoder import Encoder + logger.info("Loading TADA encoder...") + self.encoder = Encoder.from_pretrained( + TADA_CODEC_REPO, subfolder="encoder" + ).to(device) + self.encoder.eval() + + # Load the causal LM (includes decoder for wav generation) + from tada.modules.tada import TadaForCausalLM + logger.info(f"Loading TADA {model_size} model...") + self.model = TadaForCausalLM.from_pretrained( + repo, torch_dtype=model_dtype + ).to(device) + self.model.eval() + + logger.info(f"HumeAI TADA {model_size} loaded successfully on {device}") + + def unload_model(self) -> None: + """Unload model and encoder to free memory.""" + if self.model is not None: + del self.model + self.model = None + if self.encoder is not None: + del self.encoder + self.encoder = None + + self._device = None + + import torch + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + logger.info("HumeAI TADA unloaded") + + async def create_voice_prompt( + self, + audio_path: str, + reference_text: str, + use_cache: bool = True, + ) -> Tuple[dict, bool]: + """ + Create voice prompt from reference audio using TADA's encoder. + + TADA's encoder performs forced alignment between audio and text tokens, + producing an EncoderOutput with 1:1 token-audio alignment. If no + reference_text is provided, the encoder uses built-in ASR (English only). + + We serialize the EncoderOutput to a dict for caching. + """ + await self.load_model(self.model_size) + + cache_key = ( + "tada_" + get_cache_key(audio_path, reference_text) + ) if use_cache else None + + if cache_key: + cached = get_cached_voice_prompt(cache_key) + if cached is not None and isinstance(cached, dict): + return cached, True + + def _encode_sync(): + import torch + import torchaudio + + device = self._device + + # Load and prepare audio + audio, sr = torchaudio.load(str(audio_path)) + audio = audio.to(device) + + # Encode with forced alignment + text_arg = [reference_text] if reference_text else None + prompt = self.encoder( + audio, text=text_arg, sample_rate=sr + ) + + # Serialize EncoderOutput to a dict of CPU tensors for caching + prompt_dict = {} + for field_name in prompt.__dataclass_fields__: + val = getattr(prompt, field_name) + if isinstance(val, torch.Tensor): + prompt_dict[field_name] = val.detach().cpu() + elif isinstance(val, list): + prompt_dict[field_name] = val + elif isinstance(val, (int, float)): + prompt_dict[field_name] = val + else: + prompt_dict[field_name] = val + return prompt_dict + + encoded = await asyncio.to_thread(_encode_sync) + + if cache_key: + cache_voice_prompt(cache_key, encoded) + + return encoded, False + + async def combine_voice_prompts( + self, + audio_paths: List[str], + reference_texts: List[str], + ) -> Tuple[np.ndarray, str]: + return await _combine_voice_prompts(audio_paths, reference_texts, sample_rate=24000) + + async def generate( + self, + text: str, + voice_prompt: dict, + language: str = "en", + seed: Optional[int] = None, + instruct: Optional[str] = None, + ) -> Tuple[np.ndarray, int]: + """ + Generate audio from text using HumeAI TADA. + + Args: + text: Text to synthesize + voice_prompt: Serialized EncoderOutput dict from create_voice_prompt() + language: Language code (en, ar, de, es, fr, it, ja, pl, pt, zh) + seed: Random seed for reproducibility + instruct: Not supported by TADA (ignored) + + Returns: + Tuple of (audio_array, sample_rate=24000) + """ + await self.load_model(self.model_size) + + def _generate_sync(): + import torch + from tada.modules.encoder import EncoderOutput + + if seed is not None: + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + device = self._device + + # Reconstruct EncoderOutput from the cached dict + restored = {} + for k, v in voice_prompt.items(): + if isinstance(v, torch.Tensor): + # Move to device and match model dtype for float tensors + if v.is_floating_point(): + model_dtype = next(self.model.parameters()).dtype + restored[k] = v.to(device=device, dtype=model_dtype) + else: + restored[k] = v.to(device=device) + else: + restored[k] = v + + prompt = EncoderOutput(**restored) + + # For non-English with the 3B-ML model, we could reload the + # encoder with the language-specific aligner. However, the + # generation itself is language-agnostic — only the encoder's + # aligner changes. Since we encode at create_voice_prompt time, + # the language is already baked in. For simplicity, we don't + # reload the encoder here. + + logger.info(f"[TADA] Generating ({language}), text length: {len(text)}") + + output = self.model.generate( + prompt=prompt, + text=text, + ) + + # output.audio is a list of tensors (one per batch item) + if output.audio and output.audio[0] is not None: + audio_tensor = output.audio[0] + audio = audio_tensor.detach().cpu().numpy().squeeze().astype(np.float32) + else: + logger.warning("[TADA] Generation produced no audio") + audio = np.zeros(24000, dtype=np.float32) + + return audio, 24000 + + return await asyncio.to_thread(_generate_sync) diff --git a/backend/build_binary.py b/backend/build_binary.py index f9cdb1b7..0c6dfebd 100644 --- a/backend/build_binary.py +++ b/backend/build_binary.py @@ -186,6 +186,50 @@ def build_server(cuda=False): # needed by LuxTTS for text-to-phoneme conversion "--collect-all", "piper_phonemize", + # HumeAI TADA — speech-language model using Llama + flow matching + "--hidden-import", + "backend.backends.hume_backend", + "--hidden-import", + "tada", + "--hidden-import", + "tada.modules", + "--hidden-import", + "tada.modules.tada", + "--hidden-import", + "tada.modules.encoder", + "--hidden-import", + "tada.modules.decoder", + "--hidden-import", + "tada.modules.aligner", + "--hidden-import", + "tada.modules.acoustic_spkr_verf", + "--hidden-import", + "tada.nn", + "--hidden-import", + "tada.nn.vibevoice", + "--hidden-import", + "tada.utils", + "--hidden-import", + "tada.utils.gray_code", + "--hidden-import", + "tada.utils.text", + # descript-audio-codec (DAC) — used by TADA for Snake1d layers + "--hidden-import", + "dac", + "--hidden-import", + "dac.nn", + "--hidden-import", + "dac.nn.layers", + "--hidden-import", + "dac.model", + "--hidden-import", + "dac.model.dac", + "--collect-all", + "dac", + "--hidden-import", + "torchaudio", + "--collect-submodules", + "tada", ] ) diff --git a/backend/models.py b/backend/models.py index 3308b3bc..4dd2b368 100644 --- a/backend/models.py +++ b/backend/models.py @@ -66,9 +66,9 @@ class GenerationRequest(BaseModel): text: str = Field(..., min_length=1, max_length=50000) language: str = Field(default="en", pattern="^(zh|en|ja|ko|de|fr|ru|pt|es|it|he|ar|da|el|fi|hi|ms|nl|no|pl|sv|sw|tr)$") seed: Optional[int] = Field(None, ge=0) - model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B)$") + model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B)$") instruct: Optional[str] = Field(None, max_length=500) - engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo)$") + engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada)$") max_chunk_chars: int = Field( default=800, ge=100, le=5000, description="Max characters per chunk for long text splitting" ) diff --git a/backend/requirements.txt b/backend/requirements.txt index 0d927975..2bba9d70 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -33,6 +33,11 @@ s3tokenizer spacy-pkuseg pyloudnorm +# HumeAI TADA sub-dependencies (hume-tada itself is installed +# --no-deps in the setup script because it pins torch>=2.7,<2.8) +descript-audio-codec>=1.0.0 +torchaudio + # Audio processing librosa>=0.10.0 soundfile>=0.12.0 diff --git a/justfile b/justfile index 796e3ddd..fd8bf962 100644 --- a/justfile +++ b/justfile @@ -46,6 +46,8 @@ setup-python: {{ pip }} install -r {{ backend_dir }}/requirements.txt # Chatterbox pins numpy<1.26 / torch==2.6 which break on Python 3.12+ {{ pip }} install --no-deps chatterbox-tts + # HumeAI TADA pins torch>=2.7,<2.8 which conflicts with our torch>=2.1 + {{ pip }} install --no-deps hume-tada # Apple Silicon: install MLX backend if [ "$(uname -m)" = "arm64" ] && [ "$(uname)" = "Darwin" ]; then echo "Detected Apple Silicon — installing MLX dependencies..." @@ -74,6 +76,7 @@ setup-python: } & "{{ pip }}" install -r {{ backend_dir }}/requirements.txt & "{{ pip }}" install --no-deps chatterbox-tts + & "{{ pip }}" install --no-deps hume-tada & "{{ pip }}" install git+https://github.com/QwenLM/Qwen3-TTS.git & "{{ pip }}" install pyinstaller ruff pytest pytest-asyncio -q Write-Host "Python environment ready." From b02ce8e2f31f59c6803b6b83040ca6577a38ad45 Mon Sep 17 00:00:00 2001 From: James Pine Date: Tue, 17 Mar 2026 02:16:33 -0700 Subject: [PATCH 2/7] replace descript-audio-codec with lightweight DAC shim The real descript-audio-codec package pulls in descript-audiotools, which transitively requires onnx, tensorboard, protobuf, matplotlib, pystoi, and other heavy dependencies. onnx fails to build from source on macOS due to CMake version incompatibility. TADA only uses Snake1d (a 7-line PyTorch module) from DAC. This commit adds a shim in backend/utils/dac_shim.py that registers fake dac.* modules in sys.modules with just the Snake1d class, completely eliminating the DAC/audiotools dependency chain. --- backend/backends/hume_backend.py | 7 +++ backend/build_binary.py | 16 ++---- backend/requirements.txt | 6 ++- backend/utils/dac_shim.py | 93 ++++++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 14 deletions(-) create mode 100644 backend/utils/dac_shim.py diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py index 1d4a4600..b3beab25 100644 --- a/backend/backends/hume_backend.py +++ b/backend/backends/hume_backend.py @@ -100,6 +100,13 @@ def _load_model_sync(self, model_size: str = "1B"): repo = TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO) with model_load_progress(model_name, is_cached): + # Install DAC shim before importing tada — tada's encoder/decoder + # import dac.nn.layers.Snake1d which requires the descript-audio-codec + # package. The real package pulls in onnx/tensorboard/matplotlib via + # descript-audiotools, so we use a lightweight shim instead. + from ..utils.dac_shim import install_dac_shim + install_dac_shim() + import torch from huggingface_hub import snapshot_download diff --git a/backend/build_binary.py b/backend/build_binary.py index 0c6dfebd..0574894f 100644 --- a/backend/build_binary.py +++ b/backend/build_binary.py @@ -213,19 +213,11 @@ def build_server(cuda=False): "tada.utils.gray_code", "--hidden-import", "tada.utils.text", - # descript-audio-codec (DAC) — used by TADA for Snake1d layers + # DAC shim — provides dac.nn.layers.Snake1d without the real + # descript-audio-codec package (which pulls onnx/tensorboard via + # descript-audiotools). The shim is in backend/utils/dac_shim.py. "--hidden-import", - "dac", - "--hidden-import", - "dac.nn", - "--hidden-import", - "dac.nn.layers", - "--hidden-import", - "dac.model", - "--hidden-import", - "dac.model.dac", - "--collect-all", - "dac", + "backend.utils.dac_shim", "--hidden-import", "torchaudio", "--collect-submodules", diff --git a/backend/requirements.txt b/backend/requirements.txt index 2bba9d70..d77f97be 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -34,8 +34,10 @@ spacy-pkuseg pyloudnorm # HumeAI TADA sub-dependencies (hume-tada itself is installed -# --no-deps in the setup script because it pins torch>=2.7,<2.8) -descript-audio-codec>=1.0.0 +# --no-deps in the setup script because it pins torch>=2.7,<2.8. +# descript-audio-codec is NOT installed — it pulls onnx/tensorboard +# via descript-audiotools. A lightweight shim in utils/dac_shim.py +# provides the only class TADA uses: Snake1d.) torchaudio # Audio processing diff --git a/backend/utils/dac_shim.py b/backend/utils/dac_shim.py new file mode 100644 index 00000000..ea1294fc --- /dev/null +++ b/backend/utils/dac_shim.py @@ -0,0 +1,93 @@ +""" +Minimal shim for descript-audio-codec (DAC). + +TADA only imports Snake1d from dac.nn.layers and dac.model.dac. +The real DAC package pulls in descript-audiotools which depends on +onnx, tensorboard, protobuf, matplotlib, pystoi, etc. — none of +which are needed for TADA's runtime use of Snake1d. + +This shim provides the exact Snake1d implementation (MIT-licensed, +from https://github.com/descriptinc/descript-audio-codec) so we can +avoid the entire audiotools dependency chain. + +If the real DAC package is installed, this module is never used — +Python's import system will find the site-packages version first. +Install this shim only when descript-audio-codec is NOT installed. +""" + +import sys +import types + +import torch +import torch.nn as nn + + +# ── Snake activation (from dac/nn/layers.py) ──────────────────────── + +@torch.jit.script +def snake(x: torch.Tensor, alpha: torch.Tensor) -> torch.Tensor: + shape = x.shape + x = x.reshape(shape[0], shape[1], -1) + x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2) + x = x.reshape(shape) + return x + + +class Snake1d(nn.Module): + def __init__(self, channels: int): + super().__init__() + self.alpha = nn.Parameter(torch.ones(1, channels, 1)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return snake(x, self.alpha) + + +# ── Register as dac.nn.layers and dac.model.dac ───────────────────── + +def install_dac_shim() -> None: + """Register fake dac package modules in sys.modules. + + Only installs the shim if 'dac' is not already importable + (i.e. the real descript-audio-codec is not installed). + """ + try: + import dac # noqa: F401 — real package exists, do nothing + return + except ImportError: + pass + + # Create the module tree: dac -> dac.nn -> dac.nn.layers + # -> dac.model -> dac.model.dac + dac_pkg = types.ModuleType("dac") + dac_pkg.__path__ = [] # make it a package + dac_pkg.__package__ = "dac" + + dac_nn = types.ModuleType("dac.nn") + dac_nn.__path__ = [] + dac_nn.__package__ = "dac.nn" + + dac_nn_layers = types.ModuleType("dac.nn.layers") + dac_nn_layers.__package__ = "dac.nn" + dac_nn_layers.Snake1d = Snake1d + dac_nn_layers.snake = snake + + dac_model = types.ModuleType("dac.model") + dac_model.__path__ = [] + dac_model.__package__ = "dac.model" + + dac_model_dac = types.ModuleType("dac.model.dac") + dac_model_dac.__package__ = "dac.model" + dac_model_dac.Snake1d = Snake1d + + # Wire up submodules + dac_pkg.nn = dac_nn + dac_pkg.model = dac_model + dac_nn.layers = dac_nn_layers + dac_model.dac = dac_model_dac + + # Register in sys.modules + sys.modules["dac"] = dac_pkg + sys.modules["dac.nn"] = dac_nn + sys.modules["dac.nn.layers"] = dac_nn_layers + sys.modules["dac.model"] = dac_model + sys.modules["dac.model.dac"] = dac_model_dac From 7a90290a763bfdc0a1b00825d9fe3382914506dd Mon Sep 17 00:00:00 2001 From: James Pine Date: Tue, 17 Mar 2026 02:22:26 -0700 Subject: [PATCH 3/7] fix gated Llama tokenizer error by redirecting to ungated mirror TADA hardcodes 'meta-llama/Llama-3.2-1B' as its tokenizer source in both the Aligner and TadaForCausalLM.from_pretrained(). That repo is gated and requires accepting Meta's license on HuggingFace. Monkey-patch AutoTokenizer.from_pretrained during model loading to redirect Llama tokenizer requests to 'unsloth/Llama-3.2-1B', an ungated mirror with identical tokenizer files. The patch is scoped to model loading only and restored immediately after. --- backend/backends/hume_backend.py | 62 ++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py index b3beab25..804ce936 100644 --- a/backend/backends/hume_backend.py +++ b/backend/backends/hume_backend.py @@ -130,27 +130,59 @@ def _load_model_sync(self, model_size: str = "1B"): allow_patterns=["*.safetensors", "*.json", "*.txt", "*.bin", "*.model"], ) + # Pre-download the Llama tokenizer from an ungated mirror. + # TADA hardcodes "meta-llama/Llama-3.2-1B" which is gated; + # we redirect to unsloth's ungated copy at load time. + logger.info("Downloading Llama tokenizer (ungated mirror)...") + snapshot_download( + repo_id="unsloth/Llama-3.2-1B", + token=None, + allow_patterns=["tokenizer*", "special_tokens*"], + ) + # Determine dtype — use bf16 on CUDA for ~50% memory savings if device == "cuda" and torch.cuda.is_bf16_supported(): model_dtype = torch.bfloat16 else: model_dtype = torch.float32 - # Load encoder (only needed for voice prompt encoding) - from tada.modules.encoder import Encoder - logger.info("Loading TADA encoder...") - self.encoder = Encoder.from_pretrained( - TADA_CODEC_REPO, subfolder="encoder" - ).to(device) - self.encoder.eval() - - # Load the causal LM (includes decoder for wav generation) - from tada.modules.tada import TadaForCausalLM - logger.info(f"Loading TADA {model_size} model...") - self.model = TadaForCausalLM.from_pretrained( - repo, torch_dtype=model_dtype - ).to(device) - self.model.eval() + # TADA hardcodes "meta-llama/Llama-3.2-1B" as the tokenizer + # source in its Aligner and TadaForCausalLM.from_pretrained(). + # That repo is gated (requires Meta license acceptance on HF). + # Monkey-patch AutoTokenizer.from_pretrained to redirect to an + # ungated mirror that ships the identical tokenizer files. + from transformers import AutoTokenizer + _orig_from_pretrained = AutoTokenizer.from_pretrained.__func__ + + @classmethod # type: ignore[misc] + def _patched_from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): + if "meta-llama/Llama-3.2" in str(pretrained_model_name_or_path): + pretrained_model_name_or_path = "unsloth/Llama-3.2-1B" + kwargs.setdefault("token", None) + logger.info("Redirecting Llama tokenizer to ungated mirror: unsloth/Llama-3.2-1B") + return _orig_from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs) + + AutoTokenizer.from_pretrained = _patched_from_pretrained + + try: + # Load encoder (only needed for voice prompt encoding) + from tada.modules.encoder import Encoder + logger.info("Loading TADA encoder...") + self.encoder = Encoder.from_pretrained( + TADA_CODEC_REPO, subfolder="encoder" + ).to(device) + self.encoder.eval() + + # Load the causal LM (includes decoder for wav generation) + from tada.modules.tada import TadaForCausalLM + logger.info(f"Loading TADA {model_size} model...") + self.model = TadaForCausalLM.from_pretrained( + repo, torch_dtype=model_dtype + ).to(device) + self.model.eval() + finally: + # Restore original to avoid affecting other code + AutoTokenizer.from_pretrained = _orig_from_pretrained logger.info(f"HumeAI TADA {model_size} loaded successfully on {device}") From 12cda2e0905aa80066e77ee01a540d8b66765cac Mon Sep 17 00:00:00 2001 From: James Pine Date: Tue, 17 Mar 2026 02:25:05 -0700 Subject: [PATCH 4/7] fix torchcodec error by using soundfile instead of torchaudio.load torchaudio 2.10+ switched its default audio loading backend to torchcodec, which isn't installed. Replace torchaudio.load() with soundfile.read() in create_voice_prompt(). TADA's internal use of torchaudio.functional.resample() is unaffected (pure PyTorch math, no torchcodec dependency). --- backend/backends/hume_backend.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py index 804ce936..d82afb4e 100644 --- a/backend/backends/hume_backend.py +++ b/backend/backends/hume_backend.py @@ -231,12 +231,17 @@ async def create_voice_prompt( def _encode_sync(): import torch - import torchaudio + import soundfile as sf device = self._device - # Load and prepare audio - audio, sr = torchaudio.load(str(audio_path)) + # Load audio with soundfile (torchaudio 2.10+ requires torchcodec) + audio_np, sr = sf.read(str(audio_path), dtype="float32") + audio = torch.from_numpy(audio_np).float() + if audio.ndim == 1: + audio = audio.unsqueeze(0) # (samples,) -> (1, samples) + else: + audio = audio.T # (samples, channels) -> (channels, samples) audio = audio.to(device) # Encode with forced alignment From 6bf40bd2d07fbdc84a3c220cf021a732eae2414c Mon Sep 17 00:00:00 2001 From: James Pine Date: Tue, 17 Mar 2026 03:15:57 -0700 Subject: [PATCH 5/7] fix tokenizer patch corrupting AutoTokenizer for other engines Replace the monkey-patch on AutoTokenizer.from_pretrained (which broke the classmethod descriptor and caused 'Tokenizer not loaded' errors when loading Qwen after TADA) with two targeted config patches: - Set AlignerConfig.tokenizer_name to the local ungated tokenizer path - Pre-load TadaConfig, inject tokenizer_name, pass config= to from_pretrained No global state is modified; other engines are unaffected. --- backend/backends/hume_backend.py | 75 +++++++++++++++----------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py index d82afb4e..456fd46e 100644 --- a/backend/backends/hume_backend.py +++ b/backend/backends/hume_backend.py @@ -130,11 +130,13 @@ def _load_model_sync(self, model_size: str = "1B"): allow_patterns=["*.safetensors", "*.json", "*.txt", "*.bin", "*.model"], ) - # Pre-download the Llama tokenizer from an ungated mirror. - # TADA hardcodes "meta-llama/Llama-3.2-1B" which is gated; - # we redirect to unsloth's ungated copy at load time. + # TADA hardcodes "meta-llama/Llama-3.2-1B" as the tokenizer + # source in its Aligner and TadaForCausalLM.from_pretrained(). + # That repo is gated (requires Meta license acceptance). + # Download the tokenizer from an ungated mirror and get its + # local cache path so we can point TADA at it directly. logger.info("Downloading Llama tokenizer (ungated mirror)...") - snapshot_download( + tokenizer_path = snapshot_download( repo_id="unsloth/Llama-3.2-1B", token=None, allow_patterns=["tokenizer*", "special_tokens*"], @@ -146,43 +148,34 @@ def _load_model_sync(self, model_size: str = "1B"): else: model_dtype = torch.float32 - # TADA hardcodes "meta-llama/Llama-3.2-1B" as the tokenizer - # source in its Aligner and TadaForCausalLM.from_pretrained(). - # That repo is gated (requires Meta license acceptance on HF). - # Monkey-patch AutoTokenizer.from_pretrained to redirect to an - # ungated mirror that ships the identical tokenizer files. - from transformers import AutoTokenizer - _orig_from_pretrained = AutoTokenizer.from_pretrained.__func__ - - @classmethod # type: ignore[misc] - def _patched_from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): - if "meta-llama/Llama-3.2" in str(pretrained_model_name_or_path): - pretrained_model_name_or_path = "unsloth/Llama-3.2-1B" - kwargs.setdefault("token", None) - logger.info("Redirecting Llama tokenizer to ungated mirror: unsloth/Llama-3.2-1B") - return _orig_from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs) - - AutoTokenizer.from_pretrained = _patched_from_pretrained - - try: - # Load encoder (only needed for voice prompt encoding) - from tada.modules.encoder import Encoder - logger.info("Loading TADA encoder...") - self.encoder = Encoder.from_pretrained( - TADA_CODEC_REPO, subfolder="encoder" - ).to(device) - self.encoder.eval() - - # Load the causal LM (includes decoder for wav generation) - from tada.modules.tada import TadaForCausalLM - logger.info(f"Loading TADA {model_size} model...") - self.model = TadaForCausalLM.from_pretrained( - repo, torch_dtype=model_dtype - ).to(device) - self.model.eval() - finally: - # Restore original to avoid affecting other code - AutoTokenizer.from_pretrained = _orig_from_pretrained + # Patch the Aligner config class to use the local tokenizer + # path instead of the gated "meta-llama/Llama-3.2-1B" default. + # This avoids monkey-patching AutoTokenizer.from_pretrained + # which corrupts the classmethod descriptor for other engines. + from tada.modules.aligner import AlignerConfig + AlignerConfig.tokenizer_name = tokenizer_path + + # Load encoder (only needed for voice prompt encoding) + from tada.modules.encoder import Encoder + logger.info("Loading TADA encoder...") + self.encoder = Encoder.from_pretrained( + TADA_CODEC_REPO, subfolder="encoder" + ).to(device) + self.encoder.eval() + + # Load the causal LM (includes decoder for wav generation). + # TadaForCausalLM.from_pretrained() calls + # getattr(config, "tokenizer_name", "meta-llama/Llama-3.2-1B") + # which hits the gated repo. Pre-load the config from HF, + # inject the local tokenizer path, then pass it in. + from tada.modules.tada import TadaForCausalLM, TadaConfig + logger.info(f"Loading TADA {model_size} model...") + config = TadaConfig.from_pretrained(repo) + config.tokenizer_name = tokenizer_path + self.model = TadaForCausalLM.from_pretrained( + repo, config=config, torch_dtype=model_dtype + ).to(device) + self.model.eval() logger.info(f"HumeAI TADA {model_size} loaded successfully on {device}") From 5774a168a9874c27acd4854f7194a52eb135318d Mon Sep 17 00:00:00 2001 From: James Pine Date: Tue, 17 Mar 2026 03:17:53 -0700 Subject: [PATCH 6/7] fix TADA 3B model name: tada-3b -> tada-3b-ml --- app/src/lib/hooks/useGenerationForm.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts index d174bad0..8e73ce07 100644 --- a/app/src/lib/hooks/useGenerationForm.ts +++ b/app/src/lib/hooks/useGenerationForm.ts @@ -80,7 +80,9 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { : engine === 'chatterbox_turbo' ? 'chatterbox-turbo' : engine === 'tada' - ? `tada-${(data.modelSize || '1B').toLowerCase()}` + ? data.modelSize === '3B' + ? 'tada-3b-ml' + : 'tada-1b' : `qwen-tts-${data.modelSize}`; const displayName = engine === 'luxtts' From 273483ffcfab2056804ce02559c5d1129d67477c Mon Sep 17 00:00:00 2001 From: James Pine Date: Tue, 17 Mar 2026 03:28:58 -0700 Subject: [PATCH 7/7] fix TorchScript error in frozen builds and update docs for TADA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove @torch.jit.script from the DAC shim's snake() function — TorchScript calls inspect.getsource() which fails in PyInstaller binaries (no .py source files). Update all user-facing docs: 4 → 5 TTS engines, add TADA row to every engine comparison table, mark TADA as Shipped in the upcoming engines list, update architecture diagrams and tech stack tables. --- README.md | 11 ++++++----- backend/utils/dac_shim.py | 4 +++- docs/content/docs/developer/tts-engines.mdx | 2 +- docs/content/docs/index.mdx | 4 ++-- docs/content/docs/overview/introduction.mdx | 9 +++++---- docs/notes/PROJECT_STATUS.md | 18 +++++++++++++----- 6 files changed, 30 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 614e4f21..1acc9a4f 100644 --- a/README.md +++ b/README.md @@ -59,10 +59,10 @@ ## What is Voicebox? -Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. +Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. - **Complete privacy** — models and voice data stay on your machine -- **4 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo +- **5 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA - **23 languages** — from English to Arabic, Japanese, Hindi, Swahili, and more - **Post-processing effects** — pitch shift, reverb, delay, chorus, compression, and filters - **Expressive speech** — paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo @@ -93,7 +93,7 @@ Voicebox is a **local-first voice cloning studio** — a free and open-source al ### Multi-Engine Voice Cloning -Four TTS engines with different strengths, switchable per-generation: +Five TTS engines with different strengths, switchable per-generation: | Engine | Languages | Strengths | | --------------------------- | --------- | ---------------------------------------------------------------------------------------------------------------------------------------- | @@ -101,6 +101,7 @@ Four TTS engines with different strengths, switchable per-generation: | **LuxTTS** | English | Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU | | **Chatterbox Multilingual** | 23 | Broadest language coverage — Arabic, Danish, Finnish, Greek, Hebrew, Hindi, Malay, Norwegian, Polish, Swahili, Swedish, Turkish and more | | **Chatterbox Turbo** | English | Fast 350M model with paralinguistic emotion/sound tags | +| **TADA** (1B / 3B) | 10 | HumeAI speech-language model — 700s+ coherent audio, text-acoustic dual alignment | ### Emotions & Paralinguistic Tags @@ -230,7 +231,7 @@ Full API documentation available at `http://localhost:17493/docs`. | Frontend | React, TypeScript, Tailwind CSS | | State | Zustand, React Query | | Backend | FastAPI (Python) | -| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo | +| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo, TADA | | Effects | Pedalboard (Spotify) | | Transcription | Whisper / Whisper Turbo (PyTorch or MLX) | | Inference | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) | @@ -245,7 +246,7 @@ Full API documentation available at `http://localhost:17493/docs`. | ----------------------- | ---------------------------------------------- | | **Real-time Streaming** | Stream audio as it generates, word by word | | **Voice Design** | Create new voices from text descriptions | -| **More Models** | XTTS, Bark, and other open-source voice models | +| **More Models** | XTTS, Bark, and other open-source voice models | | **Plugin Architecture** | Extend with custom models and effects | | **Mobile Companion** | Control Voicebox from your phone | diff --git a/backend/utils/dac_shim.py b/backend/utils/dac_shim.py index ea1294fc..89968c18 100644 --- a/backend/utils/dac_shim.py +++ b/backend/utils/dac_shim.py @@ -24,7 +24,9 @@ # ── Snake activation (from dac/nn/layers.py) ──────────────────────── -@torch.jit.script +# NOTE: The original DAC code uses @torch.jit.script here for a 1.4x +# speedup. We omit it because TorchScript calls inspect.getsource() +# which fails inside a PyInstaller frozen binary (no .py source files). def snake(x: torch.Tensor, alpha: torch.Tensor) -> torch.Tensor: shape = x.shape x = x.reshape(shape[0], shape[1], -1) diff --git a/docs/content/docs/developer/tts-engines.mdx b/docs/content/docs/developer/tts-engines.mdx index f95e7a71..90135a37 100644 --- a/docs/content/docs/developer/tts-engines.mdx +++ b/docs/content/docs/developer/tts-engines.mdx @@ -490,7 +490,7 @@ Based on the current model landscape, these are candidates for future integratio | **Fish Speech** | 50+ | Medium | Word-level control via inline text | Ready | | **Kokoro-82M** | English | 82M | CPU realtime, Apache 2.0 | Ready | | **XTTS-v2** | 17+ | Medium | Zero-shot cloning | Ready | -| **HumeAI TADA** | EN (1B), Multi (3B) | Medium | 700s+ coherent audio, synced transcripts | Needs vetting | +| **HumeAI TADA** | EN (1B), Multi (3B) | Medium | 700s+ coherent audio, synced transcripts | Shipped | | **MOSS-TTS** | Multilingual | Medium | Text-to-voice design, multi-speaker dialogue | Needs vetting | | **Pocket TTS** | English | ~100M | CPU-first, >1× realtime | Needs vetting | diff --git a/docs/content/docs/index.mdx b/docs/content/docs/index.mdx index cbe120c3..9a7da078 100644 --- a/docs/content/docs/index.mdx +++ b/docs/content/docs/index.mdx @@ -3,12 +3,12 @@ title: "Voicebox Documentation" description: "Voicebox is a local-first voice cloning studio -- a free and open-source alternative to ElevenLabs." --- -Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. +Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. ![Voicebox App Screenshot](/images/app-screenshot-1.webp) - **Complete privacy** -- models and voice data stay on your machine -- **4 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo +- **5 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA - **23 languages** -- from English to Arabic, Japanese, Hindi, Swahili, and more - **Post-processing effects** -- pitch shift, reverb, delay, chorus, compression, and filters - **Expressive speech** -- paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo diff --git a/docs/content/docs/overview/introduction.mdx b/docs/content/docs/overview/introduction.mdx index 134f1d71..d740a61e 100644 --- a/docs/content/docs/overview/introduction.mdx +++ b/docs/content/docs/overview/introduction.mdx @@ -5,10 +5,10 @@ description: "Voicebox is a local-first voice cloning studio -- a free and open- ## What is Voicebox? -Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. +Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. - **Complete privacy** -- models and voice data stay on your machine -- **4 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo +- **5 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA - **23 languages** -- from English to Arabic, Japanese, Hindi, Swahili, and more - **Post-processing effects** -- pitch shift, reverb, delay, chorus, compression, and filters - **Expressive speech** -- paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo @@ -20,7 +20,7 @@ Voicebox is a **local-first voice cloning studio** -- a free and open-source alt ## TTS Engines -Four engines with different strengths, switchable per-generation: +Five engines with different strengths, switchable per-generation: | Engine | Languages | Strengths | |--------|-----------|-----------| @@ -28,6 +28,7 @@ Four engines with different strengths, switchable per-generation: | **LuxTTS** | English | Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU | | **Chatterbox Multilingual** | 23 | Broadest language coverage | | **Chatterbox Turbo** | English | Fast 350M model with paralinguistic emotion/sound tags | +| **TADA** (1B / 3B) | 10 | HumeAI speech-language model -- 700s+ coherent audio | ## GPU Support @@ -56,7 +57,7 @@ Four engines with different strengths, switchable per-generation: | Frontend | React, TypeScript, Tailwind CSS | | State | Zustand, React Query | | Backend | FastAPI (Python) | -| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo | +| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo, TADA | | Effects | Pedalboard (Spotify) | | Transcription | Whisper / Whisper Turbo (PyTorch or MLX) | | Inference | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) | diff --git a/docs/notes/PROJECT_STATUS.md b/docs/notes/PROJECT_STATUS.md index 628dfa0a..71c06453 100644 --- a/docs/notes/PROJECT_STATUS.md +++ b/docs/notes/PROJECT_STATUS.md @@ -36,6 +36,10 @@ │ │ │ │ Qwen3-TTS│ │LuxTTS │ │Chatterbox │ │ │ │ │ │ │ │(Py/MLX) │ │ │ │(MTL+Turbo)│ │ │ │ │ │ │ └──────────┘ └───────┘ └───────────┘ │ │ │ +│ │ │ ┌──────────┐ │ │ │ +│ │ │ │ TADA │ │ │ │ +│ │ │ │(1B / 3B) │ │ │ │ +│ │ │ └──────────┘ │ │ │ │ │ └─────────────────────────────────────────┘ │ │ │ │ ┌───────────┐ ┌─────────┐ │ │ │ │ │ STTBackend│ │ Profiles│ │ │ @@ -59,6 +63,7 @@ | LuxTTS | `backend/backends/luxtts_backend.py` | LuxTTS — fast, CPU-friendly | | Chatterbox MTL | `backend/backends/chatterbox_backend.py` | Chatterbox Multilingual — 23 languages | | Chatterbox Turbo | `backend/backends/chatterbox_turbo_backend.py` | Chatterbox Turbo — English, paralinguistic tags | +| TADA | `backend/backends/hume_backend.py` | HumeAI TADA — 1B English + 3B Multilingual | | Platform detect | `backend/platform_detect.py` | Apple Silicon → MLX, else → PyTorch | | API types | `backend/models.py` | Pydantic request/response models | | HF progress | `backend/utils/hf_progress.py` | HFProgressTracker (tqdm patching for download progress) | @@ -78,7 +83,7 @@ ``` POST /generate 1. Look up voice profile from DB - 2. Resolve engine from request (qwen | luxtts | chatterbox | chatterbox_turbo) + 2. Resolve engine from request (qwen | luxtts | chatterbox | chatterbox_turbo | tada) 3. Get backend: get_tts_backend_for_engine(engine) # thread-safe singleton per engine 4. Check model cache → if missing, trigger background download, return HTTP 202 5. Load model (lazy): tts_backend.load_model(model_size) @@ -104,7 +109,8 @@ POST /generate - LuxTTS integration — fast, CPU-friendly English TTS (PR #254) - Chatterbox Multilingual TTS — 23 languages including Hebrew (PR #257) - Instruct parameter UI exists but is non-functional across all backends (see #224, Known Limitations) -- Single flat model dropdown (Qwen 1.7B, Qwen 0.6B, LuxTTS, Chatterbox, Chatterbox Turbo) +- HumeAI TADA integration — 1B English + 3B Multilingual speech-language model (PR #296) +- Single flat model dropdown (Qwen 1.7B, Qwen 0.6B, LuxTTS, Chatterbox, Chatterbox Turbo, TADA 1B, TADA 3B) - Centralized model config registry (`ModelConfig` dataclass) — no per-engine dispatch maps in `main.py` - Shared `EngineModelSelector` component — engine/model dropdown defined once, used in both generation forms @@ -136,6 +142,8 @@ POST /generate | LuxTTS | `luxtts` | English | ~300 MB | CPU-friendly, 48 kHz, fast | None | | Chatterbox | `chatterbox-tts` | 23 (incl. Hebrew, Arabic, Hindi, etc.) | ~3.2 GB | Zero-shot cloning, multilingual | Partial — `exaggeration` float (0-1) for expressiveness | | Chatterbox Turbo | `chatterbox-turbo` | English | ~1.5 GB | Paralinguistic tags ([laugh], [cough]), 350M params, low latency | Partial — inline tags only, no separate instruct param | +| TADA 1B | `tada-1b` | English | ~4 GB | HumeAI speech-language model, 700s+ coherent audio | None | +| TADA 3B Multilingual | `tada-3b-ml` | 10 (en, ar, zh, de, es, fr, it, ja, pl, pt) | ~8 GB | Multilingual, text-acoustic dual alignment | None | ### Multi-Engine Architecture (Shipped) @@ -143,7 +151,7 @@ The singleton TTS backend blocker described in the previous version of this doc - **Thread-safe backend registry** (`_tts_backends` dict + `_tts_backends_lock`) with double-checked locking - **Per-engine backend instances** — each engine gets its own singleton, loaded lazily -- **Engine field on GenerationRequest** — frontend sends `engine: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo'` +- **Engine field on GenerationRequest** — frontend sends `engine: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada'` - **Per-engine language filtering** — `ENGINE_LANGUAGES` map in frontend, backend regex accepts all languages - **Per-engine voice prompts** — `create_voice_prompt_for_profile()` dispatches to the correct backend - **Trim post-processing** — `trim_tts_output()` for Chatterbox engines (cuts trailing silence/hallucination) @@ -337,7 +345,7 @@ Notable requests: | **CosyVoice2-0.5B** | 3-10s zero-shot | Very fast | 24 kHz | Multilingual | Low | **Yes** — `inference_instruct2()`, works with cloning | Ready | Best instruct candidate | | **Fish Speech** | 10-30s few-shot | Real-time | 24-44 kHz | 50+ | Medium | **Yes** — inline text descriptions, word-level control | Ready | Multi-engine arch in place | | **MOSS-TTS Family** | Zero-shot | — | — | Multilingual | Medium | **Yes** — text prompts for style + timbre design | Needs vetting | Apache 2.0, multi-speaker dialogue | -| **HumeAI TADA 1B/3B** | Zero-shot | 5× faster than LLM-TTS | — | EN (1B), Multilingual (3B) | Medium | Partial — automatic prosody from text context | Needs vetting | MIT, 700s+ coherent, synced transcript output | +| **HumeAI TADA 1B/3B** | Zero-shot | 5× faster than LLM-TTS | 24 kHz | EN (1B), Multilingual (3B) | Medium | Partial — automatic prosody from text context | **Shipped** | PR #296, MIT, 700s+ coherent | | **VoxCPM 1.5** | Zero-shot (seconds) | ~0.15 RTF streaming | — | Bilingual (EN/ZH) | Medium | Partial — automatic context-aware prosody | Needs vetting | Apache 2.0, tokenizer-free continuous diffusion | | **Kokoro-82M** | 3s instant | CPU realtime | 24 kHz | English | Tiny (82M) | Partial — automatic style inference | Ready | Apache 2.0, multi-engine arch in place | | **XTTS-v2** | 6s zero-shot | Mid-GPU | 24 kHz | 17+ | Medium | Partial — style transfer from ref audio only | Ready | Multi-engine arch in place | @@ -475,7 +483,7 @@ The generation form now uses a flat model dropdown with engine-based routing. Pe | `/history/{id}/export` | GET | Export generation ZIP | | `/history/{id}/export-audio` | GET | Export audio only | | `/transcribe` | POST | Transcribe audio (Whisper) | -| `/models/status` | GET | All model statuses (Qwen, LuxTTS, Chatterbox, Chatterbox Turbo, Whisper) | +| `/models/status` | GET | All model statuses (Qwen, LuxTTS, Chatterbox, Chatterbox Turbo, TADA, Whisper) | | `/models/download` | POST | Trigger model download | | `/models/download/cancel` | POST | Cancel/dismiss download | | `/models/{name}` | DELETE | Delete downloaded model |