jamiepine · jamiepine · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -62,6 +62,7 @@ jobs:
           pip install pyinstaller
           pip install -r backend/requirements.txt
           pip install --no-deps chatterbox-tts
+          pip install --no-deps hume-tada
 
       - name: Install MLX dependencies (Apple Silicon only)
         if: matrix.backend == 'mlx'
@@ -188,6 +189,7 @@ jobs:
           pip install pyinstaller
           pip install -r backend/requirements.txt
           pip install --no-deps chatterbox-tts
+          pip install --no-deps hume-tada
 
       - name: Install PyTorch with CUDA 12.6
         run: |

diff --git a/Dockerfile b/Dockerfile
@@ -35,6 +35,8 @@ RUN pip install --no-cache-dir --upgrade pip
 
 COPY backend/requirements.txt .
 RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
+RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts
+RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada
 RUN pip install --no-cache-dir --prefix=/install \
     git+https://github.com/QwenLM/Qwen3-TTS.git
 

diff --git a/README.md b/README.md
@@ -59,10 +59,10 @@
 
 ## What is Voicebox?
 
-Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor.
+Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor.
 
 - **Complete privacy** — models and voice data stay on your machine
-- **4 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo
+- **5 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA
 - **23 languages** — from English to Arabic, Japanese, Hindi, Swahili, and more
 - **Post-processing effects** — pitch shift, reverb, delay, chorus, compression, and filters
 - **Expressive speech** — paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo
@@ -93,14 +93,15 @@ Voicebox is a **local-first voice cloning studio** — a free and open-source al
 
 ### Multi-Engine Voice Cloning
 
-Four TTS engines with different strengths, switchable per-generation:
+Five TTS engines with different strengths, switchable per-generation:
 
 | Engine                      | Languages | Strengths                                                                                                                                |
 | --------------------------- | --------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
 | **Qwen3-TTS** (0.6B / 1.7B) | 10        | High-quality multilingual cloning, delivery instructions ("speak slowly", "whisper")                                                     |
 | **LuxTTS**                  | English   | Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU                                                                              |
 | **Chatterbox Multilingual** | 23        | Broadest language coverage — Arabic, Danish, Finnish, Greek, Hebrew, Hindi, Malay, Norwegian, Polish, Swahili, Swedish, Turkish and more |
 | **Chatterbox Turbo**        | English   | Fast 350M model with paralinguistic emotion/sound tags                                                                                   |
+| **TADA** (1B / 3B)          | 10        | HumeAI speech-language model — 700s+ coherent audio, text-acoustic dual alignment                                                        |
 
 ### Emotions & Paralinguistic Tags
 
@@ -230,7 +231,7 @@ Full API documentation available at `http://localhost:17493/docs`.
 | Frontend      | React, TypeScript, Tailwind CSS                   |
 | State         | Zustand, React Query                              |
 | Backend       | FastAPI (Python)                                  |
-| TTS Engines   | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo   |
+| TTS Engines   | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo, TADA |
 | Effects       | Pedalboard (Spotify)                              |
 | Transcription | Whisper / Whisper Turbo (PyTorch or MLX)          |
 | Inference     | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) |
@@ -245,7 +246,7 @@ Full API documentation available at `http://localhost:17493/docs`.
 | ----------------------- | ---------------------------------------------- |
 | **Real-time Streaming** | Stream audio as it generates, word by word     |
 | **Voice Design**        | Create new voices from text descriptions       |
-| **More Models**         | XTTS, Bark, and other open-source voice models |
+| **More Models**         | XTTS, Bark, and other open-source voice models  |
 | **Plugin Architecture** | Extend with custom models and effects          |
 | **Mobile Companion**    | Control Voicebox from your phone               |
 

diff --git a/app/src/components/Generation/EngineModelSelector.tsx b/app/src/components/Generation/EngineModelSelector.tsx
@@ -20,20 +20,24 @@ const ENGINE_OPTIONS = [
   { value: 'luxtts', label: 'LuxTTS' },
   { value: 'chatterbox', label: 'Chatterbox' },
   { value: 'chatterbox_turbo', label: 'Chatterbox Turbo' },
+  { value: 'tada:1B', label: 'TADA 1B' },
+  { value: 'tada:3B', label: 'TADA 3B Multilingual' },
 ] as const;
 
 const ENGINE_DESCRIPTIONS: Record<string, string> = {
   qwen: 'Multi-language, two sizes',
   luxtts: 'Fast, English-focused',
   chatterbox: '23 languages, incl. Hebrew',
   chatterbox_turbo: 'English, [laugh] [cough] tags',
+  tada: 'HumeAI, 700s+ coherent audio',
 };
 
 /** Engines that only support English and should force language to 'en' on select. */
 const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']);
 
 function getSelectValue(engine: string, modelSize?: string): string {
   if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`;
+  if (engine === 'tada') return `tada:${modelSize || '1B'}`;
   return engine;
 }
 
@@ -48,6 +52,20 @@ function handleEngineChange(form: UseFormReturn<GenerationFormValues>, value: st
     if (!available.some((l) => l.value === currentLang)) {
       form.setValue('language', available[0]?.value ?? 'en');
     }
+  } else if (value.startsWith('tada:')) {
+    const [, modelSize] = value.split(':');
+    form.setValue('engine', 'tada');
+    form.setValue('modelSize', modelSize as '1B' | '3B');
+    // TADA 1B is English-only; 3B is multilingual
+    if (modelSize === '1B') {
+      form.setValue('language', 'en');
+    } else {
+      const currentLang = form.getValues('language');
+      const available = getLanguageOptionsForEngine('tada');
+      if (!available.some((l) => l.value === currentLang)) {
+        form.setValue('language', available[0]?.value ?? 'en');
+      }
+    }
   } else {
     form.setValue('engine', value as GenerationFormValues['engine']);
     form.setValue('modelSize', undefined as unknown as '1.7B' | '0.6B');

diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx
@@ -62,6 +62,10 @@ const MODEL_DESCRIPTIONS: Record<string, string> = {
     'Production-grade open source TTS by Resemble AI. Supports 23 languages with voice cloning and emotion exaggeration control.',
   'chatterbox-turbo':
     'Streamlined 350M parameter TTS by Resemble AI. High-quality English speech with less compute and VRAM than larger models.',
+  'tada-1b':
+    'HumeAI TADA 1B — English speech-language model built on Llama 3.2 1B. Generates 700s+ of coherent audio with synchronized text-acoustic alignment.',
+  'tada-3b-ml':
+    'HumeAI TADA 3B Multilingual — built on Llama 3.2 3B. Supports 10 languages with high-fidelity voice cloning via text-acoustic dual alignment.',
   'whisper-base':
     'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.',
   'whisper-small':
@@ -391,7 +395,8 @@ export function ModelManagement() {
       (m) =>
         m.model_name.startsWith('qwen-tts') ||
         m.model_name.startsWith('luxtts') ||
-        m.model_name.startsWith('chatterbox'),
+        m.model_name.startsWith('chatterbox') ||
+        m.model_name.startsWith('tada'),
     ) ?? [];
   const whisperModels = modelStatus?.models.filter((m) => m.model_name.startsWith('whisper')) ?? [];
 

diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts
@@ -42,8 +42,8 @@ export interface GenerationRequest {
   text: string;
   language: LanguageCode;
   seed?: number;
-  model_size?: '1.7B' | '0.6B';
-  engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo';
+  model_size?: '1.7B' | '0.6B' | '1B' | '3B';
+  engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada';
   instruct?: string;
   max_chunk_chars?: number;
   crossfade_ms?: number;

diff --git a/app/src/lib/constants/languages.ts b/app/src/lib/constants/languages.ts
@@ -66,6 +66,7 @@ export const ENGINE_LANGUAGES: Record<string, readonly LanguageCode[]> = {
     'zh',
   ],
   chatterbox_turbo: ['en'],
+  tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'],
 } as const;
 
 /** Helper: get language options for a given engine. */

diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts
@@ -15,9 +15,9 @@ const generationSchema = z.object({
   text: z.string().min(1, '').max(50000),
   language: z.enum(LANGUAGE_CODES as [LanguageCode, ...LanguageCode[]]),
   seed: z.number().int().optional(),
-  modelSize: z.enum(['1.7B', '0.6B']).optional(),
+  modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(),
   instruct: z.string().max(500).optional(),
-  engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo']).optional(),
+  engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']).optional(),
 });
 
 export type GenerationFormValues = z.infer<typeof generationSchema>;
@@ -79,17 +79,25 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
             ? 'chatterbox-tts'
             : engine === 'chatterbox_turbo'
               ? 'chatterbox-turbo'
-              : `qwen-tts-${data.modelSize}`;
+              : engine === 'tada'
+                ? data.modelSize === '3B'
+                  ? 'tada-3b-ml'
+                  : 'tada-1b'
+                : `qwen-tts-${data.modelSize}`;
       const displayName =
         engine === 'luxtts'
           ? 'LuxTTS'
           : engine === 'chatterbox'
             ? 'Chatterbox TTS'
             : engine === 'chatterbox_turbo'
               ? 'Chatterbox Turbo'
-              : data.modelSize === '1.7B'
-                ? 'Qwen TTS 1.7B'
-                : 'Qwen TTS 0.6B';
+              : engine === 'tada'
+                ? data.modelSize === '3B'
+                  ? 'TADA 3B Multilingual'
+                  : 'TADA 1B'
+                : data.modelSize === '1.7B'
+                  ? 'Qwen TTS 1.7B'
+                  : 'Qwen TTS 0.6B';
 
       // Check if model needs downloading
       try {
@@ -104,17 +112,17 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
         console.error('Failed to check model status:', error);
       }
 
-      const isQwen = engine === 'qwen';
+      const hasModelSizes = engine === 'qwen' || engine === 'tada';
       const effectsChain = options.getEffectsChain?.();
       // This now returns immediately with status="generating"
       const result = await generation.mutateAsync({
         profile_id: selectedProfileId,
         text: data.text,
         language: data.language,
         seed: data.seed,
-        model_size: isQwen ? data.modelSize : undefined,
+        model_size: hasModelSizes ? data.modelSize : undefined,
         engine,
-        instruct: isQwen ? data.instruct || undefined : undefined,
+        instruct: engine === 'qwen' ? data.instruct || undefined : undefined,
         max_chunk_chars: maxChunkChars,
         crossfade_ms: crossfadeMs,
         normalize: normalizeAudio,

diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py
@@ -166,6 +166,7 @@ def is_loaded(self) -> bool:
     "luxtts": "LuxTTS",
     "chatterbox": "Chatterbox TTS",
     "chatterbox_turbo": "Chatterbox Turbo",
+    "tada": "TADA",
 }
 
 
@@ -259,6 +260,24 @@ def _get_non_qwen_tts_configs() -> list[ModelConfig]:
             needs_trim=True,
             languages=["en"],
         ),
+        ModelConfig(
+            model_name="tada-1b",
+            display_name="TADA 1B (English)",
+            engine="tada",
+            hf_repo_id="HumeAI/tada-1b",
+            model_size="1B",
+            size_mb=4000,
+            languages=["en"],
+        ),
+        ModelConfig(
+            model_name="tada-3b-ml",
+            display_name="TADA 3B Multilingual",
+            engine="tada",
+            hf_repo_id="HumeAI/tada-3b-ml",
+            model_size="3B",
+            size_mb=8000,
+            languages=["en", "ar", "zh", "de", "es", "fr", "it", "ja", "pl", "pt"],
+        ),
     ]
 
 
@@ -339,10 +358,12 @@ def engine_has_model_sizes(engine: str) -> bool:
 
 
 async def load_engine_model(engine: str, model_size: str = "default") -> None:
-    """Load a model for the given engine, handling the Qwen model_size special case."""
+    """Load a model for the given engine, handling engines with multiple model sizes."""
     backend = get_tts_backend_for_engine(engine)
     if engine == "qwen":
         await backend.load_model_async(model_size)
+    elif engine == "tada":
+        await backend.load_model(model_size)
     else:
         await backend.load_model()
 
@@ -358,7 +379,7 @@ async def ensure_model_cached_or_raise(engine: str, model_size: str = "default")
             cfg = c
             break
 
-    if engine == "qwen":
+    if engine in ("qwen", "tada"):
         if not backend._is_model_cached(model_size):
             raise HTTPException(
                 status_code=400,
@@ -490,6 +511,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend:
             from .chatterbox_turbo_backend import ChatterboxTurboTTSBackend
 
             backend = ChatterboxTurboTTSBackend()
+        elif engine == "tada":
+            from .hume_backend import HumeTadaBackend
+
+            backend = HumeTadaBackend()
         else:
             raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}")