From 4e7772a21df5ebfc9dbb54f35ee3772276db3054 Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Tue, 17 Mar 2026 01:55:15 -0700
Subject: [PATCH 1/7] add HumeAI TADA TTS engine (1B English + 3B Multilingual)

Integrates HumeAI's TADA (Text-Acoustic Dual Alignment) speech-language
model as a new TTS engine. TADA uses a novel 1:1 token-audio alignment
that produces coherent speech over long sequences (700s+).

Two model variants:
- tada-1b: English-only, ~4GB, built on Llama 3.2 1B
- tada-3b-ml: 10 languages, ~8GB, built on Llama 3.2 3B

Backend uses the Encoder for voice prompt encoding with caching, and
TadaForCausalLM with flow-matching diffusion for generation. Supports
bf16 inference on CUDA, forces CPU on macOS (MPS compatibility).

Installed with --no-deps due to torch>=2.7 pin conflict; descript-audio-codec
and torchaudio added as explicit sub-dependencies.
---
 .github/workflows/release.yml                 |   2 +
 Dockerfile                                    |   2 +
 .../Generation/EngineModelSelector.tsx        |  18 +
 .../ServerSettings/ModelManagement.tsx        |   7 +-
 app/src/lib/api/types.ts                      |   4 +-
 app/src/lib/constants/languages.ts            |   1 +
 app/src/lib/hooks/useGenerationForm.ts        |  24 +-
 backend/backends/__init__.py                  |  29 +-
 backend/backends/hume_backend.py              | 310 ++++++++++++++++++
 backend/build_binary.py                       |  44 +++
 backend/models.py                             |   4 +-
 backend/requirements.txt                      |   5 +
 justfile                                      |   3 +
 13 files changed, 437 insertions(+), 16 deletions(-)
 create mode 100644 backend/backends/hume_backend.py

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 2b0cf254..d00df528 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -62,6 +62,7 @@ jobs:
           pip install pyinstaller
           pip install -r backend/requirements.txt
           pip install --no-deps chatterbox-tts
+          pip install --no-deps hume-tada
 
       - name: Install MLX dependencies (Apple Silicon only)
         if: matrix.backend == 'mlx'
@@ -188,6 +189,7 @@ jobs:
           pip install pyinstaller
           pip install -r backend/requirements.txt
           pip install --no-deps chatterbox-tts
+          pip install --no-deps hume-tada
 
       - name: Install PyTorch with CUDA 12.6
         run: |
diff --git a/Dockerfile b/Dockerfile
index 20da9e1a..1ad85e53 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -35,6 +35,8 @@ RUN pip install --no-cache-dir --upgrade pip
 
 COPY backend/requirements.txt .
 RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
+RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts
+RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada
 RUN pip install --no-cache-dir --prefix=/install \
     git+https://github.com/QwenLM/Qwen3-TTS.git
 
diff --git a/app/src/components/Generation/EngineModelSelector.tsx b/app/src/components/Generation/EngineModelSelector.tsx
index 77dac03f..4382d3f7 100644
--- a/app/src/components/Generation/EngineModelSelector.tsx
+++ b/app/src/components/Generation/EngineModelSelector.tsx
@@ -20,6 +20,8 @@ const ENGINE_OPTIONS = [
   { value: 'luxtts', label: 'LuxTTS' },
   { value: 'chatterbox', label: 'Chatterbox' },
   { value: 'chatterbox_turbo', label: 'Chatterbox Turbo' },
+  { value: 'tada:1B', label: 'TADA 1B' },
+  { value: 'tada:3B', label: 'TADA 3B Multilingual' },
 ] as const;
 
 const ENGINE_DESCRIPTIONS: Record<string, string> = {
@@ -27,6 +29,7 @@ const ENGINE_DESCRIPTIONS: Record<string, string> = {
   luxtts: 'Fast, English-focused',
   chatterbox: '23 languages, incl. Hebrew',
   chatterbox_turbo: 'English, [laugh] [cough] tags',
+  tada: 'HumeAI, 700s+ coherent audio',
 };
 
 /** Engines that only support English and should force language to 'en' on select. */
@@ -34,6 +37,7 @@ const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']);
 
 function getSelectValue(engine: string, modelSize?: string): string {
   if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`;
+  if (engine === 'tada') return `tada:${modelSize || '1B'}`;
   return engine;
 }
 
@@ -48,6 +52,20 @@ function handleEngineChange(form: UseFormReturn<GenerationFormValues>, value: st
     if (!available.some((l) => l.value === currentLang)) {
       form.setValue('language', available[0]?.value ?? 'en');
     }
+  } else if (value.startsWith('tada:')) {
+    const [, modelSize] = value.split(':');
+    form.setValue('engine', 'tada');
+    form.setValue('modelSize', modelSize as '1B' | '3B');
+    // TADA 1B is English-only; 3B is multilingual
+    if (modelSize === '1B') {
+      form.setValue('language', 'en');
+    } else {
+      const currentLang = form.getValues('language');
+      const available = getLanguageOptionsForEngine('tada');
+      if (!available.some((l) => l.value === currentLang)) {
+        form.setValue('language', available[0]?.value ?? 'en');
+      }
+    }
   } else {
     form.setValue('engine', value as GenerationFormValues['engine']);
     form.setValue('modelSize', undefined as unknown as '1.7B' | '0.6B');
diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx
index 5d113d89..c415306d 100644
--- a/app/src/components/ServerSettings/ModelManagement.tsx
+++ b/app/src/components/ServerSettings/ModelManagement.tsx
@@ -62,6 +62,10 @@ const MODEL_DESCRIPTIONS: Record<string, string> = {
     'Production-grade open source TTS by Resemble AI. Supports 23 languages with voice cloning and emotion exaggeration control.',
   'chatterbox-turbo':
     'Streamlined 350M parameter TTS by Resemble AI. High-quality English speech with less compute and VRAM than larger models.',
+  'tada-1b':
+    'HumeAI TADA 1B — English speech-language model built on Llama 3.2 1B. Generates 700s+ of coherent audio with synchronized text-acoustic alignment.',
+  'tada-3b-ml':
+    'HumeAI TADA 3B Multilingual — built on Llama 3.2 3B. Supports 10 languages with high-fidelity voice cloning via text-acoustic dual alignment.',
   'whisper-base':
     'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.',
   'whisper-small':
@@ -391,7 +395,8 @@ export function ModelManagement() {
       (m) =>
         m.model_name.startsWith('qwen-tts') ||
         m.model_name.startsWith('luxtts') ||
-        m.model_name.startsWith('chatterbox'),
+        m.model_name.startsWith('chatterbox') ||
+        m.model_name.startsWith('tada'),
     ) ?? [];
   const whisperModels = modelStatus?.models.filter((m) => m.model_name.startsWith('whisper')) ?? [];
 
diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts
index daae2a95..aa85d001 100644
--- a/app/src/lib/api/types.ts
+++ b/app/src/lib/api/types.ts
@@ -42,8 +42,8 @@ export interface GenerationRequest {
   text: string;
   language: LanguageCode;
   seed?: number;
-  model_size?: '1.7B' | '0.6B';
-  engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo';
+  model_size?: '1.7B' | '0.6B' | '1B' | '3B';
+  engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada';
   instruct?: string;
   max_chunk_chars?: number;
   crossfade_ms?: number;
diff --git a/app/src/lib/constants/languages.ts b/app/src/lib/constants/languages.ts
index 19d6bca6..a0d233a5 100644
--- a/app/src/lib/constants/languages.ts
+++ b/app/src/lib/constants/languages.ts
@@ -66,6 +66,7 @@ export const ENGINE_LANGUAGES: Record<string, readonly LanguageCode[]> = {
     'zh',
   ],
   chatterbox_turbo: ['en'],
+  tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'],
 } as const;
 
 /** Helper: get language options for a given engine. */
diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts
index 74f9a94c..d174bad0 100644
--- a/app/src/lib/hooks/useGenerationForm.ts
+++ b/app/src/lib/hooks/useGenerationForm.ts
@@ -15,9 +15,9 @@ const generationSchema = z.object({
   text: z.string().min(1, '').max(50000),
   language: z.enum(LANGUAGE_CODES as [LanguageCode, ...LanguageCode[]]),
   seed: z.number().int().optional(),
-  modelSize: z.enum(['1.7B', '0.6B']).optional(),
+  modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(),
   instruct: z.string().max(500).optional(),
-  engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo']).optional(),
+  engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']).optional(),
 });
 
 export type GenerationFormValues = z.infer<typeof generationSchema>;
@@ -79,7 +79,9 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
             ? 'chatterbox-tts'
             : engine === 'chatterbox_turbo'
               ? 'chatterbox-turbo'
-              : `qwen-tts-${data.modelSize}`;
+              : engine === 'tada'
+                ? `tada-${(data.modelSize || '1B').toLowerCase()}`
+                : `qwen-tts-${data.modelSize}`;
       const displayName =
         engine === 'luxtts'
           ? 'LuxTTS'
@@ -87,9 +89,13 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
             ? 'Chatterbox TTS'
             : engine === 'chatterbox_turbo'
               ? 'Chatterbox Turbo'
-              : data.modelSize === '1.7B'
-                ? 'Qwen TTS 1.7B'
-                : 'Qwen TTS 0.6B';
+              : engine === 'tada'
+                ? data.modelSize === '3B'
+                  ? 'TADA 3B Multilingual'
+                  : 'TADA 1B'
+                : data.modelSize === '1.7B'
+                  ? 'Qwen TTS 1.7B'
+                  : 'Qwen TTS 0.6B';
 
       // Check if model needs downloading
       try {
@@ -104,7 +110,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
         console.error('Failed to check model status:', error);
       }
 
-      const isQwen = engine === 'qwen';
+      const hasModelSizes = engine === 'qwen' || engine === 'tada';
       const effectsChain = options.getEffectsChain?.();
       // This now returns immediately with status="generating"
       const result = await generation.mutateAsync({
@@ -112,9 +118,9 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
         text: data.text,
         language: data.language,
         seed: data.seed,
-        model_size: isQwen ? data.modelSize : undefined,
+        model_size: hasModelSizes ? data.modelSize : undefined,
         engine,
-        instruct: isQwen ? data.instruct || undefined : undefined,
+        instruct: engine === 'qwen' ? data.instruct || undefined : undefined,
         max_chunk_chars: maxChunkChars,
         crossfade_ms: crossfadeMs,
         normalize: normalizeAudio,
diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py
index 6f20f3de..a4f5113a 100644
--- a/backend/backends/__init__.py
+++ b/backend/backends/__init__.py
@@ -166,6 +166,7 @@ def is_loaded(self) -> bool:
     "luxtts": "LuxTTS",
     "chatterbox": "Chatterbox TTS",
     "chatterbox_turbo": "Chatterbox Turbo",
+    "tada": "TADA",
 }
 
 
@@ -259,6 +260,24 @@ def _get_non_qwen_tts_configs() -> list[ModelConfig]:
             needs_trim=True,
             languages=["en"],
         ),
+        ModelConfig(
+            model_name="tada-1b",
+            display_name="TADA 1B (English)",
+            engine="tada",
+            hf_repo_id="HumeAI/tada-1b",
+            model_size="1B",
+            size_mb=4000,
+            languages=["en"],
+        ),
+        ModelConfig(
+            model_name="tada-3b-ml",
+            display_name="TADA 3B Multilingual",
+            engine="tada",
+            hf_repo_id="HumeAI/tada-3b-ml",
+            model_size="3B",
+            size_mb=8000,
+            languages=["en", "ar", "zh", "de", "es", "fr", "it", "ja", "pl", "pt"],
+        ),
     ]
 
 
@@ -339,10 +358,12 @@ def engine_has_model_sizes(engine: str) -> bool:
 
 
 async def load_engine_model(engine: str, model_size: str = "default") -> None:
-    """Load a model for the given engine, handling the Qwen model_size special case."""
+    """Load a model for the given engine, handling engines with multiple model sizes."""
     backend = get_tts_backend_for_engine(engine)
     if engine == "qwen":
         await backend.load_model_async(model_size)
+    elif engine == "tada":
+        await backend.load_model(model_size)
     else:
         await backend.load_model()
 
@@ -358,7 +379,7 @@ async def ensure_model_cached_or_raise(engine: str, model_size: str = "default")
             cfg = c
             break
 
-    if engine == "qwen":
+    if engine in ("qwen", "tada"):
         if not backend._is_model_cached(model_size):
             raise HTTPException(
                 status_code=400,
@@ -490,6 +511,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend:
             from .chatterbox_turbo_backend import ChatterboxTurboTTSBackend
 
             backend = ChatterboxTurboTTSBackend()
+        elif engine == "tada":
+            from .hume_backend import HumeTadaBackend
+
+            backend = HumeTadaBackend()
         else:
             raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}")
 
diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py
new file mode 100644
index 00000000..1d4a4600
--- /dev/null
+++ b/backend/backends/hume_backend.py
@@ -0,0 +1,310 @@
+"""
+HumeAI TADA TTS backend implementation.
+
+Wraps HumeAI's TADA (Text-Acoustic Dual Alignment) model for
+high-quality voice cloning. Two model variants:
+  - tada-1b: English-only, ~2B params (Llama 3.2 1B base)
+  - tada-3b-ml: Multilingual, ~4B params (Llama 3.2 3B base)
+
+Both use a shared encoder/codec (HumeAI/tada-codec). The encoder
+produces 1:1 aligned token embeddings from reference audio, and the
+causal LM generates speech via flow-matching diffusion.
+
+24kHz output, bf16 inference on CUDA, fp32 on CPU.
+"""
+
+import asyncio
+import logging
+import threading
+from typing import ClassVar, List, Optional, Tuple
+
+import numpy as np
+
+from . import TTSBackend
+from .base import (
+    is_model_cached,
+    get_torch_device,
+    combine_voice_prompts as _combine_voice_prompts,
+    model_load_progress,
+)
+from ..utils.cache import get_cache_key, get_cached_voice_prompt, cache_voice_prompt
+
+logger = logging.getLogger(__name__)
+
+# HuggingFace repos
+TADA_CODEC_REPO = "HumeAI/tada-codec"
+TADA_1B_REPO = "HumeAI/tada-1b"
+TADA_3B_ML_REPO = "HumeAI/tada-3b-ml"
+
+TADA_MODEL_REPOS = {
+    "1B": TADA_1B_REPO,
+    "3B": TADA_3B_ML_REPO,
+}
+
+# Key weight files for cache detection
+_TADA_MODEL_WEIGHT_FILES = [
+    "model.safetensors",
+]
+
+_TADA_CODEC_WEIGHT_FILES = [
+    "encoder/model.safetensors",
+]
+
+
+class HumeTadaBackend:
+    """HumeAI TADA TTS backend for high-quality voice cloning."""
+
+    _load_lock: ClassVar[threading.Lock] = threading.Lock()
+
+    def __init__(self):
+        self.model = None
+        self.encoder = None
+        self.model_size = "1B"  # default to 1B
+        self._device = None
+        self._model_load_lock = asyncio.Lock()
+
+    def _get_device(self) -> str:
+        # Force CPU on macOS — MPS has issues with flow matching
+        # and large vocab lm_head (>65536 output channels)
+        return get_torch_device(force_cpu_on_mac=True)
+
+    def is_loaded(self) -> bool:
+        return self.model is not None
+
+    def _get_model_path(self, model_size: str = "1B") -> str:
+        return TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO)
+
+    def _is_model_cached(self, model_size: str = "1B") -> bool:
+        repo = TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO)
+        model_cached = is_model_cached(repo, required_files=_TADA_MODEL_WEIGHT_FILES)
+        codec_cached = is_model_cached(TADA_CODEC_REPO, required_files=_TADA_CODEC_WEIGHT_FILES)
+        return model_cached and codec_cached
+
+    async def load_model(self, model_size: str = "1B") -> None:
+        """Load the TADA model and encoder."""
+        if self.model is not None and self.model_size == model_size:
+            return
+        async with self._model_load_lock:
+            if self.model is not None and self.model_size == model_size:
+                return
+            # Unload existing model if switching sizes
+            if self.model is not None:
+                self.unload_model()
+            self.model_size = model_size
+            await asyncio.to_thread(self._load_model_sync, model_size)
+
+    def _load_model_sync(self, model_size: str = "1B"):
+        """Synchronous model loading with progress tracking."""
+        model_name = f"tada-{model_size.lower()}"
+        is_cached = self._is_model_cached(model_size)
+        repo = TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO)
+
+        with model_load_progress(model_name, is_cached):
+            import torch
+            from huggingface_hub import snapshot_download
+
+            device = self._get_device()
+            self._device = device
+            logger.info(f"Loading HumeAI TADA {model_size} on {device}...")
+
+            # Download codec (encoder + decoder) if not cached
+            logger.info("Downloading TADA codec...")
+            snapshot_download(
+                repo_id=TADA_CODEC_REPO,
+                token=None,
+                allow_patterns=["*.safetensors", "*.json", "*.txt", "*.bin"],
+            )
+
+            # Download model weights if not cached
+            logger.info(f"Downloading TADA {model_size} model...")
+            snapshot_download(
+                repo_id=repo,
+                token=None,
+                allow_patterns=["*.safetensors", "*.json", "*.txt", "*.bin", "*.model"],
+            )
+
+            # Determine dtype — use bf16 on CUDA for ~50% memory savings
+            if device == "cuda" and torch.cuda.is_bf16_supported():
+                model_dtype = torch.bfloat16
+            else:
+                model_dtype = torch.float32
+
+            # Load encoder (only needed for voice prompt encoding)
+            from tada.modules.encoder import Encoder
+            logger.info("Loading TADA encoder...")
+            self.encoder = Encoder.from_pretrained(
+                TADA_CODEC_REPO, subfolder="encoder"
+            ).to(device)
+            self.encoder.eval()
+
+            # Load the causal LM (includes decoder for wav generation)
+            from tada.modules.tada import TadaForCausalLM
+            logger.info(f"Loading TADA {model_size} model...")
+            self.model = TadaForCausalLM.from_pretrained(
+                repo, torch_dtype=model_dtype
+            ).to(device)
+            self.model.eval()
+
+        logger.info(f"HumeAI TADA {model_size} loaded successfully on {device}")
+
+    def unload_model(self) -> None:
+        """Unload model and encoder to free memory."""
+        if self.model is not None:
+            del self.model
+            self.model = None
+        if self.encoder is not None:
+            del self.encoder
+            self.encoder = None
+
+        self._device = None
+
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+        logger.info("HumeAI TADA unloaded")
+
+    async def create_voice_prompt(
+        self,
+        audio_path: str,
+        reference_text: str,
+        use_cache: bool = True,
+    ) -> Tuple[dict, bool]:
+        """
+        Create voice prompt from reference audio using TADA's encoder.
+
+        TADA's encoder performs forced alignment between audio and text tokens,
+        producing an EncoderOutput with 1:1 token-audio alignment. If no
+        reference_text is provided, the encoder uses built-in ASR (English only).
+
+        We serialize the EncoderOutput to a dict for caching.
+        """
+        await self.load_model(self.model_size)
+
+        cache_key = (
+            "tada_" + get_cache_key(audio_path, reference_text)
+        ) if use_cache else None
+
+        if cache_key:
+            cached = get_cached_voice_prompt(cache_key)
+            if cached is not None and isinstance(cached, dict):
+                return cached, True
+
+        def _encode_sync():
+            import torch
+            import torchaudio
+
+            device = self._device
+
+            # Load and prepare audio
+            audio, sr = torchaudio.load(str(audio_path))
+            audio = audio.to(device)
+
+            # Encode with forced alignment
+            text_arg = [reference_text] if reference_text else None
+            prompt = self.encoder(
+                audio, text=text_arg, sample_rate=sr
+            )
+
+            # Serialize EncoderOutput to a dict of CPU tensors for caching
+            prompt_dict = {}
+            for field_name in prompt.__dataclass_fields__:
+                val = getattr(prompt, field_name)
+                if isinstance(val, torch.Tensor):
+                    prompt_dict[field_name] = val.detach().cpu()
+                elif isinstance(val, list):
+                    prompt_dict[field_name] = val
+                elif isinstance(val, (int, float)):
+                    prompt_dict[field_name] = val
+                else:
+                    prompt_dict[field_name] = val
+            return prompt_dict
+
+        encoded = await asyncio.to_thread(_encode_sync)
+
+        if cache_key:
+            cache_voice_prompt(cache_key, encoded)
+
+        return encoded, False
+
+    async def combine_voice_prompts(
+        self,
+        audio_paths: List[str],
+        reference_texts: List[str],
+    ) -> Tuple[np.ndarray, str]:
+        return await _combine_voice_prompts(audio_paths, reference_texts, sample_rate=24000)
+
+    async def generate(
+        self,
+        text: str,
+        voice_prompt: dict,
+        language: str = "en",
+        seed: Optional[int] = None,
+        instruct: Optional[str] = None,
+    ) -> Tuple[np.ndarray, int]:
+        """
+        Generate audio from text using HumeAI TADA.
+
+        Args:
+            text: Text to synthesize
+            voice_prompt: Serialized EncoderOutput dict from create_voice_prompt()
+            language: Language code (en, ar, de, es, fr, it, ja, pl, pt, zh)
+            seed: Random seed for reproducibility
+            instruct: Not supported by TADA (ignored)
+
+        Returns:
+            Tuple of (audio_array, sample_rate=24000)
+        """
+        await self.load_model(self.model_size)
+
+        def _generate_sync():
+            import torch
+            from tada.modules.encoder import EncoderOutput
+
+            if seed is not None:
+                torch.manual_seed(seed)
+                if torch.cuda.is_available():
+                    torch.cuda.manual_seed(seed)
+
+            device = self._device
+
+            # Reconstruct EncoderOutput from the cached dict
+            restored = {}
+            for k, v in voice_prompt.items():
+                if isinstance(v, torch.Tensor):
+                    # Move to device and match model dtype for float tensors
+                    if v.is_floating_point():
+                        model_dtype = next(self.model.parameters()).dtype
+                        restored[k] = v.to(device=device, dtype=model_dtype)
+                    else:
+                        restored[k] = v.to(device=device)
+                else:
+                    restored[k] = v
+
+            prompt = EncoderOutput(**restored)
+
+            # For non-English with the 3B-ML model, we could reload the
+            # encoder with the language-specific aligner. However, the
+            # generation itself is language-agnostic — only the encoder's
+            # aligner changes. Since we encode at create_voice_prompt time,
+            # the language is already baked in. For simplicity, we don't
+            # reload the encoder here.
+
+            logger.info(f"[TADA] Generating ({language}), text length: {len(text)}")
+
+            output = self.model.generate(
+                prompt=prompt,
+                text=text,
+            )
+
+            # output.audio is a list of tensors (one per batch item)
+            if output.audio and output.audio[0] is not None:
+                audio_tensor = output.audio[0]
+                audio = audio_tensor.detach().cpu().numpy().squeeze().astype(np.float32)
+            else:
+                logger.warning("[TADA] Generation produced no audio")
+                audio = np.zeros(24000, dtype=np.float32)
+
+            return audio, 24000
+
+        return await asyncio.to_thread(_generate_sync)
diff --git a/backend/build_binary.py b/backend/build_binary.py
index f9cdb1b7..0c6dfebd 100644
--- a/backend/build_binary.py
+++ b/backend/build_binary.py
@@ -186,6 +186,50 @@ def build_server(cuda=False):
             # needed by LuxTTS for text-to-phoneme conversion
             "--collect-all",
             "piper_phonemize",
+            # HumeAI TADA — speech-language model using Llama + flow matching
+            "--hidden-import",
+            "backend.backends.hume_backend",
+            "--hidden-import",
+            "tada",
+            "--hidden-import",
+            "tada.modules",
+            "--hidden-import",
+            "tada.modules.tada",
+            "--hidden-import",
+            "tada.modules.encoder",
+            "--hidden-import",
+            "tada.modules.decoder",
+            "--hidden-import",
+            "tada.modules.aligner",
+            "--hidden-import",
+            "tada.modules.acoustic_spkr_verf",
+            "--hidden-import",
+            "tada.nn",
+            "--hidden-import",
+            "tada.nn.vibevoice",
+            "--hidden-import",
+            "tada.utils",
+            "--hidden-import",
+            "tada.utils.gray_code",
+            "--hidden-import",
+            "tada.utils.text",
+            # descript-audio-codec (DAC) — used by TADA for Snake1d layers
+            "--hidden-import",
+            "dac",
+            "--hidden-import",
+            "dac.nn",
+            "--hidden-import",
+            "dac.nn.layers",
+            "--hidden-import",
+            "dac.model",
+            "--hidden-import",
+            "dac.model.dac",
+            "--collect-all",
+            "dac",
+            "--hidden-import",
+            "torchaudio",
+            "--collect-submodules",
+            "tada",
         ]
     )
 
diff --git a/backend/models.py b/backend/models.py
index 3308b3bc..4dd2b368 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -66,9 +66,9 @@ class GenerationRequest(BaseModel):
     text: str = Field(..., min_length=1, max_length=50000)
     language: str = Field(default="en", pattern="^(zh|en|ja|ko|de|fr|ru|pt|es|it|he|ar|da|el|fi|hi|ms|nl|no|pl|sv|sw|tr)$")
     seed: Optional[int] = Field(None, ge=0)
-    model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B)$")
+    model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B)$")
     instruct: Optional[str] = Field(None, max_length=500)
-    engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo)$")
+    engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada)$")
     max_chunk_chars: int = Field(
         default=800, ge=100, le=5000, description="Max characters per chunk for long text splitting"
     )
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 0d927975..2bba9d70 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -33,6 +33,11 @@ s3tokenizer
 spacy-pkuseg
 pyloudnorm
 
+# HumeAI TADA sub-dependencies (hume-tada itself is installed
+# --no-deps in the setup script because it pins torch>=2.7,<2.8)
+descript-audio-codec>=1.0.0
+torchaudio
+
 # Audio processing
 librosa>=0.10.0
 soundfile>=0.12.0
diff --git a/justfile b/justfile
index 796e3ddd..fd8bf962 100644
--- a/justfile
+++ b/justfile
@@ -46,6 +46,8 @@ setup-python:
     {{ pip }} install -r {{ backend_dir }}/requirements.txt
     # Chatterbox pins numpy<1.26 / torch==2.6 which break on Python 3.12+
     {{ pip }} install --no-deps chatterbox-tts
+    # HumeAI TADA pins torch>=2.7,<2.8 which conflicts with our torch>=2.1
+    {{ pip }} install --no-deps hume-tada
     # Apple Silicon: install MLX backend
     if [ "$(uname -m)" = "arm64" ] && [ "$(uname)" = "Darwin" ]; then
         echo "Detected Apple Silicon — installing MLX dependencies..."
@@ -74,6 +76,7 @@ setup-python:
     }
     & "{{ pip }}" install -r {{ backend_dir }}/requirements.txt
     & "{{ pip }}" install --no-deps chatterbox-tts
+    & "{{ pip }}" install --no-deps hume-tada
     & "{{ pip }}" install git+https://github.com/QwenLM/Qwen3-TTS.git
     & "{{ pip }}" install pyinstaller ruff pytest pytest-asyncio -q
     Write-Host "Python environment ready."

From b02ce8e2f31f59c6803b6b83040ca6577a38ad45 Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Tue, 17 Mar 2026 02:16:33 -0700
Subject: [PATCH 2/7] replace descript-audio-codec with lightweight DAC shim

The real descript-audio-codec package pulls in descript-audiotools,
which transitively requires onnx, tensorboard, protobuf, matplotlib,
pystoi, and other heavy dependencies. onnx fails to build from source
on macOS due to CMake version incompatibility.

TADA only uses Snake1d (a 7-line PyTorch module) from DAC. This commit
adds a shim in backend/utils/dac_shim.py that registers fake dac.*
modules in sys.modules with just the Snake1d class, completely
eliminating the DAC/audiotools dependency chain.
---
 backend/backends/hume_backend.py |  7 +++
 backend/build_binary.py          | 16 ++----
 backend/requirements.txt         |  6 ++-
 backend/utils/dac_shim.py        | 93 ++++++++++++++++++++++++++++++++
 4 files changed, 108 insertions(+), 14 deletions(-)
 create mode 100644 backend/utils/dac_shim.py

diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py
index 1d4a4600..b3beab25 100644
--- a/backend/backends/hume_backend.py
+++ b/backend/backends/hume_backend.py
@@ -100,6 +100,13 @@ def _load_model_sync(self, model_size: str = "1B"):
         repo = TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO)
 
         with model_load_progress(model_name, is_cached):
+            # Install DAC shim before importing tada — tada's encoder/decoder
+            # import dac.nn.layers.Snake1d which requires the descript-audio-codec
+            # package.  The real package pulls in onnx/tensorboard/matplotlib via
+            # descript-audiotools, so we use a lightweight shim instead.
+            from ..utils.dac_shim import install_dac_shim
+            install_dac_shim()
+
             import torch
             from huggingface_hub import snapshot_download
 
diff --git a/backend/build_binary.py b/backend/build_binary.py
index 0c6dfebd..0574894f 100644
--- a/backend/build_binary.py
+++ b/backend/build_binary.py
@@ -213,19 +213,11 @@ def build_server(cuda=False):
             "tada.utils.gray_code",
             "--hidden-import",
             "tada.utils.text",
-            # descript-audio-codec (DAC) — used by TADA for Snake1d layers
+            # DAC shim — provides dac.nn.layers.Snake1d without the real
+            # descript-audio-codec package (which pulls onnx/tensorboard via
+            # descript-audiotools). The shim is in backend/utils/dac_shim.py.
             "--hidden-import",
-            "dac",
-            "--hidden-import",
-            "dac.nn",
-            "--hidden-import",
-            "dac.nn.layers",
-            "--hidden-import",
-            "dac.model",
-            "--hidden-import",
-            "dac.model.dac",
-            "--collect-all",
-            "dac",
+            "backend.utils.dac_shim",
             "--hidden-import",
             "torchaudio",
             "--collect-submodules",
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 2bba9d70..d77f97be 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -34,8 +34,10 @@ spacy-pkuseg
 pyloudnorm
 
 # HumeAI TADA sub-dependencies (hume-tada itself is installed
-# --no-deps in the setup script because it pins torch>=2.7,<2.8)
-descript-audio-codec>=1.0.0
+# --no-deps in the setup script because it pins torch>=2.7,<2.8.
+# descript-audio-codec is NOT installed — it pulls onnx/tensorboard
+# via descript-audiotools.  A lightweight shim in utils/dac_shim.py
+# provides the only class TADA uses: Snake1d.)
 torchaudio
 
 # Audio processing
diff --git a/backend/utils/dac_shim.py b/backend/utils/dac_shim.py
new file mode 100644
index 00000000..ea1294fc
--- /dev/null
+++ b/backend/utils/dac_shim.py
@@ -0,0 +1,93 @@
+"""
+Minimal shim for descript-audio-codec (DAC).
+
+TADA only imports Snake1d from dac.nn.layers and dac.model.dac.
+The real DAC package pulls in descript-audiotools which depends on
+onnx, tensorboard, protobuf, matplotlib, pystoi, etc. — none of
+which are needed for TADA's runtime use of Snake1d.
+
+This shim provides the exact Snake1d implementation (MIT-licensed,
+from https://github.com/descriptinc/descript-audio-codec) so we can
+avoid the entire audiotools dependency chain.
+
+If the real DAC package is installed, this module is never used —
+Python's import system will find the site-packages version first.
+Install this shim only when descript-audio-codec is NOT installed.
+"""
+
+import sys
+import types
+
+import torch
+import torch.nn as nn
+
+
+# ── Snake activation (from dac/nn/layers.py) ────────────────────────
+
+@torch.jit.script
+def snake(x: torch.Tensor, alpha: torch.Tensor) -> torch.Tensor:
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+
+
+class Snake1d(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return snake(x, self.alpha)
+
+
+# ── Register as dac.nn.layers and dac.model.dac ─────────────────────
+
+def install_dac_shim() -> None:
+    """Register fake dac package modules in sys.modules.
+
+    Only installs the shim if 'dac' is not already importable
+    (i.e. the real descript-audio-codec is not installed).
+    """
+    try:
+        import dac  # noqa: F401  — real package exists, do nothing
+        return
+    except ImportError:
+        pass
+
+    # Create the module tree: dac -> dac.nn -> dac.nn.layers
+    #                              -> dac.model -> dac.model.dac
+    dac_pkg = types.ModuleType("dac")
+    dac_pkg.__path__ = []  # make it a package
+    dac_pkg.__package__ = "dac"
+
+    dac_nn = types.ModuleType("dac.nn")
+    dac_nn.__path__ = []
+    dac_nn.__package__ = "dac.nn"
+
+    dac_nn_layers = types.ModuleType("dac.nn.layers")
+    dac_nn_layers.__package__ = "dac.nn"
+    dac_nn_layers.Snake1d = Snake1d
+    dac_nn_layers.snake = snake
+
+    dac_model = types.ModuleType("dac.model")
+    dac_model.__path__ = []
+    dac_model.__package__ = "dac.model"
+
+    dac_model_dac = types.ModuleType("dac.model.dac")
+    dac_model_dac.__package__ = "dac.model"
+    dac_model_dac.Snake1d = Snake1d
+
+    # Wire up submodules
+    dac_pkg.nn = dac_nn
+    dac_pkg.model = dac_model
+    dac_nn.layers = dac_nn_layers
+    dac_model.dac = dac_model_dac
+
+    # Register in sys.modules
+    sys.modules["dac"] = dac_pkg
+    sys.modules["dac.nn"] = dac_nn
+    sys.modules["dac.nn.layers"] = dac_nn_layers
+    sys.modules["dac.model"] = dac_model
+    sys.modules["dac.model.dac"] = dac_model_dac

From 7a90290a763bfdc0a1b00825d9fe3382914506dd Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Tue, 17 Mar 2026 02:22:26 -0700
Subject: [PATCH 3/7] fix gated Llama tokenizer error by redirecting to ungated
 mirror

TADA hardcodes 'meta-llama/Llama-3.2-1B' as its tokenizer source in
both the Aligner and TadaForCausalLM.from_pretrained(). That repo is
gated and requires accepting Meta's license on HuggingFace.

Monkey-patch AutoTokenizer.from_pretrained during model loading to
redirect Llama tokenizer requests to 'unsloth/Llama-3.2-1B', an
ungated mirror with identical tokenizer files. The patch is scoped
to model loading only and restored immediately after.
---
 backend/backends/hume_backend.py | 62 ++++++++++++++++++++++++--------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py
index b3beab25..804ce936 100644
--- a/backend/backends/hume_backend.py
+++ b/backend/backends/hume_backend.py
@@ -130,27 +130,59 @@ def _load_model_sync(self, model_size: str = "1B"):
                 allow_patterns=["*.safetensors", "*.json", "*.txt", "*.bin", "*.model"],
             )
 
+            # Pre-download the Llama tokenizer from an ungated mirror.
+            # TADA hardcodes "meta-llama/Llama-3.2-1B" which is gated;
+            # we redirect to unsloth's ungated copy at load time.
+            logger.info("Downloading Llama tokenizer (ungated mirror)...")
+            snapshot_download(
+                repo_id="unsloth/Llama-3.2-1B",
+                token=None,
+                allow_patterns=["tokenizer*", "special_tokens*"],
+            )
+
             # Determine dtype — use bf16 on CUDA for ~50% memory savings
             if device == "cuda" and torch.cuda.is_bf16_supported():
                 model_dtype = torch.bfloat16
             else:
                 model_dtype = torch.float32
 
-            # Load encoder (only needed for voice prompt encoding)
-            from tada.modules.encoder import Encoder
-            logger.info("Loading TADA encoder...")
-            self.encoder = Encoder.from_pretrained(
-                TADA_CODEC_REPO, subfolder="encoder"
-            ).to(device)
-            self.encoder.eval()
-
-            # Load the causal LM (includes decoder for wav generation)
-            from tada.modules.tada import TadaForCausalLM
-            logger.info(f"Loading TADA {model_size} model...")
-            self.model = TadaForCausalLM.from_pretrained(
-                repo, torch_dtype=model_dtype
-            ).to(device)
-            self.model.eval()
+            # TADA hardcodes "meta-llama/Llama-3.2-1B" as the tokenizer
+            # source in its Aligner and TadaForCausalLM.from_pretrained().
+            # That repo is gated (requires Meta license acceptance on HF).
+            # Monkey-patch AutoTokenizer.from_pretrained to redirect to an
+            # ungated mirror that ships the identical tokenizer files.
+            from transformers import AutoTokenizer
+            _orig_from_pretrained = AutoTokenizer.from_pretrained.__func__
+
+            @classmethod  # type: ignore[misc]
+            def _patched_from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+                if "meta-llama/Llama-3.2" in str(pretrained_model_name_or_path):
+                    pretrained_model_name_or_path = "unsloth/Llama-3.2-1B"
+                    kwargs.setdefault("token", None)
+                    logger.info("Redirecting Llama tokenizer to ungated mirror: unsloth/Llama-3.2-1B")
+                return _orig_from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs)
+
+            AutoTokenizer.from_pretrained = _patched_from_pretrained
+
+            try:
+                # Load encoder (only needed for voice prompt encoding)
+                from tada.modules.encoder import Encoder
+                logger.info("Loading TADA encoder...")
+                self.encoder = Encoder.from_pretrained(
+                    TADA_CODEC_REPO, subfolder="encoder"
+                ).to(device)
+                self.encoder.eval()
+
+                # Load the causal LM (includes decoder for wav generation)
+                from tada.modules.tada import TadaForCausalLM
+                logger.info(f"Loading TADA {model_size} model...")
+                self.model = TadaForCausalLM.from_pretrained(
+                    repo, torch_dtype=model_dtype
+                ).to(device)
+                self.model.eval()
+            finally:
+                # Restore original to avoid affecting other code
+                AutoTokenizer.from_pretrained = _orig_from_pretrained
 
         logger.info(f"HumeAI TADA {model_size} loaded successfully on {device}")
 

From 12cda2e0905aa80066e77ee01a540d8b66765cac Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Tue, 17 Mar 2026 02:25:05 -0700
Subject: [PATCH 4/7] fix torchcodec error by using soundfile instead of
 torchaudio.load

torchaudio 2.10+ switched its default audio loading backend to
torchcodec, which isn't installed. Replace torchaudio.load() with
soundfile.read() in create_voice_prompt(). TADA's internal use of
torchaudio.functional.resample() is unaffected (pure PyTorch math,
no torchcodec dependency).
---
 backend/backends/hume_backend.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py
index 804ce936..d82afb4e 100644
--- a/backend/backends/hume_backend.py
+++ b/backend/backends/hume_backend.py
@@ -231,12 +231,17 @@ async def create_voice_prompt(
 
         def _encode_sync():
             import torch
-            import torchaudio
+            import soundfile as sf
 
             device = self._device
 
-            # Load and prepare audio
-            audio, sr = torchaudio.load(str(audio_path))
+            # Load audio with soundfile (torchaudio 2.10+ requires torchcodec)
+            audio_np, sr = sf.read(str(audio_path), dtype="float32")
+            audio = torch.from_numpy(audio_np).float()
+            if audio.ndim == 1:
+                audio = audio.unsqueeze(0)  # (samples,) -> (1, samples)
+            else:
+                audio = audio.T  # (samples, channels) -> (channels, samples)
             audio = audio.to(device)
 
             # Encode with forced alignment

From 6bf40bd2d07fbdc84a3c220cf021a732eae2414c Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Tue, 17 Mar 2026 03:15:57 -0700
Subject: [PATCH 5/7] fix tokenizer patch corrupting AutoTokenizer for other
 engines

Replace the monkey-patch on AutoTokenizer.from_pretrained (which broke
the classmethod descriptor and caused 'Tokenizer not loaded' errors
when loading Qwen after TADA) with two targeted config patches:
- Set AlignerConfig.tokenizer_name to the local ungated tokenizer path
- Pre-load TadaConfig, inject tokenizer_name, pass config= to from_pretrained

No global state is modified; other engines are unaffected.
---
 backend/backends/hume_backend.py | 75 +++++++++++++++-----------------
 1 file changed, 34 insertions(+), 41 deletions(-)

diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py
index d82afb4e..456fd46e 100644
--- a/backend/backends/hume_backend.py
+++ b/backend/backends/hume_backend.py
@@ -130,11 +130,13 @@ def _load_model_sync(self, model_size: str = "1B"):
                 allow_patterns=["*.safetensors", "*.json", "*.txt", "*.bin", "*.model"],
             )
 
-            # Pre-download the Llama tokenizer from an ungated mirror.
-            # TADA hardcodes "meta-llama/Llama-3.2-1B" which is gated;
-            # we redirect to unsloth's ungated copy at load time.
+            # TADA hardcodes "meta-llama/Llama-3.2-1B" as the tokenizer
+            # source in its Aligner and TadaForCausalLM.from_pretrained().
+            # That repo is gated (requires Meta license acceptance).
+            # Download the tokenizer from an ungated mirror and get its
+            # local cache path so we can point TADA at it directly.
             logger.info("Downloading Llama tokenizer (ungated mirror)...")
-            snapshot_download(
+            tokenizer_path = snapshot_download(
                 repo_id="unsloth/Llama-3.2-1B",
                 token=None,
                 allow_patterns=["tokenizer*", "special_tokens*"],
@@ -146,43 +148,34 @@ def _load_model_sync(self, model_size: str = "1B"):
             else:
                 model_dtype = torch.float32
 
-            # TADA hardcodes "meta-llama/Llama-3.2-1B" as the tokenizer
-            # source in its Aligner and TadaForCausalLM.from_pretrained().
-            # That repo is gated (requires Meta license acceptance on HF).
-            # Monkey-patch AutoTokenizer.from_pretrained to redirect to an
-            # ungated mirror that ships the identical tokenizer files.
-            from transformers import AutoTokenizer
-            _orig_from_pretrained = AutoTokenizer.from_pretrained.__func__
-
-            @classmethod  # type: ignore[misc]
-            def _patched_from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-                if "meta-llama/Llama-3.2" in str(pretrained_model_name_or_path):
-                    pretrained_model_name_or_path = "unsloth/Llama-3.2-1B"
-                    kwargs.setdefault("token", None)
-                    logger.info("Redirecting Llama tokenizer to ungated mirror: unsloth/Llama-3.2-1B")
-                return _orig_from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs)
-
-            AutoTokenizer.from_pretrained = _patched_from_pretrained
-
-            try:
-                # Load encoder (only needed for voice prompt encoding)
-                from tada.modules.encoder import Encoder
-                logger.info("Loading TADA encoder...")
-                self.encoder = Encoder.from_pretrained(
-                    TADA_CODEC_REPO, subfolder="encoder"
-                ).to(device)
-                self.encoder.eval()
-
-                # Load the causal LM (includes decoder for wav generation)
-                from tada.modules.tada import TadaForCausalLM
-                logger.info(f"Loading TADA {model_size} model...")
-                self.model = TadaForCausalLM.from_pretrained(
-                    repo, torch_dtype=model_dtype
-                ).to(device)
-                self.model.eval()
-            finally:
-                # Restore original to avoid affecting other code
-                AutoTokenizer.from_pretrained = _orig_from_pretrained
+            # Patch the Aligner config class to use the local tokenizer
+            # path instead of the gated "meta-llama/Llama-3.2-1B" default.
+            # This avoids monkey-patching AutoTokenizer.from_pretrained
+            # which corrupts the classmethod descriptor for other engines.
+            from tada.modules.aligner import AlignerConfig
+            AlignerConfig.tokenizer_name = tokenizer_path
+
+            # Load encoder (only needed for voice prompt encoding)
+            from tada.modules.encoder import Encoder
+            logger.info("Loading TADA encoder...")
+            self.encoder = Encoder.from_pretrained(
+                TADA_CODEC_REPO, subfolder="encoder"
+            ).to(device)
+            self.encoder.eval()
+
+            # Load the causal LM (includes decoder for wav generation).
+            # TadaForCausalLM.from_pretrained() calls
+            #   getattr(config, "tokenizer_name", "meta-llama/Llama-3.2-1B")
+            # which hits the gated repo. Pre-load the config from HF,
+            # inject the local tokenizer path, then pass it in.
+            from tada.modules.tada import TadaForCausalLM, TadaConfig
+            logger.info(f"Loading TADA {model_size} model...")
+            config = TadaConfig.from_pretrained(repo)
+            config.tokenizer_name = tokenizer_path
+            self.model = TadaForCausalLM.from_pretrained(
+                repo, config=config, torch_dtype=model_dtype
+            ).to(device)
+            self.model.eval()
 
         logger.info(f"HumeAI TADA {model_size} loaded successfully on {device}")
 

From 5774a168a9874c27acd4854f7194a52eb135318d Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Tue, 17 Mar 2026 03:17:53 -0700
Subject: [PATCH 6/7] fix TADA 3B model name: tada-3b -> tada-3b-ml

---
 app/src/lib/hooks/useGenerationForm.ts | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts
index d174bad0..8e73ce07 100644
--- a/app/src/lib/hooks/useGenerationForm.ts
+++ b/app/src/lib/hooks/useGenerationForm.ts
@@ -80,7 +80,9 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
             : engine === 'chatterbox_turbo'
               ? 'chatterbox-turbo'
               : engine === 'tada'
-                ? `tada-${(data.modelSize || '1B').toLowerCase()}`
+                ? data.modelSize === '3B'
+                  ? 'tada-3b-ml'
+                  : 'tada-1b'
                 : `qwen-tts-${data.modelSize}`;
       const displayName =
         engine === 'luxtts'

From 273483ffcfab2056804ce02559c5d1129d67477c Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Tue, 17 Mar 2026 03:28:58 -0700
Subject: [PATCH 7/7] fix TorchScript error in frozen builds and update docs
 for TADA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove @torch.jit.script from the DAC shim's snake() function —
TorchScript calls inspect.getsource() which fails in PyInstaller
binaries (no .py source files).

Update all user-facing docs: 4 → 5 TTS engines, add TADA row to
every engine comparison table, mark TADA as Shipped in the upcoming
engines list, update architecture diagrams and tech stack tables.
---
 README.md                                   | 11 ++++++-----
 backend/utils/dac_shim.py                   |  4 +++-
 docs/content/docs/developer/tts-engines.mdx |  2 +-
 docs/content/docs/index.mdx                 |  4 ++--
 docs/content/docs/overview/introduction.mdx |  9 +++++----
 docs/notes/PROJECT_STATUS.md                | 18 +++++++++++++-----
 6 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 614e4f21..1acc9a4f 100644
--- a/README.md
+++ b/README.md
@@ -59,10 +59,10 @@
 
 ## What is Voicebox?
 
-Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor.
+Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor.
 
 - **Complete privacy** — models and voice data stay on your machine
-- **4 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo
+- **5 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA
 - **23 languages** — from English to Arabic, Japanese, Hindi, Swahili, and more
 - **Post-processing effects** — pitch shift, reverb, delay, chorus, compression, and filters
 - **Expressive speech** — paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo
@@ -93,7 +93,7 @@ Voicebox is a **local-first voice cloning studio** — a free and open-source al
 
 ### Multi-Engine Voice Cloning
 
-Four TTS engines with different strengths, switchable per-generation:
+Five TTS engines with different strengths, switchable per-generation:
 
 | Engine                      | Languages | Strengths                                                                                                                                |
 | --------------------------- | --------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
@@ -101,6 +101,7 @@ Four TTS engines with different strengths, switchable per-generation:
 | **LuxTTS**                  | English   | Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU                                                                              |
 | **Chatterbox Multilingual** | 23        | Broadest language coverage — Arabic, Danish, Finnish, Greek, Hebrew, Hindi, Malay, Norwegian, Polish, Swahili, Swedish, Turkish and more |
 | **Chatterbox Turbo**        | English   | Fast 350M model with paralinguistic emotion/sound tags                                                                                   |
+| **TADA** (1B / 3B)          | 10        | HumeAI speech-language model — 700s+ coherent audio, text-acoustic dual alignment                                                        |
 
 ### Emotions & Paralinguistic Tags
 
@@ -230,7 +231,7 @@ Full API documentation available at `http://localhost:17493/docs`.
 | Frontend      | React, TypeScript, Tailwind CSS                   |
 | State         | Zustand, React Query                              |
 | Backend       | FastAPI (Python)                                  |
-| TTS Engines   | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo   |
+| TTS Engines   | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo, TADA |
 | Effects       | Pedalboard (Spotify)                              |
 | Transcription | Whisper / Whisper Turbo (PyTorch or MLX)          |
 | Inference     | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) |
@@ -245,7 +246,7 @@ Full API documentation available at `http://localhost:17493/docs`.
 | ----------------------- | ---------------------------------------------- |
 | **Real-time Streaming** | Stream audio as it generates, word by word     |
 | **Voice Design**        | Create new voices from text descriptions       |
-| **More Models**         | XTTS, Bark, and other open-source voice models |
+| **More Models**         | XTTS, Bark, and other open-source voice models  |
 | **Plugin Architecture** | Extend with custom models and effects          |
 | **Mobile Companion**    | Control Voicebox from your phone               |
 
diff --git a/backend/utils/dac_shim.py b/backend/utils/dac_shim.py
index ea1294fc..89968c18 100644
--- a/backend/utils/dac_shim.py
+++ b/backend/utils/dac_shim.py
@@ -24,7 +24,9 @@
 
 # ── Snake activation (from dac/nn/layers.py) ────────────────────────
 
-@torch.jit.script
+# NOTE: The original DAC code uses @torch.jit.script here for a 1.4x
+# speedup.  We omit it because TorchScript calls inspect.getsource()
+# which fails inside a PyInstaller frozen binary (no .py source files).
 def snake(x: torch.Tensor, alpha: torch.Tensor) -> torch.Tensor:
     shape = x.shape
     x = x.reshape(shape[0], shape[1], -1)
diff --git a/docs/content/docs/developer/tts-engines.mdx b/docs/content/docs/developer/tts-engines.mdx
index f95e7a71..90135a37 100644
--- a/docs/content/docs/developer/tts-engines.mdx
+++ b/docs/content/docs/developer/tts-engines.mdx
@@ -490,7 +490,7 @@ Based on the current model landscape, these are candidates for future integratio
 | **Fish Speech** | 50+ | Medium | Word-level control via inline text | Ready |
 | **Kokoro-82M** | English | 82M | CPU realtime, Apache 2.0 | Ready |
 | **XTTS-v2** | 17+ | Medium | Zero-shot cloning | Ready |
-| **HumeAI TADA** | EN (1B), Multi (3B) | Medium | 700s+ coherent audio, synced transcripts | Needs vetting |
+| **HumeAI TADA** | EN (1B), Multi (3B) | Medium | 700s+ coherent audio, synced transcripts | Shipped |
 | **MOSS-TTS** | Multilingual | Medium | Text-to-voice design, multi-speaker dialogue | Needs vetting |
 | **Pocket TTS** | English | ~100M | CPU-first, >1× realtime | Needs vetting |
 
diff --git a/docs/content/docs/index.mdx b/docs/content/docs/index.mdx
index cbe120c3..9a7da078 100644
--- a/docs/content/docs/index.mdx
+++ b/docs/content/docs/index.mdx
@@ -3,12 +3,12 @@ title: "Voicebox Documentation"
 description: "Voicebox is a local-first voice cloning studio -- a free and open-source alternative to ElevenLabs."
 ---
 
-Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor.
+Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor.
 
 ![Voicebox App Screenshot](/images/app-screenshot-1.webp)
 
 - **Complete privacy** -- models and voice data stay on your machine
-- **4 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo
+- **5 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA
 - **23 languages** -- from English to Arabic, Japanese, Hindi, Swahili, and more
 - **Post-processing effects** -- pitch shift, reverb, delay, chorus, compression, and filters
 - **Expressive speech** -- paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo
diff --git a/docs/content/docs/overview/introduction.mdx b/docs/content/docs/overview/introduction.mdx
index 134f1d71..d740a61e 100644
--- a/docs/content/docs/overview/introduction.mdx
+++ b/docs/content/docs/overview/introduction.mdx
@@ -5,10 +5,10 @@ description: "Voicebox is a local-first voice cloning studio -- a free and open-
 
 ## What is Voicebox?
 
-Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor.
+Voicebox is a **local-first voice cloning studio** -- a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor.
 
 - **Complete privacy** -- models and voice data stay on your machine
-- **4 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo
+- **5 TTS engines** -- Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA
 - **23 languages** -- from English to Arabic, Japanese, Hindi, Swahili, and more
 - **Post-processing effects** -- pitch shift, reverb, delay, chorus, compression, and filters
 - **Expressive speech** -- paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo
@@ -20,7 +20,7 @@ Voicebox is a **local-first voice cloning studio** -- a free and open-source alt
 
 ## TTS Engines
 
-Four engines with different strengths, switchable per-generation:
+Five engines with different strengths, switchable per-generation:
 
 | Engine | Languages | Strengths |
 |--------|-----------|-----------|
@@ -28,6 +28,7 @@ Four engines with different strengths, switchable per-generation:
 | **LuxTTS** | English | Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU |
 | **Chatterbox Multilingual** | 23 | Broadest language coverage |
 | **Chatterbox Turbo** | English | Fast 350M model with paralinguistic emotion/sound tags |
+| **TADA** (1B / 3B) | 10 | HumeAI speech-language model -- 700s+ coherent audio |
 
 ## GPU Support
 
@@ -56,7 +57,7 @@ Four engines with different strengths, switchable per-generation:
 | Frontend | React, TypeScript, Tailwind CSS |
 | State | Zustand, React Query |
 | Backend | FastAPI (Python) |
-| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo |
+| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo, TADA |
 | Effects | Pedalboard (Spotify) |
 | Transcription | Whisper / Whisper Turbo (PyTorch or MLX) |
 | Inference | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) |
diff --git a/docs/notes/PROJECT_STATUS.md b/docs/notes/PROJECT_STATUS.md
index 628dfa0a..71c06453 100644
--- a/docs/notes/PROJECT_STATUS.md
+++ b/docs/notes/PROJECT_STATUS.md
@@ -36,6 +36,10 @@
 │  │  │  │ Qwen3-TTS│ │LuxTTS │ │Chatterbox │  │  │  │
 │  │  │  │(Py/MLX)  │ │       │ │(MTL+Turbo)│  │  │  │
 │  │  │  └──────────┘ └───────┘ └───────────┘  │  │  │
+│  │  │  ┌──────────┐                           │  │  │
+│  │  │  │ TADA     │                           │  │  │
+│  │  │  │(1B / 3B) │                           │  │  │
+│  │  │  └──────────┘                           │  │  │
 │  │  └─────────────────────────────────────────┘  │  │
 │  │  ┌───────────┐  ┌─────────┐                   │  │
 │  │  │ STTBackend│  │ Profiles│                   │  │
@@ -59,6 +63,7 @@
 | LuxTTS | `backend/backends/luxtts_backend.py` | LuxTTS — fast, CPU-friendly |
 | Chatterbox MTL | `backend/backends/chatterbox_backend.py` | Chatterbox Multilingual — 23 languages |
 | Chatterbox Turbo | `backend/backends/chatterbox_turbo_backend.py` | Chatterbox Turbo — English, paralinguistic tags |
+| TADA | `backend/backends/hume_backend.py` | HumeAI TADA — 1B English + 3B Multilingual |
 | Platform detect | `backend/platform_detect.py` | Apple Silicon → MLX, else → PyTorch |
 | API types | `backend/models.py` | Pydantic request/response models |
 | HF progress | `backend/utils/hf_progress.py` | HFProgressTracker (tqdm patching for download progress) |
@@ -78,7 +83,7 @@
 ```
 POST /generate
   1. Look up voice profile from DB
-  2. Resolve engine from request (qwen | luxtts | chatterbox | chatterbox_turbo)
+  2. Resolve engine from request (qwen | luxtts | chatterbox | chatterbox_turbo | tada)
   3. Get backend: get_tts_backend_for_engine(engine)  # thread-safe singleton per engine
   4. Check model cache → if missing, trigger background download, return HTTP 202
   5. Load model (lazy): tts_backend.load_model(model_size)
@@ -104,7 +109,8 @@ POST /generate
 - LuxTTS integration — fast, CPU-friendly English TTS (PR #254)
 - Chatterbox Multilingual TTS — 23 languages including Hebrew (PR #257)
 - Instruct parameter UI exists but is non-functional across all backends (see #224, Known Limitations)
-- Single flat model dropdown (Qwen 1.7B, Qwen 0.6B, LuxTTS, Chatterbox, Chatterbox Turbo)
+- HumeAI TADA integration — 1B English + 3B Multilingual speech-language model (PR #296)
+- Single flat model dropdown (Qwen 1.7B, Qwen 0.6B, LuxTTS, Chatterbox, Chatterbox Turbo, TADA 1B, TADA 3B)
 - Centralized model config registry (`ModelConfig` dataclass) — no per-engine dispatch maps in `main.py`
 - Shared `EngineModelSelector` component — engine/model dropdown defined once, used in both generation forms
 
@@ -136,6 +142,8 @@ POST /generate
 | LuxTTS | `luxtts` | English | ~300 MB | CPU-friendly, 48 kHz, fast | None |
 | Chatterbox | `chatterbox-tts` | 23 (incl. Hebrew, Arabic, Hindi, etc.) | ~3.2 GB | Zero-shot cloning, multilingual | Partial — `exaggeration` float (0-1) for expressiveness |
 | Chatterbox Turbo | `chatterbox-turbo` | English | ~1.5 GB | Paralinguistic tags ([laugh], [cough]), 350M params, low latency | Partial — inline tags only, no separate instruct param |
+| TADA 1B | `tada-1b` | English | ~4 GB | HumeAI speech-language model, 700s+ coherent audio | None |
+| TADA 3B Multilingual | `tada-3b-ml` | 10 (en, ar, zh, de, es, fr, it, ja, pl, pt) | ~8 GB | Multilingual, text-acoustic dual alignment | None |
 
 ### Multi-Engine Architecture (Shipped)
 
@@ -143,7 +151,7 @@ The singleton TTS backend blocker described in the previous version of this doc
 
 - **Thread-safe backend registry** (`_tts_backends` dict + `_tts_backends_lock`) with double-checked locking
 - **Per-engine backend instances** — each engine gets its own singleton, loaded lazily
-- **Engine field on GenerationRequest** — frontend sends `engine: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo'`
+- **Engine field on GenerationRequest** — frontend sends `engine: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada'`
 - **Per-engine language filtering** — `ENGINE_LANGUAGES` map in frontend, backend regex accepts all languages
 - **Per-engine voice prompts** — `create_voice_prompt_for_profile()` dispatches to the correct backend
 - **Trim post-processing** — `trim_tts_output()` for Chatterbox engines (cuts trailing silence/hallucination)
@@ -337,7 +345,7 @@ Notable requests:
 | **CosyVoice2-0.5B** | 3-10s zero-shot | Very fast | 24 kHz | Multilingual | Low | **Yes** — `inference_instruct2()`, works with cloning | Ready | Best instruct candidate |
 | **Fish Speech** | 10-30s few-shot | Real-time | 24-44 kHz | 50+ | Medium | **Yes** — inline text descriptions, word-level control | Ready | Multi-engine arch in place |
 | **MOSS-TTS Family** | Zero-shot | — | — | Multilingual | Medium | **Yes** — text prompts for style + timbre design | Needs vetting | Apache 2.0, multi-speaker dialogue |
-| **HumeAI TADA 1B/3B** | Zero-shot | 5× faster than LLM-TTS | — | EN (1B), Multilingual (3B) | Medium | Partial — automatic prosody from text context | Needs vetting | MIT, 700s+ coherent, synced transcript output |
+| **HumeAI TADA 1B/3B** | Zero-shot | 5× faster than LLM-TTS | 24 kHz | EN (1B), Multilingual (3B) | Medium | Partial — automatic prosody from text context | **Shipped** | PR #296, MIT, 700s+ coherent |
 | **VoxCPM 1.5** | Zero-shot (seconds) | ~0.15 RTF streaming | — | Bilingual (EN/ZH) | Medium | Partial — automatic context-aware prosody | Needs vetting | Apache 2.0, tokenizer-free continuous diffusion |
 | **Kokoro-82M** | 3s instant | CPU realtime | 24 kHz | English | Tiny (82M) | Partial — automatic style inference | Ready | Apache 2.0, multi-engine arch in place |
 | **XTTS-v2** | 6s zero-shot | Mid-GPU | 24 kHz | 17+ | Medium | Partial — style transfer from ref audio only | Ready | Multi-engine arch in place |
@@ -475,7 +483,7 @@ The generation form now uses a flat model dropdown with engine-based routing. Pe
 | `/history/{id}/export` | GET | Export generation ZIP |
 | `/history/{id}/export-audio` | GET | Export audio only |
 | `/transcribe` | POST | Transcribe audio (Whisper) |
-| `/models/status` | GET | All model statuses (Qwen, LuxTTS, Chatterbox, Chatterbox Turbo, Whisper) |
+| `/models/status` | GET | All model statuses (Qwen, LuxTTS, Chatterbox, Chatterbox Turbo, TADA, Whisper) |
 | `/models/download` | POST | Trigger model download |
 | `/models/download/cancel` | POST | Cancel/dismiss download |
 | `/models/{name}` | DELETE | Delete downloaded model |