Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ jobs:
pip install pyinstaller
pip install -r backend/requirements.txt
pip install --no-deps chatterbox-tts
pip install --no-deps hume-tada

- name: Install MLX dependencies (Apple Silicon only)
if: matrix.backend == 'mlx'
Expand Down Expand Up @@ -188,6 +189,7 @@ jobs:
pip install pyinstaller
pip install -r backend/requirements.txt
pip install --no-deps chatterbox-tts
pip install --no-deps hume-tada

- name: Install PyTorch with CUDA 12.6
run: |
Expand Down
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ RUN pip install --no-cache-dir --upgrade pip

COPY backend/requirements.txt .
RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts
RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada
RUN pip install --no-cache-dir --prefix=/install \
git+https://github.com/QwenLM/Qwen3-TTS.git

Expand Down
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@

## What is Voicebox?

Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 4 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor.
Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor.

- **Complete privacy** — models and voice data stay on your machine
- **4 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo
- **5 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA
- **23 languages** — from English to Arabic, Japanese, Hindi, Swahili, and more
- **Post-processing effects** — pitch shift, reverb, delay, chorus, compression, and filters
- **Expressive speech** — paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo
Expand Down Expand Up @@ -93,14 +93,15 @@ Voicebox is a **local-first voice cloning studio** — a free and open-source al

### Multi-Engine Voice Cloning

Four TTS engines with different strengths, switchable per-generation:
Five TTS engines with different strengths, switchable per-generation:

| Engine | Languages | Strengths |
| --------------------------- | --------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| **Qwen3-TTS** (0.6B / 1.7B) | 10 | High-quality multilingual cloning, delivery instructions ("speak slowly", "whisper") |
| **LuxTTS** | English | Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU |
| **Chatterbox Multilingual** | 23 | Broadest language coverage — Arabic, Danish, Finnish, Greek, Hebrew, Hindi, Malay, Norwegian, Polish, Swahili, Swedish, Turkish and more |
| **Chatterbox Turbo** | English | Fast 350M model with paralinguistic emotion/sound tags |
| **TADA** (1B / 3B) | 10 | HumeAI speech-language model — 700s+ coherent audio, text-acoustic dual alignment |

### Emotions & Paralinguistic Tags

Expand Down Expand Up @@ -230,7 +231,7 @@ Full API documentation available at `http://localhost:17493/docs`.
| Frontend | React, TypeScript, Tailwind CSS |
| State | Zustand, React Query |
| Backend | FastAPI (Python) |
| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo |
| TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo, TADA |
| Effects | Pedalboard (Spotify) |
| Transcription | Whisper / Whisper Turbo (PyTorch or MLX) |
| Inference | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) |
Expand All @@ -245,7 +246,7 @@ Full API documentation available at `http://localhost:17493/docs`.
| ----------------------- | ---------------------------------------------- |
| **Real-time Streaming** | Stream audio as it generates, word by word |
| **Voice Design** | Create new voices from text descriptions |
| **More Models** | XTTS, Bark, and other open-source voice models |
| **More Models** | XTTS, Bark, and other open-source voice models |
| **Plugin Architecture** | Extend with custom models and effects |
| **Mobile Companion** | Control Voicebox from your phone |

Expand Down
18 changes: 18 additions & 0 deletions app/src/components/Generation/EngineModelSelector.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,24 @@ const ENGINE_OPTIONS = [
{ value: 'luxtts', label: 'LuxTTS' },
{ value: 'chatterbox', label: 'Chatterbox' },
{ value: 'chatterbox_turbo', label: 'Chatterbox Turbo' },
{ value: 'tada:1B', label: 'TADA 1B' },
{ value: 'tada:3B', label: 'TADA 3B Multilingual' },
] as const;

const ENGINE_DESCRIPTIONS: Record<string, string> = {
qwen: 'Multi-language, two sizes',
luxtts: 'Fast, English-focused',
chatterbox: '23 languages, incl. Hebrew',
chatterbox_turbo: 'English, [laugh] [cough] tags',
tada: 'HumeAI, 700s+ coherent audio',
};

/** Engines that only support English and should force language to 'en' on select. */
const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']);

function getSelectValue(engine: string, modelSize?: string): string {
if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`;
if (engine === 'tada') return `tada:${modelSize || '1B'}`;
return engine;
}

Expand All @@ -48,6 +52,20 @@ function handleEngineChange(form: UseFormReturn<GenerationFormValues>, value: st
if (!available.some((l) => l.value === currentLang)) {
form.setValue('language', available[0]?.value ?? 'en');
}
} else if (value.startsWith('tada:')) {
const [, modelSize] = value.split(':');
form.setValue('engine', 'tada');
form.setValue('modelSize', modelSize as '1B' | '3B');
// TADA 1B is English-only; 3B is multilingual
if (modelSize === '1B') {
form.setValue('language', 'en');
} else {
const currentLang = form.getValues('language');
const available = getLanguageOptionsForEngine('tada');
if (!available.some((l) => l.value === currentLang)) {
form.setValue('language', available[0]?.value ?? 'en');
}
}
} else {
form.setValue('engine', value as GenerationFormValues['engine']);
form.setValue('modelSize', undefined as unknown as '1.7B' | '0.6B');
Expand Down
7 changes: 6 additions & 1 deletion app/src/components/ServerSettings/ModelManagement.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ const MODEL_DESCRIPTIONS: Record<string, string> = {
'Production-grade open source TTS by Resemble AI. Supports 23 languages with voice cloning and emotion exaggeration control.',
'chatterbox-turbo':
'Streamlined 350M parameter TTS by Resemble AI. High-quality English speech with less compute and VRAM than larger models.',
'tada-1b':
'HumeAI TADA 1B — English speech-language model built on Llama 3.2 1B. Generates 700s+ of coherent audio with synchronized text-acoustic alignment.',
'tada-3b-ml':
'HumeAI TADA 3B Multilingual — built on Llama 3.2 3B. Supports 10 languages with high-fidelity voice cloning via text-acoustic dual alignment.',
'whisper-base':
'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.',
'whisper-small':
Expand Down Expand Up @@ -391,7 +395,8 @@ export function ModelManagement() {
(m) =>
m.model_name.startsWith('qwen-tts') ||
m.model_name.startsWith('luxtts') ||
m.model_name.startsWith('chatterbox'),
m.model_name.startsWith('chatterbox') ||
m.model_name.startsWith('tada'),
) ?? [];
const whisperModels = modelStatus?.models.filter((m) => m.model_name.startsWith('whisper')) ?? [];

Expand Down
4 changes: 2 additions & 2 deletions app/src/lib/api/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ export interface GenerationRequest {
text: string;
language: LanguageCode;
seed?: number;
model_size?: '1.7B' | '0.6B';
engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo';
model_size?: '1.7B' | '0.6B' | '1B' | '3B';
engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada';
instruct?: string;
max_chunk_chars?: number;
crossfade_ms?: number;
Expand Down
1 change: 1 addition & 0 deletions app/src/lib/constants/languages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ export const ENGINE_LANGUAGES: Record<string, readonly LanguageCode[]> = {
'zh',
],
chatterbox_turbo: ['en'],
tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'],
} as const;

/** Helper: get language options for a given engine. */
Expand Down
26 changes: 17 additions & 9 deletions app/src/lib/hooks/useGenerationForm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ const generationSchema = z.object({
text: z.string().min(1, '').max(50000),
language: z.enum(LANGUAGE_CODES as [LanguageCode, ...LanguageCode[]]),
seed: z.number().int().optional(),
modelSize: z.enum(['1.7B', '0.6B']).optional(),
modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(),
instruct: z.string().max(500).optional(),
engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo']).optional(),
engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']).optional(),
});

export type GenerationFormValues = z.infer<typeof generationSchema>;
Expand Down Expand Up @@ -79,17 +79,25 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
? 'chatterbox-tts'
: engine === 'chatterbox_turbo'
? 'chatterbox-turbo'
: `qwen-tts-${data.modelSize}`;
: engine === 'tada'
? data.modelSize === '3B'
? 'tada-3b-ml'
: 'tada-1b'
: `qwen-tts-${data.modelSize}`;
const displayName =
engine === 'luxtts'
? 'LuxTTS'
: engine === 'chatterbox'
? 'Chatterbox TTS'
: engine === 'chatterbox_turbo'
? 'Chatterbox Turbo'
: data.modelSize === '1.7B'
? 'Qwen TTS 1.7B'
: 'Qwen TTS 0.6B';
: engine === 'tada'
? data.modelSize === '3B'
? 'TADA 3B Multilingual'
: 'TADA 1B'
: data.modelSize === '1.7B'
? 'Qwen TTS 1.7B'
: 'Qwen TTS 0.6B';

// Check if model needs downloading
try {
Expand All @@ -104,17 +112,17 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
console.error('Failed to check model status:', error);
}

const isQwen = engine === 'qwen';
const hasModelSizes = engine === 'qwen' || engine === 'tada';
const effectsChain = options.getEffectsChain?.();
// This now returns immediately with status="generating"
const result = await generation.mutateAsync({
profile_id: selectedProfileId,
text: data.text,
language: data.language,
seed: data.seed,
model_size: isQwen ? data.modelSize : undefined,
model_size: hasModelSizes ? data.modelSize : undefined,
engine,
instruct: isQwen ? data.instruct || undefined : undefined,
instruct: engine === 'qwen' ? data.instruct || undefined : undefined,
max_chunk_chars: maxChunkChars,
crossfade_ms: crossfadeMs,
normalize: normalizeAudio,
Expand Down
29 changes: 27 additions & 2 deletions backend/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def is_loaded(self) -> bool:
"luxtts": "LuxTTS",
"chatterbox": "Chatterbox TTS",
"chatterbox_turbo": "Chatterbox Turbo",
"tada": "TADA",
}


Expand Down Expand Up @@ -259,6 +260,24 @@ def _get_non_qwen_tts_configs() -> list[ModelConfig]:
needs_trim=True,
languages=["en"],
),
ModelConfig(
model_name="tada-1b",
display_name="TADA 1B (English)",
engine="tada",
hf_repo_id="HumeAI/tada-1b",
model_size="1B",
size_mb=4000,
languages=["en"],
),
ModelConfig(
model_name="tada-3b-ml",
display_name="TADA 3B Multilingual",
engine="tada",
hf_repo_id="HumeAI/tada-3b-ml",
model_size="3B",
size_mb=8000,
languages=["en", "ar", "zh", "de", "es", "fr", "it", "ja", "pl", "pt"],
),
]


Expand Down Expand Up @@ -339,10 +358,12 @@ def engine_has_model_sizes(engine: str) -> bool:


async def load_engine_model(engine: str, model_size: str = "default") -> None:
"""Load a model for the given engine, handling the Qwen model_size special case."""
"""Load a model for the given engine, handling engines with multiple model sizes."""
backend = get_tts_backend_for_engine(engine)
if engine == "qwen":
await backend.load_model_async(model_size)
elif engine == "tada":
await backend.load_model(model_size)
else:
await backend.load_model()

Expand All @@ -358,7 +379,7 @@ async def ensure_model_cached_or_raise(engine: str, model_size: str = "default")
cfg = c
break

if engine == "qwen":
if engine in ("qwen", "tada"):
if not backend._is_model_cached(model_size):
raise HTTPException(
status_code=400,
Expand Down Expand Up @@ -490,6 +511,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend:
from .chatterbox_turbo_backend import ChatterboxTurboTTSBackend

backend = ChatterboxTurboTTSBackend()
elif engine == "tada":
from .hume_backend import HumeTadaBackend

backend = HumeTadaBackend()
else:
raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}")

Expand Down
Loading