From 3584283d8410b4cf2aed80b6248c06e032106e52 Mon Sep 17 00:00:00 2001 From: James Pine Date: Thu, 19 Mar 2026 10:09:48 -0700 Subject: [PATCH 1/4] feat: Kokoro 82M TTS engine + voice profile type system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Kokoro-82M as a new TTS engine — 82M params, CPU realtime, 8 languages, Apache 2.0. Unlike cloning engines, Kokoro uses pre-built voice styles, which required a new profile type system to support non-cloning engines cleanly. Kokoro engine: - New kokoro_backend.py implementing TTSBackend protocol - 50 built-in voices across en/es/fr/hi/it/pt/ja/zh - KPipeline API with language-aware G2P routing via misaki - PyInstaller bundling for misaki, language_tags, espeakng_loader, en_core_web_sm Voice profile type system: - New voice_type column: 'cloned' | 'preset' | 'designed' (future) - Preset profiles store engine + voice ID instead of audio samples - default_engine field on profiles — auto-selects engine on profile pick - Create Voice dialog: toggle between 'Clone from audio' and 'Built-in voice' - Edit dialog shows preset voice info instead of sample list for preset profiles - Engine selector locks to preset engine when preset profile is selected - Profile grid filters by engine — shows Kokoro voices when Kokoro selected - Custom empty state when no preset profiles exist for selected engine Bug fixes: - Fix relative audio paths in DB causing 404s in production builds - config.set_data_dir() now resolves to absolute paths - Startup migration converts existing relative paths to absolute Also updates PROJECT_STATUS.md and tts-engines.mdx developer guide. --- .../Generation/EngineModelSelector.tsx | 70 ++- .../Generation/FloatingGenerateBox.tsx | 64 ++- .../components/Generation/GenerationForm.tsx | 2 +- .../ServerSettings/ModelManagement.tsx | 5 +- .../components/VoiceProfiles/ProfileCard.tsx | 10 + .../components/VoiceProfiles/ProfileForm.tsx | 481 +++++++++++++----- .../components/VoiceProfiles/ProfileList.tsx | 35 +- app/src/lib/api/client.ts | 11 + app/src/lib/api/types.ts | 21 +- app/src/lib/constants/languages.ts | 2 + app/src/lib/hooks/useGenerationForm.ts | 14 +- app/src/stores/uiStore.ts | 7 + backend/backends/__init__.py | 13 + backend/backends/kokoro_backend.py | 288 +++++++++++ backend/build_binary.py | 38 ++ backend/config.py | 6 +- backend/database/migrations.py | 56 ++ backend/database/models.py | 16 +- backend/models.py | 12 +- backend/requirements.txt | 7 + backend/routes/profiles.py | 97 ++++ backend/services/profiles.py | 54 +- backend/voicebox-server.spec | 17 +- docs/content/docs/developer/tts-engines.mdx | 28 + docs/notes/PROJECT_STATUS.md | 239 ++++----- tauri/src-tauri/Cargo.lock | 2 +- 26 files changed, 1303 insertions(+), 292 deletions(-) create mode 100644 backend/backends/kokoro_backend.py diff --git a/app/src/components/Generation/EngineModelSelector.tsx b/app/src/components/Generation/EngineModelSelector.tsx index 4382d3f7..773aa089 100644 --- a/app/src/components/Generation/EngineModelSelector.tsx +++ b/app/src/components/Generation/EngineModelSelector.tsx @@ -7,6 +7,7 @@ import { SelectTrigger, SelectValue, } from '@/components/ui/select'; +import type { VoiceProfileResponse } from '@/lib/api/types'; import { getLanguageOptionsForEngine } from '@/lib/constants/languages'; import type { GenerationFormValues } from '@/lib/hooks/useGenerationForm'; @@ -15,13 +16,14 @@ import type { GenerationFormValues } from '@/lib/hooks/useGenerationForm'; * Adding a new engine means adding one entry here. */ const ENGINE_OPTIONS = [ - { value: 'qwen:1.7B', label: 'Qwen3-TTS 1.7B' }, - { value: 'qwen:0.6B', label: 'Qwen3-TTS 0.6B' }, - { value: 'luxtts', label: 'LuxTTS' }, - { value: 'chatterbox', label: 'Chatterbox' }, - { value: 'chatterbox_turbo', label: 'Chatterbox Turbo' }, - { value: 'tada:1B', label: 'TADA 1B' }, - { value: 'tada:3B', label: 'TADA 3B Multilingual' }, + { value: 'qwen:1.7B', label: 'Qwen3-TTS 1.7B', engine: 'qwen' }, + { value: 'qwen:0.6B', label: 'Qwen3-TTS 0.6B', engine: 'qwen' }, + { value: 'luxtts', label: 'LuxTTS', engine: 'luxtts' }, + { value: 'chatterbox', label: 'Chatterbox', engine: 'chatterbox' }, + { value: 'chatterbox_turbo', label: 'Chatterbox Turbo', engine: 'chatterbox_turbo' }, + { value: 'tada:1B', label: 'TADA 1B', engine: 'tada' }, + { value: 'tada:3B', label: 'TADA 3B Multilingual', engine: 'tada' }, + { value: 'kokoro', label: 'Kokoro 82M', engine: 'kokoro' }, ] as const; const ENGINE_DESCRIPTIONS: Record = { @@ -30,11 +32,38 @@ const ENGINE_DESCRIPTIONS: Record = { chatterbox: '23 languages, incl. Hebrew', chatterbox_turbo: 'English, [laugh] [cough] tags', tada: 'HumeAI, 700s+ coherent audio', + kokoro: '82M params, CPU realtime, 8 langs', }; /** Engines that only support English and should force language to 'en' on select. */ const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']); +/** Engines that support cloned (reference audio) profiles. */ +const CLONING_ENGINES = new Set(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']); + +/** Engines that are preset-only (no cloning). */ +const PRESET_ONLY_ENGINES = new Set(['kokoro']); + +/** + * Get which engine options are available for the selected profile. + * + * - Preset profiles: locked to their preset engine + * - All other profiles: all engines available + */ +function getAvailableOptions(selectedProfile?: VoiceProfileResponse | null) { + if (!selectedProfile) return ENGINE_OPTIONS; + + const voiceType = selectedProfile.voice_type || 'cloned'; + + if (voiceType === 'preset') { + // Preset profiles lock to their specific engine + const presetEngine = selectedProfile.preset_engine; + return ENGINE_OPTIONS.filter((opt) => opt.engine === presetEngine); + } + + return ENGINE_OPTIONS; +} + function getSelectValue(engine: string, modelSize?: string): string { if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`; if (engine === 'tada') return `tada:${modelSize || '1B'}`; @@ -85,12 +114,21 @@ function handleEngineChange(form: UseFormReturn, value: st interface EngineModelSelectorProps { form: UseFormReturn; compact?: boolean; + selectedProfile?: VoiceProfileResponse | null; } -export function EngineModelSelector({ form, compact }: EngineModelSelectorProps) { +export function EngineModelSelector({ form, compact, selectedProfile }: EngineModelSelectorProps) { const engine = form.watch('engine') || 'qwen'; const modelSize = form.watch('modelSize'); const selectValue = getSelectValue(engine, modelSize); + const availableOptions = getAvailableOptions(selectedProfile); + + // If current engine isn't in available options, auto-switch to first available + const currentEngineAvailable = availableOptions.some((opt) => opt.value === selectValue); + if (!currentEngineAvailable && availableOptions.length > 0) { + // Defer to avoid setting state during render + setTimeout(() => handleEngineChange(form, availableOptions[0].value), 0); + } const itemClass = compact ? 'text-xs text-muted-foreground' : undefined; const triggerClass = compact @@ -105,7 +143,7 @@ export function EngineModelSelector({ form, compact }: EngineModelSelectorProps) - {ENGINE_OPTIONS.map((opt) => ( + {availableOptions.map((opt) => ( {opt.label} @@ -119,3 +157,17 @@ export function EngineModelSelector({ form, compact }: EngineModelSelectorProps) export function getEngineDescription(engine: string): string { return ENGINE_DESCRIPTIONS[engine] ?? ''; } + +/** + * Check if a profile is compatible with the currently selected engine. + * Useful for UI hints. + */ +export function isProfileCompatibleWithEngine( + profile: VoiceProfileResponse, + engine: string, +): boolean { + const voiceType = profile.voice_type || 'cloned'; + if (voiceType === 'preset') return profile.preset_engine === engine; + if (voiceType === 'cloned') return CLONING_ENGINES.has(engine); + return !PRESET_ONLY_ENGINES.has(engine); // designed — future +} diff --git a/app/src/components/Generation/FloatingGenerateBox.tsx b/app/src/components/Generation/FloatingGenerateBox.tsx index 96e8f553..ae1cad26 100644 --- a/app/src/components/Generation/FloatingGenerateBox.tsx +++ b/app/src/components/Generation/FloatingGenerateBox.tsx @@ -36,6 +36,7 @@ export function FloatingGenerateBox({ }: FloatingGenerateBoxProps) { const selectedProfileId = useUIStore((state) => state.selectedProfileId); const setSelectedProfileId = useUIStore((state) => state.setSelectedProfileId); + const setSelectedEngine = useUIStore((state) => state.setSelectedEngine); const { data: selectedProfile } = useProfile(selectedProfileId || ''); const { data: profiles } = useProfiles(); const [isExpanded, setIsExpanded] = useState(false); @@ -67,7 +68,12 @@ export function FloatingGenerateBox({ } }, getEffectsChain: () => { - if (!selectedPresetId || !effectPresets) return undefined; + if (!selectedPresetId) return undefined; + // Profile's own effects chain (no matching preset) + if (selectedPresetId === '_profile') { + return selectedProfile?.effects_chain ?? undefined; + } + if (!effectPresets) return undefined; const preset = effectPresets.find((p) => p.id === selectedPresetId); return preset?.effects_chain; }, @@ -110,12 +116,56 @@ export function FloatingGenerateBox({ } }, [selectedProfileId, profiles, setSelectedProfileId]); - // Sync generation form language with selected profile's language + // Sync engine selection to global store so ProfileList can filter + const watchedEngine = form.watch('engine'); + useEffect(() => { + if (watchedEngine) { + setSelectedEngine(watchedEngine); + } + }, [watchedEngine, setSelectedEngine]); + + // Sync generation form language, engine, and effects with selected profile useEffect(() => { if (selectedProfile?.language) { form.setValue('language', selectedProfile.language as LanguageCode); } - }, [selectedProfile, form]); + // Auto-switch engine if profile has a default + if (selectedProfile?.default_engine) { + form.setValue( + 'engine', + selectedProfile.default_engine as + | 'qwen' + | 'luxtts' + | 'chatterbox' + | 'chatterbox_turbo' + | 'tada' + | 'kokoro', + ); + } + // Pre-fill effects from profile defaults + if ( + selectedProfile?.effects_chain && + selectedProfile.effects_chain.length > 0 && + effectPresets + ) { + // Try to match against a known preset + const profileChainJson = JSON.stringify(selectedProfile.effects_chain); + const matchingPreset = effectPresets.find( + (p) => JSON.stringify(p.effects_chain) === profileChainJson, + ); + if (matchingPreset) { + setSelectedPresetId(matchingPreset.id); + } else { + // No matching preset — use special value to pass profile chain directly + setSelectedPresetId('_profile'); + } + } else if ( + selectedProfile && + (!selectedProfile.effects_chain || selectedProfile.effects_chain.length === 0) + ) { + setSelectedPresetId(null); + } + }, [selectedProfile, effectPresets, form]); // Auto-resize textarea based on content (only when expanded) useEffect(() => { @@ -358,7 +408,7 @@ export function FloatingGenerateBox({ /> - + @@ -375,6 +425,12 @@ export function FloatingGenerateBox({ No effects + {selectedProfile?.effects_chain && + selectedProfile.effects_chain.length > 0 && ( + + Profile default + + )} {effectPresets?.map((preset) => ( {preset.name} diff --git a/app/src/components/Generation/GenerationForm.tsx b/app/src/components/Generation/GenerationForm.tsx index 225e8dfa..9f7a7cd7 100644 --- a/app/src/components/Generation/GenerationForm.tsx +++ b/app/src/components/Generation/GenerationForm.tsx @@ -118,7 +118,7 @@ export function GenerationForm() {
Model - + {getEngineDescription(form.watch('engine') || 'qwen')} diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx index c415306d..e7eda69f 100644 --- a/app/src/components/ServerSettings/ModelManagement.tsx +++ b/app/src/components/ServerSettings/ModelManagement.tsx @@ -66,6 +66,8 @@ const MODEL_DESCRIPTIONS: Record = { 'HumeAI TADA 1B — English speech-language model built on Llama 3.2 1B. Generates 700s+ of coherent audio with synchronized text-acoustic alignment.', 'tada-3b-ml': 'HumeAI TADA 3B Multilingual — built on Llama 3.2 3B. Supports 10 languages with high-fidelity voice cloning via text-acoustic dual alignment.', + kokoro: + 'Kokoro 82M by hexgrad. Tiny 82M-parameter TTS that runs at CPU realtime. Supports 8 languages with pre-built voice styles. Apache 2.0 licensed.', 'whisper-base': 'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.', 'whisper-small': @@ -396,7 +398,8 @@ export function ModelManagement() { m.model_name.startsWith('qwen-tts') || m.model_name.startsWith('luxtts') || m.model_name.startsWith('chatterbox') || - m.model_name.startsWith('tada'), + m.model_name.startsWith('tada') || + m.model_name.startsWith('kokoro'), ) ?? []; const whisperModels = modelStatus?.models.filter((m) => m.model_name.startsWith('whisper')) ?? []; diff --git a/app/src/components/VoiceProfiles/ProfileCard.tsx b/app/src/components/VoiceProfiles/ProfileCard.tsx index 3675b765..e2a9d4d4 100644 --- a/app/src/components/VoiceProfiles/ProfileCard.tsx +++ b/app/src/components/VoiceProfiles/ProfileCard.tsx @@ -97,6 +97,16 @@ export function ProfileCard({ profile }: ProfileCardProps) { {profile.language} + {profile.voice_type === 'preset' && ( + + {profile.preset_engine} + + )} + {profile.voice_type === 'designed' && ( + + designed + + )} {profile.effects_chain && profile.effects_chain.length > 0 && ( )} diff --git a/app/src/components/VoiceProfiles/ProfileForm.tsx b/app/src/components/VoiceProfiles/ProfileForm.tsx index 13edf6f4..d3f53eac 100644 --- a/app/src/components/VoiceProfiles/ProfileForm.tsx +++ b/app/src/components/VoiceProfiles/ProfileForm.tsx @@ -1,9 +1,11 @@ import { zodResolver } from '@hookform/resolvers/zod'; -import { Edit2, Mic, Monitor, Upload, X } from 'lucide-react'; +import { useQuery } from '@tanstack/react-query'; +import { Edit2, Mic, Monitor, Music, Upload, X } from 'lucide-react'; import { useEffect, useRef, useState } from 'react'; import { useForm } from 'react-hook-form'; import * as z from 'zod'; import { EffectsChainEditor } from '@/components/Effects/EffectsChainEditor'; +import { Badge } from '@/components/ui/badge'; import { Button } from '@/components/ui/button'; import { Dialog, @@ -15,6 +17,7 @@ import { import { Form, FormControl, + FormDescription, FormField, FormItem, FormLabel, @@ -32,7 +35,7 @@ import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs'; import { Textarea } from '@/components/ui/textarea'; import { useToast } from '@/components/ui/use-toast'; import { apiClient } from '@/lib/api/client'; -import type { EffectConfig } from '@/lib/api/types'; +import type { EffectConfig, PresetVoice, VoiceType } from '@/lib/api/types'; import { LANGUAGE_CODES, LANGUAGE_OPTIONS, type LanguageCode } from '@/lib/constants/languages'; import { useAudioPlayer } from '@/lib/hooks/useAudioPlayer'; import { useAudioRecording } from '@/lib/hooks/useAudioRecording'; @@ -120,16 +123,20 @@ export function ProfileForm() { const deleteAvatar = useDeleteAvatar(); const transcribe = useTranscription(); const { toast } = useToast(); + const [voiceSource, setVoiceSource] = useState<'clone' | 'builtin'>('clone'); const [sampleMode, setSampleMode] = useState<'upload' | 'record' | 'system'>('record'); const [audioDuration, setAudioDuration] = useState(null); const [isValidatingAudio, setIsValidatingAudio] = useState(false); const [avatarPreview, setAvatarPreview] = useState(null); + const [selectedPresetEngine, setSelectedPresetEngine] = useState('kokoro'); + const [selectedPresetVoiceId, setSelectedPresetVoiceId] = useState(''); const avatarInputRef = useRef(null); const { isPlaying, playPause, cleanup: cleanupAudio } = useAudioPlayer(); const isCreating = !editingProfileId; const serverUrl = useServerStore((state) => state.serverUrl); const [profileEffectsChain, setProfileEffectsChain] = useState([]); const [effectsDirty, setEffectsDirty] = useState(false); + const [defaultEngine, setDefaultEngine] = useState(''); const form = useForm({ resolver: zodResolver(profileSchema), @@ -239,6 +246,20 @@ export function ProfileForm() { }, }); + // Fetch available preset voices for the selected engine + const presetEngineToQuery = isCreating + ? selectedPresetEngine + : (editingProfile?.preset_engine ?? ''); + const { data: presetVoicesData } = useQuery({ + queryKey: ['presetVoices', presetEngineToQuery], + queryFn: () => apiClient.listPresetVoices(presetEngineToQuery), + enabled: + !!presetEngineToQuery && + ((voiceSource === 'builtin' && isCreating) || + (!isCreating && editingProfile?.voice_type === 'preset')), + }); + const presetVoices = presetVoicesData?.voices ?? []; + // Show recording errors useEffect(() => { if (recordingError) { @@ -287,6 +308,7 @@ export function ProfileForm() { }); setProfileEffectsChain(editingProfile.effects_chain ?? []); setEffectsDirty(false); + setDefaultEngine(editingProfile.default_engine ?? ''); } else if (profileFormDraft && open) { // Restore from draft when opening in create mode form.reset({ @@ -415,13 +437,14 @@ export function ProfileForm() { async function onSubmit(data: ProfileFormValues) { try { if (editingProfileId) { - // Editing: just update profile + // Editing: update profile await updateProfile.mutateAsync({ profileId: editingProfileId, data: { name: data.name, description: data.description, language: data.language, + default_engine: defaultEngine || undefined, }, }); @@ -464,8 +487,50 @@ export function ProfileForm() { title: 'Voice updated', description: `"${data.name}" has been updated successfully.`, }); + } else if (voiceSource === 'builtin') { + // Creating preset profile from built-in voice + if (!selectedPresetVoiceId) { + toast({ + title: 'No voice selected', + description: 'Please select a built-in voice.', + variant: 'destructive', + }); + return; + } + + const profile = await createProfile.mutateAsync({ + name: data.name, + description: data.description, + language: data.language, + voice_type: 'preset' as VoiceType, + preset_engine: selectedPresetEngine, + preset_voice_id: selectedPresetVoiceId, + default_engine: selectedPresetEngine, + }); + + // Handle avatar upload if provided + if (data.avatarFile) { + try { + await uploadAvatar.mutateAsync({ + profileId: profile.id, + file: data.avatarFile, + }); + } catch (avatarError) { + toast({ + title: 'Avatar upload failed', + description: + avatarError instanceof Error ? avatarError.message : 'Failed to upload avatar', + variant: 'destructive', + }); + } + } + + toast({ + title: 'Profile created', + description: `"${data.name}" has been created with a built-in voice.`, + }); } else { - // Creating: require sample file and reference text + // Creating cloned profile: require sample file and reference text const sampleFile = form.getValues('sampleFile'); const referenceText = form.getValues('referenceText'); @@ -528,6 +593,7 @@ export function ProfileForm() { name: data.name, description: data.description, language: data.language, + default_engine: defaultEngine || undefined, }); // Convert non-WAV uploads to WAV so the backend can always use soundfile. @@ -642,16 +708,16 @@ export function ProfileForm() { return ( - -
+ +
- {editingProfileId ? 'Edit Voice' : 'Clone voice'} + {editingProfileId ? 'Edit Voice' : 'Create Voice'} {editingProfileId ? 'Update your voice profile details and manage samples.' - : 'Create a new voice profile with an audio sample to clone the voice.'} + : 'Create a new voice profile from an audio sample or a built-in voice.'} {isCreating && profileFormDraft && (
@@ -682,143 +748,275 @@ export function ProfileForm() {
-
+
{/* Left column: Sample management */} -
+
{isCreating ? ( <> - { - const newMode = v as 'upload' | 'record' | 'system'; - // Cancel any active recordings when switching modes - if (isRecording && newMode !== 'record') { - cancelRecording(); - } - if (isSystemRecording && newMode !== 'system') { - cancelSystemRecording(); - } - setSampleMode(newMode); - }} - > - - - - Upload - - - - Record - - {platform.metadata.isTauri && isSystemAudioSupported && ( - - - System Audio - - )} - - - - ( - MAX_AUDIO_DURATION_SECONDS - } - fieldName={name} + {/* Voice source selector */} +
+
+ + +
+
+ + {voiceSource === 'builtin' ? ( +
+ + Choose a pre-built voice. These don't require an audio sample. + + + {/* Engine selector */} + + Engine + + + + {/* Voice picker */} + + Voice +
+ {presetVoices.map((voice: PresetVoice) => ( + + ))} +
+
+
+ ) : ( + <> + { + const newMode = v as 'upload' | 'record' | 'system'; + // Cancel any active recordings when switching modes + if (isRecording && newMode !== 'record') { + cancelRecording(); + } + if (isSystemRecording && newMode !== 'system') { + cancelSystemRecording(); + } + setSampleMode(newMode); + }} + > + + + + Upload + + + + Record + + {platform.metadata.isTauri && isSystemAudioSupported && ( + + + System Audio + + )} + + + + ( + MAX_AUDIO_DURATION_SECONDS + } + fieldName={name} + /> + )} + /> + + + + ( + + )} /> + + + {platform.metadata.isTauri && isSystemAudioSupported && ( + + ( + + )} + /> + )} - /> -
+
- ( - + name="referenceText" + render={({ field }) => ( + + Reference Text + +