From ae44a48227df42dee450e0ded4248674e3dee79e Mon Sep 17 00:00:00 2001 From: Anshuman Suri Date: Tue, 5 Aug 2025 14:11:05 -0400 Subject: [PATCH 01/14] Typing, minor README edits, gitignore --- .gitignore | 1 + README.md | 14 +------------- kittentts/.gitignore | 1 + kittentts/get_model.py | 15 ++++++++------- 4 files changed, 11 insertions(+), 20 deletions(-) create mode 100644 .gitignore create mode 100644 kittentts/.gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aace59f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.egg-info/ \ No newline at end of file diff --git a/README.md b/README.md index 81536da..5812ba2 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million [Join our discord](https://discord.gg/upcyF5s6) - ## ✨ Features - **Ultra-lightweight**: Model size less than 25MB @@ -14,8 +13,6 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million - **High-quality voices**: Several premium voice options available - **Fast inference**: Optimized for real-time speech synthesis - - ## 🚀 Quick Start ### Installation @@ -24,9 +21,7 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl ``` - - - ### Basic Usage +### Basic Usage ``` from kittentts import KittenTTS @@ -42,20 +37,13 @@ sf.write('output.wav', audio, 24000) ``` - - - - ## 💻 System Requirements Works literally everywhere - - ## Checklist - [x] Release a preview model - [ ] Release the fully trained model weights - [ ] Release mobile SDK - [ ] Release web version - diff --git a/kittentts/.gitignore b/kittentts/.gitignore new file mode 100644 index 0000000..763624e --- /dev/null +++ b/kittentts/.gitignore @@ -0,0 +1 @@ +__pycache__/* \ No newline at end of file diff --git a/kittentts/get_model.py b/kittentts/get_model.py index f91c28c..af2febf 100644 --- a/kittentts/get_model.py +++ b/kittentts/get_model.py @@ -1,5 +1,6 @@ import json import os +import numpy as np from huggingface_hub import hf_hub_download from .onnx_model import KittenTTS_1_Onnx @@ -22,8 +23,8 @@ def __init__(self, model_name="KittenML/kitten-tts-nano-0.1", cache_dir=None): repo_id = model_name self.model = download_from_huggingface(repo_id=repo_id, cache_dir=cache_dir) - - def generate(self, text, voice="expr-voice-5-m", speed=1.0): + + def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray: """Generate audio from text. Args: @@ -35,8 +36,8 @@ def generate(self, text, voice="expr-voice-5-m", speed=1.0): Audio data as numpy array """ return self.model.generate(text, voice=voice, speed=speed) - - def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0, sample_rate=24000): + + def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", speed: float = 1.0, sample_rate: int = 24000): """Generate audio from text and save to file. Args: @@ -46,7 +47,7 @@ def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0, speed: Speech speed (1.0 = normal) sample_rate: Audio sample rate """ - return self.model.generate_to_file(text, output_path, voice=voice, speed=speed, sample_rate=sample_rate) + self.model.generate_to_file(text, output_path, voice=voice, speed=speed, sample_rate=sample_rate) @property def available_voices(self): @@ -54,7 +55,7 @@ def available_voices(self): return self.model.available_voices -def download_from_huggingface(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir=None): +def download_from_huggingface(repo_id: str="KittenML/kitten-tts-nano-0.1", cache_dir=None) -> KittenTTS_1_Onnx: """Download model files from Hugging Face repository. Args: @@ -97,6 +98,6 @@ def download_from_huggingface(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir= return model -def get_model(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir=None): +def get_model(repo_id: str="KittenML/kitten-tts-nano-0.1", cache_dir=None) -> KittenTTS: """Get a KittenTTS model (legacy function for backward compatibility).""" return KittenTTS(repo_id, cache_dir) From 5f9fe405ad6ad27773229f0addaedaabe94a65ba Mon Sep 17 00:00:00 2001 From: Anshuman Suri Date: Tue, 5 Aug 2025 14:39:24 -0400 Subject: [PATCH 02/14] Minor edits --- README.md | 2 +- kittentts/get_model.py | 2 ++ kittentts/onnx_model.py | 18 ++++++------------ 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 5812ba2..ac37b0a 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentt from kittentts import KittenTTS m = KittenTTS("KittenML/kitten-tts-nano-0.1") -audio = m.generate("This high quality TTS model works without a GPU", voice='expr-voice-2-f' ) +audio = m.generate("This high quality TTS model works without a GPU", voice='expr-voice-2-f') # available_voices : [ 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ] diff --git a/kittentts/get_model.py b/kittentts/get_model.py index af2febf..ed8354a 100644 --- a/kittentts/get_model.py +++ b/kittentts/get_model.py @@ -35,6 +35,8 @@ def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) Returns: Audio data as numpy array """ + if not text: + raise ValueError("Input text cannot be empty.") return self.model.generate(text, voice=voice, speed=speed) def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", speed: float = 1.0, sample_rate: int = 24000): diff --git a/kittentts/onnx_model.py b/kittentts/onnx_model.py index e93c463..904484f 100644 --- a/kittentts/onnx_model.py +++ b/kittentts/onnx_model.py @@ -5,7 +5,7 @@ import onnxruntime as ort -def basic_english_tokenize(text): +def basic_english_tokenize(text: str) -> list: """Basic English tokenizer that splits on whitespace and punctuation.""" import re tokens = re.findall(r"\w+|[^\w\s]", text) @@ -27,14 +27,9 @@ def __init__(self, dummy=None): self.word_index_dictionary = dicts - def __call__(self, text): - indexes = [] - for char in text: - try: - indexes.append(self.word_index_dictionary[char]) - except KeyError: - pass - return indexes + def __call__(self, text: str) -> list: + dicts = self.word_index_dictionary + return [dicts[char] for char in text if char in dicts] class KittenTTS_1_Onnx: @@ -48,7 +43,6 @@ def __init__(self, model_path="kitten_tts_nano_preview.onnx", voices_path="voice self.model_path = model_path self.voices = np.load(voices_path) self.session = ort.InferenceSession(model_path) - self.phonemizer = phonemizer.backend.EspeakBackend( language="en-us", preserve_punctuation=True, with_stress=True ) @@ -124,10 +118,10 @@ def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice # Example usage if __name__ == "__main__": - tts = KittenTTS() + tts = KittenTTS_1_Onnx() text = """ It begins with an "Ugh!" Another mysterious stain appears on a favorite shirt. Every trick has been tried, but the stain persists. """ - tts.generate_to_file(text, "inference_output25.wav", voice="expr-voice-5-m") \ No newline at end of file + tts.generate_to_file(text, "inference_output25.wav", voice="expr-voice-5-m") From 3883bdf80d9e9e4bdf0d1d4707fa68d995d41c56 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Aug 2025 13:22:03 +0300 Subject: [PATCH 03/14] Trim generated audio based on edge silence --- kittentts/onnx_model.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kittentts/onnx_model.py b/kittentts/onnx_model.py index e93c463..b9c16c4 100644 --- a/kittentts/onnx_model.py +++ b/kittentts/onnx_model.py @@ -100,10 +100,14 @@ def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) onnx_inputs = self._prepare_inputs(text, voice, speed) outputs = self.session.run(None, onnx_inputs) - - # Trim audio - audio = outputs[0][5000:-10000] + audio = outputs[0] # shape (n,) + # Trim edge silence from audio + non_silent = np.abs(audio) >= 0.01 + if np.any(non_silent): + indices = np.where(non_silent)[0] + start, end = indices[0], indices[-1] + audio = audio[start : end + 1] return audio def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", From 8e7213027eef7b65f7aa238498e561f6ae648a28 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Aug 2025 13:38:46 +0300 Subject: [PATCH 04/14] Remove duplicate packaging files; use Hatchling as packaging backend --- MANIFEST.in | 9 --------- pyproject.toml | 33 +++++++++++++++++++-------------- requirements.txt | 8 -------- setup.py | 46 ---------------------------------------------- 4 files changed, 19 insertions(+), 77 deletions(-) delete mode 100644 MANIFEST.in delete mode 100644 requirements.txt delete mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 953bb15..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,9 +0,0 @@ -include README.md -include LICENSE -include requirements.txt -recursive-include kittentts *.py -recursive-include kittentts *.json -recursive-include kittentts *.txt -recursive-include kittentts *.onnx -global-exclude __pycache__ -global-exclude *.py[co] diff --git a/pyproject.toml b/pyproject.toml index c2d1e5c..55ef4df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,41 +1,46 @@ [build-system] -requires = ["setuptools>=45", "wheel"] -build-backend = "setuptools.build_meta" +requires = ["hatchling"] +build-backend = "hatchling.build" [project] name = "kittentts" -version = "0.1.0" description = "Ultra-lightweight text-to-speech model with just 15 million parameters" readme = "README.md" requires-python = ">=3.8" -license = {text = "Apache 2.0"} +license = "Apache-2.0" authors = [ {name = "KittenML"} ] keywords = ["text-to-speech", "tts", "speech-synthesis", "neural-networks", "onnx"] classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Multimedia :: Sound/Audio :: Speech", "Topic :: Scientific/Engineering :: Artificial Intelligence", ] dependencies = [ - "num2words", - "spacy", "espeakng_loader", + "huggingface_hub", "misaki[en]>=0.9.4", + "num2words", + "numpy", "onnxruntime", "soundfile", - "numpy", - "huggingface_hub", + "spacy", ] +dynamic = ["version"] [project.urls] Homepage = "https://github.com/kittenml/kittentts" Repository = "https://github.com/kittenml/kittentts" Issues = "https://github.com/kittenml/kittentts/issues" -[tool.setuptools.packages.find] -where = ["."] -include = ["kittentts*"] - -[tool.setuptools.package-data] -kittentts = ["*.json", "*.txt", "*.onnx"] +[tool.hatch.version] +path = "kittentts/__init__.py" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 37bfbb3..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -num2words -spacy -espeakng_loader -misaki[en]>=0.9.4 -onnxruntime -soundfile -numpy -huggingface_hub diff --git a/setup.py b/setup.py deleted file mode 100644 index d0ac187..0000000 --- a/setup.py +++ /dev/null @@ -1,46 +0,0 @@ -from setuptools import setup, find_packages - -with open("README.md", "r", encoding="utf-8") as fh: - long_description = fh.read() - -setup( - name="kittentts", - version="0.1.0", - author="KittenML", - author_email="", - description="Ultra-lightweight text-to-speech model with just 15 million parameters", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/kittenml/kittentts", - packages=find_packages(), - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Topic :: Multimedia :: Sound/Audio :: Speech", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - ], - python_requires=">=3.8", - install_requires=[ - "num2words", - "spacy", - "espeakng_loader", - "misaki[en]>=0.9.4", - "onnxruntime", - "soundfile", - "numpy", - "huggingface_hub", - ], - keywords="text-to-speech, tts, speech-synthesis, neural-networks, onnx", - project_urls={ - "Bug Reports": "https://github.com/kittenml/kittentts/issues", - "Source": "https://github.com/kittenml/kittentts", - }, -) From 03853c70da170fa727ace65215af369a6f504030 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Aug 2025 13:45:43 +0300 Subject: [PATCH 05/14] Remove unnecessary misaki dependency * Remove the `misaki` dependency, but directly depend on `phonemizer-fork` instead. * Do the side-effect phonemizer initialization call by hand --- kittentts/onnx_model.py | 6 +++++- pyproject.toml | 10 +++++----- requirements.txt | 10 +++++----- setup.py | 10 +++++----- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/kittentts/onnx_model.py b/kittentts/onnx_model.py index e93c463..a53c42d 100644 --- a/kittentts/onnx_model.py +++ b/kittentts/onnx_model.py @@ -1,8 +1,12 @@ -from misaki import en, espeak import numpy as np import phonemizer import soundfile as sf import onnxruntime as ort +import espeakng_loader +from phonemizer.backend.espeak.wrapper import EspeakWrapper + +EspeakWrapper.set_library(espeakng_loader.get_library_path()) +EspeakWrapper.set_data_path(espeakng_loader.get_data_path()) def basic_english_tokenize(text): diff --git a/pyproject.toml b/pyproject.toml index c2d1e5c..246e83b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,14 +18,14 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", ] dependencies = [ - "num2words", - "spacy", "espeakng_loader", - "misaki[en]>=0.9.4", + "huggingface_hub", + "num2words", + "numpy", "onnxruntime", + "phonemizer-fork~=3.3.2", "soundfile", - "numpy", - "huggingface_hub", + "spacy", ] [project.urls] diff --git a/requirements.txt b/requirements.txt index 37bfbb3..5c68793 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -num2words -spacy espeakng_loader -misaki[en]>=0.9.4 +huggingface_hub +num2words +numpy onnxruntime +phonemizer-fork~=3.3.2 soundfile -numpy -huggingface_hub +spacy diff --git a/setup.py b/setup.py index d0ac187..9259fa3 100644 --- a/setup.py +++ b/setup.py @@ -29,14 +29,14 @@ ], python_requires=">=3.8", install_requires=[ - "num2words", - "spacy", "espeakng_loader", - "misaki[en]>=0.9.4", + "huggingface_hub", + "num2words", + "numpy", "onnxruntime", + "phonemizer-fork~=3.3.2", "soundfile", - "numpy", - "huggingface_hub", + "spacy", ], keywords="text-to-speech, tts, speech-synthesis, neural-networks, onnx", project_urls={ From 0d7d96e0e2a17374ef425a3a268ef0254ecbe27e Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Wed, 6 Aug 2025 13:23:10 +0200 Subject: [PATCH 06/14] syntax highlighting --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 81536da..e9b3a32 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentt ### Basic Usage -``` +```python from kittentts import KittenTTS m = KittenTTS("KittenML/kitten-tts-nano-0.1") @@ -39,7 +39,6 @@ audio = m.generate("This high quality TTS model works without a GPU", voice='exp # Save the audio import soundfile as sf sf.write('output.wav', audio, 24000) - ``` From 0b4ad69ee103974f46e906e4ade9e6891ff921ba Mon Sep 17 00:00:00 2001 From: Kirby Rs Date: Sat, 8 Nov 2025 16:44:38 +0100 Subject: [PATCH 07/14] Add CLI binary interface for Kitten TTS - Add executable kitten-tts wrapper script - Add kittentts/cli.py with full command-line interface - Configure console script entry point in pyproject.toml - Implement audio fade-out with customizable duration (default: 0.2s) - Add automatic dots suffix to prevent audio cutoff - Support all available voices, speed control, and audio formats - Add joblib dependency for proper package installation - Include comprehensive help documentation and examples Features: - Text-to-speech synthesis via command line - Multiple voice options (expr-voice-2/m/f through expr-voice-5/m/f) - Adjustable speech speed and fade-out duration - Audio file output (WAV, FLAC, OGG) or direct playback - Automatic text preprocessing to prevent abrupt cutoffs --- kitten-tts | 19 ++++ kittentts/cli.py | 241 +++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 3 + 3 files changed, 263 insertions(+) create mode 100755 kitten-tts create mode 100755 kittentts/cli.py diff --git a/kitten-tts b/kitten-tts new file mode 100755 index 0000000..f7f49e0 --- /dev/null +++ b/kitten-tts @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +""" +Kitten TTS Binary - Text-to-Speech Command Line Tool +This is a wrapper script for the kittentts/cli.py +""" + +import sys +import os + +# Get the directory where this script is located +script_dir = os.path.dirname(os.path.abspath(__file__)) + +# Import and run the CLI +sys.path.insert(0, script_dir) + +from kittentts.cli import main + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/kittentts/cli.py b/kittentts/cli.py new file mode 100755 index 0000000..093b8c3 --- /dev/null +++ b/kittentts/cli.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +Kitten TTS CLI - Text-to-Speech Command Line Tool + +Usage: + python kittentts_cli.py "Hello world" # Speak text + python kittentts_cli.py "Hello world" --voice expr-voice-2-f # Use specific voice + python kittentts_cli.py "Hello world" --output output.wav # Save to file + python kittentts_cli.py --list-voices # List available voices + python kittentts_cli.py --help # Show help +""" + +import argparse +import sys +import os +import numpy as np +import soundfile as sf + +# Add the current directory to Python path so we can import kittentts +# We need to add the parent directory since we're inside kittentts/cli.py +current_dir = os.path.dirname(os.path.abspath(__file__)) +parent_dir = os.path.dirname(current_dir) +sys.path.insert(0, parent_dir) + +# Default fade out duration in seconds +DEFAULT_FADE_OUT = 0.2 + +try: + from kittentts import KittenTTS +except ImportError: + print("Error: KittenTTS not found. Please install it with:") + print("pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl") + sys.exit(1) + + +def apply_fade_out(audio_data, sample_rate=24000, fade_duration=DEFAULT_FADE_OUT): + """Apply exponential fade out to audio data. + + Args: + audio_data: NumPy array of audio samples + sample_rate: Audio sample rate (default: 24000) + fade_duration: Fade out duration in seconds (default: {DEFAULT_FADE_OUT}s) + + Returns: + Audio data with fade out applied + """ + if len(audio_data) == 0: + return audio_data + + fade_samples = int(fade_duration * sample_rate) + if fade_samples >= len(audio_data): + fade_samples = len(audio_data) // 2 # Limit fade to half of audio if very short + + # Create exponential fade curve + fade_curve = np.linspace(1, 0, fade_samples) ** 2 # Quadratic fade for smoother curve + + # Apply fade to the end of audio + audio_with_fade = audio_data.copy() + audio_with_fade[-fade_samples:] *= fade_curve + + return audio_with_fade + + +def list_voices(model): + """List all available voices.""" + print("Available voices:") + for voice in model.available_voices: + print(f" - {voice}") + + +def play_audio_simple(audio_data, sample_rate=24000): + """Simple audio playback using system command.""" + # Save to temporary file and play with system command + temp_file = "temp_kitten_tts_output.wav" + try: + sf.write(temp_file, audio_data, sample_rate) + + # Try different system audio players based on OS + import subprocess + import platform + + system = platform.system() + if system == "Darwin": # macOS + subprocess.run(["afplay", temp_file], check=True) + elif system == "Linux": + # Try common Linux audio players + for player in ["aplay", "paplay", "mpg123", "mplayer"]: + try: + subprocess.run([player, temp_file], check=True) + break + except (subprocess.CalledProcessError, FileNotFoundError): + continue + else: + print(f"Audio saved to {temp_file} (no suitable audio player found)") + elif system == "Windows": + subprocess.run(["start", temp_file], shell=True, check=True) + else: + print(f"Audio saved to {temp_file} (unsupported OS for direct playback)") + + # Clean up temp file + try: + os.remove(temp_file) + except: + pass + + except Exception as e: + print(f"Error playing audio: {e}") + print(f"Audio saved to {temp_file}") + + +def main(): + parser = argparse.ArgumentParser( + description="Kitten TTS - Ultra-lightweight text-to-speech synthesis", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s "Hello world" # Speak text + %(prog)s "Hello world" --voice expr-voice-2-f # Use specific voice + %(prog)s "Hello world" --output output.wav # Save to file + %(prog)s "Hello world" --speed 1.2 # Faster speech + %(prog)s "Hello world" --fade-out 0.1 # 0.1s fade out + %(prog)s --list-voices # List available voices + """ + ) + + parser.add_argument( + "text", + nargs="?", + help="Text to synthesize into speech" + ) + + parser.add_argument( + "--model", + default="KittenML/kitten-tts-nano-0.2", + help="Model name or path (default: KittenML/kitten-tts-nano-0.2)" + ) + + parser.add_argument( + "--voice", + default="expr-voice-2-m", + help="Voice to use (default: expr-voice-2-m)" + ) + + parser.add_argument( + "--speed", + type=float, + default=1.0, + help="Speech speed (1.0 = normal, higher = faster, lower = slower)" + ) + + parser.add_argument( + "--fade-out", + type=float, + default=DEFAULT_FADE_OUT, + help=f"Fade out duration in seconds (default: {DEFAULT_FADE_OUT}, use 0 to disable)" + ) + + parser.add_argument( + "--output", "-o", + help="Output file path (saves as WAV). If not specified, plays through speakers." + ) + + parser.add_argument( + "--list-voices", + action="store_true", + help="List available voices and exit" + ) + + parser.add_argument( + "--format", + choices=["wav", "flac", "ogg"], + default="wav", + help="Audio format for output file (default: wav)" + ) + + args = parser.parse_args() + + # Handle --list-voices + if args.list_voices: + try: + model = KittenTTS(args.model) + list_voices(model) + return 0 + except Exception as e: + print(f"Error loading model: {e}", file=sys.stderr) + return 1 + + # Check if text is provided + if not args.text: + parser.print_help() + print("\nError: Text input is required", file=sys.stderr) + return 1 + + try: + # Initialize the model + print(f"Loading model: {args.model}...") + model = KittenTTS(args.model) + + # Validate voice + if args.voice not in model.available_voices: + print(f"Error: Voice '{args.voice}' not available.", file=sys.stderr) + print(f"Available voices: {', '.join(model.available_voices)}") + return 1 + + # Add dots at the end to prevent cutoff (simple fix) + if not args.text.endswith('...'): + args.text = args.text + '...' + print(f"Added dots to prevent audio cutoff") + + # Generate audio + print(f"Generating speech using voice: {args.voice}...") + audio = model.generate(args.text, voice=args.voice, speed=args.speed) + + # Apply fade out if specified + if args.fade_out > 0: + print(f"Applying {args.fade_out}s fade out...") + audio = apply_fade_out(audio, sample_rate=24000, fade_duration=args.fade_out) + + if args.output: + # Save to file + print(f"Saving audio to: {args.output}") + sf.write(args.output, audio, 24000) + print("Done!") + else: + # Play through speakers + print("Playing audio...") + play_audio_simple(audio) + print("Done!") + + return 0 + + except KeyboardInterrupt: + print("\nInterrupted by user") + return 1 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c2d1e5c..7a460b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,9 @@ Homepage = "https://github.com/kittenml/kittentts" Repository = "https://github.com/kittenml/kittentts" Issues = "https://github.com/kittenml/kittentts/issues" +[project.scripts] +kitten-tts = "kittentts.cli:main" + [tool.setuptools.packages.find] where = ["."] include = ["kittentts*"] From 6fa98b8da6b5ade50e2a1b7c93e23233d93a9e11 Mon Sep 17 00:00:00 2001 From: Kirby Rs Date: Sat, 8 Nov 2025 16:58:42 +0100 Subject: [PATCH 08/14] Add stdin pipeline support to CLI interface - Implemented pipeline/stdin reading functionality - Added support for piping text to kitten-tts command - Updated help documentation with pipeline usage examples - Enhanced error handling for stdin operations - Maintained backward compatibility with argument-based input Usage examples: echo "hello world" | ./kitten-tts cat text_file.txt | ./kitten-tts --output audio.wav --- kittentts/cli.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/kittentts/cli.py b/kittentts/cli.py index 093b8c3..008916e 100755 --- a/kittentts/cli.py +++ b/kittentts/cli.py @@ -119,6 +119,7 @@ def main(): %(prog)s "Hello world" --output output.wav # Save to file %(prog)s "Hello world" --speed 1.2 # Faster speech %(prog)s "Hello world" --fade-out 0.1 # 0.1s fade out + echo "Hello world" | %(prog)s # Read from stdin %(prog)s --list-voices # List available voices """ ) @@ -126,7 +127,7 @@ def main(): parser.add_argument( "text", nargs="?", - help="Text to synthesize into speech" + help="Text to synthesize into speech (if not provided, reads from stdin)" ) parser.add_argument( @@ -185,11 +186,26 @@ def main(): print(f"Error loading model: {e}", file=sys.stderr) return 1 - # Check if text is provided - if not args.text: - parser.print_help() - print("\nError: Text input is required", file=sys.stderr) - return 1 + # Get text from command line or stdin + if args.text: + text = args.text + else: + # Read from stdin + try: + if sys.stdin.isatty(): + # No pipe, interactive mode + parser.print_help() + print("\nError: Text input is required (provide as argument or pipe from stdin)", file=sys.stderr) + return 1 + else: + # Pipe detected, read from stdin + text = sys.stdin.read().strip() + if not text: + print("\nError: No text received from stdin", file=sys.stderr) + return 1 + except Exception as e: + print(f"Error reading from stdin: {e}", file=sys.stderr) + return 1 try: # Initialize the model @@ -203,13 +219,13 @@ def main(): return 1 # Add dots at the end to prevent cutoff (simple fix) - if not args.text.endswith('...'): - args.text = args.text + '...' + if not text.endswith('...'): + text = text + '...' print(f"Added dots to prevent audio cutoff") # Generate audio print(f"Generating speech using voice: {args.voice}...") - audio = model.generate(args.text, voice=args.voice, speed=args.speed) + audio = model.generate(text, voice=args.voice, speed=args.speed) # Apply fade out if specified if args.fade_out > 0: From bdf63530c0c93e7e44a9903673c7d3347b9cfdda Mon Sep 17 00:00:00 2001 From: Kirby Rs Date: Sat, 8 Nov 2025 17:08:14 +0100 Subject: [PATCH 09/14] Add CLI documentation to README.md - Added comprehensive CLI usage section - Documented installation and setup steps for CLI - Listed all CLI features and available voices - Added examples for both argument and stdin/pipeline usage - Organized Python API and CLI sections separately - Updated features list to highlight CLI functionality --- README.md | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0f46ec7..cf893fb 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Email the creators with any questions : info@stellonlabs.com - **CPU-optimized**: Runs without GPU on any device - **High-quality voices**: Several premium voice options available - **Fast inference**: Optimized for real-time speech synthesis +- **Command-line interface**: Easy-to-use CLI with pipeline support @@ -30,9 +31,10 @@ pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentt - ### Basic Usage + ### Basic Usage -``` +#### Python API +```python from kittentts import KittenTTS m = KittenTTS("KittenML/kitten-tts-nano-0.2") @@ -43,9 +45,43 @@ audio = m.generate("This high quality TTS model works without a GPU", voice='exp # Save the audio import soundfile as sf sf.write('output.wav', audio, 24000) +``` + +#### Command Line Interface (CLI) + +```bash +# Clone the repository +git clone https://github.com/KittenML/KittenTTS.git +cd KittenTTS +# Create and activate virtual environment +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install dependencies +pip install -r requirements.txt + +# Use the CLI +./kitten-tts "Hello world" # Speak text +./kitten-tts "Hello world" --output hello.wav # Save to file +echo "Hello world" | ./kitten-tts # Read from stdin +./kitten-tts --list-voices # List available voices ``` +**CLI Features:** +- **Text input** via arguments or stdin (pipeline support) +- **8 different voices** (expr-voice-2/m/f through expr-voice-5/m/f) +- **Speed control** with `--speed` option +- **Audio fade-out** with `--fade-out` option (default: 0.2s) +- **Multiple formats** (WAV, FLAC, OGG) +- **Cross-platform audio playback** (macOS, Linux, Windows) + +**Available Voices:** +- `expr-voice-2-m` / `expr-voice-2-f` +- `expr-voice-3-m` / `expr-voice-3-f` +- `expr-voice-4-m` / `expr-voice-4-f` +- `expr-voice-5-m` / `expr-voice-5-f` + From 283d38cd95b6dd4252ccd99e8694de4b22f2da1f Mon Sep 17 00:00:00 2001 From: Kirby Rs Date: Sat, 8 Nov 2025 17:09:39 +0100 Subject: [PATCH 10/14] Improve CLI documentation with collapsible section - Organized CLI documentation in a collapsible details section - Added structured subsections (Installation, Basic Usage, Advanced Options) - Improved readability with better organization - Maintained all CLI features and examples - Made README more concise while preserving comprehensive information --- README.md | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index cf893fb..0400c55 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,11 @@ sf.write('output.wav', audio, 24000) #### Command Line Interface (CLI) +
+Click to expand CLI usage instructions + +##### Installation + ```bash # Clone the repository git clone https://github.com/KittenML/KittenTTS.git @@ -60,28 +65,51 @@ source venv/bin/activate # On Windows: venv\Scripts\activate # Install dependencies pip install -r requirements.txt +``` + +##### Basic Usage -# Use the CLI +```bash ./kitten-tts "Hello world" # Speak text ./kitten-tts "Hello world" --output hello.wav # Save to file echo "Hello world" | ./kitten-tts # Read from stdin ./kitten-tts --list-voices # List available voices ``` -**CLI Features:** +##### Advanced Options + +```bash +# With specific voice and fade-out +./kitten-tts "Hello world" --voice expr-voice-2-f --fade-out 0.3 + +# Adjust speech speed +./kitten-tts "Hello world" --speed 1.5 + +# Different audio formats +./kitten-tts "Hello world" --output audio.flac --format flac + +# Pipeline usage with files +cat text_file.txt | ./kitten-tts --output speech.wav +``` + +##### CLI Features + - **Text input** via arguments or stdin (pipeline support) - **8 different voices** (expr-voice-2/m/f through expr-voice-5/m/f) -- **Speed control** with `--speed` option -- **Audio fade-out** with `--fade-out` option (default: 0.2s) +- **Speed control** with `--speed` option (1.0 = normal) +- **Audio fade-out** with `--fade-out` option (default: 0.2s, use 0 to disable) - **Multiple formats** (WAV, FLAC, OGG) - **Cross-platform audio playback** (macOS, Linux, Windows) -**Available Voices:** +##### Available Voices + - `expr-voice-2-m` / `expr-voice-2-f` - `expr-voice-3-m` / `expr-voice-3-f` - `expr-voice-4-m` / `expr-voice-4-f` - `expr-voice-5-m` / `expr-voice-5-f` +
+ From ece72eff7302abca741c8c4a866d779d3c84b09d Mon Sep 17 00:00:00 2001 From: Kirby Rs Date: Sat, 8 Nov 2025 17:10:41 +0100 Subject: [PATCH 11/14] Simplify CLI section title - Changed 'Click to expand CLI usage instructions' to 'CLI Usage Instructions' - More concise and cleaner collapsible section header --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0400c55..e5ad399 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ sf.write('output.wav', audio, 24000) #### Command Line Interface (CLI)
-Click to expand CLI usage instructions +CLI Usage Instructions ##### Installation From 6b76cdeeb8f86745dc0017449bc0937a7c475dea Mon Sep 17 00:00:00 2001 From: Kirby Rs Date: Sat, 8 Nov 2025 11:12:57 -0500 Subject: [PATCH 12/14] Update checklist in README for CLI support --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e5ad399..1bf0e68 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,7 @@ Works literally everywhere ## Checklist - [x] Release a preview model +- [x] CLI support - [ ] Release the fully trained model weights - [ ] Release mobile SDK - [ ] Release web version From 2348ebece62e7b9264d5fd093abd1acdc983e82c Mon Sep 17 00:00:00 2001 From: Kirby Rs Date: Sat, 8 Nov 2025 21:47:15 +0100 Subject: [PATCH 13/14] Optimize CLI startup speed and audio playback system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major improvements: 🚀 CLI Performance: - Implement lazy imports for instant help display (0.04s vs 2.2s) - Add optimized entry point that only loads heavy dependencies when needed - Refactor CLI into separate entry and processing modules 🎵 Audio System Enhancements: - Add direct audio streaming with sounddevice library - Implement fallback to system temp directory for temp files - Fix permission issues when running from root directory - Add proper temp file cleanup and error handling 📦 Package Structure: - Update pyproject.toml to use optimized entry point - Make package imports lazy to improve startup performance - Add sounddevice as optional streaming dependency 💡 User Experience: - Help commands now appear instantly - Audio works from any directory including root - Graceful fallback when sounddevice unavailable - Maintains full CLI functionality with all existing features --- kitten-tts | 2 +- kittentts/__init__.py | 13 +++- kittentts/cli_entry.py | 90 ++++++++++++++++++++++++++++ kittentts/{cli.py => cli_process.py} | 64 ++++++++++++++++---- pyproject.toml | 7 ++- requirements.txt | 1 + 6 files changed, 161 insertions(+), 16 deletions(-) create mode 100644 kittentts/cli_entry.py rename kittentts/{cli.py => cli_process.py} (78%) diff --git a/kitten-tts b/kitten-tts index f7f49e0..21f42fd 100755 --- a/kitten-tts +++ b/kitten-tts @@ -13,7 +13,7 @@ script_dir = os.path.dirname(os.path.abspath(__file__)) # Import and run the CLI sys.path.insert(0, script_dir) -from kittentts.cli import main +from kittentts.cli_entry import main if __name__ == "__main__": sys.exit(main()) \ No newline at end of file diff --git a/kittentts/__init__.py b/kittentts/__init__.py index 9cf1a2d..6b46051 100644 --- a/kittentts/__init__.py +++ b/kittentts/__init__.py @@ -1,7 +1,16 @@ -from kittentts.get_model import get_model, KittenTTS - __version__ = "0.1.0" __author__ = "KittenML" __description__ = "Ultra-lightweight text-to-speech model with just 15 million parameters" +# Lazy imports - only load heavy dependencies when actually needed +def get_model(*args, **kwargs): + """Lazy import of get_model""" + from .get_model import get_model as _get_model + return _get_model(*args, **kwargs) + +def KittenTTS(*args, **kwargs): + """Lazy import of KittenTTS""" + from .get_model import KittenTTS as _KittenTTS + return _KittenTTS(*args, **kwargs) + __all__ = ["get_model", "KittenTTS"] diff --git a/kittentts/cli_entry.py b/kittentts/cli_entry.py new file mode 100644 index 0000000..3b73e10 --- /dev/null +++ b/kittentts/cli_entry.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +""" +Optimized entry point for KittenTTS with fast help and lazy imports +""" + +import argparse +import sys + +def show_help(): + """Show help message without importing heavy dependencies""" + parser = argparse.ArgumentParser( + description="Kitten TTS - Ultra-lightweight text-to-speech synthesis", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s "Hello world" # Speak text + %(prog)s "Hello world" --voice expr-voice-2-f # Use specific voice + %(prog)s "Hello world" --output output.wav # Save to file + %(prog)s "Hello world" --speed 1.2 # Faster speech + %(prog)s "Hello world" --fade-out 0.1 # 0.1s fade out + echo "Hello world" | %(prog)s # Read from stdin + %(prog)s --list-voices # List available voices + """ + ) + + parser.add_argument( + "text", + nargs="?", + help="Text to synthesize into speech (if not provided, reads from stdin)" + ) + + parser.add_argument( + "--model", + default="KittenML/kitten-tts-nano-0.2", + help="Model name or path (default: KittenML/kitten-tts-nano-0.2)" + ) + + parser.add_argument( + "--voice", + default="expr-voice-2-m", + help="Voice to use (default: expr-voice-2-m)" + ) + + parser.add_argument( + "--speed", + type=float, + default=1.0, + help="Speech speed (1.0 = normal, higher = faster, lower = slower)" + ) + + parser.add_argument( + "--fade-out", + type=float, + default=0.2, + help="Fade out duration in seconds (default: 0.2, use 0 to disable)" + ) + + parser.add_argument( + "--output", "-o", + help="Output file path (saves as WAV). If not specified, plays through speakers." + ) + + parser.add_argument( + "--list-voices", + action="store_true", + help="List available voices and exit" + ) + + parser.add_argument( + "--format", + choices=["wav", "flac", "ogg"], + default="wav", + help="Audio format for output file (default: wav)" + ) + + parser.print_help() + +def main(): + """Optimized main entry point - fast help, full functionality when needed""" + # Check if user just wants help + if len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ['-h', '--help']): + show_help() + return 0 + + # For any other operation, run the full CLI + from .cli_process import main as cli_main + return cli_main() + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/kittentts/cli.py b/kittentts/cli_process.py similarity index 78% rename from kittentts/cli.py rename to kittentts/cli_process.py index 008916e..8372836 100755 --- a/kittentts/cli.py +++ b/kittentts/cli_process.py @@ -15,6 +15,7 @@ import os import numpy as np import soundfile as sf +import tempfile # Add the current directory to Python path so we can import kittentts # We need to add the parent directory since we're inside kittentts/cli.py @@ -25,12 +26,18 @@ # Default fade out duration in seconds DEFAULT_FADE_OUT = 0.2 -try: - from kittentts import KittenTTS -except ImportError: - print("Error: KittenTTS not found. Please install it with:") - print("pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl") - sys.exit(1) + +# Lazy import - only load KittenTTS when actually needed (not for help) +def get_kittentts(): + try: + # Import directly from get_model to avoid package-level imports + from kittentts.get_model import KittenTTS + return KittenTTS + except ImportError: + print("Error: KittenTTS not found. Please install it with:") + print( + "pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl") + sys.exit(1) def apply_fade_out(audio_data, sample_rate=24000, fade_duration=DEFAULT_FADE_OUT): @@ -69,10 +76,37 @@ def list_voices(model): def play_audio_simple(audio_data, sample_rate=24000): - """Simple audio playback using system command.""" - # Save to temporary file and play with system command - temp_file = "temp_kitten_tts_output.wav" + """Direct audio streaming without temporary files.""" try: + # Try to import sounddevice for direct audio streaming + import sounddevice as sd + import numpy as np + + # Convert audio data to proper format if needed + if audio_data.dtype != np.float32: + audio_data = audio_data.astype(np.float32) + + # Play audio directly + sd.play(audio_data, sample_rate) + sd.wait() # Wait for playback to complete + + except ImportError: + # Fallback to temp file method if sounddevice not available + print("sounddevice not available, falling back to temp file method...") + play_audio_with_tempfile(audio_data, sample_rate) + except Exception as e: + # Try alternative streaming method or fallback + print(f"Direct streaming failed: {e}") + play_audio_with_tempfile(audio_data, sample_rate) + + +def play_audio_with_tempfile(audio_data, sample_rate=24000): + """Fallback method using temporary file in system temp directory.""" + temp_file = None + try: + # Create temp file in system temp directory + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + temp_file = tmp.name sf.write(temp_file, audio_data, sample_rate) # Try different system audio players based on OS @@ -99,13 +133,17 @@ def play_audio_simple(audio_data, sample_rate=24000): # Clean up temp file try: - os.remove(temp_file) + if temp_file and os.path.exists(temp_file): + os.remove(temp_file) except: pass except Exception as e: print(f"Error playing audio: {e}") - print(f"Audio saved to {temp_file}") + if temp_file and os.path.exists(temp_file): + print(f"Audio saved to {temp_file}") + else: + print("Audio could not be saved - temp file creation failed") def main(): @@ -179,6 +217,7 @@ def main(): # Handle --list-voices if args.list_voices: try: + KittenTTS = get_kittentts() model = KittenTTS(args.model) list_voices(model) return 0 @@ -210,6 +249,7 @@ def main(): try: # Initialize the model print(f"Loading model: {args.model}...") + KittenTTS = get_kittentts() model = KittenTTS(args.model) # Validate voice @@ -254,4 +294,4 @@ def main(): if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml index 7a460b8..addfa97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,13 +28,18 @@ dependencies = [ "huggingface_hub", ] +[project.optional-dependencies] +streaming = [ + "sounddevice", +] + [project.urls] Homepage = "https://github.com/kittenml/kittentts" Repository = "https://github.com/kittenml/kittentts" Issues = "https://github.com/kittenml/kittentts/issues" [project.scripts] -kitten-tts = "kittentts.cli:main" +kitten-tts = "kittentts.cli_entry:main" [tool.setuptools.packages.find] where = ["."] diff --git a/requirements.txt b/requirements.txt index 37bfbb3..64ffd5b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ onnxruntime soundfile numpy huggingface_hub +sounddevice From fba13265e95fc627a87b1b5ed855930d8e6ba323 Mon Sep 17 00:00:00 2001 From: Kirby Rs Date: Sat, 8 Nov 2025 23:32:04 +0100 Subject: [PATCH 14/14] Add old_trim parameter to generate method for backward compatibility --- kittentts/cli_process.py | 10 ++++---- kittentts/get_model.py | 4 ++-- kittentts/onnx_model.py | 52 ++++++++++++++++++++++------------------ 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/kittentts/cli_process.py b/kittentts/cli_process.py index 8372836..142e3c6 100755 --- a/kittentts/cli_process.py +++ b/kittentts/cli_process.py @@ -24,7 +24,7 @@ sys.path.insert(0, parent_dir) # Default fade out duration in seconds -DEFAULT_FADE_OUT = 0.2 +DEFAULT_FADE_OUT = 0.3 # Lazy import - only load KittenTTS when actually needed (not for help) @@ -259,13 +259,13 @@ def main(): return 1 # Add dots at the end to prevent cutoff (simple fix) - if not text.endswith('...'): - text = text + '...' - print(f"Added dots to prevent audio cutoff") + # if not text.endswith('...'): + # text = text + '...' + # print(f"Added dots to prevent audio cutoff") # Generate audio print(f"Generating speech using voice: {args.voice}...") - audio = model.generate(text, voice=args.voice, speed=args.speed) + audio = model.generate(text, voice=args.voice, speed=args.speed, old_trim=True) # Apply fade out if specified if args.fade_out > 0: diff --git a/kittentts/get_model.py b/kittentts/get_model.py index ed8354a..7cd984f 100644 --- a/kittentts/get_model.py +++ b/kittentts/get_model.py @@ -24,7 +24,7 @@ def __init__(self, model_name="KittenML/kitten-tts-nano-0.1", cache_dir=None): self.model = download_from_huggingface(repo_id=repo_id, cache_dir=cache_dir) - def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray: + def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0, old_trim=False) -> np.ndarray: """Generate audio from text. Args: @@ -37,7 +37,7 @@ def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) """ if not text: raise ValueError("Input text cannot be empty.") - return self.model.generate(text, voice=voice, speed=speed) + return self.model.generate(text, voice=voice, speed=speed, old_trim) def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", speed: float = 1.0, sample_rate: int = 24000): """Generate audio from text and save to file. diff --git a/kittentts/onnx_model.py b/kittentts/onnx_model.py index f6ab787..7fe663a 100644 --- a/kittentts/onnx_model.py +++ b/kittentts/onnx_model.py @@ -24,7 +24,7 @@ def __init__(self, dummy=None): _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) - + dicts = {} for i in range(len(symbols)): dicts[symbols[i]] = i @@ -51,40 +51,40 @@ def __init__(self, model_path="kitten_tts_nano_preview.onnx", voices_path="voice language="en-us", preserve_punctuation=True, with_stress=True ) self.text_cleaner = TextCleaner() - + # Available voices self.available_voices = [ - 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', + 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ] - + def _prepare_inputs(self, text: str, voice: str, speed: float = 1.0) -> dict: """Prepare ONNX model inputs from text and voice parameters.""" if voice not in self.available_voices: raise ValueError(f"Voice '{voice}' not available. Choose from: {self.available_voices}") - + # Phonemize the input text phonemes_list = self.phonemizer.phonemize([text]) - + # Process phonemes to get token IDs phonemes = basic_english_tokenize(phonemes_list[0]) phonemes = ' '.join(phonemes) tokens = self.text_cleaner(phonemes) - + # Add start and end tokens tokens.insert(0, 0) tokens.append(0) - + input_ids = np.array([tokens], dtype=np.int64) ref_s = self.voices[voice] - + return { "input_ids": input_ids, "style": ref_s, "speed": np.array([speed], dtype=np.float32), } - - def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray: + + def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0, old_trim=False) -> np.ndarray: """Synthesize speech from text. Args: @@ -96,20 +96,26 @@ def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) Audio data as numpy array """ onnx_inputs = self._prepare_inputs(text, voice, speed) - + outputs = self.session.run(None, onnx_inputs) - audio = outputs[0] # shape (n,) - # Trim edge silence from audio - non_silent = np.abs(audio) >= 0.01 - if np.any(non_silent): - indices = np.where(non_silent)[0] - start, end = indices[0], indices[-1] - audio = audio[start : end + 1] + if old_trim: + return outputs[0][5000:-10000] + else: + # new trim approach, PR link: + # https://github.com/KittenML/KittenTTS/pull/22/commits/3883bdf80d9e9e4bdf0d1d4707fa68d995d41c56 + audio = outputs[0] # shape (n,) + # Trim edge silence from audio + non_silent = np.abs(audio) >= 0.1 + if np.any(non_silent): + indices = np.where(non_silent)[0] + start, end = indices[0], indices[-1] + audio = audio[start: end + 1] + return audio - - def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", - speed: float = 1.0, sample_rate: int = 24000) -> None: + + def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", + speed: float = 1.0, sample_rate: int = 24000) -> None: """Synthesize speech and save to file. Args: @@ -127,7 +133,7 @@ def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice # Example usage if __name__ == "__main__": tts = KittenTTS_1_Onnx() - + text = """ It begins with an "Ugh!" Another mysterious stain appears on a favorite shirt. Every trick has been tried, but the stain persists. """