diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aace59f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.egg-info/ \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 953bb15..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,9 +0,0 @@ -include README.md -include LICENSE -include requirements.txt -recursive-include kittentts *.py -recursive-include kittentts *.json -recursive-include kittentts *.txt -recursive-include kittentts *.onnx -global-exclude __pycache__ -global-exclude *.py[co] diff --git a/README.md b/README.md index 0f46ec7..b38a95f 100644 --- a/README.md +++ b/README.md @@ -10,15 +10,13 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million Email the creators with any questions : info@stellonlabs.com - ## ✨ Features - **Ultra-lightweight**: Model size less than 25MB - **CPU-optimized**: Runs without GPU on any device - **High-quality voices**: Several premium voice options available - **Fast inference**: Optimized for real-time speech synthesis - - +- **Command-line interface**: Easy-to-use CLI with pipeline support ## 🚀 Quick Start @@ -28,38 +26,97 @@ Email the creators with any questions : info@stellonlabs.com pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl ``` +### Basic Usage - - ### Basic Usage - -``` +#### Python API +```python from kittentts import KittenTTS m = KittenTTS("KittenML/kitten-tts-nano-0.2") -audio = m.generate("This high quality TTS model works without a GPU", voice='expr-voice-2-f' ) +audio = m.generate("This high quality TTS model works without a GPU", voice='expr-voice-2-f') # available_voices : [ 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ] # Save the audio import soundfile as sf sf.write('output.wav', audio, 24000) +``` + +#### Command Line Interface (CLI) +
+CLI Usage Instructions + +##### Installation + +```bash +# Clone the repository +git clone https://github.com/KittenML/KittenTTS.git +cd KittenTTS + +# Create and activate virtual environment +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install dependencies +pip install -r requirements.txt ``` +##### Basic Usage +```bash +./kitten-tts "Hello world" # Speak text +./kitten-tts "Hello world" --output hello.wav # Save to file +echo "Hello world" | ./kitten-tts # Read from stdin +./kitten-tts --list-voices # List available voices +``` +##### Advanced Options +```bash +# With specific voice and fade-out +./kitten-tts "Hello world" --voice expr-voice-2-f --fade-out 0.3 -## 💻 System Requirements +# Adjust speech speed +./kitten-tts "Hello world" --speed 1.5 -Works literally everywhere +# Different audio formats +./kitten-tts "Hello world" --output audio.flac --format flac + +# Pipeline usage with files +cat text_file.txt | ./kitten-tts --output speech.wav +``` + +##### CLI Features + +- **Text input** via arguments or stdin (pipeline support) +- **8 different voices** (expr-voice-2/m/f through expr-voice-5/m/f) +- **Speed control** with `--speed` option (1.0 = normal) +- **Audio fade-out** with `--fade-out` option (default: 0.2s, use 0 to disable) +- **Multiple formats** (WAV, FLAC, OGG) +- **Cross-platform audio playback** (macOS, Linux, Windows) + +##### Available Voices + +- `expr-voice-2-m` / `expr-voice-2-f` +- `expr-voice-3-m` / `expr-voice-3-f` +- `expr-voice-4-m` / `expr-voice-4-f` +- `expr-voice-5-m` / `expr-voice-5-f` + +
+ + +## 💻 System Requirements + +Works literally everywhere + ## Checklist - [x] Release a preview model +- [x] CLI support - [ ] Release the fully trained model weights - [ ] Release mobile SDK - [ ] Release web version - diff --git a/kitten-tts b/kitten-tts new file mode 100755 index 0000000..21f42fd --- /dev/null +++ b/kitten-tts @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +""" +Kitten TTS Binary - Text-to-Speech Command Line Tool +This is a wrapper script for the kittentts/cli.py +""" + +import sys +import os + +# Get the directory where this script is located +script_dir = os.path.dirname(os.path.abspath(__file__)) + +# Import and run the CLI +sys.path.insert(0, script_dir) + +from kittentts.cli_entry import main + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/kittentts/.gitignore b/kittentts/.gitignore new file mode 100644 index 0000000..763624e --- /dev/null +++ b/kittentts/.gitignore @@ -0,0 +1 @@ +__pycache__/* \ No newline at end of file diff --git a/kittentts/__init__.py b/kittentts/__init__.py index 9cf1a2d..6b46051 100644 --- a/kittentts/__init__.py +++ b/kittentts/__init__.py @@ -1,7 +1,16 @@ -from kittentts.get_model import get_model, KittenTTS - __version__ = "0.1.0" __author__ = "KittenML" __description__ = "Ultra-lightweight text-to-speech model with just 15 million parameters" +# Lazy imports - only load heavy dependencies when actually needed +def get_model(*args, **kwargs): + """Lazy import of get_model""" + from .get_model import get_model as _get_model + return _get_model(*args, **kwargs) + +def KittenTTS(*args, **kwargs): + """Lazy import of KittenTTS""" + from .get_model import KittenTTS as _KittenTTS + return _KittenTTS(*args, **kwargs) + __all__ = ["get_model", "KittenTTS"] diff --git a/kittentts/cli_entry.py b/kittentts/cli_entry.py new file mode 100644 index 0000000..3b73e10 --- /dev/null +++ b/kittentts/cli_entry.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +""" +Optimized entry point for KittenTTS with fast help and lazy imports +""" + +import argparse +import sys + +def show_help(): + """Show help message without importing heavy dependencies""" + parser = argparse.ArgumentParser( + description="Kitten TTS - Ultra-lightweight text-to-speech synthesis", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s "Hello world" # Speak text + %(prog)s "Hello world" --voice expr-voice-2-f # Use specific voice + %(prog)s "Hello world" --output output.wav # Save to file + %(prog)s "Hello world" --speed 1.2 # Faster speech + %(prog)s "Hello world" --fade-out 0.1 # 0.1s fade out + echo "Hello world" | %(prog)s # Read from stdin + %(prog)s --list-voices # List available voices + """ + ) + + parser.add_argument( + "text", + nargs="?", + help="Text to synthesize into speech (if not provided, reads from stdin)" + ) + + parser.add_argument( + "--model", + default="KittenML/kitten-tts-nano-0.2", + help="Model name or path (default: KittenML/kitten-tts-nano-0.2)" + ) + + parser.add_argument( + "--voice", + default="expr-voice-2-m", + help="Voice to use (default: expr-voice-2-m)" + ) + + parser.add_argument( + "--speed", + type=float, + default=1.0, + help="Speech speed (1.0 = normal, higher = faster, lower = slower)" + ) + + parser.add_argument( + "--fade-out", + type=float, + default=0.2, + help="Fade out duration in seconds (default: 0.2, use 0 to disable)" + ) + + parser.add_argument( + "--output", "-o", + help="Output file path (saves as WAV). If not specified, plays through speakers." + ) + + parser.add_argument( + "--list-voices", + action="store_true", + help="List available voices and exit" + ) + + parser.add_argument( + "--format", + choices=["wav", "flac", "ogg"], + default="wav", + help="Audio format for output file (default: wav)" + ) + + parser.print_help() + +def main(): + """Optimized main entry point - fast help, full functionality when needed""" + # Check if user just wants help + if len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ['-h', '--help']): + show_help() + return 0 + + # For any other operation, run the full CLI + from .cli_process import main as cli_main + return cli_main() + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/kittentts/cli_process.py b/kittentts/cli_process.py new file mode 100755 index 0000000..142e3c6 --- /dev/null +++ b/kittentts/cli_process.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +""" +Kitten TTS CLI - Text-to-Speech Command Line Tool + +Usage: + python kittentts_cli.py "Hello world" # Speak text + python kittentts_cli.py "Hello world" --voice expr-voice-2-f # Use specific voice + python kittentts_cli.py "Hello world" --output output.wav # Save to file + python kittentts_cli.py --list-voices # List available voices + python kittentts_cli.py --help # Show help +""" + +import argparse +import sys +import os +import numpy as np +import soundfile as sf +import tempfile + +# Add the current directory to Python path so we can import kittentts +# We need to add the parent directory since we're inside kittentts/cli.py +current_dir = os.path.dirname(os.path.abspath(__file__)) +parent_dir = os.path.dirname(current_dir) +sys.path.insert(0, parent_dir) + +# Default fade out duration in seconds +DEFAULT_FADE_OUT = 0.3 + + +# Lazy import - only load KittenTTS when actually needed (not for help) +def get_kittentts(): + try: + # Import directly from get_model to avoid package-level imports + from kittentts.get_model import KittenTTS + return KittenTTS + except ImportError: + print("Error: KittenTTS not found. Please install it with:") + print( + "pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl") + sys.exit(1) + + +def apply_fade_out(audio_data, sample_rate=24000, fade_duration=DEFAULT_FADE_OUT): + """Apply exponential fade out to audio data. + + Args: + audio_data: NumPy array of audio samples + sample_rate: Audio sample rate (default: 24000) + fade_duration: Fade out duration in seconds (default: {DEFAULT_FADE_OUT}s) + + Returns: + Audio data with fade out applied + """ + if len(audio_data) == 0: + return audio_data + + fade_samples = int(fade_duration * sample_rate) + if fade_samples >= len(audio_data): + fade_samples = len(audio_data) // 2 # Limit fade to half of audio if very short + + # Create exponential fade curve + fade_curve = np.linspace(1, 0, fade_samples) ** 2 # Quadratic fade for smoother curve + + # Apply fade to the end of audio + audio_with_fade = audio_data.copy() + audio_with_fade[-fade_samples:] *= fade_curve + + return audio_with_fade + + +def list_voices(model): + """List all available voices.""" + print("Available voices:") + for voice in model.available_voices: + print(f" - {voice}") + + +def play_audio_simple(audio_data, sample_rate=24000): + """Direct audio streaming without temporary files.""" + try: + # Try to import sounddevice for direct audio streaming + import sounddevice as sd + import numpy as np + + # Convert audio data to proper format if needed + if audio_data.dtype != np.float32: + audio_data = audio_data.astype(np.float32) + + # Play audio directly + sd.play(audio_data, sample_rate) + sd.wait() # Wait for playback to complete + + except ImportError: + # Fallback to temp file method if sounddevice not available + print("sounddevice not available, falling back to temp file method...") + play_audio_with_tempfile(audio_data, sample_rate) + except Exception as e: + # Try alternative streaming method or fallback + print(f"Direct streaming failed: {e}") + play_audio_with_tempfile(audio_data, sample_rate) + + +def play_audio_with_tempfile(audio_data, sample_rate=24000): + """Fallback method using temporary file in system temp directory.""" + temp_file = None + try: + # Create temp file in system temp directory + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + temp_file = tmp.name + sf.write(temp_file, audio_data, sample_rate) + + # Try different system audio players based on OS + import subprocess + import platform + + system = platform.system() + if system == "Darwin": # macOS + subprocess.run(["afplay", temp_file], check=True) + elif system == "Linux": + # Try common Linux audio players + for player in ["aplay", "paplay", "mpg123", "mplayer"]: + try: + subprocess.run([player, temp_file], check=True) + break + except (subprocess.CalledProcessError, FileNotFoundError): + continue + else: + print(f"Audio saved to {temp_file} (no suitable audio player found)") + elif system == "Windows": + subprocess.run(["start", temp_file], shell=True, check=True) + else: + print(f"Audio saved to {temp_file} (unsupported OS for direct playback)") + + # Clean up temp file + try: + if temp_file and os.path.exists(temp_file): + os.remove(temp_file) + except: + pass + + except Exception as e: + print(f"Error playing audio: {e}") + if temp_file and os.path.exists(temp_file): + print(f"Audio saved to {temp_file}") + else: + print("Audio could not be saved - temp file creation failed") + + +def main(): + parser = argparse.ArgumentParser( + description="Kitten TTS - Ultra-lightweight text-to-speech synthesis", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s "Hello world" # Speak text + %(prog)s "Hello world" --voice expr-voice-2-f # Use specific voice + %(prog)s "Hello world" --output output.wav # Save to file + %(prog)s "Hello world" --speed 1.2 # Faster speech + %(prog)s "Hello world" --fade-out 0.1 # 0.1s fade out + echo "Hello world" | %(prog)s # Read from stdin + %(prog)s --list-voices # List available voices + """ + ) + + parser.add_argument( + "text", + nargs="?", + help="Text to synthesize into speech (if not provided, reads from stdin)" + ) + + parser.add_argument( + "--model", + default="KittenML/kitten-tts-nano-0.2", + help="Model name or path (default: KittenML/kitten-tts-nano-0.2)" + ) + + parser.add_argument( + "--voice", + default="expr-voice-2-m", + help="Voice to use (default: expr-voice-2-m)" + ) + + parser.add_argument( + "--speed", + type=float, + default=1.0, + help="Speech speed (1.0 = normal, higher = faster, lower = slower)" + ) + + parser.add_argument( + "--fade-out", + type=float, + default=DEFAULT_FADE_OUT, + help=f"Fade out duration in seconds (default: {DEFAULT_FADE_OUT}, use 0 to disable)" + ) + + parser.add_argument( + "--output", "-o", + help="Output file path (saves as WAV). If not specified, plays through speakers." + ) + + parser.add_argument( + "--list-voices", + action="store_true", + help="List available voices and exit" + ) + + parser.add_argument( + "--format", + choices=["wav", "flac", "ogg"], + default="wav", + help="Audio format for output file (default: wav)" + ) + + args = parser.parse_args() + + # Handle --list-voices + if args.list_voices: + try: + KittenTTS = get_kittentts() + model = KittenTTS(args.model) + list_voices(model) + return 0 + except Exception as e: + print(f"Error loading model: {e}", file=sys.stderr) + return 1 + + # Get text from command line or stdin + if args.text: + text = args.text + else: + # Read from stdin + try: + if sys.stdin.isatty(): + # No pipe, interactive mode + parser.print_help() + print("\nError: Text input is required (provide as argument or pipe from stdin)", file=sys.stderr) + return 1 + else: + # Pipe detected, read from stdin + text = sys.stdin.read().strip() + if not text: + print("\nError: No text received from stdin", file=sys.stderr) + return 1 + except Exception as e: + print(f"Error reading from stdin: {e}", file=sys.stderr) + return 1 + + try: + # Initialize the model + print(f"Loading model: {args.model}...") + KittenTTS = get_kittentts() + model = KittenTTS(args.model) + + # Validate voice + if args.voice not in model.available_voices: + print(f"Error: Voice '{args.voice}' not available.", file=sys.stderr) + print(f"Available voices: {', '.join(model.available_voices)}") + return 1 + + # Add dots at the end to prevent cutoff (simple fix) + # if not text.endswith('...'): + # text = text + '...' + # print(f"Added dots to prevent audio cutoff") + + # Generate audio + print(f"Generating speech using voice: {args.voice}...") + audio = model.generate(text, voice=args.voice, speed=args.speed, old_trim=True) + + # Apply fade out if specified + if args.fade_out > 0: + print(f"Applying {args.fade_out}s fade out...") + audio = apply_fade_out(audio, sample_rate=24000, fade_duration=args.fade_out) + + if args.output: + # Save to file + print(f"Saving audio to: {args.output}") + sf.write(args.output, audio, 24000) + print("Done!") + else: + # Play through speakers + print("Playing audio...") + play_audio_simple(audio) + print("Done!") + + return 0 + + except KeyboardInterrupt: + print("\nInterrupted by user") + return 1 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/kittentts/get_model.py b/kittentts/get_model.py index f91c28c..7cd984f 100644 --- a/kittentts/get_model.py +++ b/kittentts/get_model.py @@ -1,5 +1,6 @@ import json import os +import numpy as np from huggingface_hub import hf_hub_download from .onnx_model import KittenTTS_1_Onnx @@ -22,8 +23,8 @@ def __init__(self, model_name="KittenML/kitten-tts-nano-0.1", cache_dir=None): repo_id = model_name self.model = download_from_huggingface(repo_id=repo_id, cache_dir=cache_dir) - - def generate(self, text, voice="expr-voice-5-m", speed=1.0): + + def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0, old_trim=False) -> np.ndarray: """Generate audio from text. Args: @@ -34,9 +35,11 @@ def generate(self, text, voice="expr-voice-5-m", speed=1.0): Returns: Audio data as numpy array """ - return self.model.generate(text, voice=voice, speed=speed) - - def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0, sample_rate=24000): + if not text: + raise ValueError("Input text cannot be empty.") + return self.model.generate(text, voice=voice, speed=speed, old_trim) + + def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", speed: float = 1.0, sample_rate: int = 24000): """Generate audio from text and save to file. Args: @@ -46,7 +49,7 @@ def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0, speed: Speech speed (1.0 = normal) sample_rate: Audio sample rate """ - return self.model.generate_to_file(text, output_path, voice=voice, speed=speed, sample_rate=sample_rate) + self.model.generate_to_file(text, output_path, voice=voice, speed=speed, sample_rate=sample_rate) @property def available_voices(self): @@ -54,7 +57,7 @@ def available_voices(self): return self.model.available_voices -def download_from_huggingface(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir=None): +def download_from_huggingface(repo_id: str="KittenML/kitten-tts-nano-0.1", cache_dir=None) -> KittenTTS_1_Onnx: """Download model files from Hugging Face repository. Args: @@ -97,6 +100,6 @@ def download_from_huggingface(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir= return model -def get_model(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir=None): +def get_model(repo_id: str="KittenML/kitten-tts-nano-0.1", cache_dir=None) -> KittenTTS: """Get a KittenTTS model (legacy function for backward compatibility).""" return KittenTTS(repo_id, cache_dir) diff --git a/kittentts/onnx_model.py b/kittentts/onnx_model.py index e93c463..7fe663a 100644 --- a/kittentts/onnx_model.py +++ b/kittentts/onnx_model.py @@ -1,11 +1,15 @@ -from misaki import en, espeak import numpy as np import phonemizer import soundfile as sf import onnxruntime as ort +import espeakng_loader +from phonemizer.backend.espeak.wrapper import EspeakWrapper +EspeakWrapper.set_library(espeakng_loader.get_library_path()) +EspeakWrapper.set_data_path(espeakng_loader.get_data_path()) -def basic_english_tokenize(text): + +def basic_english_tokenize(text: str) -> list: """Basic English tokenizer that splits on whitespace and punctuation.""" import re tokens = re.findall(r"\w+|[^\w\s]", text) @@ -20,21 +24,16 @@ def __init__(self, dummy=None): _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) - + dicts = {} for i in range(len(symbols)): dicts[symbols[i]] = i self.word_index_dictionary = dicts - def __call__(self, text): - indexes = [] - for char in text: - try: - indexes.append(self.word_index_dictionary[char]) - except KeyError: - pass - return indexes + def __call__(self, text: str) -> list: + dicts = self.word_index_dictionary + return [dicts[char] for char in text if char in dicts] class KittenTTS_1_Onnx: @@ -48,45 +47,44 @@ def __init__(self, model_path="kitten_tts_nano_preview.onnx", voices_path="voice self.model_path = model_path self.voices = np.load(voices_path) self.session = ort.InferenceSession(model_path) - self.phonemizer = phonemizer.backend.EspeakBackend( language="en-us", preserve_punctuation=True, with_stress=True ) self.text_cleaner = TextCleaner() - + # Available voices self.available_voices = [ - 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', + 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ] - + def _prepare_inputs(self, text: str, voice: str, speed: float = 1.0) -> dict: """Prepare ONNX model inputs from text and voice parameters.""" if voice not in self.available_voices: raise ValueError(f"Voice '{voice}' not available. Choose from: {self.available_voices}") - + # Phonemize the input text phonemes_list = self.phonemizer.phonemize([text]) - + # Process phonemes to get token IDs phonemes = basic_english_tokenize(phonemes_list[0]) phonemes = ' '.join(phonemes) tokens = self.text_cleaner(phonemes) - + # Add start and end tokens tokens.insert(0, 0) tokens.append(0) - + input_ids = np.array([tokens], dtype=np.int64) ref_s = self.voices[voice] - + return { "input_ids": input_ids, "style": ref_s, "speed": np.array([speed], dtype=np.float32), } - - def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray: + + def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0, old_trim=False) -> np.ndarray: """Synthesize speech from text. Args: @@ -98,16 +96,26 @@ def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) Audio data as numpy array """ onnx_inputs = self._prepare_inputs(text, voice, speed) - + outputs = self.session.run(None, onnx_inputs) - - # Trim audio - audio = outputs[0][5000:-10000] + + if old_trim: + return outputs[0][5000:-10000] + else: + # new trim approach, PR link: + # https://github.com/KittenML/KittenTTS/pull/22/commits/3883bdf80d9e9e4bdf0d1d4707fa68d995d41c56 + audio = outputs[0] # shape (n,) + # Trim edge silence from audio + non_silent = np.abs(audio) >= 0.1 + if np.any(non_silent): + indices = np.where(non_silent)[0] + start, end = indices[0], indices[-1] + audio = audio[start: end + 1] return audio - - def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", - speed: float = 1.0, sample_rate: int = 24000) -> None: + + def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", + speed: float = 1.0, sample_rate: int = 24000) -> None: """Synthesize speech and save to file. Args: @@ -124,10 +132,10 @@ def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice # Example usage if __name__ == "__main__": - tts = KittenTTS() - + tts = KittenTTS_1_Onnx() + text = """ It begins with an "Ugh!" Another mysterious stain appears on a favorite shirt. Every trick has been tried, but the stain persists. """ - tts.generate_to_file(text, "inference_output25.wav", voice="expr-voice-5-m") \ No newline at end of file + tts.generate_to_file(text, "inference_output25.wav", voice="expr-voice-5-m") diff --git a/pyproject.toml b/pyproject.toml index c2d1e5c..189a24f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,31 +1,44 @@ [build-system] -requires = ["setuptools>=45", "wheel"] -build-backend = "setuptools.build_meta" +requires = ["hatchling"] +build-backend = "hatchling.build" [project] name = "kittentts" -version = "0.1.0" description = "Ultra-lightweight text-to-speech model with just 15 million parameters" readme = "README.md" requires-python = ">=3.8" -license = {text = "Apache 2.0"} +license = "Apache-2.0" authors = [ {name = "KittenML"} ] keywords = ["text-to-speech", "tts", "speech-synthesis", "neural-networks", "onnx"] classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Multimedia :: Sound/Audio :: Speech", "Topic :: Scientific/Engineering :: Artificial Intelligence", ] dependencies = [ - "num2words", - "spacy", "espeakng_loader", - "misaki[en]>=0.9.4", + "num2words", + "numpy", "onnxruntime", + "phonemizer-fork~=3.3.2", "soundfile", - "numpy", - "huggingface_hub", + "spacy", +] +dynamic = ["version"] + +[project.optional-dependencies] +streaming = [ + "sounddevice", ] [project.urls] @@ -33,9 +46,8 @@ Homepage = "https://github.com/kittenml/kittentts" Repository = "https://github.com/kittenml/kittentts" Issues = "https://github.com/kittenml/kittentts/issues" -[tool.setuptools.packages.find] -where = ["."] -include = ["kittentts*"] +[project.scripts] +kitten-tts = "kittentts.cli_entry:main" -[tool.setuptools.package-data] -kittentts = ["*.json", "*.txt", "*.onnx"] +[tool.hatch.version] +path = "kittentts/__init__.py" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 37bfbb3..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -num2words -spacy -espeakng_loader -misaki[en]>=0.9.4 -onnxruntime -soundfile -numpy -huggingface_hub diff --git a/setup.py b/setup.py deleted file mode 100644 index d0ac187..0000000 --- a/setup.py +++ /dev/null @@ -1,46 +0,0 @@ -from setuptools import setup, find_packages - -with open("README.md", "r", encoding="utf-8") as fh: - long_description = fh.read() - -setup( - name="kittentts", - version="0.1.0", - author="KittenML", - author_email="", - description="Ultra-lightweight text-to-speech model with just 15 million parameters", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/kittenml/kittentts", - packages=find_packages(), - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Topic :: Multimedia :: Sound/Audio :: Speech", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - ], - python_requires=">=3.8", - install_requires=[ - "num2words", - "spacy", - "espeakng_loader", - "misaki[en]>=0.9.4", - "onnxruntime", - "soundfile", - "numpy", - "huggingface_hub", - ], - keywords="text-to-speech, tts, speech-synthesis, neural-networks, onnx", - project_urls={ - "Bug Reports": "https://github.com/kittenml/kittentts/issues", - "Source": "https://github.com/kittenml/kittentts", - }, -)