From fe7670bc631e34ba7780309542451c8b267f2532 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Aug 2025 13:12:26 +0300 Subject: [PATCH 1/4] Add basic CLI --- README.md | 10 ++++++--- kittentts/__main__.py | 51 +++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 3 +++ setup.py | 5 +++++ 4 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 kittentts/__main__.py diff --git a/README.md b/README.md index 81536da..f0ea40e 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,7 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl ``` - - - ### Basic Usage +### API Usage ``` from kittentts import KittenTTS @@ -42,6 +40,12 @@ sf.write('output.wav', audio, 24000) ``` +### CLI usage + +``` +kittentts --output output.wav --text "This high quality TTS model works without a GPU" +``` + diff --git a/kittentts/__main__.py b/kittentts/__main__.py new file mode 100644 index 0000000..bcbd842 --- /dev/null +++ b/kittentts/__main__.py @@ -0,0 +1,51 @@ +import argparse +import datetime + +voices = [ + "expr-voice-2-m", + "expr-voice-2-f", + "expr-voice-3-m", + "expr-voice-3-f", + "expr-voice-4-m", + "expr-voice-4-f", + "expr-voice-5-m", + "expr-voice-5-f", +] + + +def run(*, model: str, voice: str, output: str, text: str) -> datetime.timedelta: + from kittentts import KittenTTS + import soundfile as sf + + m = KittenTTS(model) + t0 = datetime.datetime.now() + audio = m.generate(text, voice=voice) + sf.write(output, audio, 24000) + t1 = datetime.datetime.now() + return t1 - t0 + + +def main() -> None: + ap = argparse.ArgumentParser(prog="kittentts", description="Run Kitten TTS model") + ap.add_argument("--model", default="KittenML/kitten-tts-nano-0.1", help="Model to use") + ap.add_argument("--text", required=True, help="Text to synthesize") + ap.add_argument("--voice", default="expr-voice-2-f", help="Voice to use", choices=voices) + ap.add_argument("--output", help="Output audio file") + + args = ap.parse_args() + + if not args.output: + ts = datetime.datetime.now().isoformat(timespec="seconds").replace(":", "-") + args.output = f"{args.voice}-{ts}.wav" + + gen_time = run( + model=args.model, + voice=args.voice, + output=args.output, + text=args.text, + ) + print(f"Generated audio in {gen_time}, saved to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index c2d1e5c..944ff70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,9 @@ dependencies = [ "huggingface_hub", ] +[project.scripts] +kittentts = "kittentts.__main__:main" + [project.urls] Homepage = "https://github.com/kittenml/kittentts" Repository = "https://github.com/kittenml/kittentts" diff --git a/setup.py b/setup.py index d0ac187..a59cc07 100644 --- a/setup.py +++ b/setup.py @@ -43,4 +43,9 @@ "Bug Reports": "https://github.com/kittenml/kittentts/issues", "Source": "https://github.com/kittenml/kittentts", }, + entry_points={ + "console_scripts": [ + "kittentts=kittentts.__main__:main", + ], + }, ) From ca7c23d1990811f9ffd89353d9a6f56751c0528a Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Aug 2025 13:12:47 +0300 Subject: [PATCH 2/4] Add standard Python gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..59d11d7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.egg-info +*.py[cod] +*.wav From 42ad5ccd60285c1fe2b33b5d7746f9e29524df6a Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Aug 2025 15:19:39 +0300 Subject: [PATCH 3/4] Add support for stdout output --- kittentts/__main__.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kittentts/__main__.py b/kittentts/__main__.py index bcbd842..a694839 100644 --- a/kittentts/__main__.py +++ b/kittentts/__main__.py @@ -1,5 +1,7 @@ import argparse import datetime +import io +import sys voices = [ "expr-voice-2-m", @@ -20,7 +22,13 @@ def run(*, model: str, voice: str, output: str, text: str) -> datetime.timedelta m = KittenTTS(model) t0 = datetime.datetime.now() audio = m.generate(text, voice=voice) - sf.write(output, audio, 24000) + if output == "-": + # sf requires a seekable buffer for writing. + bio = io.BytesIO() + sf.write(bio, audio, 24000, format="WAV", subtype="PCM_16") + sys.stdout.buffer.write(bio.getvalue()) + else: + sf.write(output, audio, 24000) t1 = datetime.datetime.now() return t1 - t0 @@ -30,7 +38,7 @@ def main() -> None: ap.add_argument("--model", default="KittenML/kitten-tts-nano-0.1", help="Model to use") ap.add_argument("--text", required=True, help="Text to synthesize") ap.add_argument("--voice", default="expr-voice-2-f", help="Voice to use", choices=voices) - ap.add_argument("--output", help="Output audio file") + ap.add_argument("--output", help="Output audio file (- for stdout; use with care)") args = ap.parse_args() @@ -44,7 +52,7 @@ def main() -> None: output=args.output, text=args.text, ) - print(f"Generated audio in {gen_time}, saved to {args.output}") + print(f"Generated audio in {gen_time}, saved to {args.output}", file=sys.stderr) if __name__ == "__main__": From 0aacfcd3c2674c0b53afae35ad4b64106517d06e Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Aug 2025 17:14:41 +0300 Subject: [PATCH 4/4] Add optional `--speed` --- kittentts/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kittentts/__main__.py b/kittentts/__main__.py index a694839..73c7da6 100644 --- a/kittentts/__main__.py +++ b/kittentts/__main__.py @@ -15,13 +15,13 @@ ] -def run(*, model: str, voice: str, output: str, text: str) -> datetime.timedelta: +def run(*, model: str, voice: str, output: str, text: str, speed: float=1.0) -> datetime.timedelta: from kittentts import KittenTTS import soundfile as sf m = KittenTTS(model) t0 = datetime.datetime.now() - audio = m.generate(text, voice=voice) + audio = m.generate(text, voice=voice, speed=speed) if output == "-": # sf requires a seekable buffer for writing. bio = io.BytesIO() @@ -38,6 +38,7 @@ def main() -> None: ap.add_argument("--model", default="KittenML/kitten-tts-nano-0.1", help="Model to use") ap.add_argument("--text", required=True, help="Text to synthesize") ap.add_argument("--voice", default="expr-voice-2-f", help="Voice to use", choices=voices) + ap.add_argument("--speed", type=float, default=1.0, help="Speech speed (1.0 = normal)") ap.add_argument("--output", help="Output audio file (- for stdout; use with care)") args = ap.parse_args() @@ -51,6 +52,7 @@ def main() -> None: voice=args.voice, output=args.output, text=args.text, + speed=args.speed, ) print(f"Generated audio in {gen_time}, saved to {args.output}", file=sys.stderr)