diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..59d11d7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.egg-info +*.py[cod] +*.wav diff --git a/README.md b/README.md index 81536da..f0ea40e 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,7 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl ``` - - - ### Basic Usage +### API Usage ``` from kittentts import KittenTTS @@ -42,6 +40,12 @@ sf.write('output.wav', audio, 24000) ``` +### CLI usage + +``` +kittentts --output output.wav --text "This high quality TTS model works without a GPU" +``` + diff --git a/kittentts/__main__.py b/kittentts/__main__.py new file mode 100644 index 0000000..73c7da6 --- /dev/null +++ b/kittentts/__main__.py @@ -0,0 +1,61 @@ +import argparse +import datetime +import io +import sys + +voices = [ + "expr-voice-2-m", + "expr-voice-2-f", + "expr-voice-3-m", + "expr-voice-3-f", + "expr-voice-4-m", + "expr-voice-4-f", + "expr-voice-5-m", + "expr-voice-5-f", +] + + +def run(*, model: str, voice: str, output: str, text: str, speed: float=1.0) -> datetime.timedelta: + from kittentts import KittenTTS + import soundfile as sf + + m = KittenTTS(model) + t0 = datetime.datetime.now() + audio = m.generate(text, voice=voice, speed=speed) + if output == "-": + # sf requires a seekable buffer for writing. + bio = io.BytesIO() + sf.write(bio, audio, 24000, format="WAV", subtype="PCM_16") + sys.stdout.buffer.write(bio.getvalue()) + else: + sf.write(output, audio, 24000) + t1 = datetime.datetime.now() + return t1 - t0 + + +def main() -> None: + ap = argparse.ArgumentParser(prog="kittentts", description="Run Kitten TTS model") + ap.add_argument("--model", default="KittenML/kitten-tts-nano-0.1", help="Model to use") + ap.add_argument("--text", required=True, help="Text to synthesize") + ap.add_argument("--voice", default="expr-voice-2-f", help="Voice to use", choices=voices) + ap.add_argument("--speed", type=float, default=1.0, help="Speech speed (1.0 = normal)") + ap.add_argument("--output", help="Output audio file (- for stdout; use with care)") + + args = ap.parse_args() + + if not args.output: + ts = datetime.datetime.now().isoformat(timespec="seconds").replace(":", "-") + args.output = f"{args.voice}-{ts}.wav" + + gen_time = run( + model=args.model, + voice=args.voice, + output=args.output, + text=args.text, + speed=args.speed, + ) + print(f"Generated audio in {gen_time}, saved to {args.output}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index c2d1e5c..944ff70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,9 @@ dependencies = [ "huggingface_hub", ] +[project.scripts] +kittentts = "kittentts.__main__:main" + [project.urls] Homepage = "https://github.com/kittenml/kittentts" Repository = "https://github.com/kittenml/kittentts" diff --git a/setup.py b/setup.py index d0ac187..a59cc07 100644 --- a/setup.py +++ b/setup.py @@ -43,4 +43,9 @@ "Bug Reports": "https://github.com/kittenml/kittentts/issues", "Source": "https://github.com/kittenml/kittentts", }, + entry_points={ + "console_scripts": [ + "kittentts=kittentts.__main__:main", + ], + }, )