From 0438ffc292353dfd8c7016e46d423c813919b730 Mon Sep 17 00:00:00 2001 From: yousan Date: Fri, 15 Mar 2024 13:42:36 +0000 Subject: [PATCH 1/2] web_ui --- webui.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 webui.py diff --git a/webui.py b/webui.py new file mode 100644 index 0000000..b9568d7 --- /dev/null +++ b/webui.py @@ -0,0 +1,46 @@ +import argparse +import gradio as gr +import soundfile as sf +import numpy as np +import tempfile +from pathlib import Path +from transformer_infer import PhemeClient + +def generate_audio(text, voice, temperature, top_k, max_new_tokens): + args = parse_arguments() + client = PhemeClient(args) + audio_array = client.infer(text, voice=voice, temperature=temperature, top_k=top_k, max_new_tokens=max_new_tokens) + + # NumPy 配列をオーディオファイルに変換 + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: + sf.write(temp_file.name, audio_array, args.target_sample_rate) + return temp_file.name + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--manifest_path", type=str, default="demo/manifest.json") + parser.add_argument("--outputdir", type=str, default="demo/") + parser.add_argument("--featuredir", type=str, default="demo/") + parser.add_argument("--text_tokens_file", type=str, default="ckpt/unique_text_tokens.k2symbols") + parser.add_argument("--t2s_path", type=str, default="ckpt/t2s/") + parser.add_argument("--s2a_path", type=str, default="ckpt/s2a/s2a.ckpt") + parser.add_argument("--target_sample_rate", type=int, default=16_000) + return parser.parse_args() + +voice_list = ["male_voice", "female_voice"] # 利用可能な声のリストに置き換えてください + +iface = gr.Interface( + fn=generate_audio, + inputs=[ + gr.inputs.Textbox(label="Text"), + gr.inputs.Dropdown(voice_list, label="Voice"), + gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"), + gr.Slider(minimum=1, maximum=500, value=210, step=1, label="Top K"), + gr.Slider(minimum=1, maximum=1000, value=750, step=1, label="Max New Tokens") + ], + outputs=gr.Audio(type="filepath", label="Generated Audio"), + title="Pheme TTS Demo", + description="Enter text and select a voice to generate speech.", +) + +iface.launch(share=True) \ No newline at end of file From 53a318d4dddfe020a9889e0c4dc4efb6ee25d1ad Mon Sep 17 00:00:00 2001 From: yousan Date: Fri, 15 Mar 2024 14:58:13 +0000 Subject: [PATCH 2/2] fix: init function --- webui.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/webui.py b/webui.py index b9568d7..f2aec15 100644 --- a/webui.py +++ b/webui.py @@ -7,14 +7,14 @@ from transformer_infer import PhemeClient def generate_audio(text, voice, temperature, top_k, max_new_tokens): - args = parse_arguments() - client = PhemeClient(args) + start_time = time.time() # 開始時間を記録 audio_array = client.infer(text, voice=voice, temperature=temperature, top_k=top_k, max_new_tokens=max_new_tokens) # NumPy 配列をオーディオファイルに変換 with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: sf.write(temp_file.name, audio_array, args.target_sample_rate) - return temp_file.name + + return temp_file.name def parse_arguments(): parser = argparse.ArgumentParser() @@ -27,8 +27,6 @@ def parse_arguments(): parser.add_argument("--target_sample_rate", type=int, default=16_000) return parser.parse_args() -voice_list = ["male_voice", "female_voice"] # 利用可能な声のリストに置き換えてください - iface = gr.Interface( fn=generate_audio, inputs=[ @@ -43,4 +41,9 @@ def parse_arguments(): description="Enter text and select a voice to generate speech.", ) +voice_list = ["male_voice"] # 利用可能な声のリストに置き換えてください + +args = parse_arguments() +client = PhemeClient(args) + iface.launch(share=True) \ No newline at end of file