forked from Saganaki22/OrpheusTTS-WebUI
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathorpheus.py
More file actions
166 lines (138 loc) · 5.89 KB
/
orpheus.py
File metadata and controls
166 lines (138 loc) · 5.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import gradio as gr
from orpheus_tts import OrpheusModel
import wave
import time
import os
model = None
def load_model(model_name="canopylabs/orpheus-tts-0.1-finetune-prod"):
global model
# Initialize the Orpheus TTS model according to documentation
model = OrpheusModel(model_name=model_name)
def generate_speech(prompt, voice, temperature, top_p, repetition_penalty):
if model is None:
load_model()
# Start timing
start_time = time.monotonic()
# Generate speech from the provided text
syn_tokens = model.generate_speech(
prompt=prompt,
voice=voice,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty
)
# Create a unique output filename to avoid overwriting previous generations
output_filename = f"output_{int(time.time())}.wav"
# Write the audio to a WAV file
with wave.open(output_filename, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(24000)
total_frames = 0
for audio_chunk in syn_tokens:
frame_count = len(audio_chunk) // (wf.getsampwidth() * wf.getnchannels())
total_frames += frame_count
wf.writeframes(audio_chunk)
duration = total_frames / wf.getframerate()
# Calculate processing time
end_time = time.monotonic()
processing_time = end_time - start_time
# Prepare result message
result_message = f"Generated {duration:.2f} seconds of audio in {processing_time:.2f} seconds"
return output_filename, result_message
# Create the Gradio interface
with gr.Blocks(title="OrpheusTTS-WebUI") as demo:
# Use HTML h1 tag for bigger title without hashtag
gr.Markdown("<div align='center'><h1>OrpheusTTS-WebUI</h1></div>")
# Main description without links
gr.Markdown("""<div align='center'>Generate realistic speech from text using the OrpheusTTS model.
**Available voices:** tara, jess, leo, leah, dan, mia, zac, zoe (in order of conversational realism)
**Available emotive tags:** `<laugh>`, `<chuckle>`, `<sigh>`, `<cough>`, `<sniffle>`, `<groan>`, `<yawn>`, `<gasp>`
**Note:** Increasing repetition_penalty and temperature makes the model speak faster.
</div>
""")
with gr.Row():
with gr.Column(scale=2):
# Text input area
prompt = gr.Textbox(
label="Text Input",
placeholder="Enter text to convert to speech...",
lines=5
)
with gr.Row():
# Voice selection dropdown
voice = gr.Dropdown(
choices=["tara", "jess", "leo", "leah", "dan", "mia", "zac", "zoe"],
label="Voice",
value="tara"
)
with gr.Row():
# Generation parameters
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=1.0,
step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top P"
)
rep_penalty = gr.Slider(
minimum=1.1,
maximum=2.0,
value=1.2,
step=0.1,
label="Repetition Penalty"
)
# Generate button
submit_btn = gr.Button("Generate Speech")
# Example prompts
gr.Examples(
examples=[
"Man, the way social media has, um, completely changed how we interact is just wild, right?",
"I just got back from my vacation <sigh> and I'm already feeling stressed about work.",
"Did you hear what happened at the party last night? <laugh> It was absolutely ridiculous!",
"I've been working on this project for hours <yawn> and I still have so much to do.",
"The concert was amazing! <gasp> You should have seen the light show!"
],
inputs=prompt,
label="Example Prompts"
)
with gr.Column(scale=1):
# Audio output
audio_output = gr.Audio(label="Generated Speech")
# Generation statistics
result_text = gr.Textbox(label="Generation Stats", interactive=False)
# Connect the generate_speech function to the interface
submit_btn.click(
fn=generate_speech,
inputs=[prompt, voice, temperature, top_p, rep_penalty],
outputs=[audio_output, result_text]
)
# Clean up function to remove generated audio files (won't work in all deployments)
def cleanup():
for file in os.listdir():
if file.startswith("output_") and file.endswith(".wav"):
try:
os.remove(file)
except:
pass
# Register cleanup for when the interface closes
demo.load(cleanup)
# Add footer with links as a separate element at the bottom
gr.Markdown("""<div align='center' style='margin-top: 20px; padding: 10px; border-top: 1px solid #ccc;'>
<a href="https://huggingface.co/canopylabs/orpheus-3b-0.1-pretrained" target="_blank">Hugging Face</a> |
<a href="https://github.com/Saganaki22/OrpheusTTS-WebUI" target="_blank">WebUI GitHub</a> |
<a href="https://github.com/canopyai/Orpheus-TTS" target="_blank">Official GitHub</a>
</div>""")
# Only run when orpheus.py is executed directly, not when imported
# Comment this out when using the wrapper
"""
if __name__ == "__main__":
demo.launch(share=False) # Set share=False to disable public URL
"""