-
Notifications
You must be signed in to change notification settings - Fork 280
Expand file tree
/
Copy pathmain.py
More file actions
156 lines (130 loc) · 5.93 KB
/
main.py
File metadata and controls
156 lines (130 loc) · 5.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import json
import re
import os
import requests
from src.agent.capability import MatchingCapability
from src.main import AgentWorker
from src.agent.capability_worker import CapabilityWorker
# =============================================================================
# SOUND GENERATOR
# Generates AI sound effects using the ElevenLabs Sound Generation API.
# Supports creating, modifying, replaying, and adjusting duration.
# =============================================================================
# --- CONFIGURATION ---
# Replace with your ElevenLabs API key
ELEVENLABS_API_KEY = "YOUR_ELEVENLABS_API_KEY_HERE"
ELEVENLABS_SOUND_URL = "https://api.elevenlabs.io/v1/sound-generation"
DEFAULT_DURATION = 5.0
MAX_DURATION = 30.0
MIN_DURATION = 0.5
EXIT_WORDS = {"stop", "exit", "quit", "done", "cancel", "bye", "leave"}
REPLAY_WORDS = {"again", "replay", "repeat", "one more time"}
ENHANCE_PROMPT = (
"Enhance this sound effect prompt for an AI generator. "
"Add details about texture, acoustics, and environment. Keep it under 20 words. "
"Input: '{input}'\nEnhanced:"
)
class SoundGeneratorCapability(MatchingCapability):
worker: AgentWorker = None
capability_worker: CapabilityWorker = None
current_description: str = None
current_duration: float = None
last_audio_bytes: bytes = None
#{{register_capability}}
def call(self, worker: AgentWorker):
self.worker = worker
self.capability_worker = CapabilityWorker(self.worker)
self.current_description = None
self.current_duration = None
self.last_audio_bytes = None
self.worker.session_tasks.create(self.run_sound_loop())
async def generate_sound(self, description: str, duration: float) -> bytes | None:
"""Call ElevenLabs Sound Generation API."""
duration = max(MIN_DURATION, min(MAX_DURATION, duration))
payload = {
"text": description,
"prompt_influence": 0.3,
"duration_seconds": duration,
}
self.worker.editor_logging_handler.info(
f"[SoundGen] Generating: '{description}' ({duration}s)"
)
try:
response = requests.post(
ELEVENLABS_SOUND_URL,
json=payload,
headers={
"xi-api-key": ELEVENLABS_API_KEY,
"Content-Type": "application/json",
},
)
if response.status_code == 200:
self.current_description = description
self.current_duration = duration
self.last_audio_bytes = response.content
return response.content
else:
self.worker.editor_logging_handler.error(
f"[SoundGen] API error: {response.status_code} {response.text}"
)
return None
except Exception as e:
self.worker.editor_logging_handler.error(f"[SoundGen] Exception: {e}")
return None
def parse_duration(self, text: str) -> float | None:
"""Extract a duration in seconds from user input."""
match = re.search(r"(\d+(?:\.\d+)?)\s*sec", text.lower())
if match:
return float(match.group(1))
keywords = {"short": 2.0, "long": 8.0, "longer": 10.0, "shorter": 2.0}
for word, val in keywords.items():
if word in text.lower():
return val
return None
async def run_sound_loop(self):
await self.capability_worker.speak(
"Sound generator ready. Describe what you want to hear."
)
while True:
user_input = await self.capability_worker.user_response()
if not user_input:
continue
lower = user_input.lower().strip()
# Exit check
if any(w in lower for w in EXIT_WORDS):
await self.capability_worker.speak("Closing sound generator.")
break
# Replay check
if self.last_audio_bytes and any(w in lower for w in REPLAY_WORDS):
await self.capability_worker.speak("Replaying.")
await self.capability_worker.play_audio(self.last_audio_bytes)
continue
# Duration adjustment
if self.current_description and ("longer" in lower or "shorter" in lower):
current = self.current_duration or DEFAULT_DURATION
new_dur = current + 3.0 if "longer" in lower else max(MIN_DURATION, current - 2.0)
await self.capability_worker.speak(f"Regenerating at {new_dur} seconds.")
audio = await self.generate_sound(self.current_description, new_dur)
if audio:
await self.capability_worker.play_audio(audio)
continue
# Enhance existing sound
if self.current_description and "enhance" in lower:
await self.capability_worker.speak("Enhancing.")
new_desc = self.capability_worker.text_to_text_response(
ENHANCE_PROMPT.format(input=self.current_description)
).replace('"', "").replace("'", "").strip()
audio = await self.generate_sound(new_desc, self.current_duration or DEFAULT_DURATION)
if audio:
await self.capability_worker.play_audio(audio)
continue
# New sound
duration = self.parse_duration(user_input) or DEFAULT_DURATION
await self.capability_worker.speak(f"Creating {user_input}.")
audio = await self.generate_sound(user_input, duration)
if audio:
await self.capability_worker.play_audio(audio)
await self.capability_worker.speak("There we go. You can modify it or ask for a new sound.")
else:
await self.capability_worker.speak("Sorry, I couldn't generate that sound.")
self.capability_worker.resume_normal_flow()