-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathstt_app.py
More file actions
388 lines (343 loc) · 14.5 KB
/
stt_app.py
File metadata and controls
388 lines (343 loc) · 14.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
from __future__ import annotations
import os
import re
import tempfile
import threading
import time
from collections import deque
from enum import Enum
from typing import Callable, Optional
from audio_worker_client import AudioWorkerClient
from issue_capture import maybe_capture_mlx_issue
from providers import get_provider
from recordings import DEFAULT_RECORDINGS_DIR, DEFAULT_RECORDINGS_MAX_BYTES, archive_recording
from stt_defaults import HOTKEY_DISPLAY_NAMES, NullOverlay, noop_sound, noop_text_injector
SAMPLE_RATE = 16000 # Whisper expects 16kHz
CHANNELS = 1
SILENCE_THRESHOLD = 0.01 # Skip transcription if peak below this
class AppState(Enum):
"""Application state for menu bar icon."""
IDLE = "idle"
RECORDING = "recording"
TRANSCRIBING = "transcribing"
class STTApp:
_MAX_STARTING_TIME_S = 5
def __init__(
self,
device_name: str | None = None,
provider=None,
*,
overlay=None,
sound_player: Callable[[str], None] | None = None,
text_injector: Callable[[str, bool], None] | None = None,
language: str | None = None,
prompt: str | None = None,
hotkey_id: str | None = None,
keep_recordings: bool | None = None,
recordings_dir: str | None = None,
recordings_max_bytes: int | None = None,
audio_worker: AudioWorkerClient | None = None,
):
self.recording = False
self.device_name = device_name # Store name, resolve to index at record time.
self.provider = provider or get_provider(os.environ.get("PROVIDER", "mlx"))
self._audio_worker = audio_worker or AudioWorkerClient()
self._overlay = overlay or NullOverlay()
self.language = language if language is not None else os.environ.get("LANGUAGE", "en")
self.prompt = prompt if prompt is not None else os.environ.get("PROMPT", "")
self.hotkey_id = hotkey_id if hotkey_id is not None else os.environ.get("HOTKEY", "cmd_r")
if keep_recordings is None:
keep_recordings = os.environ.get("KEEP_RECORDINGS", "false").lower() == "true"
self.keep_recordings = keep_recordings
self.recordings_dir = recordings_dir or DEFAULT_RECORDINGS_DIR
self.recordings_max_bytes = recordings_max_bytes or DEFAULT_RECORDINGS_MAX_BYTES
self._sound_player = sound_player or noop_sound
self._text_injector = text_injector or noop_text_injector
# Set up waveform callback to update overlay.
self._audio_worker.set_waveform_callback(self._on_waveform)
# Thread synchronization.
self._lock = threading.Lock()
self._processing = False # Guard against concurrent process_recording calls.
self._starting = False # Guard against concurrent start_recording calls.
self._event_log = deque(maxlen=200)
# Used to invalidate stale work (cancel/reset while worker thread is still running).
self._op_id = 0
# State management for menu bar.
self._state = AppState.IDLE
self._state_callback: Optional[Callable[[AppState], None]] = None
def _on_waveform(self, values: list[float], raw_peak: float):
"""Handle waveform data from audio worker."""
above_threshold = raw_peak >= SILENCE_THRESHOLD
self._overlay.update_waveform(values, above_threshold)
def set_state_callback(self, callback: Callable[[AppState], None]):
"""Register callback for state changes (called from any thread)."""
self._state_callback = callback
def _set_state(self, new_state: AppState):
"""Update state and notify callback."""
self._state = new_state
self._log_event(f"state:{new_state.value}")
if self._state_callback:
self._state_callback(new_state)
def _log_event(self, message: str) -> None:
ts = time.strftime("%Y-%m-%d %H:%M:%S")
entry = {
"ts": ts,
"message": message,
"state": self._state.value,
"recording": self.recording,
"processing": self._processing,
"starting": self._starting,
}
self._event_log.append(entry)
if os.environ.get("STT_DEBUG"):
print(f"[debug] {entry}")
def start_recording(self):
"""Start recording audio from microphone."""
with self._lock:
if self._processing:
self._log_event("start_ignored_processing")
return
if self.recording or self._starting:
self._log_event("start_ignored_busy")
return
self._starting = True
self.recording = True
self._log_event("start_recording")
self._set_state(AppState.RECORDING)
self._overlay.show()
self._sound_player("/System/Library/Sounds/Tink.aiff")
print("Recording...")
try:
self._audio_worker.start_recording(
device_name=self.device_name, sample_rate=SAMPLE_RATE, channels=CHANNELS
)
with self._lock:
if not self.recording:
try:
self._audio_worker.cancel_recording()
except Exception:
self._audio_worker.stop(force=True)
except Exception as e:
print(f"❌ Failed to start recording: {e}")
self._audio_worker.stop(force=True)
self._overlay.hide()
with self._lock:
self.recording = False
self._set_state(AppState.IDLE)
finally:
with self._lock:
self._starting = False
def stop_recording(self):
"""Stop recording and return (wav_path, frames, peak)."""
with self._lock:
if not self.recording:
return None, 0, 0.0
self.recording = False
starting = self._starting
self._overlay.set_transcribing(True)
self._sound_player("/System/Library/Sounds/Pop.aiff")
print("Stopped")
if starting:
deadline = time.time() + self._MAX_STARTING_TIME_S
while time.time() < deadline:
with self._lock:
if not self._starting:
break
time.sleep(0.01)
with self._lock:
if self._starting:
print("⚠️ Recording start still pending; restarting audio worker...")
self._starting = False
self._processing = False
try:
self._audio_worker.stop(force=True)
except Exception:
pass
return None, 0, 0.0
fd, wav_path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
try:
frames, peak = self._audio_worker.stop_recording(wav_path=wav_path)
return wav_path, frames, peak
except TimeoutError:
print("❌ Audio recording stop timed out. Restarting audio worker...")
self._audio_worker.stop(force=True)
try:
if os.path.exists(wav_path) and os.path.getsize(wav_path) > 0:
archived = archive_recording(
wav_path,
keep_recordings=self.keep_recordings,
recordings_dir=self.recordings_dir,
recordings_max_bytes=self.recordings_max_bytes,
text=None,
)
if not archived:
print(f"⚠️ Kept timed-out wav for debugging: {wav_path}")
return wav_path, 0, 0.0
else:
os.unlink(wav_path)
except OSError:
pass
return None, 0, 0.0
except Exception as e:
print(f"❌ Failed to stop recording: {e}")
self._audio_worker.stop(force=True)
try:
os.unlink(wav_path)
except OSError:
pass
return None, 0, 0.0
def cancel_recording(self):
"""Cancel recording without processing."""
with self._lock:
if not self.recording:
if self._state == AppState.RECORDING:
self._set_state(AppState.IDLE)
self._overlay.hide()
return
self.recording = False
self._set_state(AppState.IDLE)
self._overlay.hide()
self._sound_player("/System/Library/Sounds/Basso.aiff")
print("❌ Recording cancelled")
try:
self._audio_worker.cancel_recording()
except TimeoutError:
print("⚠️ Audio cancel timed out. Restarting audio worker...")
self._audio_worker.stop(force=True)
except Exception as e:
print(f"⚠️ Error cancelling audio: {e}")
self._audio_worker.stop(force=True)
def cancel_transcription(self):
"""Cancel an in-progress transcription (best-effort)."""
with self._lock:
if not self._processing:
return
self._op_id += 1
self._processing = False
cancel = getattr(self.provider, "cancel", None)
if callable(cancel):
print("Cancelling...")
try:
cancel()
except Exception as e:
print(f"⚠️ Error cancelling transcription: {e}")
self._overlay.set_transcribing(False)
self._overlay.hide()
self._set_state(AppState.IDLE)
def transcribe_audio(self, audio_file_path: str, max_retries: int = 2) -> str | None:
"""Transcribe audio using the configured provider (no thread wrapper)."""
for attempt in range(max_retries + 1):
try:
return self.provider.transcribe(audio_file_path, self.language, self.prompt)
except TimeoutError:
cancel = getattr(self.provider, "cancel", None)
if callable(cancel):
try:
cancel()
except Exception:
pass
if attempt < max_retries:
print(f"⚠️ Transcription timed out, retrying ({attempt + 2}/{max_retries + 1})...")
continue
print("❌ Transcription timed out after all retries")
return None
except Exception as e:
if attempt < max_retries:
print(f"⚠️ Transcription failed, retrying ({attempt + 2}/{max_retries + 1})...")
continue
print(f"❌ Transcription error after all retries: {e}")
return None
return None
def print_ready_prompt(self):
"""Print the ready prompt with hotkey name."""
from rich.console import Console
console = Console()
hotkey_name = HOTKEY_DISPLAY_NAMES.get(self.hotkey_id, self.hotkey_id)
console.print(
f"\n[bold green]Ready[/bold green] [dim]│[/dim] Hold [cyan]{hotkey_name}[/cyan] to record, +Shift ↵, Esc ✗"
)
def transform_text(self, text: str) -> str:
"""Apply text transformations."""
text = re.sub(r"^[Ss]lash\s+", "/", text)
return text
def type_text(self, text: str, send_enter: bool = False) -> None:
"""Type text into the active text field (injected backend)."""
if not text:
return
print(f"Typing: {text}" + (" ↵" if send_enter else ""))
try:
self._text_injector(text, send_enter)
except Exception as e:
print(f"❌ Failed to type text: {e}")
def process_recording(self, send_enter: bool = False):
"""Process the recorded audio: transcribe and type."""
with self._lock:
if self._processing:
self._log_event("process_ignored_processing")
return
if self._starting:
self._log_event("process_wait_starting")
self._processing = True
self._op_id += 1
op_id = self._op_id
wav_path = None
transcribed_text = None
try:
wav_path, frames, peak = self.stop_recording()
with self._lock:
if op_id != self._op_id or not self._processing:
return
if not wav_path:
print("⚠️ No audio captured, skipping...")
elif frames < int(SAMPLE_RATE * 0.5): # Less than 0.5 seconds.
print("⚠️ Recording too short, skipping...")
elif peak < SILENCE_THRESHOLD:
print("⚠️ Audio too quiet (silence), skipping...")
else:
self._set_state(AppState.TRANSCRIBING)
text = self.transcribe_audio(wav_path)
with self._lock:
if op_id != self._op_id or not self._processing:
return
if text:
text = self.transform_text(text)
transcribed_text = text
self.type_text(text, send_enter=send_enter)
print(f"✓ {text}")
else:
if maybe_capture_mlx_issue(
provider=self.provider,
wav_path=wav_path,
language=self.language,
prompt=self.prompt,
):
wav_path = None
print("No transcription returned")
self.print_ready_prompt()
except Exception as e:
print(f"❌ Error processing recording: {e}")
finally:
# Archive or clean up temp file.
if wav_path:
archived = archive_recording(
wav_path,
keep_recordings=self.keep_recordings,
recordings_dir=self.recordings_dir,
recordings_max_bytes=self.recordings_max_bytes,
text=transcribed_text,
)
if not archived:
try:
os.unlink(wav_path)
except OSError:
pass
with self._lock:
is_current = op_id == self._op_id
if is_current:
self._processing = False
# Hide overlay and reset state (only if this is still the active op).
if is_current:
self._overlay.set_transcribing(False)
self._overlay.hide()
self._set_state(AppState.IDLE)