-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
104 lines (90 loc) · 3.67 KB
/
utils.py
File metadata and controls
104 lines (90 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import torch
from transformers import AutoProcessor, SeamlessM4Tv2Model, pipeline
import numpy as np
import sounddevice as sd
import sys
from langdetect import detect
def load_model(with_asr: bool = False):
"""
Carica il processor e il modello SeamlessM4T.
Se with_asr=True, restituisce anche una pipeline ASR Whisper-small per il rilevamento della lingua.
"""
# ---- 1. Load processor & model ----
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
if with_asr:
# ---- 2. Load ASR pipeline ----
asr = pipeline(
"automatic-speech-recognition",
model="openai/whisper-small",
chunk_length_s=5,
device=0 if torch.cuda.is_available() else -1
)
return processor, model, asr
return processor, model
def record_audio_interactive(fs: int = 16000) -> np.ndarray:
"""Record audio from your mic, start/stop via Enter key."""
frames = []
def callback(indata, _frames, _time, status):
if status:
print(f"⚠️ {status}")
frames.append(indata.copy())
print("Press Enter to start recording.")
input()
print("🎙 Recording... press Enter again to stop.")
with sd.InputStream(samplerate=fs, channels=1, callback=callback):
input()
print("🛑 Recording stopped.")
audio = np.concatenate(frames, axis=0).squeeze()
return audio
def translate_speech(processor, model, audio: np.ndarray, fs: int, src_lang: str, tgt_lang: str) -> np.ndarray:
"""Run SeamlessM4Tv2Model to translate raw audio from src_lang → tgt_lang."""
inputs = processor(audios=audio, sampling_rate=fs, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(**inputs, tgt_lang=tgt_lang)
return outputs[0].cpu().numpy().squeeze()
def play_audio(audio: np.ndarray, fs: int):
"""Play a NumPy audio array at sample rate fs."""
sd.play(audio, fs)
sd.wait()
def detect_language(asr, audio: np.ndarray, fs: int) -> str:
"""Detects whether audio is in English or Russian."""
# Run ASR to get text
result = asr(audio)
text = result.get("text", "")
# Fallback to langdetect if ASR doesn't provide language
lang = detect(text) if text else detect(text or " ")
# Map to our codes
if lang.startswith("ru"):
return "rus"
return "eng"
def translate_worker(processor, model, asr, audio_queue, fs):
"""Worker: auto-detects source language, translates to the other, and plays back."""
while True:
audio_chunk = audio_queue.get()
if audio_chunk is None:
break
# detect source and set target
src_lang = detect_language(asr, audio_chunk, fs)
tgt_lang = "eng" if src_lang == "rus" else "rus"
# Prepare inputs and translate
inputs = processor(audios=audio_chunk, sampling_rate=fs, return_tensors="pt")
with torch.no_grad():
generated = model.generate(**inputs, tgt_lang=tgt_lang)
# Extract waveform and play
output_audio = generated[0].cpu().numpy().squeeze()
sd.play(output_audio, fs)
sd.wait()
print("Translation worker exiting...")
def stream_callback(indata, frames, time, status, buffer, chunk_size, q):
if status:
print(f"⚠️ {status}", file=sys.stderr)
buffer.append(indata.copy())
total = np.concatenate(buffer, axis=0).squeeze()
if total.shape[0] >= chunk_size:
chunk = total[:chunk_size]
q.put(chunk)
remainder = total[chunk_size:]
buffer.clear()
if remainder.size > 0:
buffer.append(remainder.reshape(-1, 1))