Jarvis/audio_handler.py at main · Couvbat/Jarvis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Audio input/output handler with Voice Activity Detection."""

import numpy as np
import sounddevice as sd
import soundfile as sf
import webrtcvad
from collections import deque
from typing import Optional, Generator
from loguru import logger
from config import settings


class AudioHandler:
    """Handles audio recording and playback with VAD support."""

    def __init__(self):
        self.sample_rate = settings.sample_rate
        self.channels = settings.channels
        self.chunk_size = settings.chunk_size
        self.vad = webrtcvad.Vad(2)  # Aggressiveness 0-3, 2 is moderate

    def record_until_silence(
        self,
        silence_threshold: float = 1.0,
        max_duration: float = 30.0
    ) -> np.ndarray:
        """
        Record audio until silence is detected.

        Args:
            silence_threshold: Seconds of silence before stopping
            max_duration: Maximum recording duration in seconds

        Returns:
            Audio data as numpy array
        """
        logger.info("Starting audio recording...")

        frames = []
        silence_frames = 0
        silence_frame_count = int(silence_threshold * self.sample_rate / self.chunk_size)
        max_frames = int(max_duration * self.sample_rate / self.chunk_size)

        try:
            with sd.InputStream(
                samplerate=self.sample_rate,
                channels=self.channels,
                dtype='int16',
                blocksize=self.chunk_size
            ) as stream:
                logger.info("Listening... (speak now)")

                for _ in range(max_frames):
                    audio_chunk, _ = stream.read(self.chunk_size)
                    frames.append(audio_chunk.copy())

                    # Check for voice activity
                    # VAD requires 16-bit PCM, 8kHz, 16kHz, 32kHz, or 48kHz
                    audio_bytes = audio_chunk.tobytes()

                    try:
                        is_speech = self.vad.is_speech(audio_bytes, self.sample_rate)

                        if is_speech:
                            silence_frames = 0
                        else:
                            silence_frames += 1

                        # Stop if we've had enough silence
                        if silence_frames >= silence_frame_count and len(frames) > 10:
                            logger.info("Silence detected, stopping recording")
                            break

                    except Exception as e:
                        # VAD can fail with certain audio, continue anyway
                        logger.debug(f"VAD error: {e}")

        except Exception as e:
            logger.error(f"Recording error: {e}")
            raise

        # Concatenate all frames
        if not frames:
            return np.array([], dtype=np.int16)

        audio_data = np.concatenate(frames, axis=0)
        logger.info(f"Recording complete: {len(audio_data) / self.sample_rate:.2f}s")

        return audio_data

    def play_audio(self, audio_data: np.ndarray, sample_rate: Optional[int] = None):
        """
        Play audio data.

        Args:
            audio_data: Audio samples as numpy array
            sample_rate: Sample rate (uses default if None)
        """
        if sample_rate is None:
            sample_rate = self.sample_rate

        logger.info(f"Playing audio: {len(audio_data) / sample_rate:.2f}s")

        try:
            sd.play(audio_data, sample_rate)
            sd.wait()
            logger.debug("Playback complete")
        except Exception as e:
            logger.error(f"Playback error: {e}")
            raise

    def save_audio(self, audio_data: np.ndarray, filename: str):
        """Save audio data to file."""
        try:
            sf.write(filename, audio_data, self.sample_rate)
            logger.info(f"Audio saved to {filename}")
        except Exception as e:
            logger.error(f"Error saving audio: {e}")
            raise

    def load_audio(self, filename: str) -> tuple[np.ndarray, int]:
        """Load audio from file."""
        try:
            audio_data, sample_rate = sf.read(filename, dtype='int16')
            logger.info(f"Audio loaded from {filename}")
            return audio_data, sample_rate
        except Exception as e:
            logger.error(f"Error loading audio: {e}")
            raise