RichardAtCT · BasilPadre · Mar 6, 2026
diff --git a/README.md b/README.md
@@ -213,6 +213,81 @@ Enable with `ENABLE_API_SERVER=true` and `ENABLE_SCHEDULER=true`. See [docs/setu
 
 - Plugin system for third-party extensions
 
+## Voice Transcription
+
+The bot can transcribe Telegram voice messages and pass them as text to Claude. Three providers are supported:
+
+| Provider | Type | Best for |
+|----------|------|----------|
+| `parakeet` (default) | Local GPU | Privacy, no API cost, fast on NVIDIA GPU |
+| `mistral` | Cloud (Voxtral) | Quality without local GPU |
+| `openai` | Cloud (Whisper) | Widely supported cloud option |
+
+### Enable voice processing
+
+```bash
+ENABLE_VOICE_PROCESSING=true
+VOICE_PROVIDER=parakeet   # or: mistral, openai
+```
+
+### Parakeet (local GPU, no API key required)
+
+[NVIDIA NeMo Parakeet TDT 0.6B v3](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) runs on your GPU — no cloud API needed, no per-minute cost.
+
+**Requirements:**
+
+- NVIDIA GPU with CUDA support
+- [ffmpeg](https://ffmpeg.org/download.html) installed and on PATH (or set `FFMPEG_PATH`)
+- `claude-code-telegram[parakeet]` extras
+
+**Install:**
+
+```bash
+pip install "claude-code-telegram[parakeet]"
+# or with poetry:
+poetry install --with parakeet
+```
+
+**Configure:**
+
+```bash
+ENABLE_VOICE_PROCESSING=true
+VOICE_PROVIDER=parakeet
+# Optional: explicit path to ffmpeg if not on PATH
+FFMPEG_PATH=/usr/bin/ffmpeg
+```
+
+The NeMo model (~600 MB) is downloaded automatically on first use and cached locally.
+
+### Cloud providers (Mistral / OpenAI)
+
+```bash
+# Mistral Voxtral
+ENABLE_VOICE_PROCESSING=true
+VOICE_PROVIDER=mistral
+MISTRAL_API_KEY=your-mistral-api-key
+
+# OpenAI Whisper
+ENABLE_VOICE_PROCESSING=true
+VOICE_PROVIDER=openai
+OPENAI_API_KEY=your-openai-api-key
+```
+
+**Install cloud extras:**
+
+```bash
+pip install "claude-code-telegram[voice]"
+# or with poetry:
+poetry install --with voice
+```
+
+### Additional voice settings
+
+```bash
+VOICE_MAX_FILE_SIZE_MB=20    # Max voice message size (default: 20 MB)
+FFMPEG_PATH=                 # Optional explicit path to ffmpeg binary
+```
+
 ## Configuration
 
 ### Required

diff --git a/pyproject.toml b/pyproject.toml
@@ -53,6 +53,21 @@ uvicorn = {version = "^0.34.0", extras = ["standard"]}
 apscheduler = "^3.10"
 PyYAML = "^6.0.2"
 
+[tool.poetry.group.voice.dependencies]
+# Cloud providers (mistral or openai)
+mistralai = {version = "^1.0.0", optional = true}
+openai = {version = "^1.0.0", optional = true}
+
+[tool.poetry.group.parakeet.dependencies]
+# Local GPU transcription via NVIDIA NeMo (requires CUDA)
+# Install with: pip install "claude-code-telegram[parakeet]"
+nemo_toolkit = {version = "^2.3.0", extras = ["asr"], optional = true}
+torch = {version = ">=2.0.0", optional = true}
+
+[tool.poetry.extras]
+voice = ["mistralai", "openai"]
+parakeet = ["nemo_toolkit", "torch"]
+
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.4.0"
 pytest-asyncio = "^1.0.0"

diff --git a/src/bot/features/registry.py b/src/bot/features/registry.py
@@ -16,6 +16,7 @@
 from .image_handler import ImageHandler
 from .quick_actions import QuickActionManager
 from .session_export import SessionExporter
+from .voice_handler import VoiceHandler
 
 logger = structlog.get_logger(__name__)
 
@@ -77,6 +78,14 @@ def _initialize_features(self):
         except Exception as e:
             logger.error("Failed to initialize image handler", error=str(e))
 
+        # Voice handling - conditionally enabled
+        if self.config.enable_voice_processing:
+            try:
+                self.features["voice_handler"] = VoiceHandler(config=self.config)
+                logger.info("Voice handler feature enabled")
+            except Exception as e:
+                logger.error("Failed to initialize voice handler", error=str(e))
+
         # Conversation enhancements - skip in agentic mode
         if not self.config.agentic_mode:
             try:
@@ -118,6 +127,10 @@ def get_image_handler(self) -> Optional[ImageHandler]:
         """Get image handler feature"""
         return self.get_feature("image_handler")
 
+    def get_voice_handler(self) -> Optional[VoiceHandler]:
+        """Get voice handler feature"""
+        return self.get_feature("voice_handler")
+
     def get_conversation_enhancer(self) -> Optional[ConversationEnhancer]:
         """Get conversation enhancer feature"""
         return self.get_feature("conversation")

diff --git a/src/bot/features/voice_handler.py b/src/bot/features/voice_handler.py
@@ -0,0 +1,220 @@
+"""Handle voice message transcription.
+
+Supported providers (VOICE_PROVIDER):
+  parakeet  — local GPU inference via NVIDIA NeMo (default, free, requires CUDA)
+  mistral   — Mistral Voxtral API (cloud, requires MISTRAL_API_KEY)
+  openai    — OpenAI Whisper API (cloud, requires OPENAI_API_KEY)
+"""
+
+import asyncio
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from datetime import timedelta
+from pathlib import Path
+from typing import Any, Optional
+
+import structlog
+from telegram import Voice
+
+from src.config.settings import Settings
+
+logger = structlog.get_logger(__name__)
+
+
+@dataclass
+class ProcessedVoice:
+    """Result of voice message processing."""
+
+    transcription: str
+    prompt: str
+    duration: int = 0
+
+
+class VoiceHandler:
+    """Transcribe Telegram voice/audio messages.
+
+    Delegates to one of three backends based on config.voice_provider:
+    - 'parakeet': local NVIDIA NeMo model (no API key required)
+    - 'mistral':  Mistral Voxtral cloud API
+    - 'openai':   OpenAI Whisper cloud API
+    """
+
+    def __init__(self, config: Settings):
+        self.config = config
+        self._parakeet_model = None  # lazy-loaded on first use
+        self._mistral_client: Optional[Any] = None
+        self._openai_client: Optional[Any] = None
+
+    # ------------------------------------------------------------------
+    # Public interface
+    # ------------------------------------------------------------------
+
+    async def process_voice_message(
+        self, voice: Voice, caption: Optional[str] = None
+    ) -> ProcessedVoice:
+        """Download and transcribe a Telegram voice message."""
+        self._check_file_size(getattr(voice, "file_size", None))
+
+        file = await voice.get_file()
+        self._check_file_size(getattr(file, "file_size", None))
+
+        voice_bytes = bytes(await file.download_as_bytearray())
+        self._check_file_size(len(voice_bytes))
+
+        provider = self.config.voice_provider
+        logger.info("Transcribing voice message", provider=provider, duration=voice.duration)
+
+        if provider == "parakeet":
+            transcription = await self._transcribe_parakeet(voice_bytes)
+        elif provider == "openai":
+            transcription = await self._transcribe_openai(voice_bytes)
+        else:
+            transcription = await self._transcribe_mistral(voice_bytes)
+
+        logger.info("Voice transcription complete", length=len(transcription))
+
+        label = caption if caption else "Voice message transcription:"
+        dur = voice.duration
+        duration_secs = int(dur.total_seconds()) if isinstance(dur, timedelta) else (dur or 0)
+
+        return ProcessedVoice(
+            transcription=transcription,
+            prompt=f"{label}\n\n{transcription}",
+            duration=duration_secs,
+        )
+
+    # ------------------------------------------------------------------
+    # Parakeet (local)
+    # ------------------------------------------------------------------
+
+    @property
+    def _parakeet(self):
+        """Lazy-load the Parakeet TDT 0.6B v3 model on first use."""
+        if self._parakeet_model is None:
+            try:
+                import nemo.collections.asr as nemo_asr
+            except ModuleNotFoundError as exc:
+                raise RuntimeError(
+                    "Optional dependency 'nemo_toolkit' is missing for Parakeet transcription. "
+                    "Install parakeet extras: "
+                    'pip install "claude-code-telegram[parakeet]"'
+                ) from exc
+
+            logger.info("Loading Parakeet TDT 0.6B v3 model (first use)…")
+            self._parakeet_model = nemo_asr.models.ASRModel.from_pretrained(
+                "nvidia/parakeet-tdt-0.6b-v3"
+            )
+            logger.info("Parakeet model loaded")
+        return self._parakeet_model
+
+    async def _transcribe_parakeet(self, voice_bytes: bytes) -> str:
+        """Transcribe using local Parakeet model (runs in thread pool to avoid blocking)."""
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, self._run_parakeet, voice_bytes)
+
+    def _run_parakeet(self, voice_bytes: bytes) -> str:
+        """CPU-bound transcription — called from executor."""
+        ffmpeg = self.config.resolved_ffmpeg_path
+        with tempfile.TemporaryDirectory() as tmp:
+            ogg_path = Path(tmp) / "voice.ogg"
+            wav_path = Path(tmp) / "voice.wav"
+            ogg_path.write_bytes(voice_bytes)
+
+            subprocess.run(
+                [ffmpeg, "-y", "-i", str(ogg_path), "-ar", "16000", "-ac", "1", str(wav_path)],
+                check=True,
+                capture_output=True,
+            )
+
+            output = self._parakeet.transcribe([str(wav_path)])
+            text = output[0].text.strip()
+
+        if not text:
+            raise ValueError("Parakeet transcription returned an empty result.")
+        return text
+
+    # ------------------------------------------------------------------
+    # Mistral (cloud)
+    # ------------------------------------------------------------------
+
+    async def _transcribe_mistral(self, voice_bytes: bytes) -> str:
+        client = self._get_mistral_client()
+        try:
+            response = await client.audio.transcriptions.complete_async(
+                model="voxtral-mini-2507",
+                file={"content": voice_bytes, "file_name": "voice.ogg"},
+            )
+        except Exception as exc:
+            logger.warning("Mistral transcription failed", error_type=type(exc).__name__)
+            raise RuntimeError("Mistral transcription request failed.") from exc
+
+        text = (getattr(response, "text", "") or "").strip()
+        if not text:
+            raise ValueError("Mistral transcription returned an empty response.")
+        return text
+
+    def _get_mistral_client(self) -> Any:
+        if self._mistral_client is not None:
+            return self._mistral_client
+        try:
+            from mistralai import Mistral
+        except ModuleNotFoundError as exc:
+            raise RuntimeError(
+                "Optional dependency 'mistralai' is missing. "
+                'Install voice extras: pip install "claude-code-telegram[voice]"'
+            ) from exc
+
+        api_key = self.config.mistral_api_key_str
+        if not api_key:
+            raise RuntimeError("MISTRAL_API_KEY is not configured.")
+        self._mistral_client = Mistral(api_key=api_key)
+        return self._mistral_client
+
+    # ------------------------------------------------------------------
+    # OpenAI Whisper (cloud)
+    # ------------------------------------------------------------------
+
+    async def _transcribe_openai(self, voice_bytes: bytes) -> str:
+        client = self._get_openai_client()
+        try:
+            response = await client.audio.transcriptions.create(
+                model="whisper-1",
+                file=("voice.ogg", voice_bytes),
+            )
+        except Exception as exc:
+            logger.warning("OpenAI transcription failed", error_type=type(exc).__name__)
+            raise RuntimeError("OpenAI transcription request failed.") from exc
+
+        text = (getattr(response, "text", "") or "").strip()
+        if not text:
+            raise ValueError("OpenAI transcription returned an empty response.")
+        return text
+
+    def _get_openai_client(self) -> Any:
+        if self._openai_client is not None:
+            return self._openai_client
+        try:
+            from openai import AsyncOpenAI
+        except ModuleNotFoundError as exc:
+            raise RuntimeError(
+                "Optional dependency 'openai' is missing. "
+                'Install voice extras: pip install "claude-code-telegram[voice]"'
+            ) from exc
+
+        api_key = self.config.openai_api_key_str
+        if not api_key:
+            raise RuntimeError("OPENAI_API_KEY is not configured.")
+        self._openai_client = AsyncOpenAI(api_key=api_key)
+        return self._openai_client
+
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+
+    def _check_file_size(self, size: Optional[int]) -> None:
+        if isinstance(size, int) and size > self.config.voice_max_file_size_bytes:
+            raise ValueError(
+                f"Voice message too large ({size / 1024 / 1024:.1f} MB). "
+                f"Max: {self.config.voice_max_file_size_mb} MB."
+            )