From 14d665b64dc714c9718a59dfee86e0e05e8d6ee8 Mon Sep 17 00:00:00 2001
From: BasilPadre <basilpadre@gmail.com>
Date: Fri, 6 Mar 2026 18:14:52 +0300
Subject: [PATCH] feat: add multi-provider voice transcription (Parakeet,
 Mistral, OpenAI)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds voice message transcription support with three backends:
- `parakeet` (default): local NVIDIA NeMo Parakeet TDT 0.6B v3, runs on GPU,
  no API key or cloud cost required
- `mistral`: Mistral Voxtral cloud API
- `openai`: OpenAI Whisper cloud API

New settings:
- ENABLE_VOICE_PROCESSING (bool, default false)
- VOICE_PROVIDER (mistral | openai | parakeet, default parakeet)
- FFMPEG_PATH (optional explicit path, falls back to PATH)
- VOICE_MAX_FILE_SIZE_MB (default 20)
- MISTRAL_API_KEY / OPENAI_API_KEY (for cloud providers)

Optional dependency groups added to pyproject.toml:
- `[voice]` for mistral + openai cloud providers
- `[parakeet]` for local GPU transcription via NeMo

The Parakeet model (~600 MB) is downloaded and cached automatically on
first use. Audio is converted ogg→wav via ffmpeg before transcription.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md                         |  75 ++++++++++
 pyproject.toml                    |  15 ++
 src/bot/features/registry.py      |  13 ++
 src/bot/features/voice_handler.py | 220 ++++++++++++++++++++++++++++++
 src/bot/orchestrator.py           | 100 ++++++++++++++
 src/config/settings.py            |  38 ++++++
 6 files changed, 461 insertions(+)
 create mode 100644 src/bot/features/voice_handler.py

diff --git a/README.md b/README.md
index 559e19e6..0eb9dd3b 100644
--- a/README.md
+++ b/README.md
@@ -213,6 +213,81 @@ Enable with `ENABLE_API_SERVER=true` and `ENABLE_SCHEDULER=true`. See [docs/setu
 
 - Plugin system for third-party extensions
 
+## Voice Transcription
+
+The bot can transcribe Telegram voice messages and pass them as text to Claude. Three providers are supported:
+
+| Provider | Type | Best for |
+|----------|------|----------|
+| `parakeet` (default) | Local GPU | Privacy, no API cost, fast on NVIDIA GPU |
+| `mistral` | Cloud (Voxtral) | Quality without local GPU |
+| `openai` | Cloud (Whisper) | Widely supported cloud option |
+
+### Enable voice processing
+
+```bash
+ENABLE_VOICE_PROCESSING=true
+VOICE_PROVIDER=parakeet   # or: mistral, openai
+```
+
+### Parakeet (local GPU, no API key required)
+
+[NVIDIA NeMo Parakeet TDT 0.6B v3](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) runs on your GPU — no cloud API needed, no per-minute cost.
+
+**Requirements:**
+
+- NVIDIA GPU with CUDA support
+- [ffmpeg](https://ffmpeg.org/download.html) installed and on PATH (or set `FFMPEG_PATH`)
+- `claude-code-telegram[parakeet]` extras
+
+**Install:**
+
+```bash
+pip install "claude-code-telegram[parakeet]"
+# or with poetry:
+poetry install --with parakeet
+```
+
+**Configure:**
+
+```bash
+ENABLE_VOICE_PROCESSING=true
+VOICE_PROVIDER=parakeet
+# Optional: explicit path to ffmpeg if not on PATH
+FFMPEG_PATH=/usr/bin/ffmpeg
+```
+
+The NeMo model (~600 MB) is downloaded automatically on first use and cached locally.
+
+### Cloud providers (Mistral / OpenAI)
+
+```bash
+# Mistral Voxtral
+ENABLE_VOICE_PROCESSING=true
+VOICE_PROVIDER=mistral
+MISTRAL_API_KEY=your-mistral-api-key
+
+# OpenAI Whisper
+ENABLE_VOICE_PROCESSING=true
+VOICE_PROVIDER=openai
+OPENAI_API_KEY=your-openai-api-key
+```
+
+**Install cloud extras:**
+
+```bash
+pip install "claude-code-telegram[voice]"
+# or with poetry:
+poetry install --with voice
+```
+
+### Additional voice settings
+
+```bash
+VOICE_MAX_FILE_SIZE_MB=20    # Max voice message size (default: 20 MB)
+FFMPEG_PATH=                 # Optional explicit path to ffmpeg binary
+```
+
 ## Configuration
 
 ### Required
diff --git a/pyproject.toml b/pyproject.toml
index b4fb3f79..96880782 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,21 @@ uvicorn = {version = "^0.34.0", extras = ["standard"]}
 apscheduler = "^3.10"
 PyYAML = "^6.0.2"
 
+[tool.poetry.group.voice.dependencies]
+# Cloud providers (mistral or openai)
+mistralai = {version = "^1.0.0", optional = true}
+openai = {version = "^1.0.0", optional = true}
+
+[tool.poetry.group.parakeet.dependencies]
+# Local GPU transcription via NVIDIA NeMo (requires CUDA)
+# Install with: pip install "claude-code-telegram[parakeet]"
+nemo_toolkit = {version = "^2.3.0", extras = ["asr"], optional = true}
+torch = {version = ">=2.0.0", optional = true}
+
+[tool.poetry.extras]
+voice = ["mistralai", "openai"]
+parakeet = ["nemo_toolkit", "torch"]
+
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.4.0"
 pytest-asyncio = "^1.0.0"
diff --git a/src/bot/features/registry.py b/src/bot/features/registry.py
index 572338f3..04ab72a1 100644
--- a/src/bot/features/registry.py
+++ b/src/bot/features/registry.py
@@ -16,6 +16,7 @@
 from .image_handler import ImageHandler
 from .quick_actions import QuickActionManager
 from .session_export import SessionExporter
+from .voice_handler import VoiceHandler
 
 logger = structlog.get_logger(__name__)
 
@@ -77,6 +78,14 @@ def _initialize_features(self):
         except Exception as e:
             logger.error("Failed to initialize image handler", error=str(e))
 
+        # Voice handling - conditionally enabled
+        if self.config.enable_voice_processing:
+            try:
+                self.features["voice_handler"] = VoiceHandler(config=self.config)
+                logger.info("Voice handler feature enabled")
+            except Exception as e:
+                logger.error("Failed to initialize voice handler", error=str(e))
+
         # Conversation enhancements - skip in agentic mode
         if not self.config.agentic_mode:
             try:
@@ -118,6 +127,10 @@ def get_image_handler(self) -> Optional[ImageHandler]:
         """Get image handler feature"""
         return self.get_feature("image_handler")
 
+    def get_voice_handler(self) -> Optional[VoiceHandler]:
+        """Get voice handler feature"""
+        return self.get_feature("voice_handler")
+
     def get_conversation_enhancer(self) -> Optional[ConversationEnhancer]:
         """Get conversation enhancer feature"""
         return self.get_feature("conversation")
diff --git a/src/bot/features/voice_handler.py b/src/bot/features/voice_handler.py
new file mode 100644
index 00000000..24035137
--- /dev/null
+++ b/src/bot/features/voice_handler.py
@@ -0,0 +1,220 @@
+"""Handle voice message transcription.
+
+Supported providers (VOICE_PROVIDER):
+  parakeet  — local GPU inference via NVIDIA NeMo (default, free, requires CUDA)
+  mistral   — Mistral Voxtral API (cloud, requires MISTRAL_API_KEY)
+  openai    — OpenAI Whisper API (cloud, requires OPENAI_API_KEY)
+"""
+
+import asyncio
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from datetime import timedelta
+from pathlib import Path
+from typing import Any, Optional
+
+import structlog
+from telegram import Voice
+
+from src.config.settings import Settings
+
+logger = structlog.get_logger(__name__)
+
+
+@dataclass
+class ProcessedVoice:
+    """Result of voice message processing."""
+
+    transcription: str
+    prompt: str
+    duration: int = 0
+
+
+class VoiceHandler:
+    """Transcribe Telegram voice/audio messages.
+
+    Delegates to one of three backends based on config.voice_provider:
+    - 'parakeet': local NVIDIA NeMo model (no API key required)
+    - 'mistral':  Mistral Voxtral cloud API
+    - 'openai':   OpenAI Whisper cloud API
+    """
+
+    def __init__(self, config: Settings):
+        self.config = config
+        self._parakeet_model = None  # lazy-loaded on first use
+        self._mistral_client: Optional[Any] = None
+        self._openai_client: Optional[Any] = None
+
+    # ------------------------------------------------------------------
+    # Public interface
+    # ------------------------------------------------------------------
+
+    async def process_voice_message(
+        self, voice: Voice, caption: Optional[str] = None
+    ) -> ProcessedVoice:
+        """Download and transcribe a Telegram voice message."""
+        self._check_file_size(getattr(voice, "file_size", None))
+
+        file = await voice.get_file()
+        self._check_file_size(getattr(file, "file_size", None))
+
+        voice_bytes = bytes(await file.download_as_bytearray())
+        self._check_file_size(len(voice_bytes))
+
+        provider = self.config.voice_provider
+        logger.info("Transcribing voice message", provider=provider, duration=voice.duration)
+
+        if provider == "parakeet":
+            transcription = await self._transcribe_parakeet(voice_bytes)
+        elif provider == "openai":
+            transcription = await self._transcribe_openai(voice_bytes)
+        else:
+            transcription = await self._transcribe_mistral(voice_bytes)
+
+        logger.info("Voice transcription complete", length=len(transcription))
+
+        label = caption if caption else "Voice message transcription:"
+        dur = voice.duration
+        duration_secs = int(dur.total_seconds()) if isinstance(dur, timedelta) else (dur or 0)
+
+        return ProcessedVoice(
+            transcription=transcription,
+            prompt=f"{label}\n\n{transcription}",
+            duration=duration_secs,
+        )
+
+    # ------------------------------------------------------------------
+    # Parakeet (local)
+    # ------------------------------------------------------------------
+
+    @property
+    def _parakeet(self):
+        """Lazy-load the Parakeet TDT 0.6B v3 model on first use."""
+        if self._parakeet_model is None:
+            try:
+                import nemo.collections.asr as nemo_asr
+            except ModuleNotFoundError as exc:
+                raise RuntimeError(
+                    "Optional dependency 'nemo_toolkit' is missing for Parakeet transcription. "
+                    "Install parakeet extras: "
+                    'pip install "claude-code-telegram[parakeet]"'
+                ) from exc
+
+            logger.info("Loading Parakeet TDT 0.6B v3 model (first use)…")
+            self._parakeet_model = nemo_asr.models.ASRModel.from_pretrained(
+                "nvidia/parakeet-tdt-0.6b-v3"
+            )
+            logger.info("Parakeet model loaded")
+        return self._parakeet_model
+
+    async def _transcribe_parakeet(self, voice_bytes: bytes) -> str:
+        """Transcribe using local Parakeet model (runs in thread pool to avoid blocking)."""
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, self._run_parakeet, voice_bytes)
+
+    def _run_parakeet(self, voice_bytes: bytes) -> str:
+        """CPU-bound transcription — called from executor."""
+        ffmpeg = self.config.resolved_ffmpeg_path
+        with tempfile.TemporaryDirectory() as tmp:
+            ogg_path = Path(tmp) / "voice.ogg"
+            wav_path = Path(tmp) / "voice.wav"
+            ogg_path.write_bytes(voice_bytes)
+
+            subprocess.run(
+                [ffmpeg, "-y", "-i", str(ogg_path), "-ar", "16000", "-ac", "1", str(wav_path)],
+                check=True,
+                capture_output=True,
+            )
+
+            output = self._parakeet.transcribe([str(wav_path)])
+            text = output[0].text.strip()
+
+        if not text:
+            raise ValueError("Parakeet transcription returned an empty result.")
+        return text
+
+    # ------------------------------------------------------------------
+    # Mistral (cloud)
+    # ------------------------------------------------------------------
+
+    async def _transcribe_mistral(self, voice_bytes: bytes) -> str:
+        client = self._get_mistral_client()
+        try:
+            response = await client.audio.transcriptions.complete_async(
+                model="voxtral-mini-2507",
+                file={"content": voice_bytes, "file_name": "voice.ogg"},
+            )
+        except Exception as exc:
+            logger.warning("Mistral transcription failed", error_type=type(exc).__name__)
+            raise RuntimeError("Mistral transcription request failed.") from exc
+
+        text = (getattr(response, "text", "") or "").strip()
+        if not text:
+            raise ValueError("Mistral transcription returned an empty response.")
+        return text
+
+    def _get_mistral_client(self) -> Any:
+        if self._mistral_client is not None:
+            return self._mistral_client
+        try:
+            from mistralai import Mistral
+        except ModuleNotFoundError as exc:
+            raise RuntimeError(
+                "Optional dependency 'mistralai' is missing. "
+                'Install voice extras: pip install "claude-code-telegram[voice]"'
+            ) from exc
+
+        api_key = self.config.mistral_api_key_str
+        if not api_key:
+            raise RuntimeError("MISTRAL_API_KEY is not configured.")
+        self._mistral_client = Mistral(api_key=api_key)
+        return self._mistral_client
+
+    # ------------------------------------------------------------------
+    # OpenAI Whisper (cloud)
+    # ------------------------------------------------------------------
+
+    async def _transcribe_openai(self, voice_bytes: bytes) -> str:
+        client = self._get_openai_client()
+        try:
+            response = await client.audio.transcriptions.create(
+                model="whisper-1",
+                file=("voice.ogg", voice_bytes),
+            )
+        except Exception as exc:
+            logger.warning("OpenAI transcription failed", error_type=type(exc).__name__)
+            raise RuntimeError("OpenAI transcription request failed.") from exc
+
+        text = (getattr(response, "text", "") or "").strip()
+        if not text:
+            raise ValueError("OpenAI transcription returned an empty response.")
+        return text
+
+    def _get_openai_client(self) -> Any:
+        if self._openai_client is not None:
+            return self._openai_client
+        try:
+            from openai import AsyncOpenAI
+        except ModuleNotFoundError as exc:
+            raise RuntimeError(
+                "Optional dependency 'openai' is missing. "
+                'Install voice extras: pip install "claude-code-telegram[voice]"'
+            ) from exc
+
+        api_key = self.config.openai_api_key_str
+        if not api_key:
+            raise RuntimeError("OPENAI_API_KEY is not configured.")
+        self._openai_client = AsyncOpenAI(api_key=api_key)
+        return self._openai_client
+
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+
+    def _check_file_size(self, size: Optional[int]) -> None:
+        if isinstance(size, int) and size > self.config.voice_max_file_size_bytes:
+            raise ValueError(
+                f"Voice message too large ({size / 1024 / 1024:.1f} MB). "
+                f"Max: {self.config.voice_max_file_size_mb} MB."
+            )
diff --git a/src/bot/orchestrator.py b/src/bot/orchestrator.py
index faacabb8..ebe6ebf8 100644
--- a/src/bot/orchestrator.py
+++ b/src/bot/orchestrator.py
@@ -336,6 +336,15 @@ def _register_agentic_handlers(self, app: Application) -> None:
             group=10,
         )
 
+        # Voice/audio messages -> transcribe -> Claude
+        app.add_handler(
+            MessageHandler(
+                filters.VOICE | filters.AUDIO,
+                self._inject_deps(self.agentic_voice),
+            ),
+            group=10,
+        )
+
         # Only cd: callbacks (for project selection), scoped by pattern
         app.add_handler(
             CallbackQueryHandler(
@@ -1330,6 +1339,97 @@ async def agentic_photo(
                 "Claude photo processing failed", error=str(e), user_id=user_id
             )
 
+    async def agentic_voice(
+        self, update: Update, context: ContextTypes.DEFAULT_TYPE
+    ) -> None:
+        """Transcribe voice/audio message via configured provider, then pass to Claude."""
+        user_id = update.effective_user.id
+
+        features = context.bot_data.get("features")
+        voice_handler = features.get_voice_handler() if features else None
+
+        if not voice_handler:
+            await update.message.reply_text("Voice processing is not available.")
+            return
+
+        chat = update.message.chat
+        await chat.send_action("typing")
+        progress_msg = await update.message.reply_text("Transcribing...")
+
+        try:
+            voice = update.message.voice or update.message.audio
+            processed = await voice_handler.process_voice_message(
+                voice, caption=update.message.caption
+            )
+
+            # Show transcription to user
+            await progress_msg.edit_text(f"\U0001f3a4 {processed.transcription}")
+
+            claude_integration = context.bot_data.get("claude_integration")
+            if not claude_integration:
+                return
+
+            current_dir = context.user_data.get(
+                "current_directory", self.settings.approved_directory
+            )
+            session_id = context.user_data.get("claude_session_id")
+            force_new = bool(context.user_data.get("force_new_session"))
+            verbose_level = self._get_verbose_level(context)
+            tool_log: List[Dict[str, Any]] = []
+            mcp_images_voice: List[Any] = []
+            on_stream = self._make_stream_callback(
+                verbose_level,
+                progress_msg,
+                tool_log,
+                time.time(),
+                mcp_images=mcp_images_voice,
+                approved_directory=self.settings.approved_directory,
+            )
+
+            heartbeat = self._start_typing_heartbeat(chat)
+            try:
+                claude_response = await claude_integration.run_command(
+                    prompt=processed.prompt,
+                    working_directory=current_dir,
+                    user_id=user_id,
+                    session_id=session_id,
+                    on_stream=on_stream,
+                    force_new=force_new,
+                )
+            finally:
+                heartbeat.cancel()
+
+            if force_new:
+                context.user_data["force_new_session"] = False
+
+            context.user_data["claude_session_id"] = claude_response.session_id
+
+            from .utils.formatting import ResponseFormatter
+
+            formatter = ResponseFormatter(self.settings)
+            formatted_messages = formatter.format_claude_response(
+                claude_response.content
+            )
+
+            for i, message in enumerate(formatted_messages):
+                await update.message.reply_text(
+                    message.text,
+                    parse_mode=message.parse_mode,
+                    reply_to_message_id=(
+                        update.message.message_id if i == 0 else None
+                    ),
+                )
+                if i < len(formatted_messages) - 1:
+                    await asyncio.sleep(0.5)
+
+        except Exception as e:
+            from .handlers.message import _format_error_message
+
+            await progress_msg.edit_text(_format_error_message(e), parse_mode="HTML")
+            logger.error(
+                "Voice processing failed", error=str(e), user_id=user_id
+            )
+
     async def agentic_repo(
         self, update: Update, context: ContextTypes.DEFAULT_TYPE
     ) -> None:
diff --git a/src/config/settings.py b/src/config/settings.py
index 7c32eaba..f8bd7bb3 100644
--- a/src/config/settings.py
+++ b/src/config/settings.py
@@ -160,6 +160,24 @@ class Settings(BaseSettings):
     )
 
     # Features
+    enable_voice_processing: bool = Field(False, description="Enable voice message transcription")
+    voice_provider: Literal["mistral", "openai", "parakeet"] = Field(
+        "parakeet",
+        description="Voice transcription provider: 'mistral', 'openai', or 'parakeet' (local GPU)",
+    )
+    ffmpeg_path: Optional[str] = Field(
+        None,
+        description="Path to ffmpeg binary. Falls back to 'ffmpeg' from PATH if not set.",
+    )
+    voice_max_file_size_mb: int = Field(
+        20,
+        description="Maximum voice message size in MB",
+        ge=1,
+        le=200,
+    )
+    # Mistral / OpenAI voice keys (used when voice_provider != parakeet)
+    mistral_api_key: Optional[SecretStr] = Field(None, description="Mistral API key for Voxtral")
+    openai_api_key: Optional[SecretStr] = Field(None, description="OpenAI API key for Whisper")
     enable_mcp: bool = Field(False, description="Enable Model Context Protocol")
     mcp_config_path: Optional[Path] = Field(
         None, description="MCP configuration file path"
@@ -433,3 +451,23 @@ def anthropic_api_key_str(self) -> Optional[str]:
             if self.anthropic_api_key
             else None
         )
+
+    @property
+    def mistral_api_key_str(self) -> Optional[str]:
+        """Get Mistral API key as string."""
+        return self.mistral_api_key.get_secret_value() if self.mistral_api_key else None
+
+    @property
+    def openai_api_key_str(self) -> Optional[str]:
+        """Get OpenAI API key as string."""
+        return self.openai_api_key.get_secret_value() if self.openai_api_key else None
+
+    @property
+    def voice_max_file_size_bytes(self) -> int:
+        """Maximum allowed voice message size in bytes."""
+        return self.voice_max_file_size_mb * 1024 * 1024
+
+    @property
+    def resolved_ffmpeg_path(self) -> str:
+        """ffmpeg binary path: explicit setting or fall back to PATH."""
+        return self.ffmpeg_path or "ffmpeg"