From 14d665b64dc714c9718a59dfee86e0e05e8d6ee8 Mon Sep 17 00:00:00 2001 From: BasilPadre Date: Fri, 6 Mar 2026 18:14:52 +0300 Subject: [PATCH] feat: add multi-provider voice transcription (Parakeet, Mistral, OpenAI) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds voice message transcription support with three backends: - `parakeet` (default): local NVIDIA NeMo Parakeet TDT 0.6B v3, runs on GPU, no API key or cloud cost required - `mistral`: Mistral Voxtral cloud API - `openai`: OpenAI Whisper cloud API New settings: - ENABLE_VOICE_PROCESSING (bool, default false) - VOICE_PROVIDER (mistral | openai | parakeet, default parakeet) - FFMPEG_PATH (optional explicit path, falls back to PATH) - VOICE_MAX_FILE_SIZE_MB (default 20) - MISTRAL_API_KEY / OPENAI_API_KEY (for cloud providers) Optional dependency groups added to pyproject.toml: - `[voice]` for mistral + openai cloud providers - `[parakeet]` for local GPU transcription via NeMo The Parakeet model (~600 MB) is downloaded and cached automatically on first use. Audio is converted ogg→wav via ffmpeg before transcription. Co-Authored-By: Claude Sonnet 4.6 --- README.md | 75 ++++++++++ pyproject.toml | 15 ++ src/bot/features/registry.py | 13 ++ src/bot/features/voice_handler.py | 220 ++++++++++++++++++++++++++++++ src/bot/orchestrator.py | 100 ++++++++++++++ src/config/settings.py | 38 ++++++ 6 files changed, 461 insertions(+) create mode 100644 src/bot/features/voice_handler.py diff --git a/README.md b/README.md index 559e19e6..0eb9dd3b 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,81 @@ Enable with `ENABLE_API_SERVER=true` and `ENABLE_SCHEDULER=true`. See [docs/setu - Plugin system for third-party extensions +## Voice Transcription + +The bot can transcribe Telegram voice messages and pass them as text to Claude. Three providers are supported: + +| Provider | Type | Best for | +|----------|------|----------| +| `parakeet` (default) | Local GPU | Privacy, no API cost, fast on NVIDIA GPU | +| `mistral` | Cloud (Voxtral) | Quality without local GPU | +| `openai` | Cloud (Whisper) | Widely supported cloud option | + +### Enable voice processing + +```bash +ENABLE_VOICE_PROCESSING=true +VOICE_PROVIDER=parakeet # or: mistral, openai +``` + +### Parakeet (local GPU, no API key required) + +[NVIDIA NeMo Parakeet TDT 0.6B v3](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) runs on your GPU — no cloud API needed, no per-minute cost. + +**Requirements:** + +- NVIDIA GPU with CUDA support +- [ffmpeg](https://ffmpeg.org/download.html) installed and on PATH (or set `FFMPEG_PATH`) +- `claude-code-telegram[parakeet]` extras + +**Install:** + +```bash +pip install "claude-code-telegram[parakeet]" +# or with poetry: +poetry install --with parakeet +``` + +**Configure:** + +```bash +ENABLE_VOICE_PROCESSING=true +VOICE_PROVIDER=parakeet +# Optional: explicit path to ffmpeg if not on PATH +FFMPEG_PATH=/usr/bin/ffmpeg +``` + +The NeMo model (~600 MB) is downloaded automatically on first use and cached locally. + +### Cloud providers (Mistral / OpenAI) + +```bash +# Mistral Voxtral +ENABLE_VOICE_PROCESSING=true +VOICE_PROVIDER=mistral +MISTRAL_API_KEY=your-mistral-api-key + +# OpenAI Whisper +ENABLE_VOICE_PROCESSING=true +VOICE_PROVIDER=openai +OPENAI_API_KEY=your-openai-api-key +``` + +**Install cloud extras:** + +```bash +pip install "claude-code-telegram[voice]" +# or with poetry: +poetry install --with voice +``` + +### Additional voice settings + +```bash +VOICE_MAX_FILE_SIZE_MB=20 # Max voice message size (default: 20 MB) +FFMPEG_PATH= # Optional explicit path to ffmpeg binary +``` + ## Configuration ### Required diff --git a/pyproject.toml b/pyproject.toml index b4fb3f79..96880782 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,21 @@ uvicorn = {version = "^0.34.0", extras = ["standard"]} apscheduler = "^3.10" PyYAML = "^6.0.2" +[tool.poetry.group.voice.dependencies] +# Cloud providers (mistral or openai) +mistralai = {version = "^1.0.0", optional = true} +openai = {version = "^1.0.0", optional = true} + +[tool.poetry.group.parakeet.dependencies] +# Local GPU transcription via NVIDIA NeMo (requires CUDA) +# Install with: pip install "claude-code-telegram[parakeet]" +nemo_toolkit = {version = "^2.3.0", extras = ["asr"], optional = true} +torch = {version = ">=2.0.0", optional = true} + +[tool.poetry.extras] +voice = ["mistralai", "openai"] +parakeet = ["nemo_toolkit", "torch"] + [tool.poetry.group.dev.dependencies] pytest = "^8.4.0" pytest-asyncio = "^1.0.0" diff --git a/src/bot/features/registry.py b/src/bot/features/registry.py index 572338f3..04ab72a1 100644 --- a/src/bot/features/registry.py +++ b/src/bot/features/registry.py @@ -16,6 +16,7 @@ from .image_handler import ImageHandler from .quick_actions import QuickActionManager from .session_export import SessionExporter +from .voice_handler import VoiceHandler logger = structlog.get_logger(__name__) @@ -77,6 +78,14 @@ def _initialize_features(self): except Exception as e: logger.error("Failed to initialize image handler", error=str(e)) + # Voice handling - conditionally enabled + if self.config.enable_voice_processing: + try: + self.features["voice_handler"] = VoiceHandler(config=self.config) + logger.info("Voice handler feature enabled") + except Exception as e: + logger.error("Failed to initialize voice handler", error=str(e)) + # Conversation enhancements - skip in agentic mode if not self.config.agentic_mode: try: @@ -118,6 +127,10 @@ def get_image_handler(self) -> Optional[ImageHandler]: """Get image handler feature""" return self.get_feature("image_handler") + def get_voice_handler(self) -> Optional[VoiceHandler]: + """Get voice handler feature""" + return self.get_feature("voice_handler") + def get_conversation_enhancer(self) -> Optional[ConversationEnhancer]: """Get conversation enhancer feature""" return self.get_feature("conversation") diff --git a/src/bot/features/voice_handler.py b/src/bot/features/voice_handler.py new file mode 100644 index 00000000..24035137 --- /dev/null +++ b/src/bot/features/voice_handler.py @@ -0,0 +1,220 @@ +"""Handle voice message transcription. + +Supported providers (VOICE_PROVIDER): + parakeet — local GPU inference via NVIDIA NeMo (default, free, requires CUDA) + mistral — Mistral Voxtral API (cloud, requires MISTRAL_API_KEY) + openai — OpenAI Whisper API (cloud, requires OPENAI_API_KEY) +""" + +import asyncio +import subprocess +import tempfile +from dataclasses import dataclass +from datetime import timedelta +from pathlib import Path +from typing import Any, Optional + +import structlog +from telegram import Voice + +from src.config.settings import Settings + +logger = structlog.get_logger(__name__) + + +@dataclass +class ProcessedVoice: + """Result of voice message processing.""" + + transcription: str + prompt: str + duration: int = 0 + + +class VoiceHandler: + """Transcribe Telegram voice/audio messages. + + Delegates to one of three backends based on config.voice_provider: + - 'parakeet': local NVIDIA NeMo model (no API key required) + - 'mistral': Mistral Voxtral cloud API + - 'openai': OpenAI Whisper cloud API + """ + + def __init__(self, config: Settings): + self.config = config + self._parakeet_model = None # lazy-loaded on first use + self._mistral_client: Optional[Any] = None + self._openai_client: Optional[Any] = None + + # ------------------------------------------------------------------ + # Public interface + # ------------------------------------------------------------------ + + async def process_voice_message( + self, voice: Voice, caption: Optional[str] = None + ) -> ProcessedVoice: + """Download and transcribe a Telegram voice message.""" + self._check_file_size(getattr(voice, "file_size", None)) + + file = await voice.get_file() + self._check_file_size(getattr(file, "file_size", None)) + + voice_bytes = bytes(await file.download_as_bytearray()) + self._check_file_size(len(voice_bytes)) + + provider = self.config.voice_provider + logger.info("Transcribing voice message", provider=provider, duration=voice.duration) + + if provider == "parakeet": + transcription = await self._transcribe_parakeet(voice_bytes) + elif provider == "openai": + transcription = await self._transcribe_openai(voice_bytes) + else: + transcription = await self._transcribe_mistral(voice_bytes) + + logger.info("Voice transcription complete", length=len(transcription)) + + label = caption if caption else "Voice message transcription:" + dur = voice.duration + duration_secs = int(dur.total_seconds()) if isinstance(dur, timedelta) else (dur or 0) + + return ProcessedVoice( + transcription=transcription, + prompt=f"{label}\n\n{transcription}", + duration=duration_secs, + ) + + # ------------------------------------------------------------------ + # Parakeet (local) + # ------------------------------------------------------------------ + + @property + def _parakeet(self): + """Lazy-load the Parakeet TDT 0.6B v3 model on first use.""" + if self._parakeet_model is None: + try: + import nemo.collections.asr as nemo_asr + except ModuleNotFoundError as exc: + raise RuntimeError( + "Optional dependency 'nemo_toolkit' is missing for Parakeet transcription. " + "Install parakeet extras: " + 'pip install "claude-code-telegram[parakeet]"' + ) from exc + + logger.info("Loading Parakeet TDT 0.6B v3 model (first use)…") + self._parakeet_model = nemo_asr.models.ASRModel.from_pretrained( + "nvidia/parakeet-tdt-0.6b-v3" + ) + logger.info("Parakeet model loaded") + return self._parakeet_model + + async def _transcribe_parakeet(self, voice_bytes: bytes) -> str: + """Transcribe using local Parakeet model (runs in thread pool to avoid blocking).""" + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, self._run_parakeet, voice_bytes) + + def _run_parakeet(self, voice_bytes: bytes) -> str: + """CPU-bound transcription — called from executor.""" + ffmpeg = self.config.resolved_ffmpeg_path + with tempfile.TemporaryDirectory() as tmp: + ogg_path = Path(tmp) / "voice.ogg" + wav_path = Path(tmp) / "voice.wav" + ogg_path.write_bytes(voice_bytes) + + subprocess.run( + [ffmpeg, "-y", "-i", str(ogg_path), "-ar", "16000", "-ac", "1", str(wav_path)], + check=True, + capture_output=True, + ) + + output = self._parakeet.transcribe([str(wav_path)]) + text = output[0].text.strip() + + if not text: + raise ValueError("Parakeet transcription returned an empty result.") + return text + + # ------------------------------------------------------------------ + # Mistral (cloud) + # ------------------------------------------------------------------ + + async def _transcribe_mistral(self, voice_bytes: bytes) -> str: + client = self._get_mistral_client() + try: + response = await client.audio.transcriptions.complete_async( + model="voxtral-mini-2507", + file={"content": voice_bytes, "file_name": "voice.ogg"}, + ) + except Exception as exc: + logger.warning("Mistral transcription failed", error_type=type(exc).__name__) + raise RuntimeError("Mistral transcription request failed.") from exc + + text = (getattr(response, "text", "") or "").strip() + if not text: + raise ValueError("Mistral transcription returned an empty response.") + return text + + def _get_mistral_client(self) -> Any: + if self._mistral_client is not None: + return self._mistral_client + try: + from mistralai import Mistral + except ModuleNotFoundError as exc: + raise RuntimeError( + "Optional dependency 'mistralai' is missing. " + 'Install voice extras: pip install "claude-code-telegram[voice]"' + ) from exc + + api_key = self.config.mistral_api_key_str + if not api_key: + raise RuntimeError("MISTRAL_API_KEY is not configured.") + self._mistral_client = Mistral(api_key=api_key) + return self._mistral_client + + # ------------------------------------------------------------------ + # OpenAI Whisper (cloud) + # ------------------------------------------------------------------ + + async def _transcribe_openai(self, voice_bytes: bytes) -> str: + client = self._get_openai_client() + try: + response = await client.audio.transcriptions.create( + model="whisper-1", + file=("voice.ogg", voice_bytes), + ) + except Exception as exc: + logger.warning("OpenAI transcription failed", error_type=type(exc).__name__) + raise RuntimeError("OpenAI transcription request failed.") from exc + + text = (getattr(response, "text", "") or "").strip() + if not text: + raise ValueError("OpenAI transcription returned an empty response.") + return text + + def _get_openai_client(self) -> Any: + if self._openai_client is not None: + return self._openai_client + try: + from openai import AsyncOpenAI + except ModuleNotFoundError as exc: + raise RuntimeError( + "Optional dependency 'openai' is missing. " + 'Install voice extras: pip install "claude-code-telegram[voice]"' + ) from exc + + api_key = self.config.openai_api_key_str + if not api_key: + raise RuntimeError("OPENAI_API_KEY is not configured.") + self._openai_client = AsyncOpenAI(api_key=api_key) + return self._openai_client + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + def _check_file_size(self, size: Optional[int]) -> None: + if isinstance(size, int) and size > self.config.voice_max_file_size_bytes: + raise ValueError( + f"Voice message too large ({size / 1024 / 1024:.1f} MB). " + f"Max: {self.config.voice_max_file_size_mb} MB." + ) diff --git a/src/bot/orchestrator.py b/src/bot/orchestrator.py index faacabb8..ebe6ebf8 100644 --- a/src/bot/orchestrator.py +++ b/src/bot/orchestrator.py @@ -336,6 +336,15 @@ def _register_agentic_handlers(self, app: Application) -> None: group=10, ) + # Voice/audio messages -> transcribe -> Claude + app.add_handler( + MessageHandler( + filters.VOICE | filters.AUDIO, + self._inject_deps(self.agentic_voice), + ), + group=10, + ) + # Only cd: callbacks (for project selection), scoped by pattern app.add_handler( CallbackQueryHandler( @@ -1330,6 +1339,97 @@ async def agentic_photo( "Claude photo processing failed", error=str(e), user_id=user_id ) + async def agentic_voice( + self, update: Update, context: ContextTypes.DEFAULT_TYPE + ) -> None: + """Transcribe voice/audio message via configured provider, then pass to Claude.""" + user_id = update.effective_user.id + + features = context.bot_data.get("features") + voice_handler = features.get_voice_handler() if features else None + + if not voice_handler: + await update.message.reply_text("Voice processing is not available.") + return + + chat = update.message.chat + await chat.send_action("typing") + progress_msg = await update.message.reply_text("Transcribing...") + + try: + voice = update.message.voice or update.message.audio + processed = await voice_handler.process_voice_message( + voice, caption=update.message.caption + ) + + # Show transcription to user + await progress_msg.edit_text(f"\U0001f3a4 {processed.transcription}") + + claude_integration = context.bot_data.get("claude_integration") + if not claude_integration: + return + + current_dir = context.user_data.get( + "current_directory", self.settings.approved_directory + ) + session_id = context.user_data.get("claude_session_id") + force_new = bool(context.user_data.get("force_new_session")) + verbose_level = self._get_verbose_level(context) + tool_log: List[Dict[str, Any]] = [] + mcp_images_voice: List[Any] = [] + on_stream = self._make_stream_callback( + verbose_level, + progress_msg, + tool_log, + time.time(), + mcp_images=mcp_images_voice, + approved_directory=self.settings.approved_directory, + ) + + heartbeat = self._start_typing_heartbeat(chat) + try: + claude_response = await claude_integration.run_command( + prompt=processed.prompt, + working_directory=current_dir, + user_id=user_id, + session_id=session_id, + on_stream=on_stream, + force_new=force_new, + ) + finally: + heartbeat.cancel() + + if force_new: + context.user_data["force_new_session"] = False + + context.user_data["claude_session_id"] = claude_response.session_id + + from .utils.formatting import ResponseFormatter + + formatter = ResponseFormatter(self.settings) + formatted_messages = formatter.format_claude_response( + claude_response.content + ) + + for i, message in enumerate(formatted_messages): + await update.message.reply_text( + message.text, + parse_mode=message.parse_mode, + reply_to_message_id=( + update.message.message_id if i == 0 else None + ), + ) + if i < len(formatted_messages) - 1: + await asyncio.sleep(0.5) + + except Exception as e: + from .handlers.message import _format_error_message + + await progress_msg.edit_text(_format_error_message(e), parse_mode="HTML") + logger.error( + "Voice processing failed", error=str(e), user_id=user_id + ) + async def agentic_repo( self, update: Update, context: ContextTypes.DEFAULT_TYPE ) -> None: diff --git a/src/config/settings.py b/src/config/settings.py index 7c32eaba..f8bd7bb3 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -160,6 +160,24 @@ class Settings(BaseSettings): ) # Features + enable_voice_processing: bool = Field(False, description="Enable voice message transcription") + voice_provider: Literal["mistral", "openai", "parakeet"] = Field( + "parakeet", + description="Voice transcription provider: 'mistral', 'openai', or 'parakeet' (local GPU)", + ) + ffmpeg_path: Optional[str] = Field( + None, + description="Path to ffmpeg binary. Falls back to 'ffmpeg' from PATH if not set.", + ) + voice_max_file_size_mb: int = Field( + 20, + description="Maximum voice message size in MB", + ge=1, + le=200, + ) + # Mistral / OpenAI voice keys (used when voice_provider != parakeet) + mistral_api_key: Optional[SecretStr] = Field(None, description="Mistral API key for Voxtral") + openai_api_key: Optional[SecretStr] = Field(None, description="OpenAI API key for Whisper") enable_mcp: bool = Field(False, description="Enable Model Context Protocol") mcp_config_path: Optional[Path] = Field( None, description="MCP configuration file path" @@ -433,3 +451,23 @@ def anthropic_api_key_str(self) -> Optional[str]: if self.anthropic_api_key else None ) + + @property + def mistral_api_key_str(self) -> Optional[str]: + """Get Mistral API key as string.""" + return self.mistral_api_key.get_secret_value() if self.mistral_api_key else None + + @property + def openai_api_key_str(self) -> Optional[str]: + """Get OpenAI API key as string.""" + return self.openai_api_key.get_secret_value() if self.openai_api_key else None + + @property + def voice_max_file_size_bytes(self) -> int: + """Maximum allowed voice message size in bytes.""" + return self.voice_max_file_size_mb * 1024 * 1024 + + @property + def resolved_ffmpeg_path(self) -> str: + """ffmpeg binary path: explicit setting or fall back to PATH.""" + return self.ffmpeg_path or "ffmpeg"