From 5501304c74d1ffa7d4dd8791adddef456f03c5da Mon Sep 17 00:00:00 2001 From: thereisnotime <37583483+thereisnotime@users.noreply.github.com> Date: Fri, 20 Mar 2026 02:30:16 +0200 Subject: [PATCH] feat: add local whisper.cpp voice transcription provider Add a third voice provider option (VOICE_PROVIDER=local) that transcribes Telegram voice messages entirely offline using whisper.cpp and ffmpeg. No API keys or cloud services required. - New local provider in voice_handler.py (OGG->WAV via ffmpeg, then whisper.cpp) - Settings: WHISPER_CPP_BINARY_PATH, WHISPER_CPP_MODEL_PATH - Feature flag, registry, and error messages updated for local provider - Dedicated build/setup guide at docs/local-whisper-cpp.md - Full test coverage for the local provider path - Updated .env.example, CLAUDE.md, README.md, docs/configuration.md Co-Authored-By: Claude Opus 4.6 --- .env.example | 28 ++++ CLAUDE.md | 2 +- README.md | 2 +- docs/configuration.md | 6 +- docs/local-whisper-cpp.md | 170 ++++++++++++++++++++ docs/setup.md | 15 +- src/bot/features/registry.py | 8 +- src/bot/features/voice_handler.py | 168 +++++++++++++++++++- src/bot/handlers/message.py | 29 ++-- src/bot/orchestrator.py | 6 + src/config/features.py | 2 + src/config/settings.py | 46 +++++- tests/unit/test_bot/test_message_voice.py | 25 +++ tests/unit/test_bot/test_voice_handler.py | 181 +++++++++++++++++++++- tests/unit/test_config.py | 22 ++- 15 files changed, 684 insertions(+), 26 deletions(-) create mode 100644 docs/local-whisper-cpp.md diff --git a/.env.example b/.env.example index dfd70908..8c59a4b4 100644 --- a/.env.example +++ b/.env.example @@ -140,6 +140,34 @@ QUICK_ACTIONS_TIMEOUT=120 # Git operations timeout in seconds GIT_OPERATIONS_TIMEOUT=30 +# === VOICE TRANSCRIPTION === +# Enable voice message transcription +ENABLE_VOICE_MESSAGES=true + +# Voice transcription provider: mistral, openai, or local +# - mistral: Uses Mistral Voxtral (requires MISTRAL_API_KEY) +# - openai: Uses OpenAI Whisper API (requires OPENAI_API_KEY) +# - local: Uses whisper.cpp binary (requires ffmpeg + whisper.cpp installed) +VOICE_PROVIDER=mistral + +# API keys (only needed for cloud providers) +MISTRAL_API_KEY= +OPENAI_API_KEY= + +# Override transcription model (optional) +# Defaults: voxtral-mini-latest (mistral), whisper-1 (openai), base (local) +VOICE_TRANSCRIPTION_MODEL= + +# Maximum voice message size in MB +VOICE_MAX_FILE_SIZE_MB=20 + +# Local whisper.cpp settings (only used when VOICE_PROVIDER=local) +# Path to whisper.cpp binary (auto-detected from PATH if unset) +WHISPER_CPP_BINARY_PATH= +# Path to GGML model file, or model name like "base", "small", "medium" +# Named models look for ~/.cache/whisper-cpp/ggml-{name}.bin +WHISPER_CPP_MODEL_PATH=base + # === PROJECT THREAD MODE === # Enable strict routing by Telegram project topics ENABLE_PROJECT_THREADS=false diff --git a/CLAUDE.md b/CLAUDE.md index 0917d335..b29f5871 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -102,7 +102,7 @@ Multi-project topics: `ENABLE_PROJECT_THREADS` (default false), `PROJECT_THREADS Output verbosity: `VERBOSE_LEVEL` (default 1, range 0-2). Controls how much of Claude's background activity is shown to the user in real-time. 0 = quiet (only final response, typing indicator still active), 1 = normal (tool names + reasoning snippets shown during execution), 2 = detailed (tool names with input summaries + longer reasoning text). Users can override per-session via `/verbose 0|1|2`. A persistent typing indicator is refreshed every ~2 seconds at all levels. -Voice transcription: `ENABLE_VOICE_MESSAGES` (default true), `VOICE_PROVIDER` (`mistral`|`openai`, default `mistral`), `MISTRAL_API_KEY`, `OPENAI_API_KEY`, `VOICE_TRANSCRIPTION_MODEL`. Provider implementation is in `src/bot/features/voice_handler.py`. +Voice transcription: `ENABLE_VOICE_MESSAGES` (default true), `VOICE_PROVIDER` (`mistral`|`openai`|`local`, default `mistral`), `MISTRAL_API_KEY`, `OPENAI_API_KEY`, `VOICE_TRANSCRIPTION_MODEL`. For local provider: `WHISPER_CPP_BINARY_PATH`, `WHISPER_CPP_MODEL_PATH` (requires ffmpeg + whisper.cpp installed). Provider implementation is in `src/bot/features/voice_handler.py`. Feature flags in `src/config/features.py` control: MCP, git integration, file uploads, quick actions, session export, image uploads, voice messages, conversation mode, agentic mode, API server, scheduler. diff --git a/README.md b/README.md index 34ca52d3..0bd27ae2 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ Enable with `ENABLE_API_SERVER=true` and `ENABLE_SCHEDULER=true`. See [docs/setu - Directory sandboxing with path traversal prevention - File upload handling with archive extraction - Image/screenshot upload with analysis -- Voice message transcription (Mistral Voxtral / OpenAI Whisper) +- Voice message transcription (Mistral Voxtral / OpenAI Whisper / [local whisper.cpp](docs/local-whisper-cpp.md)) - Git integration with safe repository operations - Quick actions system with context-aware buttons - Session export in Markdown, HTML, and JSON formats diff --git a/docs/configuration.md b/docs/configuration.md index 2098f8be..2bba7d9f 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -135,11 +135,15 @@ ENABLE_QUICK_ACTIONS=true # Enable voice message transcription ENABLE_VOICE_MESSAGES=true -VOICE_PROVIDER=mistral # 'mistral' (default) or 'openai' +VOICE_PROVIDER=mistral # 'mistral', 'openai', or 'local' MISTRAL_API_KEY= # Required when VOICE_PROVIDER=mistral OPENAI_API_KEY= # Required when VOICE_PROVIDER=openai VOICE_TRANSCRIPTION_MODEL= # Default: voxtral-mini-latest (Mistral) or whisper-1 (OpenAI) VOICE_MAX_FILE_SIZE_MB=20 # Max Telegram voice file size to download (1-200MB) + +# Local whisper.cpp settings (only used when VOICE_PROVIDER=local) +WHISPER_CPP_BINARY_PATH= # Path to whisper.cpp binary (auto-detected from PATH if unset) +WHISPER_CPP_MODEL_PATH=base # Path to GGML model file or model name (base, small, medium, large) ``` #### Agentic Platform diff --git a/docs/local-whisper-cpp.md b/docs/local-whisper-cpp.md new file mode 100644 index 00000000..b23773cb --- /dev/null +++ b/docs/local-whisper-cpp.md @@ -0,0 +1,170 @@ +# Local Voice Transcription with whisper.cpp + +This guide explains how to build and configure [whisper.cpp](https://github.com/ggerganov/whisper.cpp) for **offline** voice message transcription — no API keys or cloud services required. + +## Overview + +When `VOICE_PROVIDER=local` the bot transcribes Telegram voice messages entirely on your machine using: + +| Component | Purpose | +|---|---| +| **ffmpeg** | Converts Telegram OGG/Opus audio to 16 kHz mono WAV | +| **whisper.cpp** | Runs OpenAI's Whisper model locally via optimised C/C++ | +| **GGML model** | Quantised model weights (downloaded once) | + +## Prerequisites + +- A C/C++ toolchain (`gcc`/`clang`, `cmake`, `make`) +- `ffmpeg` installed and on PATH +- ~400 MB disk space for the `base` model (~1.5 GB for `medium`) + +## 1. Install ffmpeg + +### Ubuntu / Debian + +```bash +sudo apt update && sudo apt install -y ffmpeg +``` + +### macOS (Homebrew) + +```bash +brew install ffmpeg +``` + +### Alpine + +```bash +apk add ffmpeg +``` + +Verify: + +```bash +ffmpeg -version +``` + +## 2. Build whisper.cpp from source + +```bash +# Clone the repository +git clone https://github.com/ggerganov/whisper.cpp.git +cd whisper.cpp + +# Build with CMake (recommended) +cmake -B build +cmake --build build --config Release + +# The binary is at build/bin/whisper-cli (or build/bin/main on older versions) +ls build/bin/whisper-cli +``` + +> **Tip:** For GPU acceleration add `-DWHISPER_CUBLAS=ON` (NVIDIA) or `-DWHISPER_METAL=ON` (Apple Silicon) to the cmake configure step. + +### Install system-wide (optional) + +```bash +sudo cp build/bin/whisper-cli /usr/local/bin/whisper-cpp +``` + +Or add the build directory to your `PATH`: + +```bash +export PATH="$PWD/build/bin:$PATH" +``` + +## 3. Download a GGML model + +Models are hosted on Hugging Face. Pick one based on your hardware: + +| Model | Size | RAM (approx.) | Quality | +|---|---|---|---| +| `tiny` | ~75 MB | ~400 MB | Fast but lower accuracy | +| `base` | ~142 MB | ~500 MB | Good balance (default) | +| `small` | ~466 MB | ~1 GB | Better accuracy | +| `medium` | ~1.5 GB | ~2.5 GB | High accuracy | +| `large-v3` | ~3 GB | ~5 GB | Best accuracy, slow on CPU | + +```bash +# Create the model cache directory +mkdir -p ~/.cache/whisper-cpp + +# Download the base model (recommended starting point) +curl -L -o ~/.cache/whisper-cpp/ggml-base.bin \ + https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin + +# Or download small for better accuracy +curl -L -o ~/.cache/whisper-cpp/ggml-small.bin \ + https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin +``` + +## 4. Configure the bot + +Add the following to your `.env`: + +```bash +# Enable voice transcription with local provider +ENABLE_VOICE_MESSAGES=true +VOICE_PROVIDER=local + +# Path to the whisper.cpp binary (omit if already on PATH as "whisper-cpp") +WHISPER_CPP_BINARY_PATH=/usr/local/bin/whisper-cpp + +# Model: a name like "base", "small", "medium" or a full file path +# Named models resolve to ~/.cache/whisper-cpp/ggml-{name}.bin +WHISPER_CPP_MODEL_PATH=base +``` + +### Minimal configuration + +If `whisper-cpp` is on your PATH and you downloaded the `base` model to the default location, you only need: + +```bash +VOICE_PROVIDER=local +``` + +## 5. Verify the setup + +```bash +# Test ffmpeg conversion +ffmpeg -f lavfi -i "sine=frequency=440:duration=2" -ar 16000 -ac 1 /tmp/test.wav -y + +# Test whisper.cpp +whisper-cpp -m ~/.cache/whisper-cpp/ggml-base.bin -f /tmp/test.wav --no-timestamps +``` + +You should see a transcription attempt (it will be empty or nonsensical for a sine wave, but the binary should run without errors). + +## Troubleshooting + +### `whisper.cpp binary not found on PATH` + +The bot could not locate the binary. Either: +- Install it system-wide: `sudo cp build/bin/whisper-cli /usr/local/bin/whisper-cpp` +- Or set the full path: `WHISPER_CPP_BINARY_PATH=/path/to/whisper-cli` + +### `whisper.cpp model not found` + +The model file does not exist at the expected path. Download it: + +```bash +mkdir -p ~/.cache/whisper-cpp +curl -L -o ~/.cache/whisper-cpp/ggml-base.bin \ + https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin +``` + +### `ffmpeg is required but was not found` + +Install ffmpeg for your platform (see step 1 above). + +### Poor transcription quality + +- Try a larger model (`small` or `medium` instead of `base`) +- Ensure audio is not too short (< 1 second) or too noisy +- whisper.cpp uses `--language auto` by default; this works well for most languages + +### High CPU usage / slow transcription + +- Use a smaller model (`tiny` or `base`) +- Enable GPU acceleration when building whisper.cpp (CUDA / Metal) +- Consider using the `mistral` or `openai` cloud providers for faster results on low-powered machines diff --git a/docs/setup.md b/docs/setup.md index 5b0670bb..acb7f906 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -197,12 +197,23 @@ VOICE_PROVIDER=openai OPENAI_API_KEY=your-openai-api-key ``` -If you installed via pip/uv, make sure voice extras are installed: +**Local whisper.cpp (offline, no API key needed):** +```bash +VOICE_PROVIDER=local +# Optional — auto-detected from PATH if unset +WHISPER_CPP_BINARY_PATH=/usr/local/bin/whisper-cpp +# Model name ("base", "small", "medium") or full path to .bin file +WHISPER_CPP_MODEL_PATH=base +``` + +Requires `ffmpeg` and a locally built `whisper.cpp` binary. See the full [local whisper.cpp setup guide](local-whisper-cpp.md) for build instructions and model downloads. + +If you installed via pip/uv, make sure voice extras are installed (cloud providers only): ```bash pip install "claude-code-telegram[voice]" ``` -Optionally override the transcription model with `VOICE_TRANSCRIPTION_MODEL` (defaults to `voxtral-mini-latest` for Mistral, `whisper-1` for OpenAI). +Optionally override the transcription model with `VOICE_TRANSCRIPTION_MODEL` (defaults to `voxtral-mini-latest` for Mistral, `whisper-1` for OpenAI, `base` for local). ### Notification Recipients diff --git a/src/bot/features/registry.py b/src/bot/features/registry.py index 953c228b..43e7e0e0 100644 --- a/src/bot/features/registry.py +++ b/src/bot/features/registry.py @@ -78,10 +78,14 @@ def _initialize_features(self): except Exception as e: logger.error("Failed to initialize image handler", error=str(e)) - # Voice transcription - requires provider-specific API key + # Voice transcription - requires provider-specific API key (or local) voice_key_available = ( + self.config.voice_provider == "local" + ) or ( self.config.voice_provider == "openai" and self.config.openai_api_key - ) or (self.config.voice_provider == "mistral" and self.config.mistral_api_key) + ) or ( + self.config.voice_provider == "mistral" and self.config.mistral_api_key + ) if self.config.enable_voice_messages and voice_key_available: try: self.features["voice_handler"] = VoiceHandler(config=self.config) diff --git a/src/bot/features/voice_handler.py b/src/bot/features/voice_handler.py index 11daa10c..5ed31822 100644 --- a/src/bot/features/voice_handler.py +++ b/src/bot/features/voice_handler.py @@ -1,7 +1,11 @@ -"""Handle voice message transcription via Mistral (Voxtral) or OpenAI (Whisper).""" +"""Handle voice message transcription via Mistral (Voxtral), OpenAI (Whisper), or local whisper.cpp.""" +import asyncio +import shutil +import tempfile from dataclasses import dataclass from datetime import timedelta +from pathlib import Path from typing import Any, Optional import structlog @@ -22,12 +26,16 @@ class ProcessedVoice: class VoiceHandler: - """Transcribe Telegram voice messages using Mistral or OpenAI.""" + """Transcribe Telegram voice messages using Mistral, OpenAI, or local whisper.cpp.""" + + # Timeout (seconds) for ffmpeg and whisper.cpp subprocess calls. + LOCAL_SUBPROCESS_TIMEOUT: int = 120 def __init__(self, config: Settings): self.config = config self._mistral_client: Optional[Any] = None self._openai_client: Optional[Any] = None + self._resolved_whisper_binary: Optional[str] = None def _ensure_allowed_file_size(self, file_size: Optional[int]) -> None: """Reject files that exceed the configured max size.""" @@ -48,7 +56,7 @@ async def process_voice_message( """Download and transcribe a voice message. 1. Download .ogg bytes from Telegram - 2. Call the configured transcription API (Mistral or OpenAI) + 2. Call the configured transcription provider (Mistral, OpenAI, or local) 3. Build a prompt combining caption + transcription """ initial_file_size = getattr(voice, "file_size", None) @@ -79,7 +87,9 @@ async def process_voice_message( file_size=initial_file_size or resolved_file_size or len(voice_bytes), ) - if self.config.voice_provider == "openai": + if self.config.voice_provider == "local": + transcription = await self._transcribe_local(voice_bytes) + elif self.config.voice_provider == "openai": transcription = await self._transcribe_openai(voice_bytes) else: transcription = await self._transcribe_mistral(voice_bytes) @@ -103,6 +113,8 @@ async def process_voice_message( duration=duration_secs, ) + # -- Mistral provider -- + async def _transcribe_mistral(self, voice_bytes: bytes) -> str: """Transcribe audio using the Mistral API (Voxtral).""" client = self._get_mistral_client() @@ -147,6 +159,8 @@ def _get_mistral_client(self) -> Any: self._mistral_client = Mistral(api_key=api_key) return self._mistral_client + # -- OpenAI provider -- + async def _transcribe_openai(self, voice_bytes: bytes) -> str: """Transcribe audio using the OpenAI Whisper API.""" client = self._get_openai_client() @@ -187,3 +201,149 @@ def _get_openai_client(self) -> Any: self._openai_client = AsyncOpenAI(api_key=api_key) return self._openai_client + + # -- Local whisper.cpp provider -- + + async def _transcribe_local(self, voice_bytes: bytes) -> str: + """Transcribe audio locally using whisper.cpp binary.""" + binary = self._resolve_whisper_binary() + model_path = self.config.resolved_whisper_cpp_model_path + + if not Path(model_path).is_file(): + raise RuntimeError( + f"whisper.cpp model not found at {model_path}. " + "Download it with: " + "curl -L -o ~/.cache/whisper-cpp/ggml-base.bin " + "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin" + ) + + tmp_dir = None + try: + tmp_dir = tempfile.mkdtemp(prefix="voice_") + ogg_path = Path(tmp_dir) / "voice.ogg" + wav_path = Path(tmp_dir) / "voice.wav" + + ogg_path.write_bytes(voice_bytes) + + # Convert OGG/Opus -> WAV (16kHz mono PCM) + await self._convert_ogg_to_wav(ogg_path, wav_path) + + # Run whisper.cpp + text = await self._run_whisper_cpp(binary, model_path, wav_path) + + finally: + if tmp_dir: + shutil.rmtree(tmp_dir, ignore_errors=True) + + text = text.strip() + if not text: + raise ValueError( + "Local whisper.cpp transcription returned an empty response." + ) + return text + + async def _convert_ogg_to_wav(self, ogg_path: Path, wav_path: Path) -> None: + """Convert OGG/Opus to WAV (16kHz mono PCM) using ffmpeg.""" + try: + process = await asyncio.create_subprocess_exec( + "ffmpeg", + "-i", + str(ogg_path), + "-ar", + "16000", + "-ac", + "1", + "-f", + "wav", + str(wav_path), + "-y", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await asyncio.wait_for( + process.communicate(), + timeout=self.LOCAL_SUBPROCESS_TIMEOUT, + ) + + if process.returncode != 0: + raise RuntimeError( + f"ffmpeg conversion failed (exit {process.returncode}): " + f"{stderr.decode()[:200]}" + ) + except asyncio.TimeoutError: + process.kill() + raise RuntimeError( + f"ffmpeg conversion timed out after {self.LOCAL_SUBPROCESS_TIMEOUT}s." + ) + except FileNotFoundError: + raise RuntimeError( + "ffmpeg is required for local voice transcription but was not found. " + "Install it with: apt install ffmpeg" + ) + + async def _run_whisper_cpp( + self, binary: str, model_path: str, wav_path: Path + ) -> str: + """Execute whisper.cpp binary and return transcription text.""" + try: + process = await asyncio.create_subprocess_exec( + binary, + "-m", + model_path, + "-f", + str(wav_path), + "--no-timestamps", + "-l", + "auto", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for( + process.communicate(), + timeout=self.LOCAL_SUBPROCESS_TIMEOUT, + ) + + if process.returncode != 0: + logger.warning( + "whisper.cpp transcription failed", + return_code=process.returncode, + stderr=stderr.decode()[:300], + ) + raise RuntimeError("Local whisper.cpp transcription failed.") + + return stdout.decode() + + except asyncio.TimeoutError: + process.kill() + raise RuntimeError( + f"whisper.cpp transcription timed out after " + f"{self.LOCAL_SUBPROCESS_TIMEOUT}s." + ) + except FileNotFoundError: + raise RuntimeError( + f"whisper.cpp binary not found at '{binary}'. " + "Set WHISPER_CPP_BINARY_PATH or install whisper.cpp." + ) + except RuntimeError: + raise + except Exception as exc: + logger.warning( + "whisper.cpp transcription request failed", + error_type=type(exc).__name__, + ) + raise RuntimeError("Local whisper.cpp transcription failed.") from exc + + def _resolve_whisper_binary(self) -> str: + """Resolve and validate the whisper.cpp binary path on first use.""" + if self._resolved_whisper_binary is not None: + return self._resolved_whisper_binary + + binary = self.config.resolved_whisper_cpp_binary + resolved = shutil.which(binary) + if not resolved: + raise RuntimeError( + f"whisper.cpp binary '{binary}' not found on PATH. " + "Set WHISPER_CPP_BINARY_PATH to the full path." + ) + self._resolved_whisper_binary = resolved + return resolved diff --git a/src/bot/handlers/message.py b/src/bot/handlers/message.py index e5fa9f78..935d7917 100644 --- a/src/bot/handlers/message.py +++ b/src/bot/handlers/message.py @@ -1021,15 +1021,26 @@ async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No voice_handler = features.get_voice_handler() if features else None if not voice_handler: - await update.message.reply_text( - "🎙️ Voice Messages\n\n" - "Voice transcription is not available.\n" - f"Provider: {settings.voice_provider_display_name}\n" - f"Set {settings.voice_provider_api_key_env} to enable.\n" - "Install optional voice deps with " - 'pip install "claude-code-telegram[voice]".', - parse_mode="HTML", - ) + if settings.voice_provider == "local": + await update.message.reply_text( + "🎙️ Voice Messages\n\n" + "Voice transcription is not available.\n" + "Provider: Local whisper.cpp\n" + "Ensure whisper.cpp is installed and model file exists.\n" + "Set WHISPER_CPP_BINARY_PATH and " + "WHISPER_CPP_MODEL_PATH if needed.", + parse_mode="HTML", + ) + else: + await update.message.reply_text( + "🎙️ Voice Messages\n\n" + "Voice transcription is not available.\n" + f"Provider: {settings.voice_provider_display_name}\n" + f"Set {settings.voice_provider_api_key_env} to enable.\n" + "Install optional voice deps with " + 'pip install "claude-code-telegram[voice]".', + parse_mode="HTML", + ) return try: diff --git a/src/bot/orchestrator.py b/src/bot/orchestrator.py index ac1d5304..609e42d1 100644 --- a/src/bot/orchestrator.py +++ b/src/bot/orchestrator.py @@ -1452,6 +1452,12 @@ async def _handle_agentic_media_message( def _voice_unavailable_message(self) -> str: """Return provider-aware guidance when voice feature is unavailable.""" + if self.settings.voice_provider == "local": + return ( + "Voice processing is not available. " + "Ensure whisper.cpp is installed and the model file exists. " + "Check WHISPER_CPP_BINARY_PATH and WHISPER_CPP_MODEL_PATH settings." + ) return ( "Voice processing is not available. " f"Set {self.settings.voice_provider_api_key_env} " diff --git a/src/config/features.py b/src/config/features.py index 03b54a86..e5561d40 100644 --- a/src/config/features.py +++ b/src/config/features.py @@ -76,6 +76,8 @@ def voice_messages_enabled(self) -> bool: """Check if voice message transcription is enabled.""" if not self.settings.enable_voice_messages: return False + if self.settings.voice_provider == "local": + return True # No API key needed for local whisper.cpp if self.settings.voice_provider == "openai": return self.settings.openai_api_key is not None return self.settings.mistral_api_key is not None diff --git a/src/config/settings.py b/src/config/settings.py index 77c34ea4..f276168c 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -169,9 +169,9 @@ class Settings(BaseSettings): enable_voice_messages: bool = Field( True, description="Enable voice message transcription" ) - voice_provider: Literal["mistral", "openai"] = Field( + voice_provider: Literal["mistral", "openai", "local"] = Field( "mistral", - description="Voice transcription provider: 'mistral' or 'openai'", + description="Voice transcription provider: 'mistral', 'openai', or 'local'", ) mistral_api_key: Optional[SecretStr] = Field( None, description="Mistral API key for voice transcription" @@ -195,6 +195,21 @@ class Settings(BaseSettings): ge=1, le=200, ) + whisper_cpp_binary_path: Optional[str] = Field( + None, + description=( + "Path to whisper.cpp binary. " + "Required when VOICE_PROVIDER=local. Auto-detected from PATH if unset." + ), + ) + whisper_cpp_model_path: Optional[str] = Field( + None, + description=( + "Path to whisper.cpp GGML model file, or model name " + "(e.g. 'base', 'small'). Defaults to 'base'. " + "Named models resolve to ~/.cache/whisper-cpp/ggml-{name}.bin" + ), + ) enable_quick_actions: bool = Field(True, description="Enable quick action buttons") agentic_mode: bool = Field( True, @@ -395,8 +410,10 @@ def validate_voice_provider(cls, v: Any) -> str: if v is None: return "mistral" provider = str(v).strip().lower() - if provider not in {"mistral", "openai"}: - raise ValueError("voice_provider must be one of ['mistral', 'openai']") + if provider not in {"mistral", "openai", "local"}: + raise ValueError( + "voice_provider must be one of ['mistral', 'openai', 'local']" + ) return provider @field_validator("project_threads_chat_id", mode="before") @@ -503,6 +520,8 @@ def resolved_voice_model(self) -> str: return self.voice_transcription_model if self.voice_provider == "openai": return "whisper-1" + if self.voice_provider == "local": + return self.whisper_cpp_model_path or "base" return "voxtral-mini-latest" @property @@ -515,6 +534,8 @@ def voice_provider_api_key_env(self) -> str: """API key environment variable required for the configured voice provider.""" if self.voice_provider == "openai": return "OPENAI_API_KEY" + if self.voice_provider == "local": + return "" return "MISTRAL_API_KEY" @property @@ -522,4 +543,21 @@ def voice_provider_display_name(self) -> str: """Human-friendly label for the configured voice provider.""" if self.voice_provider == "openai": return "OpenAI Whisper" + if self.voice_provider == "local": + return "Local whisper.cpp" return "Mistral Voxtral" + + @property + def resolved_whisper_cpp_binary(self) -> str: + """Resolve whisper.cpp binary path, defaulting to 'whisper-cpp' on PATH.""" + return self.whisper_cpp_binary_path or "whisper-cpp" + + @property + def resolved_whisper_cpp_model_path(self) -> str: + """Resolve whisper.cpp model file path from name or explicit path.""" + path_or_name = self.whisper_cpp_model_path or "base" + if "/" in path_or_name or path_or_name.endswith(".bin"): + return path_or_name + return str( + Path.home() / ".cache" / "whisper-cpp" / f"ggml-{path_or_name}.bin" + ) diff --git a/tests/unit/test_bot/test_message_voice.py b/tests/unit/test_bot/test_message_voice.py index d6f2deb6..e5bfe1a7 100644 --- a/tests/unit/test_bot/test_message_voice.py +++ b/tests/unit/test_bot/test_message_voice.py @@ -53,3 +53,28 @@ async def test_handle_voice_missing_handler_uses_mistral_key(tmp_path): call_args = update.message.reply_text.call_args assert "MISTRAL_API_KEY" in call_args.args[0] + + +async def test_handle_voice_missing_handler_local_provider(tmp_path): + """Classic handler fallback shows whisper.cpp guidance for local provider.""" + settings = create_test_config( + approved_directory=str(tmp_path), + voice_provider="local", + ) + + features = MagicMock() + features.get_voice_handler.return_value = None + + update = MagicMock() + update.effective_user.id = 123 + update.message.reply_text = AsyncMock() + + context = MagicMock() + context.bot_data = {"settings": settings, "features": features} + context.user_data = {} + + await handle_voice(update, context) + + call_args = update.message.reply_text.call_args + assert "WHISPER_CPP_BINARY_PATH" in call_args.args[0] + assert call_args.kwargs["parse_mode"] == "HTML" diff --git a/tests/unit/test_bot/test_voice_handler.py b/tests/unit/test_bot/test_voice_handler.py index 2caddf86..d97237a4 100644 --- a/tests/unit/test_bot/test_voice_handler.py +++ b/tests/unit/test_bot/test_voice_handler.py @@ -1,9 +1,11 @@ """Tests for voice handler feature.""" +import asyncio import sys from datetime import timedelta +from pathlib import Path from types import SimpleNamespace -from unittest.mock import AsyncMock, MagicMock +from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -352,3 +354,180 @@ async def test_transcribe_openai_reuses_cached_client(openai_voice_handler): openai_ctor.assert_called_once_with(api_key="test-openai-key") assert mock_transcriptions.create.await_count == 2 + + +# --- Local whisper.cpp provider tests --- + + +@pytest.fixture +def local_config(): + """Create a mock config with local whisper.cpp settings.""" + cfg = MagicMock() + cfg.voice_provider = "local" + cfg.resolved_whisper_cpp_binary = "whisper-cpp" + cfg.resolved_whisper_cpp_model_path = "/tmp/models/ggml-base.bin" + cfg.voice_max_file_size_mb = 20 + cfg.voice_max_file_size_bytes = 20 * 1024 * 1024 + return cfg + + +@pytest.fixture +def local_voice_handler(local_config): + """Create a VoiceHandler instance with local config.""" + return VoiceHandler(config=local_config) + + +async def test_process_voice_message_local_dispatches(local_voice_handler): + """process_voice_message routes to _transcribe_local for local provider.""" + voice = _mock_voice(duration=5) + local_voice_handler._transcribe_local = AsyncMock( + return_value="Local transcription" + ) + + result = await local_voice_handler.process_voice_message(voice) + + assert isinstance(result, ProcessedVoice) + assert result.transcription == "Local transcription" + assert result.duration == 5 + local_voice_handler._transcribe_local.assert_awaited_once() + + +async def test_transcribe_local_runs_ffmpeg_and_whisper(local_voice_handler): + """Local transcription converts OGG->WAV then calls whisper.cpp binary.""" + mock_ffmpeg = AsyncMock() + mock_ffmpeg.communicate = AsyncMock(return_value=(b"", b"")) + mock_ffmpeg.returncode = 0 + + mock_whisper = AsyncMock() + mock_whisper.communicate = AsyncMock(return_value=(b"Hello world", b"")) + mock_whisper.returncode = 0 + + call_count = 0 + + async def fake_subprocess(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + return mock_ffmpeg + return mock_whisper + + with ( + patch("shutil.which", return_value="/usr/bin/whisper-cpp"), + patch( + "src.bot.features.voice_handler.Path.is_file", + return_value=True, + ), + patch( + "asyncio.create_subprocess_exec", + side_effect=fake_subprocess, + ), + ): + result = await local_voice_handler._transcribe_local(b"fake-ogg-bytes") + + assert result == "Hello world" + assert call_count == 2 + + +async def test_transcribe_local_ffmpeg_not_found(local_voice_handler): + """Missing ffmpeg gives a clear install hint.""" + with ( + patch("shutil.which", return_value="/usr/bin/whisper-cpp"), + patch( + "src.bot.features.voice_handler.Path.is_file", + return_value=True, + ), + patch( + "asyncio.create_subprocess_exec", + side_effect=FileNotFoundError, + ), + ): + with pytest.raises(RuntimeError, match="ffmpeg is required"): + await local_voice_handler._transcribe_local(b"fake-ogg") + + +async def test_transcribe_local_model_not_found(local_voice_handler): + """Missing model file raises a clear error with download hint.""" + with ( + patch("shutil.which", return_value="/usr/bin/whisper-cpp"), + patch( + "src.bot.features.voice_handler.Path.is_file", + return_value=False, + ), + ): + with pytest.raises(RuntimeError, match="model not found"): + await local_voice_handler._transcribe_local(b"fake-ogg") + + +async def test_transcribe_local_whisper_binary_not_found(local_voice_handler): + """Missing whisper.cpp binary raises a clear error.""" + with patch("shutil.which", return_value=None): + with pytest.raises(RuntimeError, match="not found on PATH"): + await local_voice_handler._transcribe_local(b"fake-ogg") + + +async def test_transcribe_local_empty_response(local_voice_handler): + """Empty whisper.cpp output raises ValueError.""" + mock_ffmpeg = AsyncMock() + mock_ffmpeg.communicate = AsyncMock(return_value=(b"", b"")) + mock_ffmpeg.returncode = 0 + + mock_whisper = AsyncMock() + mock_whisper.communicate = AsyncMock(return_value=(b" ", b"")) + mock_whisper.returncode = 0 + + call_count = 0 + + async def fake_subprocess(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + return mock_ffmpeg + return mock_whisper + + with ( + patch("shutil.which", return_value="/usr/bin/whisper-cpp"), + patch( + "src.bot.features.voice_handler.Path.is_file", + return_value=True, + ), + patch( + "asyncio.create_subprocess_exec", + side_effect=fake_subprocess, + ), + ): + with pytest.raises(ValueError, match="empty response"): + await local_voice_handler._transcribe_local(b"fake-ogg") + + +async def test_transcribe_local_whisper_nonzero_exit(local_voice_handler): + """Non-zero whisper.cpp exit code raises RuntimeError.""" + mock_ffmpeg = AsyncMock() + mock_ffmpeg.communicate = AsyncMock(return_value=(b"", b"")) + mock_ffmpeg.returncode = 0 + + mock_whisper = AsyncMock() + mock_whisper.communicate = AsyncMock(return_value=(b"", b"model load fail")) + mock_whisper.returncode = 1 + + call_count = 0 + + async def fake_subprocess(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + return mock_ffmpeg + return mock_whisper + + with ( + patch("shutil.which", return_value="/usr/bin/whisper-cpp"), + patch( + "src.bot.features.voice_handler.Path.is_file", + return_value=True, + ), + patch( + "asyncio.create_subprocess_exec", + side_effect=fake_subprocess, + ), + ): + with pytest.raises(RuntimeError, match="transcription failed"): + await local_voice_handler._transcribe_local(b"fake-ogg") diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 6b20c6fe..2f0dcd9e 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -457,7 +457,7 @@ def test_project_threads_validation_invalid_mode(tmp_path): def test_voice_provider_validation_and_normalization(tmp_path): - """VOICE_PROVIDER accepts only mistral/openai and normalizes casing.""" + """VOICE_PROVIDER accepts mistral/openai/local and normalizes casing.""" project_dir = tmp_path / "projects" project_dir.mkdir() @@ -483,6 +483,26 @@ def test_voice_provider_validation_and_normalization(tmp_path): assert "voice_provider must be one of" in str(exc_info.value) +def test_voice_provider_local_requires_no_api_key(tmp_path): + """VOICE_PROVIDER=local needs no API key and has correct display properties.""" + project_dir = tmp_path / "projects" + project_dir.mkdir() + + settings = Settings( + telegram_bot_token="test_token", + telegram_bot_username="test_bot", + approved_directory=str(project_dir), + voice_provider="local", + ) + + assert settings.voice_provider == "local" + assert settings.voice_provider_api_key_env == "" + assert settings.voice_provider_display_name == "Local whisper.cpp" + assert settings.resolved_voice_model == "base" + assert settings.resolved_whisper_cpp_binary == "whisper-cpp" + assert settings.resolved_whisper_cpp_model_path.endswith("ggml-base.bin") + + def test_voice_max_file_size_configuration(tmp_path): """Voice max file size should be configurable and validated.""" project_dir = tmp_path / "projects"