diff --git a/.env.example b/.env.example
index dfd70908..8c59a4b4 100644
--- a/.env.example
+++ b/.env.example
@@ -140,6 +140,34 @@ QUICK_ACTIONS_TIMEOUT=120
# Git operations timeout in seconds
GIT_OPERATIONS_TIMEOUT=30
+# === VOICE TRANSCRIPTION ===
+# Enable voice message transcription
+ENABLE_VOICE_MESSAGES=true
+
+# Voice transcription provider: mistral, openai, or local
+# - mistral: Uses Mistral Voxtral (requires MISTRAL_API_KEY)
+# - openai: Uses OpenAI Whisper API (requires OPENAI_API_KEY)
+# - local: Uses whisper.cpp binary (requires ffmpeg + whisper.cpp installed)
+VOICE_PROVIDER=mistral
+
+# API keys (only needed for cloud providers)
+MISTRAL_API_KEY=
+OPENAI_API_KEY=
+
+# Override transcription model (optional)
+# Defaults: voxtral-mini-latest (mistral), whisper-1 (openai), base (local)
+VOICE_TRANSCRIPTION_MODEL=
+
+# Maximum voice message size in MB
+VOICE_MAX_FILE_SIZE_MB=20
+
+# Local whisper.cpp settings (only used when VOICE_PROVIDER=local)
+# Path to whisper.cpp binary (auto-detected from PATH if unset)
+WHISPER_CPP_BINARY_PATH=
+# Path to GGML model file, or model name like "base", "small", "medium"
+# Named models look for ~/.cache/whisper-cpp/ggml-{name}.bin
+WHISPER_CPP_MODEL_PATH=base
+
# === PROJECT THREAD MODE ===
# Enable strict routing by Telegram project topics
ENABLE_PROJECT_THREADS=false
diff --git a/CLAUDE.md b/CLAUDE.md
index 0917d335..b29f5871 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -102,7 +102,7 @@ Multi-project topics: `ENABLE_PROJECT_THREADS` (default false), `PROJECT_THREADS
Output verbosity: `VERBOSE_LEVEL` (default 1, range 0-2). Controls how much of Claude's background activity is shown to the user in real-time. 0 = quiet (only final response, typing indicator still active), 1 = normal (tool names + reasoning snippets shown during execution), 2 = detailed (tool names with input summaries + longer reasoning text). Users can override per-session via `/verbose 0|1|2`. A persistent typing indicator is refreshed every ~2 seconds at all levels.
-Voice transcription: `ENABLE_VOICE_MESSAGES` (default true), `VOICE_PROVIDER` (`mistral`|`openai`, default `mistral`), `MISTRAL_API_KEY`, `OPENAI_API_KEY`, `VOICE_TRANSCRIPTION_MODEL`. Provider implementation is in `src/bot/features/voice_handler.py`.
+Voice transcription: `ENABLE_VOICE_MESSAGES` (default true), `VOICE_PROVIDER` (`mistral`|`openai`|`local`, default `mistral`), `MISTRAL_API_KEY`, `OPENAI_API_KEY`, `VOICE_TRANSCRIPTION_MODEL`. For local provider: `WHISPER_CPP_BINARY_PATH`, `WHISPER_CPP_MODEL_PATH` (requires ffmpeg + whisper.cpp installed). Provider implementation is in `src/bot/features/voice_handler.py`.
Feature flags in `src/config/features.py` control: MCP, git integration, file uploads, quick actions, session export, image uploads, voice messages, conversation mode, agentic mode, API server, scheduler.
diff --git a/README.md b/README.md
index 34ca52d3..0bd27ae2 100644
--- a/README.md
+++ b/README.md
@@ -194,7 +194,7 @@ Enable with `ENABLE_API_SERVER=true` and `ENABLE_SCHEDULER=true`. See [docs/setu
- Directory sandboxing with path traversal prevention
- File upload handling with archive extraction
- Image/screenshot upload with analysis
-- Voice message transcription (Mistral Voxtral / OpenAI Whisper)
+- Voice message transcription (Mistral Voxtral / OpenAI Whisper / [local whisper.cpp](docs/local-whisper-cpp.md))
- Git integration with safe repository operations
- Quick actions system with context-aware buttons
- Session export in Markdown, HTML, and JSON formats
diff --git a/docs/configuration.md b/docs/configuration.md
index 2098f8be..2bba7d9f 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -135,11 +135,15 @@ ENABLE_QUICK_ACTIONS=true
# Enable voice message transcription
ENABLE_VOICE_MESSAGES=true
-VOICE_PROVIDER=mistral # 'mistral' (default) or 'openai'
+VOICE_PROVIDER=mistral # 'mistral', 'openai', or 'local'
MISTRAL_API_KEY= # Required when VOICE_PROVIDER=mistral
OPENAI_API_KEY= # Required when VOICE_PROVIDER=openai
VOICE_TRANSCRIPTION_MODEL= # Default: voxtral-mini-latest (Mistral) or whisper-1 (OpenAI)
VOICE_MAX_FILE_SIZE_MB=20 # Max Telegram voice file size to download (1-200MB)
+
+# Local whisper.cpp settings (only used when VOICE_PROVIDER=local)
+WHISPER_CPP_BINARY_PATH= # Path to whisper.cpp binary (auto-detected from PATH if unset)
+WHISPER_CPP_MODEL_PATH=base # Path to GGML model file or model name (base, small, medium, large)
```
#### Agentic Platform
diff --git a/docs/local-whisper-cpp.md b/docs/local-whisper-cpp.md
new file mode 100644
index 00000000..b23773cb
--- /dev/null
+++ b/docs/local-whisper-cpp.md
@@ -0,0 +1,170 @@
+# Local Voice Transcription with whisper.cpp
+
+This guide explains how to build and configure [whisper.cpp](https://github.com/ggerganov/whisper.cpp) for **offline** voice message transcription — no API keys or cloud services required.
+
+## Overview
+
+When `VOICE_PROVIDER=local` the bot transcribes Telegram voice messages entirely on your machine using:
+
+| Component | Purpose |
+|---|---|
+| **ffmpeg** | Converts Telegram OGG/Opus audio to 16 kHz mono WAV |
+| **whisper.cpp** | Runs OpenAI's Whisper model locally via optimised C/C++ |
+| **GGML model** | Quantised model weights (downloaded once) |
+
+## Prerequisites
+
+- A C/C++ toolchain (`gcc`/`clang`, `cmake`, `make`)
+- `ffmpeg` installed and on PATH
+- ~400 MB disk space for the `base` model (~1.5 GB for `medium`)
+
+## 1. Install ffmpeg
+
+### Ubuntu / Debian
+
+```bash
+sudo apt update && sudo apt install -y ffmpeg
+```
+
+### macOS (Homebrew)
+
+```bash
+brew install ffmpeg
+```
+
+### Alpine
+
+```bash
+apk add ffmpeg
+```
+
+Verify:
+
+```bash
+ffmpeg -version
+```
+
+## 2. Build whisper.cpp from source
+
+```bash
+# Clone the repository
+git clone https://github.com/ggerganov/whisper.cpp.git
+cd whisper.cpp
+
+# Build with CMake (recommended)
+cmake -B build
+cmake --build build --config Release
+
+# The binary is at build/bin/whisper-cli (or build/bin/main on older versions)
+ls build/bin/whisper-cli
+```
+
+> **Tip:** For GPU acceleration add `-DWHISPER_CUBLAS=ON` (NVIDIA) or `-DWHISPER_METAL=ON` (Apple Silicon) to the cmake configure step.
+
+### Install system-wide (optional)
+
+```bash
+sudo cp build/bin/whisper-cli /usr/local/bin/whisper-cpp
+```
+
+Or add the build directory to your `PATH`:
+
+```bash
+export PATH="$PWD/build/bin:$PATH"
+```
+
+## 3. Download a GGML model
+
+Models are hosted on Hugging Face. Pick one based on your hardware:
+
+| Model | Size | RAM (approx.) | Quality |
+|---|---|---|---|
+| `tiny` | ~75 MB | ~400 MB | Fast but lower accuracy |
+| `base` | ~142 MB | ~500 MB | Good balance (default) |
+| `small` | ~466 MB | ~1 GB | Better accuracy |
+| `medium` | ~1.5 GB | ~2.5 GB | High accuracy |
+| `large-v3` | ~3 GB | ~5 GB | Best accuracy, slow on CPU |
+
+```bash
+# Create the model cache directory
+mkdir -p ~/.cache/whisper-cpp
+
+# Download the base model (recommended starting point)
+curl -L -o ~/.cache/whisper-cpp/ggml-base.bin \
+ https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin
+
+# Or download small for better accuracy
+curl -L -o ~/.cache/whisper-cpp/ggml-small.bin \
+ https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin
+```
+
+## 4. Configure the bot
+
+Add the following to your `.env`:
+
+```bash
+# Enable voice transcription with local provider
+ENABLE_VOICE_MESSAGES=true
+VOICE_PROVIDER=local
+
+# Path to the whisper.cpp binary (omit if already on PATH as "whisper-cpp")
+WHISPER_CPP_BINARY_PATH=/usr/local/bin/whisper-cpp
+
+# Model: a name like "base", "small", "medium" or a full file path
+# Named models resolve to ~/.cache/whisper-cpp/ggml-{name}.bin
+WHISPER_CPP_MODEL_PATH=base
+```
+
+### Minimal configuration
+
+If `whisper-cpp` is on your PATH and you downloaded the `base` model to the default location, you only need:
+
+```bash
+VOICE_PROVIDER=local
+```
+
+## 5. Verify the setup
+
+```bash
+# Test ffmpeg conversion
+ffmpeg -f lavfi -i "sine=frequency=440:duration=2" -ar 16000 -ac 1 /tmp/test.wav -y
+
+# Test whisper.cpp
+whisper-cpp -m ~/.cache/whisper-cpp/ggml-base.bin -f /tmp/test.wav --no-timestamps
+```
+
+You should see a transcription attempt (it will be empty or nonsensical for a sine wave, but the binary should run without errors).
+
+## Troubleshooting
+
+### `whisper.cpp binary not found on PATH`
+
+The bot could not locate the binary. Either:
+- Install it system-wide: `sudo cp build/bin/whisper-cli /usr/local/bin/whisper-cpp`
+- Or set the full path: `WHISPER_CPP_BINARY_PATH=/path/to/whisper-cli`
+
+### `whisper.cpp model not found`
+
+The model file does not exist at the expected path. Download it:
+
+```bash
+mkdir -p ~/.cache/whisper-cpp
+curl -L -o ~/.cache/whisper-cpp/ggml-base.bin \
+ https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin
+```
+
+### `ffmpeg is required but was not found`
+
+Install ffmpeg for your platform (see step 1 above).
+
+### Poor transcription quality
+
+- Try a larger model (`small` or `medium` instead of `base`)
+- Ensure audio is not too short (< 1 second) or too noisy
+- whisper.cpp uses `--language auto` by default; this works well for most languages
+
+### High CPU usage / slow transcription
+
+- Use a smaller model (`tiny` or `base`)
+- Enable GPU acceleration when building whisper.cpp (CUDA / Metal)
+- Consider using the `mistral` or `openai` cloud providers for faster results on low-powered machines
diff --git a/docs/setup.md b/docs/setup.md
index 5b0670bb..acb7f906 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -197,12 +197,23 @@ VOICE_PROVIDER=openai
OPENAI_API_KEY=your-openai-api-key
```
-If you installed via pip/uv, make sure voice extras are installed:
+**Local whisper.cpp (offline, no API key needed):**
+```bash
+VOICE_PROVIDER=local
+# Optional — auto-detected from PATH if unset
+WHISPER_CPP_BINARY_PATH=/usr/local/bin/whisper-cpp
+# Model name ("base", "small", "medium") or full path to .bin file
+WHISPER_CPP_MODEL_PATH=base
+```
+
+Requires `ffmpeg` and a locally built `whisper.cpp` binary. See the full [local whisper.cpp setup guide](local-whisper-cpp.md) for build instructions and model downloads.
+
+If you installed via pip/uv, make sure voice extras are installed (cloud providers only):
```bash
pip install "claude-code-telegram[voice]"
```
-Optionally override the transcription model with `VOICE_TRANSCRIPTION_MODEL` (defaults to `voxtral-mini-latest` for Mistral, `whisper-1` for OpenAI).
+Optionally override the transcription model with `VOICE_TRANSCRIPTION_MODEL` (defaults to `voxtral-mini-latest` for Mistral, `whisper-1` for OpenAI, `base` for local).
### Notification Recipients
diff --git a/src/bot/features/registry.py b/src/bot/features/registry.py
index 953c228b..43e7e0e0 100644
--- a/src/bot/features/registry.py
+++ b/src/bot/features/registry.py
@@ -78,10 +78,14 @@ def _initialize_features(self):
except Exception as e:
logger.error("Failed to initialize image handler", error=str(e))
- # Voice transcription - requires provider-specific API key
+ # Voice transcription - requires provider-specific API key (or local)
voice_key_available = (
+ self.config.voice_provider == "local"
+ ) or (
self.config.voice_provider == "openai" and self.config.openai_api_key
- ) or (self.config.voice_provider == "mistral" and self.config.mistral_api_key)
+ ) or (
+ self.config.voice_provider == "mistral" and self.config.mistral_api_key
+ )
if self.config.enable_voice_messages and voice_key_available:
try:
self.features["voice_handler"] = VoiceHandler(config=self.config)
diff --git a/src/bot/features/voice_handler.py b/src/bot/features/voice_handler.py
index 11daa10c..5ed31822 100644
--- a/src/bot/features/voice_handler.py
+++ b/src/bot/features/voice_handler.py
@@ -1,7 +1,11 @@
-"""Handle voice message transcription via Mistral (Voxtral) or OpenAI (Whisper)."""
+"""Handle voice message transcription via Mistral (Voxtral), OpenAI (Whisper), or local whisper.cpp."""
+import asyncio
+import shutil
+import tempfile
from dataclasses import dataclass
from datetime import timedelta
+from pathlib import Path
from typing import Any, Optional
import structlog
@@ -22,12 +26,16 @@ class ProcessedVoice:
class VoiceHandler:
- """Transcribe Telegram voice messages using Mistral or OpenAI."""
+ """Transcribe Telegram voice messages using Mistral, OpenAI, or local whisper.cpp."""
+
+ # Timeout (seconds) for ffmpeg and whisper.cpp subprocess calls.
+ LOCAL_SUBPROCESS_TIMEOUT: int = 120
def __init__(self, config: Settings):
self.config = config
self._mistral_client: Optional[Any] = None
self._openai_client: Optional[Any] = None
+ self._resolved_whisper_binary: Optional[str] = None
def _ensure_allowed_file_size(self, file_size: Optional[int]) -> None:
"""Reject files that exceed the configured max size."""
@@ -48,7 +56,7 @@ async def process_voice_message(
"""Download and transcribe a voice message.
1. Download .ogg bytes from Telegram
- 2. Call the configured transcription API (Mistral or OpenAI)
+ 2. Call the configured transcription provider (Mistral, OpenAI, or local)
3. Build a prompt combining caption + transcription
"""
initial_file_size = getattr(voice, "file_size", None)
@@ -79,7 +87,9 @@ async def process_voice_message(
file_size=initial_file_size or resolved_file_size or len(voice_bytes),
)
- if self.config.voice_provider == "openai":
+ if self.config.voice_provider == "local":
+ transcription = await self._transcribe_local(voice_bytes)
+ elif self.config.voice_provider == "openai":
transcription = await self._transcribe_openai(voice_bytes)
else:
transcription = await self._transcribe_mistral(voice_bytes)
@@ -103,6 +113,8 @@ async def process_voice_message(
duration=duration_secs,
)
+ # -- Mistral provider --
+
async def _transcribe_mistral(self, voice_bytes: bytes) -> str:
"""Transcribe audio using the Mistral API (Voxtral)."""
client = self._get_mistral_client()
@@ -147,6 +159,8 @@ def _get_mistral_client(self) -> Any:
self._mistral_client = Mistral(api_key=api_key)
return self._mistral_client
+ # -- OpenAI provider --
+
async def _transcribe_openai(self, voice_bytes: bytes) -> str:
"""Transcribe audio using the OpenAI Whisper API."""
client = self._get_openai_client()
@@ -187,3 +201,149 @@ def _get_openai_client(self) -> Any:
self._openai_client = AsyncOpenAI(api_key=api_key)
return self._openai_client
+
+ # -- Local whisper.cpp provider --
+
+ async def _transcribe_local(self, voice_bytes: bytes) -> str:
+ """Transcribe audio locally using whisper.cpp binary."""
+ binary = self._resolve_whisper_binary()
+ model_path = self.config.resolved_whisper_cpp_model_path
+
+ if not Path(model_path).is_file():
+ raise RuntimeError(
+ f"whisper.cpp model not found at {model_path}. "
+ "Download it with: "
+ "curl -L -o ~/.cache/whisper-cpp/ggml-base.bin "
+ "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
+ )
+
+ tmp_dir = None
+ try:
+ tmp_dir = tempfile.mkdtemp(prefix="voice_")
+ ogg_path = Path(tmp_dir) / "voice.ogg"
+ wav_path = Path(tmp_dir) / "voice.wav"
+
+ ogg_path.write_bytes(voice_bytes)
+
+ # Convert OGG/Opus -> WAV (16kHz mono PCM)
+ await self._convert_ogg_to_wav(ogg_path, wav_path)
+
+ # Run whisper.cpp
+ text = await self._run_whisper_cpp(binary, model_path, wav_path)
+
+ finally:
+ if tmp_dir:
+ shutil.rmtree(tmp_dir, ignore_errors=True)
+
+ text = text.strip()
+ if not text:
+ raise ValueError(
+ "Local whisper.cpp transcription returned an empty response."
+ )
+ return text
+
+ async def _convert_ogg_to_wav(self, ogg_path: Path, wav_path: Path) -> None:
+ """Convert OGG/Opus to WAV (16kHz mono PCM) using ffmpeg."""
+ try:
+ process = await asyncio.create_subprocess_exec(
+ "ffmpeg",
+ "-i",
+ str(ogg_path),
+ "-ar",
+ "16000",
+ "-ac",
+ "1",
+ "-f",
+ "wav",
+ str(wav_path),
+ "-y",
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ )
+ _, stderr = await asyncio.wait_for(
+ process.communicate(),
+ timeout=self.LOCAL_SUBPROCESS_TIMEOUT,
+ )
+
+ if process.returncode != 0:
+ raise RuntimeError(
+ f"ffmpeg conversion failed (exit {process.returncode}): "
+ f"{stderr.decode()[:200]}"
+ )
+ except asyncio.TimeoutError:
+ process.kill()
+ raise RuntimeError(
+ f"ffmpeg conversion timed out after {self.LOCAL_SUBPROCESS_TIMEOUT}s."
+ )
+ except FileNotFoundError:
+ raise RuntimeError(
+ "ffmpeg is required for local voice transcription but was not found. "
+ "Install it with: apt install ffmpeg"
+ )
+
+ async def _run_whisper_cpp(
+ self, binary: str, model_path: str, wav_path: Path
+ ) -> str:
+ """Execute whisper.cpp binary and return transcription text."""
+ try:
+ process = await asyncio.create_subprocess_exec(
+ binary,
+ "-m",
+ model_path,
+ "-f",
+ str(wav_path),
+ "--no-timestamps",
+ "-l",
+ "auto",
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ )
+ stdout, stderr = await asyncio.wait_for(
+ process.communicate(),
+ timeout=self.LOCAL_SUBPROCESS_TIMEOUT,
+ )
+
+ if process.returncode != 0:
+ logger.warning(
+ "whisper.cpp transcription failed",
+ return_code=process.returncode,
+ stderr=stderr.decode()[:300],
+ )
+ raise RuntimeError("Local whisper.cpp transcription failed.")
+
+ return stdout.decode()
+
+ except asyncio.TimeoutError:
+ process.kill()
+ raise RuntimeError(
+ f"whisper.cpp transcription timed out after "
+ f"{self.LOCAL_SUBPROCESS_TIMEOUT}s."
+ )
+ except FileNotFoundError:
+ raise RuntimeError(
+ f"whisper.cpp binary not found at '{binary}'. "
+ "Set WHISPER_CPP_BINARY_PATH or install whisper.cpp."
+ )
+ except RuntimeError:
+ raise
+ except Exception as exc:
+ logger.warning(
+ "whisper.cpp transcription request failed",
+ error_type=type(exc).__name__,
+ )
+ raise RuntimeError("Local whisper.cpp transcription failed.") from exc
+
+ def _resolve_whisper_binary(self) -> str:
+ """Resolve and validate the whisper.cpp binary path on first use."""
+ if self._resolved_whisper_binary is not None:
+ return self._resolved_whisper_binary
+
+ binary = self.config.resolved_whisper_cpp_binary
+ resolved = shutil.which(binary)
+ if not resolved:
+ raise RuntimeError(
+ f"whisper.cpp binary '{binary}' not found on PATH. "
+ "Set WHISPER_CPP_BINARY_PATH to the full path."
+ )
+ self._resolved_whisper_binary = resolved
+ return resolved
diff --git a/src/bot/handlers/message.py b/src/bot/handlers/message.py
index e5fa9f78..935d7917 100644
--- a/src/bot/handlers/message.py
+++ b/src/bot/handlers/message.py
@@ -1021,15 +1021,26 @@ async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
voice_handler = features.get_voice_handler() if features else None
if not voice_handler:
- await update.message.reply_text(
- "🎙️ Voice Messages\n\n"
- "Voice transcription is not available.\n"
- f"Provider: {settings.voice_provider_display_name}\n"
- f"Set {settings.voice_provider_api_key_env} to enable.\n"
- "Install optional voice deps with "
- 'pip install "claude-code-telegram[voice]".',
- parse_mode="HTML",
- )
+ if settings.voice_provider == "local":
+ await update.message.reply_text(
+ "🎙️ Voice Messages\n\n"
+ "Voice transcription is not available.\n"
+ "Provider: Local whisper.cpp\n"
+ "Ensure whisper.cpp is installed and model file exists.\n"
+ "Set WHISPER_CPP_BINARY_PATH and "
+ "WHISPER_CPP_MODEL_PATH if needed.",
+ parse_mode="HTML",
+ )
+ else:
+ await update.message.reply_text(
+ "🎙️ Voice Messages\n\n"
+ "Voice transcription is not available.\n"
+ f"Provider: {settings.voice_provider_display_name}\n"
+ f"Set {settings.voice_provider_api_key_env} to enable.\n"
+ "Install optional voice deps with "
+ 'pip install "claude-code-telegram[voice]".',
+ parse_mode="HTML",
+ )
return
try:
diff --git a/src/bot/orchestrator.py b/src/bot/orchestrator.py
index ac1d5304..609e42d1 100644
--- a/src/bot/orchestrator.py
+++ b/src/bot/orchestrator.py
@@ -1452,6 +1452,12 @@ async def _handle_agentic_media_message(
def _voice_unavailable_message(self) -> str:
"""Return provider-aware guidance when voice feature is unavailable."""
+ if self.settings.voice_provider == "local":
+ return (
+ "Voice processing is not available. "
+ "Ensure whisper.cpp is installed and the model file exists. "
+ "Check WHISPER_CPP_BINARY_PATH and WHISPER_CPP_MODEL_PATH settings."
+ )
return (
"Voice processing is not available. "
f"Set {self.settings.voice_provider_api_key_env} "
diff --git a/src/config/features.py b/src/config/features.py
index 03b54a86..e5561d40 100644
--- a/src/config/features.py
+++ b/src/config/features.py
@@ -76,6 +76,8 @@ def voice_messages_enabled(self) -> bool:
"""Check if voice message transcription is enabled."""
if not self.settings.enable_voice_messages:
return False
+ if self.settings.voice_provider == "local":
+ return True # No API key needed for local whisper.cpp
if self.settings.voice_provider == "openai":
return self.settings.openai_api_key is not None
return self.settings.mistral_api_key is not None
diff --git a/src/config/settings.py b/src/config/settings.py
index 77c34ea4..f276168c 100644
--- a/src/config/settings.py
+++ b/src/config/settings.py
@@ -169,9 +169,9 @@ class Settings(BaseSettings):
enable_voice_messages: bool = Field(
True, description="Enable voice message transcription"
)
- voice_provider: Literal["mistral", "openai"] = Field(
+ voice_provider: Literal["mistral", "openai", "local"] = Field(
"mistral",
- description="Voice transcription provider: 'mistral' or 'openai'",
+ description="Voice transcription provider: 'mistral', 'openai', or 'local'",
)
mistral_api_key: Optional[SecretStr] = Field(
None, description="Mistral API key for voice transcription"
@@ -195,6 +195,21 @@ class Settings(BaseSettings):
ge=1,
le=200,
)
+ whisper_cpp_binary_path: Optional[str] = Field(
+ None,
+ description=(
+ "Path to whisper.cpp binary. "
+ "Required when VOICE_PROVIDER=local. Auto-detected from PATH if unset."
+ ),
+ )
+ whisper_cpp_model_path: Optional[str] = Field(
+ None,
+ description=(
+ "Path to whisper.cpp GGML model file, or model name "
+ "(e.g. 'base', 'small'). Defaults to 'base'. "
+ "Named models resolve to ~/.cache/whisper-cpp/ggml-{name}.bin"
+ ),
+ )
enable_quick_actions: bool = Field(True, description="Enable quick action buttons")
agentic_mode: bool = Field(
True,
@@ -395,8 +410,10 @@ def validate_voice_provider(cls, v: Any) -> str:
if v is None:
return "mistral"
provider = str(v).strip().lower()
- if provider not in {"mistral", "openai"}:
- raise ValueError("voice_provider must be one of ['mistral', 'openai']")
+ if provider not in {"mistral", "openai", "local"}:
+ raise ValueError(
+ "voice_provider must be one of ['mistral', 'openai', 'local']"
+ )
return provider
@field_validator("project_threads_chat_id", mode="before")
@@ -503,6 +520,8 @@ def resolved_voice_model(self) -> str:
return self.voice_transcription_model
if self.voice_provider == "openai":
return "whisper-1"
+ if self.voice_provider == "local":
+ return self.whisper_cpp_model_path or "base"
return "voxtral-mini-latest"
@property
@@ -515,6 +534,8 @@ def voice_provider_api_key_env(self) -> str:
"""API key environment variable required for the configured voice provider."""
if self.voice_provider == "openai":
return "OPENAI_API_KEY"
+ if self.voice_provider == "local":
+ return ""
return "MISTRAL_API_KEY"
@property
@@ -522,4 +543,21 @@ def voice_provider_display_name(self) -> str:
"""Human-friendly label for the configured voice provider."""
if self.voice_provider == "openai":
return "OpenAI Whisper"
+ if self.voice_provider == "local":
+ return "Local whisper.cpp"
return "Mistral Voxtral"
+
+ @property
+ def resolved_whisper_cpp_binary(self) -> str:
+ """Resolve whisper.cpp binary path, defaulting to 'whisper-cpp' on PATH."""
+ return self.whisper_cpp_binary_path or "whisper-cpp"
+
+ @property
+ def resolved_whisper_cpp_model_path(self) -> str:
+ """Resolve whisper.cpp model file path from name or explicit path."""
+ path_or_name = self.whisper_cpp_model_path or "base"
+ if "/" in path_or_name or path_or_name.endswith(".bin"):
+ return path_or_name
+ return str(
+ Path.home() / ".cache" / "whisper-cpp" / f"ggml-{path_or_name}.bin"
+ )
diff --git a/tests/unit/test_bot/test_message_voice.py b/tests/unit/test_bot/test_message_voice.py
index d6f2deb6..e5bfe1a7 100644
--- a/tests/unit/test_bot/test_message_voice.py
+++ b/tests/unit/test_bot/test_message_voice.py
@@ -53,3 +53,28 @@ async def test_handle_voice_missing_handler_uses_mistral_key(tmp_path):
call_args = update.message.reply_text.call_args
assert "MISTRAL_API_KEY" in call_args.args[0]
+
+
+async def test_handle_voice_missing_handler_local_provider(tmp_path):
+ """Classic handler fallback shows whisper.cpp guidance for local provider."""
+ settings = create_test_config(
+ approved_directory=str(tmp_path),
+ voice_provider="local",
+ )
+
+ features = MagicMock()
+ features.get_voice_handler.return_value = None
+
+ update = MagicMock()
+ update.effective_user.id = 123
+ update.message.reply_text = AsyncMock()
+
+ context = MagicMock()
+ context.bot_data = {"settings": settings, "features": features}
+ context.user_data = {}
+
+ await handle_voice(update, context)
+
+ call_args = update.message.reply_text.call_args
+ assert "WHISPER_CPP_BINARY_PATH" in call_args.args[0]
+ assert call_args.kwargs["parse_mode"] == "HTML"
diff --git a/tests/unit/test_bot/test_voice_handler.py b/tests/unit/test_bot/test_voice_handler.py
index 2caddf86..d97237a4 100644
--- a/tests/unit/test_bot/test_voice_handler.py
+++ b/tests/unit/test_bot/test_voice_handler.py
@@ -1,9 +1,11 @@
"""Tests for voice handler feature."""
+import asyncio
import sys
from datetime import timedelta
+from pathlib import Path
from types import SimpleNamespace
-from unittest.mock import AsyncMock, MagicMock
+from unittest.mock import AsyncMock, MagicMock, patch
import pytest
@@ -352,3 +354,180 @@ async def test_transcribe_openai_reuses_cached_client(openai_voice_handler):
openai_ctor.assert_called_once_with(api_key="test-openai-key")
assert mock_transcriptions.create.await_count == 2
+
+
+# --- Local whisper.cpp provider tests ---
+
+
+@pytest.fixture
+def local_config():
+ """Create a mock config with local whisper.cpp settings."""
+ cfg = MagicMock()
+ cfg.voice_provider = "local"
+ cfg.resolved_whisper_cpp_binary = "whisper-cpp"
+ cfg.resolved_whisper_cpp_model_path = "/tmp/models/ggml-base.bin"
+ cfg.voice_max_file_size_mb = 20
+ cfg.voice_max_file_size_bytes = 20 * 1024 * 1024
+ return cfg
+
+
+@pytest.fixture
+def local_voice_handler(local_config):
+ """Create a VoiceHandler instance with local config."""
+ return VoiceHandler(config=local_config)
+
+
+async def test_process_voice_message_local_dispatches(local_voice_handler):
+ """process_voice_message routes to _transcribe_local for local provider."""
+ voice = _mock_voice(duration=5)
+ local_voice_handler._transcribe_local = AsyncMock(
+ return_value="Local transcription"
+ )
+
+ result = await local_voice_handler.process_voice_message(voice)
+
+ assert isinstance(result, ProcessedVoice)
+ assert result.transcription == "Local transcription"
+ assert result.duration == 5
+ local_voice_handler._transcribe_local.assert_awaited_once()
+
+
+async def test_transcribe_local_runs_ffmpeg_and_whisper(local_voice_handler):
+ """Local transcription converts OGG->WAV then calls whisper.cpp binary."""
+ mock_ffmpeg = AsyncMock()
+ mock_ffmpeg.communicate = AsyncMock(return_value=(b"", b""))
+ mock_ffmpeg.returncode = 0
+
+ mock_whisper = AsyncMock()
+ mock_whisper.communicate = AsyncMock(return_value=(b"Hello world", b""))
+ mock_whisper.returncode = 0
+
+ call_count = 0
+
+ async def fake_subprocess(*args, **kwargs):
+ nonlocal call_count
+ call_count += 1
+ if call_count == 1:
+ return mock_ffmpeg
+ return mock_whisper
+
+ with (
+ patch("shutil.which", return_value="/usr/bin/whisper-cpp"),
+ patch(
+ "src.bot.features.voice_handler.Path.is_file",
+ return_value=True,
+ ),
+ patch(
+ "asyncio.create_subprocess_exec",
+ side_effect=fake_subprocess,
+ ),
+ ):
+ result = await local_voice_handler._transcribe_local(b"fake-ogg-bytes")
+
+ assert result == "Hello world"
+ assert call_count == 2
+
+
+async def test_transcribe_local_ffmpeg_not_found(local_voice_handler):
+ """Missing ffmpeg gives a clear install hint."""
+ with (
+ patch("shutil.which", return_value="/usr/bin/whisper-cpp"),
+ patch(
+ "src.bot.features.voice_handler.Path.is_file",
+ return_value=True,
+ ),
+ patch(
+ "asyncio.create_subprocess_exec",
+ side_effect=FileNotFoundError,
+ ),
+ ):
+ with pytest.raises(RuntimeError, match="ffmpeg is required"):
+ await local_voice_handler._transcribe_local(b"fake-ogg")
+
+
+async def test_transcribe_local_model_not_found(local_voice_handler):
+ """Missing model file raises a clear error with download hint."""
+ with (
+ patch("shutil.which", return_value="/usr/bin/whisper-cpp"),
+ patch(
+ "src.bot.features.voice_handler.Path.is_file",
+ return_value=False,
+ ),
+ ):
+ with pytest.raises(RuntimeError, match="model not found"):
+ await local_voice_handler._transcribe_local(b"fake-ogg")
+
+
+async def test_transcribe_local_whisper_binary_not_found(local_voice_handler):
+ """Missing whisper.cpp binary raises a clear error."""
+ with patch("shutil.which", return_value=None):
+ with pytest.raises(RuntimeError, match="not found on PATH"):
+ await local_voice_handler._transcribe_local(b"fake-ogg")
+
+
+async def test_transcribe_local_empty_response(local_voice_handler):
+ """Empty whisper.cpp output raises ValueError."""
+ mock_ffmpeg = AsyncMock()
+ mock_ffmpeg.communicate = AsyncMock(return_value=(b"", b""))
+ mock_ffmpeg.returncode = 0
+
+ mock_whisper = AsyncMock()
+ mock_whisper.communicate = AsyncMock(return_value=(b" ", b""))
+ mock_whisper.returncode = 0
+
+ call_count = 0
+
+ async def fake_subprocess(*args, **kwargs):
+ nonlocal call_count
+ call_count += 1
+ if call_count == 1:
+ return mock_ffmpeg
+ return mock_whisper
+
+ with (
+ patch("shutil.which", return_value="/usr/bin/whisper-cpp"),
+ patch(
+ "src.bot.features.voice_handler.Path.is_file",
+ return_value=True,
+ ),
+ patch(
+ "asyncio.create_subprocess_exec",
+ side_effect=fake_subprocess,
+ ),
+ ):
+ with pytest.raises(ValueError, match="empty response"):
+ await local_voice_handler._transcribe_local(b"fake-ogg")
+
+
+async def test_transcribe_local_whisper_nonzero_exit(local_voice_handler):
+ """Non-zero whisper.cpp exit code raises RuntimeError."""
+ mock_ffmpeg = AsyncMock()
+ mock_ffmpeg.communicate = AsyncMock(return_value=(b"", b""))
+ mock_ffmpeg.returncode = 0
+
+ mock_whisper = AsyncMock()
+ mock_whisper.communicate = AsyncMock(return_value=(b"", b"model load fail"))
+ mock_whisper.returncode = 1
+
+ call_count = 0
+
+ async def fake_subprocess(*args, **kwargs):
+ nonlocal call_count
+ call_count += 1
+ if call_count == 1:
+ return mock_ffmpeg
+ return mock_whisper
+
+ with (
+ patch("shutil.which", return_value="/usr/bin/whisper-cpp"),
+ patch(
+ "src.bot.features.voice_handler.Path.is_file",
+ return_value=True,
+ ),
+ patch(
+ "asyncio.create_subprocess_exec",
+ side_effect=fake_subprocess,
+ ),
+ ):
+ with pytest.raises(RuntimeError, match="transcription failed"):
+ await local_voice_handler._transcribe_local(b"fake-ogg")
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 6b20c6fe..2f0dcd9e 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -457,7 +457,7 @@ def test_project_threads_validation_invalid_mode(tmp_path):
def test_voice_provider_validation_and_normalization(tmp_path):
- """VOICE_PROVIDER accepts only mistral/openai and normalizes casing."""
+ """VOICE_PROVIDER accepts mistral/openai/local and normalizes casing."""
project_dir = tmp_path / "projects"
project_dir.mkdir()
@@ -483,6 +483,26 @@ def test_voice_provider_validation_and_normalization(tmp_path):
assert "voice_provider must be one of" in str(exc_info.value)
+def test_voice_provider_local_requires_no_api_key(tmp_path):
+ """VOICE_PROVIDER=local needs no API key and has correct display properties."""
+ project_dir = tmp_path / "projects"
+ project_dir.mkdir()
+
+ settings = Settings(
+ telegram_bot_token="test_token",
+ telegram_bot_username="test_bot",
+ approved_directory=str(project_dir),
+ voice_provider="local",
+ )
+
+ assert settings.voice_provider == "local"
+ assert settings.voice_provider_api_key_env == ""
+ assert settings.voice_provider_display_name == "Local whisper.cpp"
+ assert settings.resolved_voice_model == "base"
+ assert settings.resolved_whisper_cpp_binary == "whisper-cpp"
+ assert settings.resolved_whisper_cpp_model_path.endswith("ggml-base.bin")
+
+
def test_voice_max_file_size_configuration(tmp_path):
"""Voice max file size should be configurable and validated."""
project_dir = tmp_path / "projects"