From 5501304c74d1ffa7d4dd8791adddef456f03c5da Mon Sep 17 00:00:00 2001
From: thereisnotime <37583483+thereisnotime@users.noreply.github.com>
Date: Fri, 20 Mar 2026 02:30:16 +0200
Subject: [PATCH] feat: add local whisper.cpp voice transcription provider

Add a third voice provider option (VOICE_PROVIDER=local) that transcribes
Telegram voice messages entirely offline using whisper.cpp and ffmpeg.
No API keys or cloud services required.

- New local provider in voice_handler.py (OGG->WAV via ffmpeg, then whisper.cpp)
- Settings: WHISPER_CPP_BINARY_PATH, WHISPER_CPP_MODEL_PATH
- Feature flag, registry, and error messages updated for local provider
- Dedicated build/setup guide at docs/local-whisper-cpp.md
- Full test coverage for the local provider path
- Updated .env.example, CLAUDE.md, README.md, docs/configuration.md

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example                              |  28 ++++
 CLAUDE.md                                 |   2 +-
 README.md                                 |   2 +-
 docs/configuration.md                     |   6 +-
 docs/local-whisper-cpp.md                 | 170 ++++++++++++++++++++
 docs/setup.md                             |  15 +-
 src/bot/features/registry.py              |   8 +-
 src/bot/features/voice_handler.py         | 168 +++++++++++++++++++-
 src/bot/handlers/message.py               |  29 ++--
 src/bot/orchestrator.py                   |   6 +
 src/config/features.py                    |   2 +
 src/config/settings.py                    |  46 +++++-
 tests/unit/test_bot/test_message_voice.py |  25 +++
 tests/unit/test_bot/test_voice_handler.py | 181 +++++++++++++++++++++-
 tests/unit/test_config.py                 |  22 ++-
 15 files changed, 684 insertions(+), 26 deletions(-)
 create mode 100644 docs/local-whisper-cpp.md

diff --git a/.env.example b/.env.example
index dfd70908..8c59a4b4 100644
--- a/.env.example
+++ b/.env.example
@@ -140,6 +140,34 @@ QUICK_ACTIONS_TIMEOUT=120
 # Git operations timeout in seconds
 GIT_OPERATIONS_TIMEOUT=30
 
+# === VOICE TRANSCRIPTION ===
+# Enable voice message transcription
+ENABLE_VOICE_MESSAGES=true
+
+# Voice transcription provider: mistral, openai, or local
+# - mistral: Uses Mistral Voxtral (requires MISTRAL_API_KEY)
+# - openai: Uses OpenAI Whisper API (requires OPENAI_API_KEY)
+# - local: Uses whisper.cpp binary (requires ffmpeg + whisper.cpp installed)
+VOICE_PROVIDER=mistral
+
+# API keys (only needed for cloud providers)
+MISTRAL_API_KEY=
+OPENAI_API_KEY=
+
+# Override transcription model (optional)
+# Defaults: voxtral-mini-latest (mistral), whisper-1 (openai), base (local)
+VOICE_TRANSCRIPTION_MODEL=
+
+# Maximum voice message size in MB
+VOICE_MAX_FILE_SIZE_MB=20
+
+# Local whisper.cpp settings (only used when VOICE_PROVIDER=local)
+# Path to whisper.cpp binary (auto-detected from PATH if unset)
+WHISPER_CPP_BINARY_PATH=
+# Path to GGML model file, or model name like "base", "small", "medium"
+# Named models look for ~/.cache/whisper-cpp/ggml-{name}.bin
+WHISPER_CPP_MODEL_PATH=base
+
 # === PROJECT THREAD MODE ===
 # Enable strict routing by Telegram project topics
 ENABLE_PROJECT_THREADS=false
diff --git a/CLAUDE.md b/CLAUDE.md
index 0917d335..b29f5871 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -102,7 +102,7 @@ Multi-project topics: `ENABLE_PROJECT_THREADS` (default false), `PROJECT_THREADS
 
 Output verbosity: `VERBOSE_LEVEL` (default 1, range 0-2). Controls how much of Claude's background activity is shown to the user in real-time. 0 = quiet (only final response, typing indicator still active), 1 = normal (tool names + reasoning snippets shown during execution), 2 = detailed (tool names with input summaries + longer reasoning text). Users can override per-session via `/verbose 0|1|2`. A persistent typing indicator is refreshed every ~2 seconds at all levels.
 
-Voice transcription: `ENABLE_VOICE_MESSAGES` (default true), `VOICE_PROVIDER` (`mistral`|`openai`, default `mistral`), `MISTRAL_API_KEY`, `OPENAI_API_KEY`, `VOICE_TRANSCRIPTION_MODEL`. Provider implementation is in `src/bot/features/voice_handler.py`.
+Voice transcription: `ENABLE_VOICE_MESSAGES` (default true), `VOICE_PROVIDER` (`mistral`|`openai`|`local`, default `mistral`), `MISTRAL_API_KEY`, `OPENAI_API_KEY`, `VOICE_TRANSCRIPTION_MODEL`. For local provider: `WHISPER_CPP_BINARY_PATH`, `WHISPER_CPP_MODEL_PATH` (requires ffmpeg + whisper.cpp installed). Provider implementation is in `src/bot/features/voice_handler.py`.
 
 Feature flags in `src/config/features.py` control: MCP, git integration, file uploads, quick actions, session export, image uploads, voice messages, conversation mode, agentic mode, API server, scheduler.
 
diff --git a/README.md b/README.md
index 34ca52d3..0bd27ae2 100644
--- a/README.md
+++ b/README.md
@@ -194,7 +194,7 @@ Enable with `ENABLE_API_SERVER=true` and `ENABLE_SCHEDULER=true`. See [docs/setu
 - Directory sandboxing with path traversal prevention
 - File upload handling with archive extraction
 - Image/screenshot upload with analysis
-- Voice message transcription (Mistral Voxtral / OpenAI Whisper)
+- Voice message transcription (Mistral Voxtral / OpenAI Whisper / [local whisper.cpp](docs/local-whisper-cpp.md))
 - Git integration with safe repository operations
 - Quick actions system with context-aware buttons
 - Session export in Markdown, HTML, and JSON formats
diff --git a/docs/configuration.md b/docs/configuration.md
index 2098f8be..2bba7d9f 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -135,11 +135,15 @@ ENABLE_QUICK_ACTIONS=true
 
 # Enable voice message transcription
 ENABLE_VOICE_MESSAGES=true
-VOICE_PROVIDER=mistral              # 'mistral' (default) or 'openai'
+VOICE_PROVIDER=mistral              # 'mistral', 'openai', or 'local'
 MISTRAL_API_KEY=                     # Required when VOICE_PROVIDER=mistral
 OPENAI_API_KEY=                      # Required when VOICE_PROVIDER=openai
 VOICE_TRANSCRIPTION_MODEL=           # Default: voxtral-mini-latest (Mistral) or whisper-1 (OpenAI)
 VOICE_MAX_FILE_SIZE_MB=20            # Max Telegram voice file size to download (1-200MB)
+
+# Local whisper.cpp settings (only used when VOICE_PROVIDER=local)
+WHISPER_CPP_BINARY_PATH=             # Path to whisper.cpp binary (auto-detected from PATH if unset)
+WHISPER_CPP_MODEL_PATH=base          # Path to GGML model file or model name (base, small, medium, large)
 ```
 
 #### Agentic Platform
diff --git a/docs/local-whisper-cpp.md b/docs/local-whisper-cpp.md
new file mode 100644
index 00000000..b23773cb
--- /dev/null
+++ b/docs/local-whisper-cpp.md
@@ -0,0 +1,170 @@
+# Local Voice Transcription with whisper.cpp
+
+This guide explains how to build and configure [whisper.cpp](https://github.com/ggerganov/whisper.cpp) for **offline** voice message transcription — no API keys or cloud services required.
+
+## Overview
+
+When `VOICE_PROVIDER=local` the bot transcribes Telegram voice messages entirely on your machine using:
+
+| Component | Purpose |
+|---|---|
+| **ffmpeg** | Converts Telegram OGG/Opus audio to 16 kHz mono WAV |
+| **whisper.cpp** | Runs OpenAI's Whisper model locally via optimised C/C++ |
+| **GGML model** | Quantised model weights (downloaded once) |
+
+## Prerequisites
+
+- A C/C++ toolchain (`gcc`/`clang`, `cmake`, `make`)
+- `ffmpeg` installed and on PATH
+- ~400 MB disk space for the `base` model (~1.5 GB for `medium`)
+
+## 1. Install ffmpeg
+
+### Ubuntu / Debian
+
+```bash
+sudo apt update && sudo apt install -y ffmpeg
+```
+
+### macOS (Homebrew)
+
+```bash
+brew install ffmpeg
+```
+
+### Alpine
+
+```bash
+apk add ffmpeg
+```
+
+Verify:
+
+```bash
+ffmpeg -version
+```
+
+## 2. Build whisper.cpp from source
+
+```bash
+# Clone the repository
+git clone https://github.com/ggerganov/whisper.cpp.git
+cd whisper.cpp
+
+# Build with CMake (recommended)
+cmake -B build
+cmake --build build --config Release
+
+# The binary is at build/bin/whisper-cli (or build/bin/main on older versions)
+ls build/bin/whisper-cli
+```
+
+> **Tip:** For GPU acceleration add `-DWHISPER_CUBLAS=ON` (NVIDIA) or `-DWHISPER_METAL=ON` (Apple Silicon) to the cmake configure step.
+
+### Install system-wide (optional)
+
+```bash
+sudo cp build/bin/whisper-cli /usr/local/bin/whisper-cpp
+```
+
+Or add the build directory to your `PATH`:
+
+```bash
+export PATH="$PWD/build/bin:$PATH"
+```
+
+## 3. Download a GGML model
+
+Models are hosted on Hugging Face. Pick one based on your hardware:
+
+| Model | Size | RAM (approx.) | Quality |
+|---|---|---|---|
+| `tiny` | ~75 MB | ~400 MB | Fast but lower accuracy |
+| `base` | ~142 MB | ~500 MB | Good balance (default) |
+| `small` | ~466 MB | ~1 GB | Better accuracy |
+| `medium` | ~1.5 GB | ~2.5 GB | High accuracy |
+| `large-v3` | ~3 GB | ~5 GB | Best accuracy, slow on CPU |
+
+```bash
+# Create the model cache directory
+mkdir -p ~/.cache/whisper-cpp
+
+# Download the base model (recommended starting point)
+curl -L -o ~/.cache/whisper-cpp/ggml-base.bin \
+  https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin
+
+# Or download small for better accuracy
+curl -L -o ~/.cache/whisper-cpp/ggml-small.bin \
+  https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin
+```
+
+## 4. Configure the bot
+
+Add the following to your `.env`:
+
+```bash
+# Enable voice transcription with local provider
+ENABLE_VOICE_MESSAGES=true
+VOICE_PROVIDER=local
+
+# Path to the whisper.cpp binary (omit if already on PATH as "whisper-cpp")
+WHISPER_CPP_BINARY_PATH=/usr/local/bin/whisper-cpp
+
+# Model: a name like "base", "small", "medium" or a full file path
+# Named models resolve to ~/.cache/whisper-cpp/ggml-{name}.bin
+WHISPER_CPP_MODEL_PATH=base
+```
+
+### Minimal configuration
+
+If `whisper-cpp` is on your PATH and you downloaded the `base` model to the default location, you only need:
+
+```bash
+VOICE_PROVIDER=local
+```
+
+## 5. Verify the setup
+
+```bash
+# Test ffmpeg conversion
+ffmpeg -f lavfi -i "sine=frequency=440:duration=2" -ar 16000 -ac 1 /tmp/test.wav -y
+
+# Test whisper.cpp
+whisper-cpp -m ~/.cache/whisper-cpp/ggml-base.bin -f /tmp/test.wav --no-timestamps
+```
+
+You should see a transcription attempt (it will be empty or nonsensical for a sine wave, but the binary should run without errors).
+
+## Troubleshooting
+
+### `whisper.cpp binary not found on PATH`
+
+The bot could not locate the binary. Either:
+- Install it system-wide: `sudo cp build/bin/whisper-cli /usr/local/bin/whisper-cpp`
+- Or set the full path: `WHISPER_CPP_BINARY_PATH=/path/to/whisper-cli`
+
+### `whisper.cpp model not found`
+
+The model file does not exist at the expected path. Download it:
+
+```bash
+mkdir -p ~/.cache/whisper-cpp
+curl -L -o ~/.cache/whisper-cpp/ggml-base.bin \
+  https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin
+```
+
+### `ffmpeg is required but was not found`
+
+Install ffmpeg for your platform (see step 1 above).
+
+### Poor transcription quality
+
+- Try a larger model (`small` or `medium` instead of `base`)
+- Ensure audio is not too short (< 1 second) or too noisy
+- whisper.cpp uses `--language auto` by default; this works well for most languages
+
+### High CPU usage / slow transcription
+
+- Use a smaller model (`tiny` or `base`)
+- Enable GPU acceleration when building whisper.cpp (CUDA / Metal)
+- Consider using the `mistral` or `openai` cloud providers for faster results on low-powered machines
diff --git a/docs/setup.md b/docs/setup.md
index 5b0670bb..acb7f906 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -197,12 +197,23 @@ VOICE_PROVIDER=openai
 OPENAI_API_KEY=your-openai-api-key
 ```
 
-If you installed via pip/uv, make sure voice extras are installed:
+**Local whisper.cpp (offline, no API key needed):**
+```bash
+VOICE_PROVIDER=local
+# Optional — auto-detected from PATH if unset
+WHISPER_CPP_BINARY_PATH=/usr/local/bin/whisper-cpp
+# Model name ("base", "small", "medium") or full path to .bin file
+WHISPER_CPP_MODEL_PATH=base
+```
+
+Requires `ffmpeg` and a locally built `whisper.cpp` binary. See the full [local whisper.cpp setup guide](local-whisper-cpp.md) for build instructions and model downloads.
+
+If you installed via pip/uv, make sure voice extras are installed (cloud providers only):
 ```bash
 pip install "claude-code-telegram[voice]"
 ```
 
-Optionally override the transcription model with `VOICE_TRANSCRIPTION_MODEL` (defaults to `voxtral-mini-latest` for Mistral, `whisper-1` for OpenAI).
+Optionally override the transcription model with `VOICE_TRANSCRIPTION_MODEL` (defaults to `voxtral-mini-latest` for Mistral, `whisper-1` for OpenAI, `base` for local).
 
 ### Notification Recipients
 
diff --git a/src/bot/features/registry.py b/src/bot/features/registry.py
index 953c228b..43e7e0e0 100644
--- a/src/bot/features/registry.py
+++ b/src/bot/features/registry.py
@@ -78,10 +78,14 @@ def _initialize_features(self):
         except Exception as e:
             logger.error("Failed to initialize image handler", error=str(e))
 
-        # Voice transcription - requires provider-specific API key
+        # Voice transcription - requires provider-specific API key (or local)
         voice_key_available = (
+            self.config.voice_provider == "local"
+        ) or (
             self.config.voice_provider == "openai" and self.config.openai_api_key
-        ) or (self.config.voice_provider == "mistral" and self.config.mistral_api_key)
+        ) or (
+            self.config.voice_provider == "mistral" and self.config.mistral_api_key
+        )
         if self.config.enable_voice_messages and voice_key_available:
             try:
                 self.features["voice_handler"] = VoiceHandler(config=self.config)
diff --git a/src/bot/features/voice_handler.py b/src/bot/features/voice_handler.py
index 11daa10c..5ed31822 100644
--- a/src/bot/features/voice_handler.py
+++ b/src/bot/features/voice_handler.py
@@ -1,7 +1,11 @@
-"""Handle voice message transcription via Mistral (Voxtral) or OpenAI (Whisper)."""
+"""Handle voice message transcription via Mistral (Voxtral), OpenAI (Whisper), or local whisper.cpp."""
 
+import asyncio
+import shutil
+import tempfile
 from dataclasses import dataclass
 from datetime import timedelta
+from pathlib import Path
 from typing import Any, Optional
 
 import structlog
@@ -22,12 +26,16 @@ class ProcessedVoice:
 
 
 class VoiceHandler:
-    """Transcribe Telegram voice messages using Mistral or OpenAI."""
+    """Transcribe Telegram voice messages using Mistral, OpenAI, or local whisper.cpp."""
+
+    # Timeout (seconds) for ffmpeg and whisper.cpp subprocess calls.
+    LOCAL_SUBPROCESS_TIMEOUT: int = 120
 
     def __init__(self, config: Settings):
         self.config = config
         self._mistral_client: Optional[Any] = None
         self._openai_client: Optional[Any] = None
+        self._resolved_whisper_binary: Optional[str] = None
 
     def _ensure_allowed_file_size(self, file_size: Optional[int]) -> None:
         """Reject files that exceed the configured max size."""
@@ -48,7 +56,7 @@ async def process_voice_message(
         """Download and transcribe a voice message.
 
         1. Download .ogg bytes from Telegram
-        2. Call the configured transcription API (Mistral or OpenAI)
+        2. Call the configured transcription provider (Mistral, OpenAI, or local)
         3. Build a prompt combining caption + transcription
         """
         initial_file_size = getattr(voice, "file_size", None)
@@ -79,7 +87,9 @@ async def process_voice_message(
             file_size=initial_file_size or resolved_file_size or len(voice_bytes),
         )
 
-        if self.config.voice_provider == "openai":
+        if self.config.voice_provider == "local":
+            transcription = await self._transcribe_local(voice_bytes)
+        elif self.config.voice_provider == "openai":
             transcription = await self._transcribe_openai(voice_bytes)
         else:
             transcription = await self._transcribe_mistral(voice_bytes)
@@ -103,6 +113,8 @@ async def process_voice_message(
             duration=duration_secs,
         )
 
+    # -- Mistral provider --
+
     async def _transcribe_mistral(self, voice_bytes: bytes) -> str:
         """Transcribe audio using the Mistral API (Voxtral)."""
         client = self._get_mistral_client()
@@ -147,6 +159,8 @@ def _get_mistral_client(self) -> Any:
         self._mistral_client = Mistral(api_key=api_key)
         return self._mistral_client
 
+    # -- OpenAI provider --
+
     async def _transcribe_openai(self, voice_bytes: bytes) -> str:
         """Transcribe audio using the OpenAI Whisper API."""
         client = self._get_openai_client()
@@ -187,3 +201,149 @@ def _get_openai_client(self) -> Any:
 
         self._openai_client = AsyncOpenAI(api_key=api_key)
         return self._openai_client
+
+    # -- Local whisper.cpp provider --
+
+    async def _transcribe_local(self, voice_bytes: bytes) -> str:
+        """Transcribe audio locally using whisper.cpp binary."""
+        binary = self._resolve_whisper_binary()
+        model_path = self.config.resolved_whisper_cpp_model_path
+
+        if not Path(model_path).is_file():
+            raise RuntimeError(
+                f"whisper.cpp model not found at {model_path}. "
+                "Download it with: "
+                "curl -L -o ~/.cache/whisper-cpp/ggml-base.bin "
+                "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
+            )
+
+        tmp_dir = None
+        try:
+            tmp_dir = tempfile.mkdtemp(prefix="voice_")
+            ogg_path = Path(tmp_dir) / "voice.ogg"
+            wav_path = Path(tmp_dir) / "voice.wav"
+
+            ogg_path.write_bytes(voice_bytes)
+
+            # Convert OGG/Opus -> WAV (16kHz mono PCM)
+            await self._convert_ogg_to_wav(ogg_path, wav_path)
+
+            # Run whisper.cpp
+            text = await self._run_whisper_cpp(binary, model_path, wav_path)
+
+        finally:
+            if tmp_dir:
+                shutil.rmtree(tmp_dir, ignore_errors=True)
+
+        text = text.strip()
+        if not text:
+            raise ValueError(
+                "Local whisper.cpp transcription returned an empty response."
+            )
+        return text
+
+    async def _convert_ogg_to_wav(self, ogg_path: Path, wav_path: Path) -> None:
+        """Convert OGG/Opus to WAV (16kHz mono PCM) using ffmpeg."""
+        try:
+            process = await asyncio.create_subprocess_exec(
+                "ffmpeg",
+                "-i",
+                str(ogg_path),
+                "-ar",
+                "16000",
+                "-ac",
+                "1",
+                "-f",
+                "wav",
+                str(wav_path),
+                "-y",
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            _, stderr = await asyncio.wait_for(
+                process.communicate(),
+                timeout=self.LOCAL_SUBPROCESS_TIMEOUT,
+            )
+
+            if process.returncode != 0:
+                raise RuntimeError(
+                    f"ffmpeg conversion failed (exit {process.returncode}): "
+                    f"{stderr.decode()[:200]}"
+                )
+        except asyncio.TimeoutError:
+            process.kill()
+            raise RuntimeError(
+                f"ffmpeg conversion timed out after {self.LOCAL_SUBPROCESS_TIMEOUT}s."
+            )
+        except FileNotFoundError:
+            raise RuntimeError(
+                "ffmpeg is required for local voice transcription but was not found. "
+                "Install it with: apt install ffmpeg"
+            )
+
+    async def _run_whisper_cpp(
+        self, binary: str, model_path: str, wav_path: Path
+    ) -> str:
+        """Execute whisper.cpp binary and return transcription text."""
+        try:
+            process = await asyncio.create_subprocess_exec(
+                binary,
+                "-m",
+                model_path,
+                "-f",
+                str(wav_path),
+                "--no-timestamps",
+                "-l",
+                "auto",
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            stdout, stderr = await asyncio.wait_for(
+                process.communicate(),
+                timeout=self.LOCAL_SUBPROCESS_TIMEOUT,
+            )
+
+            if process.returncode != 0:
+                logger.warning(
+                    "whisper.cpp transcription failed",
+                    return_code=process.returncode,
+                    stderr=stderr.decode()[:300],
+                )
+                raise RuntimeError("Local whisper.cpp transcription failed.")
+
+            return stdout.decode()
+
+        except asyncio.TimeoutError:
+            process.kill()
+            raise RuntimeError(
+                f"whisper.cpp transcription timed out after "
+                f"{self.LOCAL_SUBPROCESS_TIMEOUT}s."
+            )
+        except FileNotFoundError:
+            raise RuntimeError(
+                f"whisper.cpp binary not found at '{binary}'. "
+                "Set WHISPER_CPP_BINARY_PATH or install whisper.cpp."
+            )
+        except RuntimeError:
+            raise
+        except Exception as exc:
+            logger.warning(
+                "whisper.cpp transcription request failed",
+                error_type=type(exc).__name__,
+            )
+            raise RuntimeError("Local whisper.cpp transcription failed.") from exc
+
+    def _resolve_whisper_binary(self) -> str:
+        """Resolve and validate the whisper.cpp binary path on first use."""
+        if self._resolved_whisper_binary is not None:
+            return self._resolved_whisper_binary
+
+        binary = self.config.resolved_whisper_cpp_binary
+        resolved = shutil.which(binary)
+        if not resolved:
+            raise RuntimeError(
+                f"whisper.cpp binary '{binary}' not found on PATH. "
+                "Set WHISPER_CPP_BINARY_PATH to the full path."
+            )
+        self._resolved_whisper_binary = resolved
+        return resolved
diff --git a/src/bot/handlers/message.py b/src/bot/handlers/message.py
index e5fa9f78..935d7917 100644
--- a/src/bot/handlers/message.py
+++ b/src/bot/handlers/message.py
@@ -1021,15 +1021,26 @@ async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE) -> No
     voice_handler = features.get_voice_handler() if features else None
 
     if not voice_handler:
-        await update.message.reply_text(
-            "🎙️ <b>Voice Messages</b>\n\n"
-            "Voice transcription is not available.\n"
-            f"Provider: <code>{settings.voice_provider_display_name}</code>\n"
-            f"Set <code>{settings.voice_provider_api_key_env}</code> to enable.\n"
-            "Install optional voice deps with "
-            '<code>pip install "claude-code-telegram[voice]"</code>.',
-            parse_mode="HTML",
-        )
+        if settings.voice_provider == "local":
+            await update.message.reply_text(
+                "🎙️ <b>Voice Messages</b>\n\n"
+                "Voice transcription is not available.\n"
+                "Provider: <code>Local whisper.cpp</code>\n"
+                "Ensure whisper.cpp is installed and model file exists.\n"
+                "Set <code>WHISPER_CPP_BINARY_PATH</code> and "
+                "<code>WHISPER_CPP_MODEL_PATH</code> if needed.",
+                parse_mode="HTML",
+            )
+        else:
+            await update.message.reply_text(
+                "🎙️ <b>Voice Messages</b>\n\n"
+                "Voice transcription is not available.\n"
+                f"Provider: <code>{settings.voice_provider_display_name}</code>\n"
+                f"Set <code>{settings.voice_provider_api_key_env}</code> to enable.\n"
+                "Install optional voice deps with "
+                '<code>pip install "claude-code-telegram[voice]"</code>.',
+                parse_mode="HTML",
+            )
         return
 
     try:
diff --git a/src/bot/orchestrator.py b/src/bot/orchestrator.py
index ac1d5304..609e42d1 100644
--- a/src/bot/orchestrator.py
+++ b/src/bot/orchestrator.py
@@ -1452,6 +1452,12 @@ async def _handle_agentic_media_message(
 
     def _voice_unavailable_message(self) -> str:
         """Return provider-aware guidance when voice feature is unavailable."""
+        if self.settings.voice_provider == "local":
+            return (
+                "Voice processing is not available. "
+                "Ensure whisper.cpp is installed and the model file exists. "
+                "Check WHISPER_CPP_BINARY_PATH and WHISPER_CPP_MODEL_PATH settings."
+            )
         return (
             "Voice processing is not available. "
             f"Set {self.settings.voice_provider_api_key_env} "
diff --git a/src/config/features.py b/src/config/features.py
index 03b54a86..e5561d40 100644
--- a/src/config/features.py
+++ b/src/config/features.py
@@ -76,6 +76,8 @@ def voice_messages_enabled(self) -> bool:
         """Check if voice message transcription is enabled."""
         if not self.settings.enable_voice_messages:
             return False
+        if self.settings.voice_provider == "local":
+            return True  # No API key needed for local whisper.cpp
         if self.settings.voice_provider == "openai":
             return self.settings.openai_api_key is not None
         return self.settings.mistral_api_key is not None
diff --git a/src/config/settings.py b/src/config/settings.py
index 77c34ea4..f276168c 100644
--- a/src/config/settings.py
+++ b/src/config/settings.py
@@ -169,9 +169,9 @@ class Settings(BaseSettings):
     enable_voice_messages: bool = Field(
         True, description="Enable voice message transcription"
     )
-    voice_provider: Literal["mistral", "openai"] = Field(
+    voice_provider: Literal["mistral", "openai", "local"] = Field(
         "mistral",
-        description="Voice transcription provider: 'mistral' or 'openai'",
+        description="Voice transcription provider: 'mistral', 'openai', or 'local'",
     )
     mistral_api_key: Optional[SecretStr] = Field(
         None, description="Mistral API key for voice transcription"
@@ -195,6 +195,21 @@ class Settings(BaseSettings):
         ge=1,
         le=200,
     )
+    whisper_cpp_binary_path: Optional[str] = Field(
+        None,
+        description=(
+            "Path to whisper.cpp binary. "
+            "Required when VOICE_PROVIDER=local. Auto-detected from PATH if unset."
+        ),
+    )
+    whisper_cpp_model_path: Optional[str] = Field(
+        None,
+        description=(
+            "Path to whisper.cpp GGML model file, or model name "
+            "(e.g. 'base', 'small'). Defaults to 'base'. "
+            "Named models resolve to ~/.cache/whisper-cpp/ggml-{name}.bin"
+        ),
+    )
     enable_quick_actions: bool = Field(True, description="Enable quick action buttons")
     agentic_mode: bool = Field(
         True,
@@ -395,8 +410,10 @@ def validate_voice_provider(cls, v: Any) -> str:
         if v is None:
             return "mistral"
         provider = str(v).strip().lower()
-        if provider not in {"mistral", "openai"}:
-            raise ValueError("voice_provider must be one of ['mistral', 'openai']")
+        if provider not in {"mistral", "openai", "local"}:
+            raise ValueError(
+                "voice_provider must be one of ['mistral', 'openai', 'local']"
+            )
         return provider
 
     @field_validator("project_threads_chat_id", mode="before")
@@ -503,6 +520,8 @@ def resolved_voice_model(self) -> str:
             return self.voice_transcription_model
         if self.voice_provider == "openai":
             return "whisper-1"
+        if self.voice_provider == "local":
+            return self.whisper_cpp_model_path or "base"
         return "voxtral-mini-latest"
 
     @property
@@ -515,6 +534,8 @@ def voice_provider_api_key_env(self) -> str:
         """API key environment variable required for the configured voice provider."""
         if self.voice_provider == "openai":
             return "OPENAI_API_KEY"
+        if self.voice_provider == "local":
+            return ""
         return "MISTRAL_API_KEY"
 
     @property
@@ -522,4 +543,21 @@ def voice_provider_display_name(self) -> str:
         """Human-friendly label for the configured voice provider."""
         if self.voice_provider == "openai":
             return "OpenAI Whisper"
+        if self.voice_provider == "local":
+            return "Local whisper.cpp"
         return "Mistral Voxtral"
+
+    @property
+    def resolved_whisper_cpp_binary(self) -> str:
+        """Resolve whisper.cpp binary path, defaulting to 'whisper-cpp' on PATH."""
+        return self.whisper_cpp_binary_path or "whisper-cpp"
+
+    @property
+    def resolved_whisper_cpp_model_path(self) -> str:
+        """Resolve whisper.cpp model file path from name or explicit path."""
+        path_or_name = self.whisper_cpp_model_path or "base"
+        if "/" in path_or_name or path_or_name.endswith(".bin"):
+            return path_or_name
+        return str(
+            Path.home() / ".cache" / "whisper-cpp" / f"ggml-{path_or_name}.bin"
+        )
diff --git a/tests/unit/test_bot/test_message_voice.py b/tests/unit/test_bot/test_message_voice.py
index d6f2deb6..e5bfe1a7 100644
--- a/tests/unit/test_bot/test_message_voice.py
+++ b/tests/unit/test_bot/test_message_voice.py
@@ -53,3 +53,28 @@ async def test_handle_voice_missing_handler_uses_mistral_key(tmp_path):
 
     call_args = update.message.reply_text.call_args
     assert "MISTRAL_API_KEY" in call_args.args[0]
+
+
+async def test_handle_voice_missing_handler_local_provider(tmp_path):
+    """Classic handler fallback shows whisper.cpp guidance for local provider."""
+    settings = create_test_config(
+        approved_directory=str(tmp_path),
+        voice_provider="local",
+    )
+
+    features = MagicMock()
+    features.get_voice_handler.return_value = None
+
+    update = MagicMock()
+    update.effective_user.id = 123
+    update.message.reply_text = AsyncMock()
+
+    context = MagicMock()
+    context.bot_data = {"settings": settings, "features": features}
+    context.user_data = {}
+
+    await handle_voice(update, context)
+
+    call_args = update.message.reply_text.call_args
+    assert "WHISPER_CPP_BINARY_PATH" in call_args.args[0]
+    assert call_args.kwargs["parse_mode"] == "HTML"
diff --git a/tests/unit/test_bot/test_voice_handler.py b/tests/unit/test_bot/test_voice_handler.py
index 2caddf86..d97237a4 100644
--- a/tests/unit/test_bot/test_voice_handler.py
+++ b/tests/unit/test_bot/test_voice_handler.py
@@ -1,9 +1,11 @@
 """Tests for voice handler feature."""
 
+import asyncio
 import sys
 from datetime import timedelta
+from pathlib import Path
 from types import SimpleNamespace
-from unittest.mock import AsyncMock, MagicMock
+from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
 
@@ -352,3 +354,180 @@ async def test_transcribe_openai_reuses_cached_client(openai_voice_handler):
 
     openai_ctor.assert_called_once_with(api_key="test-openai-key")
     assert mock_transcriptions.create.await_count == 2
+
+
+# --- Local whisper.cpp provider tests ---
+
+
+@pytest.fixture
+def local_config():
+    """Create a mock config with local whisper.cpp settings."""
+    cfg = MagicMock()
+    cfg.voice_provider = "local"
+    cfg.resolved_whisper_cpp_binary = "whisper-cpp"
+    cfg.resolved_whisper_cpp_model_path = "/tmp/models/ggml-base.bin"
+    cfg.voice_max_file_size_mb = 20
+    cfg.voice_max_file_size_bytes = 20 * 1024 * 1024
+    return cfg
+
+
+@pytest.fixture
+def local_voice_handler(local_config):
+    """Create a VoiceHandler instance with local config."""
+    return VoiceHandler(config=local_config)
+
+
+async def test_process_voice_message_local_dispatches(local_voice_handler):
+    """process_voice_message routes to _transcribe_local for local provider."""
+    voice = _mock_voice(duration=5)
+    local_voice_handler._transcribe_local = AsyncMock(
+        return_value="Local transcription"
+    )
+
+    result = await local_voice_handler.process_voice_message(voice)
+
+    assert isinstance(result, ProcessedVoice)
+    assert result.transcription == "Local transcription"
+    assert result.duration == 5
+    local_voice_handler._transcribe_local.assert_awaited_once()
+
+
+async def test_transcribe_local_runs_ffmpeg_and_whisper(local_voice_handler):
+    """Local transcription converts OGG->WAV then calls whisper.cpp binary."""
+    mock_ffmpeg = AsyncMock()
+    mock_ffmpeg.communicate = AsyncMock(return_value=(b"", b""))
+    mock_ffmpeg.returncode = 0
+
+    mock_whisper = AsyncMock()
+    mock_whisper.communicate = AsyncMock(return_value=(b"Hello world", b""))
+    mock_whisper.returncode = 0
+
+    call_count = 0
+
+    async def fake_subprocess(*args, **kwargs):
+        nonlocal call_count
+        call_count += 1
+        if call_count == 1:
+            return mock_ffmpeg
+        return mock_whisper
+
+    with (
+        patch("shutil.which", return_value="/usr/bin/whisper-cpp"),
+        patch(
+            "src.bot.features.voice_handler.Path.is_file",
+            return_value=True,
+        ),
+        patch(
+            "asyncio.create_subprocess_exec",
+            side_effect=fake_subprocess,
+        ),
+    ):
+        result = await local_voice_handler._transcribe_local(b"fake-ogg-bytes")
+
+    assert result == "Hello world"
+    assert call_count == 2
+
+
+async def test_transcribe_local_ffmpeg_not_found(local_voice_handler):
+    """Missing ffmpeg gives a clear install hint."""
+    with (
+        patch("shutil.which", return_value="/usr/bin/whisper-cpp"),
+        patch(
+            "src.bot.features.voice_handler.Path.is_file",
+            return_value=True,
+        ),
+        patch(
+            "asyncio.create_subprocess_exec",
+            side_effect=FileNotFoundError,
+        ),
+    ):
+        with pytest.raises(RuntimeError, match="ffmpeg is required"):
+            await local_voice_handler._transcribe_local(b"fake-ogg")
+
+
+async def test_transcribe_local_model_not_found(local_voice_handler):
+    """Missing model file raises a clear error with download hint."""
+    with (
+        patch("shutil.which", return_value="/usr/bin/whisper-cpp"),
+        patch(
+            "src.bot.features.voice_handler.Path.is_file",
+            return_value=False,
+        ),
+    ):
+        with pytest.raises(RuntimeError, match="model not found"):
+            await local_voice_handler._transcribe_local(b"fake-ogg")
+
+
+async def test_transcribe_local_whisper_binary_not_found(local_voice_handler):
+    """Missing whisper.cpp binary raises a clear error."""
+    with patch("shutil.which", return_value=None):
+        with pytest.raises(RuntimeError, match="not found on PATH"):
+            await local_voice_handler._transcribe_local(b"fake-ogg")
+
+
+async def test_transcribe_local_empty_response(local_voice_handler):
+    """Empty whisper.cpp output raises ValueError."""
+    mock_ffmpeg = AsyncMock()
+    mock_ffmpeg.communicate = AsyncMock(return_value=(b"", b""))
+    mock_ffmpeg.returncode = 0
+
+    mock_whisper = AsyncMock()
+    mock_whisper.communicate = AsyncMock(return_value=(b"   ", b""))
+    mock_whisper.returncode = 0
+
+    call_count = 0
+
+    async def fake_subprocess(*args, **kwargs):
+        nonlocal call_count
+        call_count += 1
+        if call_count == 1:
+            return mock_ffmpeg
+        return mock_whisper
+
+    with (
+        patch("shutil.which", return_value="/usr/bin/whisper-cpp"),
+        patch(
+            "src.bot.features.voice_handler.Path.is_file",
+            return_value=True,
+        ),
+        patch(
+            "asyncio.create_subprocess_exec",
+            side_effect=fake_subprocess,
+        ),
+    ):
+        with pytest.raises(ValueError, match="empty response"):
+            await local_voice_handler._transcribe_local(b"fake-ogg")
+
+
+async def test_transcribe_local_whisper_nonzero_exit(local_voice_handler):
+    """Non-zero whisper.cpp exit code raises RuntimeError."""
+    mock_ffmpeg = AsyncMock()
+    mock_ffmpeg.communicate = AsyncMock(return_value=(b"", b""))
+    mock_ffmpeg.returncode = 0
+
+    mock_whisper = AsyncMock()
+    mock_whisper.communicate = AsyncMock(return_value=(b"", b"model load fail"))
+    mock_whisper.returncode = 1
+
+    call_count = 0
+
+    async def fake_subprocess(*args, **kwargs):
+        nonlocal call_count
+        call_count += 1
+        if call_count == 1:
+            return mock_ffmpeg
+        return mock_whisper
+
+    with (
+        patch("shutil.which", return_value="/usr/bin/whisper-cpp"),
+        patch(
+            "src.bot.features.voice_handler.Path.is_file",
+            return_value=True,
+        ),
+        patch(
+            "asyncio.create_subprocess_exec",
+            side_effect=fake_subprocess,
+        ),
+    ):
+        with pytest.raises(RuntimeError, match="transcription failed"):
+            await local_voice_handler._transcribe_local(b"fake-ogg")
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 6b20c6fe..2f0dcd9e 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -457,7 +457,7 @@ def test_project_threads_validation_invalid_mode(tmp_path):
 
 
 def test_voice_provider_validation_and_normalization(tmp_path):
-    """VOICE_PROVIDER accepts only mistral/openai and normalizes casing."""
+    """VOICE_PROVIDER accepts mistral/openai/local and normalizes casing."""
     project_dir = tmp_path / "projects"
     project_dir.mkdir()
 
@@ -483,6 +483,26 @@ def test_voice_provider_validation_and_normalization(tmp_path):
     assert "voice_provider must be one of" in str(exc_info.value)
 
 
+def test_voice_provider_local_requires_no_api_key(tmp_path):
+    """VOICE_PROVIDER=local needs no API key and has correct display properties."""
+    project_dir = tmp_path / "projects"
+    project_dir.mkdir()
+
+    settings = Settings(
+        telegram_bot_token="test_token",
+        telegram_bot_username="test_bot",
+        approved_directory=str(project_dir),
+        voice_provider="local",
+    )
+
+    assert settings.voice_provider == "local"
+    assert settings.voice_provider_api_key_env == ""
+    assert settings.voice_provider_display_name == "Local whisper.cpp"
+    assert settings.resolved_voice_model == "base"
+    assert settings.resolved_whisper_cpp_binary == "whisper-cpp"
+    assert settings.resolved_whisper_cpp_model_path.endswith("ggml-base.bin")
+
+
 def test_voice_max_file_size_configuration(tmp_path):
     """Voice max file size should be configurable and validated."""
     project_dir = tmp_path / "projects"