Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,81 @@ Enable with `ENABLE_API_SERVER=true` and `ENABLE_SCHEDULER=true`. See [docs/setu

- Plugin system for third-party extensions

## Voice Transcription

The bot can transcribe Telegram voice messages and pass them as text to Claude. Three providers are supported:

| Provider | Type | Best for |
|----------|------|----------|
| `parakeet` (default) | Local GPU | Privacy, no API cost, fast on NVIDIA GPU |
| `mistral` | Cloud (Voxtral) | Quality without local GPU |
| `openai` | Cloud (Whisper) | Widely supported cloud option |

### Enable voice processing

```bash
ENABLE_VOICE_PROCESSING=true
VOICE_PROVIDER=parakeet # or: mistral, openai
```

### Parakeet (local GPU, no API key required)

[NVIDIA NeMo Parakeet TDT 0.6B v3](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) runs on your GPU — no cloud API needed, no per-minute cost.

**Requirements:**

- NVIDIA GPU with CUDA support
- [ffmpeg](https://ffmpeg.org/download.html) installed and on PATH (or set `FFMPEG_PATH`)
- `claude-code-telegram[parakeet]` extras

**Install:**

```bash
pip install "claude-code-telegram[parakeet]"
# or with poetry:
poetry install --with parakeet
```

**Configure:**

```bash
ENABLE_VOICE_PROCESSING=true
VOICE_PROVIDER=parakeet
# Optional: explicit path to ffmpeg if not on PATH
FFMPEG_PATH=/usr/bin/ffmpeg
```

The NeMo model (~600 MB) is downloaded automatically on first use and cached locally.

### Cloud providers (Mistral / OpenAI)

```bash
# Mistral Voxtral
ENABLE_VOICE_PROCESSING=true
VOICE_PROVIDER=mistral
MISTRAL_API_KEY=your-mistral-api-key

# OpenAI Whisper
ENABLE_VOICE_PROCESSING=true
VOICE_PROVIDER=openai
OPENAI_API_KEY=your-openai-api-key
```

**Install cloud extras:**

```bash
pip install "claude-code-telegram[voice]"
# or with poetry:
poetry install --with voice
```

### Additional voice settings

```bash
VOICE_MAX_FILE_SIZE_MB=20 # Max voice message size (default: 20 MB)
FFMPEG_PATH= # Optional explicit path to ffmpeg binary
```

## Configuration

### Required
Expand Down
15 changes: 15 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,21 @@ uvicorn = {version = "^0.34.0", extras = ["standard"]}
apscheduler = "^3.10"
PyYAML = "^6.0.2"

[tool.poetry.group.voice.dependencies]
# Cloud providers (mistral or openai)
mistralai = {version = "^1.0.0", optional = true}
openai = {version = "^1.0.0", optional = true}

[tool.poetry.group.parakeet.dependencies]
# Local GPU transcription via NVIDIA NeMo (requires CUDA)
# Install with: pip install "claude-code-telegram[parakeet]"
nemo_toolkit = {version = "^2.3.0", extras = ["asr"], optional = true}
torch = {version = ">=2.0.0", optional = true}

[tool.poetry.extras]
voice = ["mistralai", "openai"]
parakeet = ["nemo_toolkit", "torch"]

[tool.poetry.group.dev.dependencies]
pytest = "^8.4.0"
pytest-asyncio = "^1.0.0"
Expand Down
13 changes: 13 additions & 0 deletions src/bot/features/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .image_handler import ImageHandler
from .quick_actions import QuickActionManager
from .session_export import SessionExporter
from .voice_handler import VoiceHandler

logger = structlog.get_logger(__name__)

Expand Down Expand Up @@ -77,6 +78,14 @@ def _initialize_features(self):
except Exception as e:
logger.error("Failed to initialize image handler", error=str(e))

# Voice handling - conditionally enabled
if self.config.enable_voice_processing:
try:
self.features["voice_handler"] = VoiceHandler(config=self.config)
logger.info("Voice handler feature enabled")
except Exception as e:
logger.error("Failed to initialize voice handler", error=str(e))

# Conversation enhancements - skip in agentic mode
if not self.config.agentic_mode:
try:
Expand Down Expand Up @@ -118,6 +127,10 @@ def get_image_handler(self) -> Optional[ImageHandler]:
"""Get image handler feature"""
return self.get_feature("image_handler")

def get_voice_handler(self) -> Optional[VoiceHandler]:
"""Get voice handler feature"""
return self.get_feature("voice_handler")

def get_conversation_enhancer(self) -> Optional[ConversationEnhancer]:
"""Get conversation enhancer feature"""
return self.get_feature("conversation")
Expand Down
220 changes: 220 additions & 0 deletions src/bot/features/voice_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
"""Handle voice message transcription.

Supported providers (VOICE_PROVIDER):
parakeet — local GPU inference via NVIDIA NeMo (default, free, requires CUDA)
mistral — Mistral Voxtral API (cloud, requires MISTRAL_API_KEY)
openai — OpenAI Whisper API (cloud, requires OPENAI_API_KEY)
"""

import asyncio
import subprocess
import tempfile
from dataclasses import dataclass
from datetime import timedelta
from pathlib import Path
from typing import Any, Optional

import structlog
from telegram import Voice

from src.config.settings import Settings

logger = structlog.get_logger(__name__)


@dataclass
class ProcessedVoice:
"""Result of voice message processing."""

transcription: str
prompt: str
duration: int = 0


class VoiceHandler:
"""Transcribe Telegram voice/audio messages.

Delegates to one of three backends based on config.voice_provider:
- 'parakeet': local NVIDIA NeMo model (no API key required)
- 'mistral': Mistral Voxtral cloud API
- 'openai': OpenAI Whisper cloud API
"""

def __init__(self, config: Settings):
self.config = config
self._parakeet_model = None # lazy-loaded on first use
self._mistral_client: Optional[Any] = None
self._openai_client: Optional[Any] = None

# ------------------------------------------------------------------
# Public interface
# ------------------------------------------------------------------

async def process_voice_message(
self, voice: Voice, caption: Optional[str] = None
) -> ProcessedVoice:
"""Download and transcribe a Telegram voice message."""
self._check_file_size(getattr(voice, "file_size", None))

file = await voice.get_file()
self._check_file_size(getattr(file, "file_size", None))

voice_bytes = bytes(await file.download_as_bytearray())
self._check_file_size(len(voice_bytes))

provider = self.config.voice_provider
logger.info("Transcribing voice message", provider=provider, duration=voice.duration)

if provider == "parakeet":
transcription = await self._transcribe_parakeet(voice_bytes)
elif provider == "openai":
transcription = await self._transcribe_openai(voice_bytes)
else:
transcription = await self._transcribe_mistral(voice_bytes)

logger.info("Voice transcription complete", length=len(transcription))

label = caption if caption else "Voice message transcription:"
dur = voice.duration
duration_secs = int(dur.total_seconds()) if isinstance(dur, timedelta) else (dur or 0)

return ProcessedVoice(
transcription=transcription,
prompt=f"{label}\n\n{transcription}",
duration=duration_secs,
)

# ------------------------------------------------------------------
# Parakeet (local)
# ------------------------------------------------------------------

@property
def _parakeet(self):
"""Lazy-load the Parakeet TDT 0.6B v3 model on first use."""
if self._parakeet_model is None:
try:
import nemo.collections.asr as nemo_asr
except ModuleNotFoundError as exc:
raise RuntimeError(
"Optional dependency 'nemo_toolkit' is missing for Parakeet transcription. "
"Install parakeet extras: "
'pip install "claude-code-telegram[parakeet]"'
) from exc

logger.info("Loading Parakeet TDT 0.6B v3 model (first use)…")
self._parakeet_model = nemo_asr.models.ASRModel.from_pretrained(
"nvidia/parakeet-tdt-0.6b-v3"
)
logger.info("Parakeet model loaded")
return self._parakeet_model

async def _transcribe_parakeet(self, voice_bytes: bytes) -> str:
"""Transcribe using local Parakeet model (runs in thread pool to avoid blocking)."""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self._run_parakeet, voice_bytes)

def _run_parakeet(self, voice_bytes: bytes) -> str:
"""CPU-bound transcription — called from executor."""
ffmpeg = self.config.resolved_ffmpeg_path
with tempfile.TemporaryDirectory() as tmp:
ogg_path = Path(tmp) / "voice.ogg"
wav_path = Path(tmp) / "voice.wav"
ogg_path.write_bytes(voice_bytes)

subprocess.run(
[ffmpeg, "-y", "-i", str(ogg_path), "-ar", "16000", "-ac", "1", str(wav_path)],
check=True,
capture_output=True,
)

output = self._parakeet.transcribe([str(wav_path)])
text = output[0].text.strip()

if not text:
raise ValueError("Parakeet transcription returned an empty result.")
return text

# ------------------------------------------------------------------
# Mistral (cloud)
# ------------------------------------------------------------------

async def _transcribe_mistral(self, voice_bytes: bytes) -> str:
client = self._get_mistral_client()
try:
response = await client.audio.transcriptions.complete_async(
model="voxtral-mini-2507",
file={"content": voice_bytes, "file_name": "voice.ogg"},
)
except Exception as exc:
logger.warning("Mistral transcription failed", error_type=type(exc).__name__)
raise RuntimeError("Mistral transcription request failed.") from exc

text = (getattr(response, "text", "") or "").strip()
if not text:
raise ValueError("Mistral transcription returned an empty response.")
return text

def _get_mistral_client(self) -> Any:
if self._mistral_client is not None:
return self._mistral_client
try:
from mistralai import Mistral
except ModuleNotFoundError as exc:
raise RuntimeError(
"Optional dependency 'mistralai' is missing. "
'Install voice extras: pip install "claude-code-telegram[voice]"'
) from exc

api_key = self.config.mistral_api_key_str
if not api_key:
raise RuntimeError("MISTRAL_API_KEY is not configured.")
self._mistral_client = Mistral(api_key=api_key)
return self._mistral_client

# ------------------------------------------------------------------
# OpenAI Whisper (cloud)
# ------------------------------------------------------------------

async def _transcribe_openai(self, voice_bytes: bytes) -> str:
client = self._get_openai_client()
try:
response = await client.audio.transcriptions.create(
model="whisper-1",
file=("voice.ogg", voice_bytes),
)
except Exception as exc:
logger.warning("OpenAI transcription failed", error_type=type(exc).__name__)
raise RuntimeError("OpenAI transcription request failed.") from exc

text = (getattr(response, "text", "") or "").strip()
if not text:
raise ValueError("OpenAI transcription returned an empty response.")
return text

def _get_openai_client(self) -> Any:
if self._openai_client is not None:
return self._openai_client
try:
from openai import AsyncOpenAI
except ModuleNotFoundError as exc:
raise RuntimeError(
"Optional dependency 'openai' is missing. "
'Install voice extras: pip install "claude-code-telegram[voice]"'
) from exc

api_key = self.config.openai_api_key_str
if not api_key:
raise RuntimeError("OPENAI_API_KEY is not configured.")
self._openai_client = AsyncOpenAI(api_key=api_key)
return self._openai_client

# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------

def _check_file_size(self, size: Optional[int]) -> None:
if isinstance(size, int) and size > self.config.voice_max_file_size_bytes:
raise ValueError(
f"Voice message too large ({size / 1024 / 1024:.1f} MB). "
f"Max: {self.config.voice_max_file_size_mb} MB."
)
Loading