Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 26 additions & 17 deletions src/listenr/build_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,33 @@
import csv
import json
import logging
import os
import random
import sys
from pathlib import Path

import listenr.config_manager as cfg
from listenr.constants import (
DATASET_FORMAT,
DATASET_MIN_CHARS,
DATASET_MIN_DURATION,
DATASET_OUTPUT,
DATASET_SEED,
DATASET_SPLIT,
STORAGE_BASE,
)

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("listenr.build_dataset")

# ---------------------------------------------------------------------------
# Defaults
# Defaults (sourced from constants, which read from config at import time)
# ---------------------------------------------------------------------------
DEFAULT_OUTPUT = Path("~/listenr_dataset").expanduser()
DEFAULT_SPLIT = "80/10/10"
DEFAULT_MIN_DURATION = 0.3 # seconds
DEFAULT_MIN_CHARS = 2 # minimum non-whitespace chars in transcription

DEFAULT_OUTPUT = DATASET_OUTPUT
DEFAULT_SPLIT = DATASET_SPLIT
DEFAULT_MIN_DURATION = DATASET_MIN_DURATION
DEFAULT_MIN_CHARS = DATASET_MIN_CHARS
DEFAULT_SEED = DATASET_SEED
DEFAULT_FORMAT = DATASET_FORMAT

CSV_COLUMNS = [
"uuid",
Expand All @@ -65,8 +75,7 @@

def _manifest_path() -> Path:
"""Return the manifest.jsonl path from config."""
clips_path = cfg.get_setting("Storage", "audio_clips_path", "~/.listenr/audio_clips")
return Path(clips_path).expanduser() / "manifest.jsonl"
return STORAGE_BASE / "manifest.jsonl"


def load_manifest(manifest_path: Path) -> list[dict]:
Expand Down Expand Up @@ -244,36 +253,36 @@ def main() -> None:
"--output",
type=Path,
default=DEFAULT_OUTPUT,
help=f"Output directory for dataset files (default: {DEFAULT_OUTPUT})",
help=f"Output directory for dataset files (default: from config, currently {DEFAULT_OUTPUT})",
)
parser.add_argument(
"--split",
default=DEFAULT_SPLIT,
help=f"Train/dev/test split percentages, e.g. 80/10/10 (default: {DEFAULT_SPLIT})",
help=f"Train/dev/test split percentages, e.g. 80/10/10 (default: from config, currently {DEFAULT_SPLIT})",
)
parser.add_argument(
"--min-duration",
type=float,
default=DEFAULT_MIN_DURATION,
help=f"Minimum clip duration in seconds (default: {DEFAULT_MIN_DURATION})",
help=f"Minimum clip duration in seconds (default: from config, currently {DEFAULT_MIN_DURATION})",
)
parser.add_argument(
"--min-chars",
type=int,
default=DEFAULT_MIN_CHARS,
help=f"Minimum non-whitespace chars in transcription (default: {DEFAULT_MIN_CHARS})",
help=f"Minimum non-whitespace chars in transcription (default: from config, currently {DEFAULT_MIN_CHARS})",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed for reproducible splits (default: 42)",
default=DEFAULT_SEED,
help=f"Random seed for reproducible splits (default: from config, currently {DEFAULT_SEED})",
)
parser.add_argument(
"--format",
choices=["csv", "hf", "both"],
default="csv",
help="Output format: csv, hf (HuggingFace datasets), or both (default: csv)",
default=DEFAULT_FORMAT,
help=f"Output format: csv, hf (HuggingFace datasets), or both (default: from config, currently {DEFAULT_FORMAT})",
)
Comment on lines 282 to 286
Copy link

Copilot AI Mar 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

--format restricts choices to ['csv','hf','both'], but DEFAULT_FORMAT comes from DATASET_FORMAT (config-backed). If a user config sets an unsupported value (e.g. 'parquet'), argparse will error immediately because the default is invalid. Either validate/normalize DATASET_FORMAT in constants/config_manager, or keep the CLI choices in sync with all documented/accepted config values.

Copilot uses AI. Check for mistakes.
parser.add_argument(
"--dry-run",
Expand Down
40 changes: 15 additions & 25 deletions src/listenr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,47 +23,38 @@
from collections import deque
from math import gcd
from scipy.signal import resample_poly
from pathlib import Path

from listenr.unified_asr import LemonadeUnifiedASR
from listenr.llm_processor import lemonade_llm_correct, lemonade_load_model, lemonade_unload_models
from listenr.transcript_utils import is_hallucination, strip_noise_tags
from listenr.storage import save_recording
import listenr.config_manager as cfg
from listenr.constants import (
ASR_RATE,
CAPTURE_RATE,
CHANNELS,
CHUNK_SIZE,
INPUT_DEVICE,
LLM_CONTEXT_WINDOW,
LLM_ENABLED as USE_LLM,
LLM_MODEL,
STORAGE_BASE,
WHISPER_MODEL,
)

logging.basicConfig(level=logging.WARNING, format='%(levelname)s: %(message)s')
log = logging.getLogger('listenr.cli')

# Audio settings from config
CAPTURE_RATE = cfg.get_int_setting('Audio', 'sample_rate', 16000)
ASR_RATE = 16000 # Lemonade /realtime always requires 16kHz PCM16
CHUNK_SIZE = cfg.get_int_setting('Audio', 'blocksize', 1360)
CHANNELS = cfg.get_int_setting('Audio', 'channels', 1)
INPUT_DEVICE = cfg.get_setting('Audio', 'input_device', 'default') or None
if INPUT_DEVICE == 'default':
INPUT_DEVICE = None

# Compute resample ratio once (e.g. 48000→16000 = up 1, down 3)
_gcd = gcd(CAPTURE_RATE, ASR_RATE)
_RESAMPLE_UP = ASR_RATE // _gcd
_RESAMPLE_DOWN = CAPTURE_RATE // _gcd
_NEED_RESAMPLE = (CAPTURE_RATE != ASR_RATE)

# Storage
STORAGE_BASE = Path(
cfg.get_setting('Storage', 'audio_clips_path', '~/listenr_recordings') or '~/listenr_recordings'
).expanduser()

# LLM settings
USE_LLM = cfg.get_bool_setting('LLM', 'enabled', False)
LLM_MODEL = cfg.get_setting('LLM', 'model', 'gpt-oss-20b-mxfp4-GGUF') or 'gpt-oss-20b-mxfp4-GGUF'
WHISPER_MODEL = cfg.get_setting('Whisper', 'model', 'Whisper-Large-v3-Turbo') or 'Whisper-Large-v3-Turbo'


def get_lemonade_ws_url() -> str:
"""Discover Lemonade WebSocket URL from /api/v1/health."""
api_base = cfg.get_setting('LLM', 'api_base', 'http://localhost:8000/api/v1') or 'http://localhost:8000/api/v1'
health_url = api_base.rstrip('/').replace('/api/v1', '') + '/api/v1/health'
from listenr.constants import LLM_API_BASE
health_url = LLM_API_BASE.rstrip('/').replace('/api/v1', '') + '/api/v1/health'
try:
resp = requests.get(health_url, timeout=2)
resp.raise_for_status()
Expand Down Expand Up @@ -165,8 +156,7 @@ async def _run(save: bool, show_raw: bool, debug: bool):
asr = LemonadeUnifiedASR(use_llm=False) # LLM correction handled here for saving
pcm_buffer: list = []
# Rolling window of (raw, corrected) pairs passed as context to the LLM
_context_size = cfg.get_int_setting('LLM', 'context_window', 3)
llm_context: deque[tuple[str, str]] = deque(maxlen=_context_size)
llm_context: deque[tuple[str, str]] = deque(maxlen=LLM_CONTEXT_WINDOW)

async for result in asr.stream_transcribe(
mic_stream(pcm_buffer, debug=debug),
Expand Down
10 changes: 9 additions & 1 deletion src/listenr/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,18 @@
'timeout': '30',
'context_window': '10', # Number of preceding segments passed as context to the LLM
},
'Dataset': {
'output_path': '~/listenr_dataset', # Where build_dataset writes CSV/HF output
'split': '80/10/10', # Train/dev/test split percentages
'min_duration': '0.3', # Minimum clip duration in seconds
'min_chars': '2', # Minimum non-whitespace chars in transcription
'seed': '42', # Random seed for reproducible splits
'format': 'csv', # Output format: csv, hf, or both
},
'Output': {
'file': '~/transcripts_raw.txt',
'llm_file': '~/transcripts_clean.txt',
'format': '[{timestamp}] {text}',
'line_format': '[{timestamp}] {text}',
'timestamp_format': '%%Y-%%m-%%d %%H:%%M:%%S', # Double %% for configparser escaping
'show_raw': 'false',
Comment on lines 65 to 70
Copy link

Copilot AI Mar 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renaming the Output key from format to line_format in DEFAULT_CONFIG is a breaking change for existing config.ini files that still set format—their customized value will be ignored and the fallback will be used. Consider adding a small backward-compat alias/migration (e.g., when reading line_format, fall back to format if present) or documenting the migration clearly.

Copilot uses AI. Check for mistakes.
},
Expand Down
179 changes: 179 additions & 0 deletions src/listenr/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""
constants.py — Typed, config-backed constants for the listenr package.

All values are read **once** at import time from ``~/.config/listenr/config.ini``
(via :mod:`listenr.config_manager`).

Downstream modules should import individual names::

from listenr.constants import CAPTURE_RATE, LLM_MODEL, WHISPER_MODEL

If you need to refresh constants at runtime (e.g. tests that patch config),
call :func:`reload` to re-read all values from the current config state.
"""

from __future__ import annotations

from pathlib import Path

import listenr.config_manager as cfg

# ---------------------------------------------------------------------------
# Lemonade
# ---------------------------------------------------------------------------

LEMONADE_API_BASE: str = (
cfg.get_setting("Lemonade", "api_base", "http://localhost:8000/api/v1")
or "http://localhost:8000/api/v1"
)

# ---------------------------------------------------------------------------
# Whisper
# ---------------------------------------------------------------------------

WHISPER_MODEL: str = (
cfg.get_setting("Whisper", "model", "Whisper-Tiny") or "Whisper-Tiny"
)

# ---------------------------------------------------------------------------
# Audio
# ---------------------------------------------------------------------------

CAPTURE_RATE: int = cfg.get_int_setting("Audio", "sample_rate", 48000)
CHANNELS: int = cfg.get_int_setting("Audio", "channels", 1)
CHUNK_SIZE: int = cfg.get_int_setting("Audio", "blocksize", 4096)
INPUT_DEVICE: str | None = (
cfg.get_setting("Audio", "input_device", "pipewire") or None
)
if INPUT_DEVICE == "default":
INPUT_DEVICE = None

# Lemonade /realtime always requires 16 kHz PCM-16 — this is not configurable.
ASR_RATE: int = 16000

# ---------------------------------------------------------------------------
# Storage
# ---------------------------------------------------------------------------

STORAGE_BASE: Path = Path(
cfg.get_setting("Storage", "audio_clips_path", "~/.listenr/audio_clips")
or "~/.listenr/audio_clips"
).expanduser()

STORAGE_CLIPS_ENABLED: bool = cfg.get_bool_setting(
"Storage", "audio_clips_enabled", True
)
STORAGE_RETENTION_DAYS: int = cfg.get_int_setting("Storage", "retention_days", 90)
STORAGE_MAX_GB: float = cfg.get_float_setting("Storage", "max_storage_gb", 10.0)

# ---------------------------------------------------------------------------
# VAD
# ---------------------------------------------------------------------------

VAD_THRESHOLD: float = cfg.get_float_setting("VAD", "threshold", 0.05)
VAD_SILENCE_MS: int = cfg.get_int_setting("VAD", "silence_duration_ms", 800)
VAD_PREFIX_PADDING_MS: int = cfg.get_int_setting("VAD", "prefix_padding_ms", 250)

# ---------------------------------------------------------------------------
# LLM
# ---------------------------------------------------------------------------

LLM_ENABLED: bool = cfg.get_bool_setting("LLM", "enabled", True)
LLM_MODEL: str = (
cfg.get_setting("LLM", "model", "gpt-oss-20b-mxfp4-GGUF")
or "gpt-oss-20b-mxfp4-GGUF"
)
LLM_API_BASE: str = (
cfg.get_setting("LLM", "api_base", "http://localhost:8000/api/v1")
or "http://localhost:8000/api/v1"
)
LLM_TEMPERATURE: float = cfg.get_float_setting("LLM", "temperature", 0.3)
LLM_MAX_TOKENS: int = cfg.get_int_setting("LLM", "max_tokens", 1500)
LLM_TIMEOUT: int = cfg.get_int_setting("LLM", "timeout", 30)
LLM_CONTEXT_WINDOW: int = cfg.get_int_setting("LLM", "context_window", 10)

# ---------------------------------------------------------------------------
# Dataset
# ---------------------------------------------------------------------------

DATASET_OUTPUT: Path = Path(
cfg.get_setting("Dataset", "output_path", "~/listenr_dataset")
or "~/listenr_dataset"
).expanduser()
DATASET_SPLIT: str = cfg.get_setting("Dataset", "split", "80/10/10") or "80/10/10"
DATASET_MIN_DURATION: float = cfg.get_float_setting("Dataset", "min_duration", 0.3)
DATASET_MIN_CHARS: int = cfg.get_int_setting("Dataset", "min_chars", 2)
DATASET_SEED: int = cfg.get_int_setting("Dataset", "seed", 42)

_VALID_DATASET_FORMATS: frozenset[str] = frozenset({"csv", "hf", "both"})
_raw_dataset_format: str = cfg.get_setting("Dataset", "format", "csv") or "csv"
if _raw_dataset_format not in _VALID_DATASET_FORMATS:
import warnings
warnings.warn(
f"Config [Dataset] format={_raw_dataset_format!r} is not a recognised value "
f"({', '.join(sorted(_VALID_DATASET_FORMATS))}); falling back to 'csv'.",
UserWarning,
stacklevel=2,
)
_raw_dataset_format = "csv"
DATASET_FORMAT: str = _raw_dataset_format

# ---------------------------------------------------------------------------
# Output / transcript files
# ---------------------------------------------------------------------------

OUTPUT_FILE: Path | None = (
Path(v).expanduser()
if (v := cfg.get_setting("Output", "file", ""))
else None
)
OUTPUT_LLM_FILE: Path | None = (
Path(v).expanduser()
if (v := cfg.get_setting("Output", "llm_file", ""))
else None
)
OUTPUT_LINE_FORMAT: str = (
cfg.get_setting("Output", "line_format", "[{timestamp}] {text}")
or "[{timestamp}] {text}"
)
OUTPUT_TIMESTAMP_FORMAT: str = (
cfg.get_setting("Output", "timestamp_format", "%Y-%m-%d %H:%M:%S")
or "%Y-%m-%d %H:%M:%S"
)
OUTPUT_SHOW_RAW: bool = cfg.get_bool_setting("Output", "show_raw", False)

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------

LOG_LEVEL: str = cfg.get_setting("Logging", "level", "INFO") or "INFO"
LOG_FILE: Path | None = (
Path(v).expanduser()
if (v := cfg.get_setting("Logging", "file", ""))
else None
)


# ---------------------------------------------------------------------------
# Reload helper (used by tests and advanced callers)
# ---------------------------------------------------------------------------

def reload() -> None:
"""Re-read all constants from the current config state (in-place update).

Useful in tests that patch :mod:`listenr.config_manager` after import::

cfg.update_setting('LLM', 'model', 'my-test-model')
import listenr.constants as C
C.reload()
assert C.LLM_MODEL == 'my-test-model'
"""
import sys
import importlib

# Re-execute this module in the same module object so all names are updated
# in place — existing ``from listenr.constants import X`` bindings in already-
# imported modules won't see the change, but direct attribute access on the
# module object (``constants.X``) will.
module = sys.modules[__name__]
importlib.reload(module)
Loading