Rebreda · Rebreda · Mar 2, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026
diff --git a/src/listenr/build_dataset.py b/src/listenr/build_dataset.py
@@ -26,23 +26,33 @@
 import csv
 import json
 import logging
-import os
 import random
 import sys
 from pathlib import Path
 
-import listenr.config_manager as cfg
+from listenr.constants import (
+    DATASET_FORMAT,
+    DATASET_MIN_CHARS,
+    DATASET_MIN_DURATION,
+    DATASET_OUTPUT,
+    DATASET_SEED,
+    DATASET_SPLIT,
+    STORAGE_BASE,
+)
 
 logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
 logger = logging.getLogger("listenr.build_dataset")
 
 # ---------------------------------------------------------------------------
-# Defaults
+# Defaults (sourced from constants, which read from config at import time)
 # ---------------------------------------------------------------------------
-DEFAULT_OUTPUT = Path("~/listenr_dataset").expanduser()
-DEFAULT_SPLIT = "80/10/10"
-DEFAULT_MIN_DURATION = 0.3  # seconds
-DEFAULT_MIN_CHARS = 2  # minimum non-whitespace chars in transcription
+
+DEFAULT_OUTPUT       = DATASET_OUTPUT
+DEFAULT_SPLIT        = DATASET_SPLIT
+DEFAULT_MIN_DURATION = DATASET_MIN_DURATION
+DEFAULT_MIN_CHARS    = DATASET_MIN_CHARS
+DEFAULT_SEED         = DATASET_SEED
+DEFAULT_FORMAT       = DATASET_FORMAT
 
 CSV_COLUMNS = [
     "uuid",
@@ -65,8 +75,7 @@
 
 def _manifest_path() -> Path:
     """Return the manifest.jsonl path from config."""
-    clips_path = cfg.get_setting("Storage", "audio_clips_path", "~/.listenr/audio_clips")
-    return Path(clips_path).expanduser() / "manifest.jsonl"
+    return STORAGE_BASE / "manifest.jsonl"
 
 
 def load_manifest(manifest_path: Path) -> list[dict]:
@@ -244,36 +253,36 @@ def main() -> None:
         "--output",
         type=Path,
         default=DEFAULT_OUTPUT,
-        help=f"Output directory for dataset files (default: {DEFAULT_OUTPUT})",
+        help=f"Output directory for dataset files (default: from config, currently {DEFAULT_OUTPUT})",
     )
     parser.add_argument(
         "--split",
         default=DEFAULT_SPLIT,
-        help=f"Train/dev/test split percentages, e.g. 80/10/10 (default: {DEFAULT_SPLIT})",
+        help=f"Train/dev/test split percentages, e.g. 80/10/10 (default: from config, currently {DEFAULT_SPLIT})",
     )
     parser.add_argument(
         "--min-duration",
         type=float,
         default=DEFAULT_MIN_DURATION,
-        help=f"Minimum clip duration in seconds (default: {DEFAULT_MIN_DURATION})",
+        help=f"Minimum clip duration in seconds (default: from config, currently {DEFAULT_MIN_DURATION})",
     )
     parser.add_argument(
         "--min-chars",
         type=int,
         default=DEFAULT_MIN_CHARS,
-        help=f"Minimum non-whitespace chars in transcription (default: {DEFAULT_MIN_CHARS})",
+        help=f"Minimum non-whitespace chars in transcription (default: from config, currently {DEFAULT_MIN_CHARS})",
     )
     parser.add_argument(
         "--seed",
         type=int,
-        default=42,
-        help="Random seed for reproducible splits (default: 42)",
+        default=DEFAULT_SEED,
+        help=f"Random seed for reproducible splits (default: from config, currently {DEFAULT_SEED})",
     )
     parser.add_argument(
         "--format",
         choices=["csv", "hf", "both"],
-        default="csv",
-        help="Output format: csv, hf (HuggingFace datasets), or both (default: csv)",
+        default=DEFAULT_FORMAT,
+        help=f"Output format: csv, hf (HuggingFace datasets), or both (default: from config, currently {DEFAULT_FORMAT})",
     )
     parser.add_argument(
         "--dry-run",

diff --git a/src/listenr/cli.py b/src/listenr/cli.py
@@ -23,47 +23,38 @@
 from collections import deque
 from math import gcd
 from scipy.signal import resample_poly
-from pathlib import Path
 
 from listenr.unified_asr import LemonadeUnifiedASR
 from listenr.llm_processor import lemonade_llm_correct, lemonade_load_model, lemonade_unload_models
 from listenr.transcript_utils import is_hallucination, strip_noise_tags
 from listenr.storage import save_recording
-import listenr.config_manager as cfg
+from listenr.constants import (
+    ASR_RATE,
+    CAPTURE_RATE,
+    CHANNELS,
+    CHUNK_SIZE,
+    INPUT_DEVICE,
+    LLM_CONTEXT_WINDOW,
+    LLM_ENABLED as USE_LLM,
+    LLM_MODEL,
+    STORAGE_BASE,
+    WHISPER_MODEL,
+)
 
 logging.basicConfig(level=logging.WARNING, format='%(levelname)s: %(message)s')
 log = logging.getLogger('listenr.cli')
 
-# Audio settings from config
-CAPTURE_RATE = cfg.get_int_setting('Audio', 'sample_rate', 16000)
-ASR_RATE = 16000  # Lemonade /realtime always requires 16kHz PCM16
-CHUNK_SIZE = cfg.get_int_setting('Audio', 'blocksize', 1360)
-CHANNELS = cfg.get_int_setting('Audio', 'channels', 1)
-INPUT_DEVICE = cfg.get_setting('Audio', 'input_device', 'default') or None
-if INPUT_DEVICE == 'default':
-    INPUT_DEVICE = None
-
 # Compute resample ratio once (e.g. 48000→16000 = up 1, down 3)
 _gcd = gcd(CAPTURE_RATE, ASR_RATE)
 _RESAMPLE_UP = ASR_RATE // _gcd
 _RESAMPLE_DOWN = CAPTURE_RATE // _gcd
 _NEED_RESAMPLE = (CAPTURE_RATE != ASR_RATE)
 
-# Storage
-STORAGE_BASE = Path(
-    cfg.get_setting('Storage', 'audio_clips_path', '~/listenr_recordings') or '~/listenr_recordings'
-).expanduser()
-
-# LLM settings
-USE_LLM = cfg.get_bool_setting('LLM', 'enabled', False)
-LLM_MODEL = cfg.get_setting('LLM', 'model', 'gpt-oss-20b-mxfp4-GGUF') or 'gpt-oss-20b-mxfp4-GGUF'
-WHISPER_MODEL = cfg.get_setting('Whisper', 'model', 'Whisper-Large-v3-Turbo') or 'Whisper-Large-v3-Turbo'
-
 
 def get_lemonade_ws_url() -> str:
     """Discover Lemonade WebSocket URL from /api/v1/health."""
-    api_base = cfg.get_setting('LLM', 'api_base', 'http://localhost:8000/api/v1') or 'http://localhost:8000/api/v1'
-    health_url = api_base.rstrip('/').replace('/api/v1', '') + '/api/v1/health'
+    from listenr.constants import LLM_API_BASE
+    health_url = LLM_API_BASE.rstrip('/').replace('/api/v1', '') + '/api/v1/health'
     try:
         resp = requests.get(health_url, timeout=2)
         resp.raise_for_status()
@@ -165,8 +156,7 @@ async def _run(save: bool, show_raw: bool, debug: bool):
     asr = LemonadeUnifiedASR(use_llm=False)  # LLM correction handled here for saving
     pcm_buffer: list = []
     # Rolling window of (raw, corrected) pairs passed as context to the LLM
-    _context_size = cfg.get_int_setting('LLM', 'context_window', 3)
-    llm_context: deque[tuple[str, str]] = deque(maxlen=_context_size)
+    llm_context: deque[tuple[str, str]] = deque(maxlen=LLM_CONTEXT_WINDOW)
 
     async for result in asr.stream_transcribe(
         mic_stream(pcm_buffer, debug=debug),

diff --git a/src/listenr/config_manager.py b/src/listenr/config_manager.py
@@ -54,10 +54,18 @@
         'timeout': '30',
         'context_window': '10',  # Number of preceding segments passed as context to the LLM
     },
+    'Dataset': {
+        'output_path': '~/listenr_dataset',   # Where build_dataset writes CSV/HF output
+        'split': '80/10/10',                  # Train/dev/test split percentages
+        'min_duration': '0.3',                # Minimum clip duration in seconds
+        'min_chars': '2',                     # Minimum non-whitespace chars in transcription
+        'seed': '42',                         # Random seed for reproducible splits
+        'format': 'csv',                      # Output format: csv, hf, or both
+    },
     'Output': {
         'file': '~/transcripts_raw.txt',
         'llm_file': '~/transcripts_clean.txt',
-        'format': '[{timestamp}] {text}',
+        'line_format': '[{timestamp}] {text}',
         'timestamp_format': '%%Y-%%m-%%d %%H:%%M:%%S',  # Double %% for configparser escaping
         'show_raw': 'false',
     },

diff --git a/src/listenr/constants.py b/src/listenr/constants.py
@@ -0,0 +1,179 @@
+"""
+constants.py — Typed, config-backed constants for the listenr package.
+
+All values are read **once** at import time from ``~/.config/listenr/config.ini``
+(via :mod:`listenr.config_manager`).  
+
+Downstream modules should import individual names::
+
+    from listenr.constants import CAPTURE_RATE, LLM_MODEL, WHISPER_MODEL
+
+If you need to refresh constants at runtime (e.g. tests that patch config),
+call :func:`reload` to re-read all values from the current config state.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import listenr.config_manager as cfg
+
+# ---------------------------------------------------------------------------
+# Lemonade
+# ---------------------------------------------------------------------------
+
+LEMONADE_API_BASE: str = (
+    cfg.get_setting("Lemonade", "api_base", "http://localhost:8000/api/v1")
+    or "http://localhost:8000/api/v1"
+)
+
+# ---------------------------------------------------------------------------
+# Whisper
+# ---------------------------------------------------------------------------
+
+WHISPER_MODEL: str = (
+    cfg.get_setting("Whisper", "model", "Whisper-Tiny") or "Whisper-Tiny"
+)
+
+# ---------------------------------------------------------------------------
+# Audio
+# ---------------------------------------------------------------------------
+
+CAPTURE_RATE: int = cfg.get_int_setting("Audio", "sample_rate", 48000)
+CHANNELS: int = cfg.get_int_setting("Audio", "channels", 1)
+CHUNK_SIZE: int = cfg.get_int_setting("Audio", "blocksize", 4096)
+INPUT_DEVICE: str | None = (
+    cfg.get_setting("Audio", "input_device", "pipewire") or None
+)
+if INPUT_DEVICE == "default":
+    INPUT_DEVICE = None
+
+# Lemonade /realtime always requires 16 kHz PCM-16 — this is not configurable.
+ASR_RATE: int = 16000
+
+# ---------------------------------------------------------------------------
+# Storage
+# ---------------------------------------------------------------------------
+
+STORAGE_BASE: Path = Path(
+    cfg.get_setting("Storage", "audio_clips_path", "~/.listenr/audio_clips")
+    or "~/.listenr/audio_clips"
+).expanduser()
+
+STORAGE_CLIPS_ENABLED: bool = cfg.get_bool_setting(
+    "Storage", "audio_clips_enabled", True
+)
+STORAGE_RETENTION_DAYS: int = cfg.get_int_setting("Storage", "retention_days", 90)
+STORAGE_MAX_GB: float = cfg.get_float_setting("Storage", "max_storage_gb", 10.0)
+
+# ---------------------------------------------------------------------------
+# VAD
+# ---------------------------------------------------------------------------
+
+VAD_THRESHOLD: float = cfg.get_float_setting("VAD", "threshold", 0.05)
+VAD_SILENCE_MS: int = cfg.get_int_setting("VAD", "silence_duration_ms", 800)
+VAD_PREFIX_PADDING_MS: int = cfg.get_int_setting("VAD", "prefix_padding_ms", 250)
+
+# ---------------------------------------------------------------------------
+# LLM
+# ---------------------------------------------------------------------------
+
+LLM_ENABLED: bool = cfg.get_bool_setting("LLM", "enabled", True)
+LLM_MODEL: str = (
+    cfg.get_setting("LLM", "model", "gpt-oss-20b-mxfp4-GGUF")
+    or "gpt-oss-20b-mxfp4-GGUF"
+)
+LLM_API_BASE: str = (
+    cfg.get_setting("LLM", "api_base", "http://localhost:8000/api/v1")
+    or "http://localhost:8000/api/v1"
+)
+LLM_TEMPERATURE: float = cfg.get_float_setting("LLM", "temperature", 0.3)
+LLM_MAX_TOKENS: int = cfg.get_int_setting("LLM", "max_tokens", 1500)
+LLM_TIMEOUT: int = cfg.get_int_setting("LLM", "timeout", 30)
+LLM_CONTEXT_WINDOW: int = cfg.get_int_setting("LLM", "context_window", 10)
+
+# ---------------------------------------------------------------------------
+# Dataset
+# ---------------------------------------------------------------------------
+
+DATASET_OUTPUT: Path = Path(
+    cfg.get_setting("Dataset", "output_path", "~/listenr_dataset")
+    or "~/listenr_dataset"
+).expanduser()
+DATASET_SPLIT: str = cfg.get_setting("Dataset", "split", "80/10/10") or "80/10/10"
+DATASET_MIN_DURATION: float = cfg.get_float_setting("Dataset", "min_duration", 0.3)
+DATASET_MIN_CHARS: int = cfg.get_int_setting("Dataset", "min_chars", 2)
+DATASET_SEED: int = cfg.get_int_setting("Dataset", "seed", 42)
+
+_VALID_DATASET_FORMATS: frozenset[str] = frozenset({"csv", "hf", "both"})
+_raw_dataset_format: str = cfg.get_setting("Dataset", "format", "csv") or "csv"
+if _raw_dataset_format not in _VALID_DATASET_FORMATS:
+    import warnings
+    warnings.warn(
+        f"Config [Dataset] format={_raw_dataset_format!r} is not a recognised value "
+        f"({', '.join(sorted(_VALID_DATASET_FORMATS))}); falling back to 'csv'.",
+        UserWarning,
+        stacklevel=2,
+    )
+    _raw_dataset_format = "csv"
+DATASET_FORMAT: str = _raw_dataset_format
+
+# ---------------------------------------------------------------------------
+# Output / transcript files
+# ---------------------------------------------------------------------------
+
+OUTPUT_FILE: Path | None = (
+    Path(v).expanduser()
+    if (v := cfg.get_setting("Output", "file", ""))
+    else None
+)
+OUTPUT_LLM_FILE: Path | None = (
+    Path(v).expanduser()
+    if (v := cfg.get_setting("Output", "llm_file", ""))
+    else None
+)
+OUTPUT_LINE_FORMAT: str = (
+    cfg.get_setting("Output", "line_format", "[{timestamp}] {text}")
+    or "[{timestamp}] {text}"
+)
+OUTPUT_TIMESTAMP_FORMAT: str = (
+    cfg.get_setting("Output", "timestamp_format", "%Y-%m-%d %H:%M:%S")
+    or "%Y-%m-%d %H:%M:%S"
+)
+OUTPUT_SHOW_RAW: bool = cfg.get_bool_setting("Output", "show_raw", False)
+
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+
+LOG_LEVEL: str = cfg.get_setting("Logging", "level", "INFO") or "INFO"
+LOG_FILE: Path | None = (
+    Path(v).expanduser()
+    if (v := cfg.get_setting("Logging", "file", ""))
+    else None
+)
+
+
+# ---------------------------------------------------------------------------
+# Reload helper (used by tests and advanced callers)
+# ---------------------------------------------------------------------------
+
+def reload() -> None:
+    """Re-read all constants from the current config state (in-place update).
+
+    Useful in tests that patch :mod:`listenr.config_manager` after import::
+
+        cfg.update_setting('LLM', 'model', 'my-test-model')
+        import listenr.constants as C
+        C.reload()
+        assert C.LLM_MODEL == 'my-test-model'
+    """
+    import sys
+    import importlib
+
+    # Re-execute this module in the same module object so all names are updated
+    # in place — existing ``from listenr.constants import X`` bindings in already-
+    # imported modules won't see the change, but direct attribute access on the
+    # module object (``constants.X``) will.
+    module = sys.modules[__name__]
+    importlib.reload(module)