diff --git a/src/listenr/build_dataset.py b/src/listenr/build_dataset.py index fc80002..e7e7201 100644 --- a/src/listenr/build_dataset.py +++ b/src/listenr/build_dataset.py @@ -26,23 +26,33 @@ import csv import json import logging -import os import random import sys from pathlib import Path -import listenr.config_manager as cfg +from listenr.constants import ( + DATASET_FORMAT, + DATASET_MIN_CHARS, + DATASET_MIN_DURATION, + DATASET_OUTPUT, + DATASET_SEED, + DATASET_SPLIT, + STORAGE_BASE, +) logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") logger = logging.getLogger("listenr.build_dataset") # --------------------------------------------------------------------------- -# Defaults +# Defaults (sourced from constants, which read from config at import time) # --------------------------------------------------------------------------- -DEFAULT_OUTPUT = Path("~/listenr_dataset").expanduser() -DEFAULT_SPLIT = "80/10/10" -DEFAULT_MIN_DURATION = 0.3 # seconds -DEFAULT_MIN_CHARS = 2 # minimum non-whitespace chars in transcription + +DEFAULT_OUTPUT = DATASET_OUTPUT +DEFAULT_SPLIT = DATASET_SPLIT +DEFAULT_MIN_DURATION = DATASET_MIN_DURATION +DEFAULT_MIN_CHARS = DATASET_MIN_CHARS +DEFAULT_SEED = DATASET_SEED +DEFAULT_FORMAT = DATASET_FORMAT CSV_COLUMNS = [ "uuid", @@ -65,8 +75,7 @@ def _manifest_path() -> Path: """Return the manifest.jsonl path from config.""" - clips_path = cfg.get_setting("Storage", "audio_clips_path", "~/.listenr/audio_clips") - return Path(clips_path).expanduser() / "manifest.jsonl" + return STORAGE_BASE / "manifest.jsonl" def load_manifest(manifest_path: Path) -> list[dict]: @@ -244,36 +253,36 @@ def main() -> None: "--output", type=Path, default=DEFAULT_OUTPUT, - help=f"Output directory for dataset files (default: {DEFAULT_OUTPUT})", + help=f"Output directory for dataset files (default: from config, currently {DEFAULT_OUTPUT})", ) parser.add_argument( "--split", default=DEFAULT_SPLIT, - help=f"Train/dev/test split percentages, e.g. 80/10/10 (default: {DEFAULT_SPLIT})", + help=f"Train/dev/test split percentages, e.g. 80/10/10 (default: from config, currently {DEFAULT_SPLIT})", ) parser.add_argument( "--min-duration", type=float, default=DEFAULT_MIN_DURATION, - help=f"Minimum clip duration in seconds (default: {DEFAULT_MIN_DURATION})", + help=f"Minimum clip duration in seconds (default: from config, currently {DEFAULT_MIN_DURATION})", ) parser.add_argument( "--min-chars", type=int, default=DEFAULT_MIN_CHARS, - help=f"Minimum non-whitespace chars in transcription (default: {DEFAULT_MIN_CHARS})", + help=f"Minimum non-whitespace chars in transcription (default: from config, currently {DEFAULT_MIN_CHARS})", ) parser.add_argument( "--seed", type=int, - default=42, - help="Random seed for reproducible splits (default: 42)", + default=DEFAULT_SEED, + help=f"Random seed for reproducible splits (default: from config, currently {DEFAULT_SEED})", ) parser.add_argument( "--format", choices=["csv", "hf", "both"], - default="csv", - help="Output format: csv, hf (HuggingFace datasets), or both (default: csv)", + default=DEFAULT_FORMAT, + help=f"Output format: csv, hf (HuggingFace datasets), or both (default: from config, currently {DEFAULT_FORMAT})", ) parser.add_argument( "--dry-run", diff --git a/src/listenr/cli.py b/src/listenr/cli.py index f395ba5..ec55d68 100644 --- a/src/listenr/cli.py +++ b/src/listenr/cli.py @@ -23,47 +23,38 @@ from collections import deque from math import gcd from scipy.signal import resample_poly -from pathlib import Path from listenr.unified_asr import LemonadeUnifiedASR from listenr.llm_processor import lemonade_llm_correct, lemonade_load_model, lemonade_unload_models from listenr.transcript_utils import is_hallucination, strip_noise_tags from listenr.storage import save_recording -import listenr.config_manager as cfg +from listenr.constants import ( + ASR_RATE, + CAPTURE_RATE, + CHANNELS, + CHUNK_SIZE, + INPUT_DEVICE, + LLM_CONTEXT_WINDOW, + LLM_ENABLED as USE_LLM, + LLM_MODEL, + STORAGE_BASE, + WHISPER_MODEL, +) logging.basicConfig(level=logging.WARNING, format='%(levelname)s: %(message)s') log = logging.getLogger('listenr.cli') -# Audio settings from config -CAPTURE_RATE = cfg.get_int_setting('Audio', 'sample_rate', 16000) -ASR_RATE = 16000 # Lemonade /realtime always requires 16kHz PCM16 -CHUNK_SIZE = cfg.get_int_setting('Audio', 'blocksize', 1360) -CHANNELS = cfg.get_int_setting('Audio', 'channels', 1) -INPUT_DEVICE = cfg.get_setting('Audio', 'input_device', 'default') or None -if INPUT_DEVICE == 'default': - INPUT_DEVICE = None - # Compute resample ratio once (e.g. 48000→16000 = up 1, down 3) _gcd = gcd(CAPTURE_RATE, ASR_RATE) _RESAMPLE_UP = ASR_RATE // _gcd _RESAMPLE_DOWN = CAPTURE_RATE // _gcd _NEED_RESAMPLE = (CAPTURE_RATE != ASR_RATE) -# Storage -STORAGE_BASE = Path( - cfg.get_setting('Storage', 'audio_clips_path', '~/listenr_recordings') or '~/listenr_recordings' -).expanduser() - -# LLM settings -USE_LLM = cfg.get_bool_setting('LLM', 'enabled', False) -LLM_MODEL = cfg.get_setting('LLM', 'model', 'gpt-oss-20b-mxfp4-GGUF') or 'gpt-oss-20b-mxfp4-GGUF' -WHISPER_MODEL = cfg.get_setting('Whisper', 'model', 'Whisper-Large-v3-Turbo') or 'Whisper-Large-v3-Turbo' - def get_lemonade_ws_url() -> str: """Discover Lemonade WebSocket URL from /api/v1/health.""" - api_base = cfg.get_setting('LLM', 'api_base', 'http://localhost:8000/api/v1') or 'http://localhost:8000/api/v1' - health_url = api_base.rstrip('/').replace('/api/v1', '') + '/api/v1/health' + from listenr.constants import LLM_API_BASE + health_url = LLM_API_BASE.rstrip('/').replace('/api/v1', '') + '/api/v1/health' try: resp = requests.get(health_url, timeout=2) resp.raise_for_status() @@ -165,8 +156,7 @@ async def _run(save: bool, show_raw: bool, debug: bool): asr = LemonadeUnifiedASR(use_llm=False) # LLM correction handled here for saving pcm_buffer: list = [] # Rolling window of (raw, corrected) pairs passed as context to the LLM - _context_size = cfg.get_int_setting('LLM', 'context_window', 3) - llm_context: deque[tuple[str, str]] = deque(maxlen=_context_size) + llm_context: deque[tuple[str, str]] = deque(maxlen=LLM_CONTEXT_WINDOW) async for result in asr.stream_transcribe( mic_stream(pcm_buffer, debug=debug), diff --git a/src/listenr/config_manager.py b/src/listenr/config_manager.py index c3d9edc..0d93610 100644 --- a/src/listenr/config_manager.py +++ b/src/listenr/config_manager.py @@ -54,10 +54,18 @@ 'timeout': '30', 'context_window': '10', # Number of preceding segments passed as context to the LLM }, + 'Dataset': { + 'output_path': '~/listenr_dataset', # Where build_dataset writes CSV/HF output + 'split': '80/10/10', # Train/dev/test split percentages + 'min_duration': '0.3', # Minimum clip duration in seconds + 'min_chars': '2', # Minimum non-whitespace chars in transcription + 'seed': '42', # Random seed for reproducible splits + 'format': 'csv', # Output format: csv, hf, or both + }, 'Output': { 'file': '~/transcripts_raw.txt', 'llm_file': '~/transcripts_clean.txt', - 'format': '[{timestamp}] {text}', + 'line_format': '[{timestamp}] {text}', 'timestamp_format': '%%Y-%%m-%%d %%H:%%M:%%S', # Double %% for configparser escaping 'show_raw': 'false', }, diff --git a/src/listenr/constants.py b/src/listenr/constants.py new file mode 100644 index 0000000..ed79e54 --- /dev/null +++ b/src/listenr/constants.py @@ -0,0 +1,179 @@ +""" +constants.py — Typed, config-backed constants for the listenr package. + +All values are read **once** at import time from ``~/.config/listenr/config.ini`` +(via :mod:`listenr.config_manager`). + +Downstream modules should import individual names:: + + from listenr.constants import CAPTURE_RATE, LLM_MODEL, WHISPER_MODEL + +If you need to refresh constants at runtime (e.g. tests that patch config), +call :func:`reload` to re-read all values from the current config state. +""" + +from __future__ import annotations + +from pathlib import Path + +import listenr.config_manager as cfg + +# --------------------------------------------------------------------------- +# Lemonade +# --------------------------------------------------------------------------- + +LEMONADE_API_BASE: str = ( + cfg.get_setting("Lemonade", "api_base", "http://localhost:8000/api/v1") + or "http://localhost:8000/api/v1" +) + +# --------------------------------------------------------------------------- +# Whisper +# --------------------------------------------------------------------------- + +WHISPER_MODEL: str = ( + cfg.get_setting("Whisper", "model", "Whisper-Tiny") or "Whisper-Tiny" +) + +# --------------------------------------------------------------------------- +# Audio +# --------------------------------------------------------------------------- + +CAPTURE_RATE: int = cfg.get_int_setting("Audio", "sample_rate", 48000) +CHANNELS: int = cfg.get_int_setting("Audio", "channels", 1) +CHUNK_SIZE: int = cfg.get_int_setting("Audio", "blocksize", 4096) +INPUT_DEVICE: str | None = ( + cfg.get_setting("Audio", "input_device", "pipewire") or None +) +if INPUT_DEVICE == "default": + INPUT_DEVICE = None + +# Lemonade /realtime always requires 16 kHz PCM-16 — this is not configurable. +ASR_RATE: int = 16000 + +# --------------------------------------------------------------------------- +# Storage +# --------------------------------------------------------------------------- + +STORAGE_BASE: Path = Path( + cfg.get_setting("Storage", "audio_clips_path", "~/.listenr/audio_clips") + or "~/.listenr/audio_clips" +).expanduser() + +STORAGE_CLIPS_ENABLED: bool = cfg.get_bool_setting( + "Storage", "audio_clips_enabled", True +) +STORAGE_RETENTION_DAYS: int = cfg.get_int_setting("Storage", "retention_days", 90) +STORAGE_MAX_GB: float = cfg.get_float_setting("Storage", "max_storage_gb", 10.0) + +# --------------------------------------------------------------------------- +# VAD +# --------------------------------------------------------------------------- + +VAD_THRESHOLD: float = cfg.get_float_setting("VAD", "threshold", 0.05) +VAD_SILENCE_MS: int = cfg.get_int_setting("VAD", "silence_duration_ms", 800) +VAD_PREFIX_PADDING_MS: int = cfg.get_int_setting("VAD", "prefix_padding_ms", 250) + +# --------------------------------------------------------------------------- +# LLM +# --------------------------------------------------------------------------- + +LLM_ENABLED: bool = cfg.get_bool_setting("LLM", "enabled", True) +LLM_MODEL: str = ( + cfg.get_setting("LLM", "model", "gpt-oss-20b-mxfp4-GGUF") + or "gpt-oss-20b-mxfp4-GGUF" +) +LLM_API_BASE: str = ( + cfg.get_setting("LLM", "api_base", "http://localhost:8000/api/v1") + or "http://localhost:8000/api/v1" +) +LLM_TEMPERATURE: float = cfg.get_float_setting("LLM", "temperature", 0.3) +LLM_MAX_TOKENS: int = cfg.get_int_setting("LLM", "max_tokens", 1500) +LLM_TIMEOUT: int = cfg.get_int_setting("LLM", "timeout", 30) +LLM_CONTEXT_WINDOW: int = cfg.get_int_setting("LLM", "context_window", 10) + +# --------------------------------------------------------------------------- +# Dataset +# --------------------------------------------------------------------------- + +DATASET_OUTPUT: Path = Path( + cfg.get_setting("Dataset", "output_path", "~/listenr_dataset") + or "~/listenr_dataset" +).expanduser() +DATASET_SPLIT: str = cfg.get_setting("Dataset", "split", "80/10/10") or "80/10/10" +DATASET_MIN_DURATION: float = cfg.get_float_setting("Dataset", "min_duration", 0.3) +DATASET_MIN_CHARS: int = cfg.get_int_setting("Dataset", "min_chars", 2) +DATASET_SEED: int = cfg.get_int_setting("Dataset", "seed", 42) + +_VALID_DATASET_FORMATS: frozenset[str] = frozenset({"csv", "hf", "both"}) +_raw_dataset_format: str = cfg.get_setting("Dataset", "format", "csv") or "csv" +if _raw_dataset_format not in _VALID_DATASET_FORMATS: + import warnings + warnings.warn( + f"Config [Dataset] format={_raw_dataset_format!r} is not a recognised value " + f"({', '.join(sorted(_VALID_DATASET_FORMATS))}); falling back to 'csv'.", + UserWarning, + stacklevel=2, + ) + _raw_dataset_format = "csv" +DATASET_FORMAT: str = _raw_dataset_format + +# --------------------------------------------------------------------------- +# Output / transcript files +# --------------------------------------------------------------------------- + +OUTPUT_FILE: Path | None = ( + Path(v).expanduser() + if (v := cfg.get_setting("Output", "file", "")) + else None +) +OUTPUT_LLM_FILE: Path | None = ( + Path(v).expanduser() + if (v := cfg.get_setting("Output", "llm_file", "")) + else None +) +OUTPUT_LINE_FORMAT: str = ( + cfg.get_setting("Output", "line_format", "[{timestamp}] {text}") + or "[{timestamp}] {text}" +) +OUTPUT_TIMESTAMP_FORMAT: str = ( + cfg.get_setting("Output", "timestamp_format", "%Y-%m-%d %H:%M:%S") + or "%Y-%m-%d %H:%M:%S" +) +OUTPUT_SHOW_RAW: bool = cfg.get_bool_setting("Output", "show_raw", False) + +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- + +LOG_LEVEL: str = cfg.get_setting("Logging", "level", "INFO") or "INFO" +LOG_FILE: Path | None = ( + Path(v).expanduser() + if (v := cfg.get_setting("Logging", "file", "")) + else None +) + + +# --------------------------------------------------------------------------- +# Reload helper (used by tests and advanced callers) +# --------------------------------------------------------------------------- + +def reload() -> None: + """Re-read all constants from the current config state (in-place update). + + Useful in tests that patch :mod:`listenr.config_manager` after import:: + + cfg.update_setting('LLM', 'model', 'my-test-model') + import listenr.constants as C + C.reload() + assert C.LLM_MODEL == 'my-test-model' + """ + import sys + import importlib + + # Re-execute this module in the same module object so all names are updated + # in place — existing ``from listenr.constants import X`` bindings in already- + # imported modules won't see the change, but direct attribute access on the + # module object (``constants.X``) will. + module = sys.modules[__name__] + importlib.reload(module) diff --git a/src/listenr/llm_processor.py b/src/listenr/llm_processor.py index 649f20c..0213a29 100644 --- a/src/listenr/llm_processor.py +++ b/src/listenr/llm_processor.py @@ -7,8 +7,14 @@ import re import requests import listenr.config_manager as cfg - -_DEFAULT_API_BASE = "http://localhost:8000/api/v1" +from listenr.constants import ( + LLM_API_BASE as _DEFAULT_API_BASE, + LLM_MAX_TOKENS, + LLM_MODEL, + LLM_TEMPERATURE, + LLM_TIMEOUT, + WHISPER_MODEL, +) # System prompt for transcription post-processing. # The model must return ONLY a JSON object — no prose, no markdown fences. @@ -139,11 +145,11 @@ def lemonade_llm_correct( Never raises — on failure returns the original text with is_improved=False. """ if model is None: - model = cfg.get_setting('LLM', 'model', 'gpt-oss-20b-mxfp4-GGUF') + model = LLM_MODEL - temperature = cfg.get_float_setting('LLM', 'temperature', 0.1) - max_tokens = cfg.get_int_setting('LLM', 'max_tokens', 1500) - timeout = cfg.get_int_setting('LLM', 'timeout', 30) + temperature = LLM_TEMPERATURE + max_tokens = LLM_MAX_TOKENS + timeout = LLM_TIMEOUT # Build message list: system + interleaved context turns + current segment messages: list[dict] = [{"role": "system", "content": _CORRECTION_SYSTEM_PROMPT}] @@ -184,7 +190,7 @@ def lemonade_transcribe_audio(audio_path, model=None): Use Lemonade's HTTP transcription endpoint for audio files. """ if model is None: - model = cfg.get_setting('Whisper', 'model', 'Whisper-Tiny') + model = WHISPER_MODEL with open(audio_path, "rb") as f: resp = requests.post( f"{_api_base()}/audio/transcriptions", diff --git a/src/listenr/unified_asr.py b/src/listenr/unified_asr.py index 67d56ba..5577c1f 100644 --- a/src/listenr/unified_asr.py +++ b/src/listenr/unified_asr.py @@ -20,8 +20,14 @@ import websockets import asyncio -import listenr.config_manager as cfg from listenr.llm_processor import lemonade_llm_correct, lemonade_transcribe_audio +from listenr.constants import ( + LLM_API_BASE, + VAD_THRESHOLD, + VAD_SILENCE_MS, + VAD_PREFIX_PADDING_MS, + WHISPER_MODEL as _DEFAULT_WHISPER_MODEL, +) logger = logging.getLogger('unified_asr') @@ -72,11 +78,10 @@ async def stream_transcribe(self, audio_stream, whisper_model=None, on_result=No import base64 if whisper_model is None: - whisper_model = cfg.get_setting('Whisper', 'model', 'Whisper-Large-v3-Turbo') + whisper_model = _DEFAULT_WHISPER_MODEL if lemonade_ws_url is None: - api_base = cfg.get_setting('LLM', 'api_base', 'http://localhost:8000/api/v1') or 'http://localhost:8000/api/v1' try: - resp = requests.get(f"{api_base}/health", timeout=5) + resp = requests.get(f"{LLM_API_BASE}/health", timeout=5) ws_port = resp.json().get('websocket_port', 8001) except Exception: ws_port = 8001 @@ -87,9 +92,9 @@ async def stream_transcribe(self, audio_stream, whisper_model=None, on_result=No "session": { "model": whisper_model, "turn_detection": { - "threshold": cfg.get_float_setting('VAD', 'threshold', 0.01), - "silence_duration_ms": cfg.get_int_setting('VAD', 'silence_duration_ms', 800), - "prefix_padding_ms": cfg.get_int_setting('VAD', 'prefix_padding_ms', 250), + "threshold": VAD_THRESHOLD, + "silence_duration_ms": VAD_SILENCE_MS, + "prefix_padding_ms": VAD_PREFIX_PADDING_MS, }, }, } diff --git a/tests/test_constants.py b/tests/test_constants.py new file mode 100644 index 0000000..3402e77 --- /dev/null +++ b/tests/test_constants.py @@ -0,0 +1,344 @@ +""" +Unit tests for listenr.constants. + +Verifies: + - Every public constant exists and has the expected Python type. + - Path constants are absolute (already expanded). + - Numeric constants are within sane ranges. + - reload() correctly picks up config changes made at runtime. + - Constants are consumed by the modules that use them (smoke-import checks). +""" + +import importlib +from pathlib import Path + +import pytest + +import listenr.constants as C + + +# --------------------------------------------------------------------------- +# Type checks +# --------------------------------------------------------------------------- + +class TestConstantTypes: + def test_lemonade_api_base_is_str(self): + assert isinstance(C.LEMONADE_API_BASE, str) + + def test_whisper_model_is_str(self): + assert isinstance(C.WHISPER_MODEL, str) + + def test_capture_rate_is_int(self): + assert isinstance(C.CAPTURE_RATE, int) + + def test_asr_rate_is_int(self): + assert isinstance(C.ASR_RATE, int) + + def test_channels_is_int(self): + assert isinstance(C.CHANNELS, int) + + def test_chunk_size_is_int(self): + assert isinstance(C.CHUNK_SIZE, int) + + def test_input_device_is_str_or_none(self): + assert C.INPUT_DEVICE is None or isinstance(C.INPUT_DEVICE, str) + + def test_storage_base_is_path(self): + assert isinstance(C.STORAGE_BASE, Path) + + def test_storage_clips_enabled_is_bool(self): + assert isinstance(C.STORAGE_CLIPS_ENABLED, bool) + + def test_storage_retention_days_is_int(self): + assert isinstance(C.STORAGE_RETENTION_DAYS, int) + + def test_storage_max_gb_is_float(self): + assert isinstance(C.STORAGE_MAX_GB, float) + + def test_vad_threshold_is_float(self): + assert isinstance(C.VAD_THRESHOLD, float) + + def test_vad_silence_ms_is_int(self): + assert isinstance(C.VAD_SILENCE_MS, int) + + def test_vad_prefix_padding_ms_is_int(self): + assert isinstance(C.VAD_PREFIX_PADDING_MS, int) + + def test_llm_enabled_is_bool(self): + assert isinstance(C.LLM_ENABLED, bool) + + def test_llm_model_is_str(self): + assert isinstance(C.LLM_MODEL, str) + + def test_llm_api_base_is_str(self): + assert isinstance(C.LLM_API_BASE, str) + + def test_llm_temperature_is_float(self): + assert isinstance(C.LLM_TEMPERATURE, float) + + def test_llm_max_tokens_is_int(self): + assert isinstance(C.LLM_MAX_TOKENS, int) + + def test_llm_timeout_is_int(self): + assert isinstance(C.LLM_TIMEOUT, int) + + def test_llm_context_window_is_int(self): + assert isinstance(C.LLM_CONTEXT_WINDOW, int) + + def test_dataset_output_is_path(self): + assert isinstance(C.DATASET_OUTPUT, Path) + + def test_dataset_split_is_str(self): + assert isinstance(C.DATASET_SPLIT, str) + + def test_dataset_min_duration_is_float(self): + assert isinstance(C.DATASET_MIN_DURATION, float) + + def test_dataset_min_chars_is_int(self): + assert isinstance(C.DATASET_MIN_CHARS, int) + + def test_dataset_seed_is_int(self): + assert isinstance(C.DATASET_SEED, int) + + def test_dataset_format_is_str(self): + assert isinstance(C.DATASET_FORMAT, str) + + def test_output_file_is_path_or_none(self): + assert C.OUTPUT_FILE is None or isinstance(C.OUTPUT_FILE, Path) + + def test_output_llm_file_is_path_or_none(self): + assert C.OUTPUT_LLM_FILE is None or isinstance(C.OUTPUT_LLM_FILE, Path) + + def test_output_line_format_is_str(self): + assert isinstance(C.OUTPUT_LINE_FORMAT, str) + + def test_output_timestamp_format_is_str(self): + assert isinstance(C.OUTPUT_TIMESTAMP_FORMAT, str) + + def test_output_show_raw_is_bool(self): + assert isinstance(C.OUTPUT_SHOW_RAW, bool) + + def test_log_level_is_str(self): + assert isinstance(C.LOG_LEVEL, str) + + def test_log_file_is_path_or_none(self): + assert C.LOG_FILE is None or isinstance(C.LOG_FILE, Path) + + +# --------------------------------------------------------------------------- +# Value sanity checks +# --------------------------------------------------------------------------- + +class TestConstantValues: + def test_asr_rate_always_16000(self): + """ASR_RATE is fixed — Lemonade /realtime always requires 16 kHz.""" + assert C.ASR_RATE == 16000 + + def test_capture_rate_positive(self): + assert C.CAPTURE_RATE > 0 + + def test_chunk_size_positive(self): + assert C.CHUNK_SIZE > 0 + + def test_channels_at_least_one(self): + assert C.CHANNELS >= 1 + + def test_vad_threshold_between_0_and_1(self): + assert 0.0 < C.VAD_THRESHOLD < 1.0 + + def test_vad_silence_ms_positive(self): + assert C.VAD_SILENCE_MS > 0 + + def test_vad_prefix_padding_ms_non_negative(self): + assert C.VAD_PREFIX_PADDING_MS >= 0 + + def test_llm_temperature_in_range(self): + assert 0.0 <= C.LLM_TEMPERATURE <= 2.0 + + def test_llm_max_tokens_positive(self): + assert C.LLM_MAX_TOKENS > 0 + + def test_llm_timeout_positive(self): + assert C.LLM_TIMEOUT > 0 + + def test_llm_context_window_positive(self): + assert C.LLM_CONTEXT_WINDOW > 0 + + def test_dataset_split_parses_as_three_ints(self): + parts = C.DATASET_SPLIT.split('/') + assert len(parts) == 3, f"Expected 3 parts, got: {C.DATASET_SPLIT!r}" + assert all(p.strip().isdigit() for p in parts) + + def test_dataset_split_sums_to_100(self): + parts = [int(p) for p in C.DATASET_SPLIT.split('/')] + assert sum(parts) == 100, f"Split must sum to 100, got {sum(parts)}" + + def test_dataset_min_duration_non_negative(self): + assert C.DATASET_MIN_DURATION >= 0.0 + + def test_dataset_min_chars_non_negative(self): + assert C.DATASET_MIN_CHARS >= 0 + + def test_dataset_format_valid(self): + assert C.DATASET_FORMAT in {'csv', 'hf', 'both'} + + def test_lemonade_api_base_starts_with_http(self): + assert C.LEMONADE_API_BASE.startswith('http') + + def test_llm_api_base_starts_with_http(self): + assert C.LLM_API_BASE.startswith('http') + + def test_storage_base_is_absolute(self): + assert C.STORAGE_BASE.is_absolute() + + def test_dataset_output_is_absolute(self): + assert C.DATASET_OUTPUT.is_absolute() + + def test_whisper_model_non_empty(self): + assert C.WHISPER_MODEL.strip() != '' + + def test_llm_model_non_empty(self): + assert C.LLM_MODEL.strip() != '' + + def test_log_level_is_valid(self): + import logging + assert C.LOG_LEVEL.upper() in {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'} + + def test_output_line_format_contains_text_placeholder(self): + assert '{text}' in C.OUTPUT_LINE_FORMAT + + def test_retention_days_positive(self): + assert C.STORAGE_RETENTION_DAYS > 0 + + def test_storage_max_gb_positive(self): + assert C.STORAGE_MAX_GB > 0 + + +# --------------------------------------------------------------------------- +# reload() smoke test — checks the mechanism works without real config change +# --------------------------------------------------------------------------- + +class TestReload: + def test_reload_returns_same_types(self): + """reload() should succeed and constants should still have correct types.""" + C.reload() + # Re-import to get updated module state + importlib.invalidate_caches() + import listenr.constants as C2 + assert isinstance(C2.CAPTURE_RATE, int) + assert isinstance(C2.LLM_MODEL, str) + assert isinstance(C2.STORAGE_BASE, Path) + assert isinstance(C2.DATASET_MIN_DURATION, float) + + def test_reload_preserves_asr_rate(self): + """ASR_RATE must remain 16000 regardless of any reload.""" + C.reload() + import listenr.constants as C2 + assert C2.ASR_RATE == 16000 + + def test_reload_with_patched_config(self, monkeypatch): + """Constants module attribute reflects patched config after reload().""" + import listenr.config_manager as cfg_mod + monkeypatch.setattr( + cfg_mod, 'get_int_setting', + lambda section, key, fallback=0: 9999 if (section, key) == ('LLM', 'max_tokens') else fallback, + ) + C.reload() + import listenr.constants as C2 + assert C2.LLM_MAX_TOKENS == 9999 + + def test_reload_restores_after_patch(self, monkeypatch): + """After monkeypatch teardown + reload, value returns to real config.""" + import listenr.config_manager as cfg_mod + original_fn = cfg_mod.get_int_setting + monkeypatch.setattr( + cfg_mod, 'get_int_setting', + lambda section, key, fallback=0: 1 if (section, key) == ('LLM', 'context_window') else original_fn(section, key, fallback), + ) + C.reload() + import listenr.constants as C2 + assert C2.LLM_CONTEXT_WINDOW == 1 + + # monkeypatch teardown restores original_fn automatically; + # call reload once more here to re-read with real config + monkeypatch.undo() + C.reload() + import listenr.constants as C3 + assert isinstance(C3.LLM_CONTEXT_WINDOW, int) + + +# --------------------------------------------------------------------------- +# Smoke imports — ensure migrated modules still import cleanly +# --------------------------------------------------------------------------- + +class TestModuleImports: + def test_cli_imports_constants(self): + import listenr.cli # noqa: F401 — just verify no ImportError + + def test_llm_processor_imports_constants(self): + import listenr.llm_processor # noqa: F401 + + def test_unified_asr_imports_constants(self): + import listenr.unified_asr # noqa: F401 + + def test_build_dataset_imports_constants(self): + import listenr.build_dataset # noqa: F401 + + def test_build_dataset_defaults_match_constants(self): + import listenr.build_dataset as bd + assert bd.DEFAULT_OUTPUT == C.DATASET_OUTPUT + assert bd.DEFAULT_SPLIT == C.DATASET_SPLIT + assert bd.DEFAULT_MIN_DURATION == C.DATASET_MIN_DURATION + assert bd.DEFAULT_MIN_CHARS == C.DATASET_MIN_CHARS + assert bd.DEFAULT_SEED == C.DATASET_SEED + assert bd.DEFAULT_FORMAT == C.DATASET_FORMAT + + +# --------------------------------------------------------------------------- +# No stale cfg.get_* calls in migrated modules (grep-based AST-free check) +# --------------------------------------------------------------------------- + +class TestNoCfgCallsInMigratedModules: + """ + Ensure migrated modules do not contain inline cfg.get_*_setting() calls + that duplicate what constants.py already exposes. We allow cfg.get_setting + ONLY inside _api_base() in llm_processor (URL may be overridden at runtime). + """ + + def _source(self, module_name: str) -> str: + mod = importlib.import_module(module_name) + import inspect + return inspect.getsource(mod) + + def test_cli_has_no_inline_cfg_get_calls(self): + src = self._source('listenr.cli') + # cli.py should contain no cfg.get_*_setting() calls at all + import re + calls = re.findall(r'cfg\.get_\w+_setting\(', src) + assert calls == [], f"cli.py still has inline cfg calls: {calls}" + + def test_build_dataset_has_no_inline_cfg_get_calls(self): + src = self._source('listenr.build_dataset') + import re + calls = re.findall(r'cfg\.get_\w+_setting\(', src) + assert calls == [], f"build_dataset.py still has inline cfg calls: {calls}" + + def test_unified_asr_has_no_inline_cfg_get_calls(self): + src = self._source('listenr.unified_asr') + import re + calls = re.findall(r'cfg\.get_\w+_setting\(', src) + assert calls == [], f"unified_asr.py still has inline cfg calls: {calls}" + + def test_llm_processor_cfg_calls_only_in_api_base(self): + src = self._source('listenr.llm_processor') + import re + # Find all lines with cfg.get_*_setting + lines_with_cfg = [ + (i + 1, line.strip()) + for i, line in enumerate(src.splitlines()) + if re.search(r'cfg\.get_\w+_setting\(', line) + ] + for lineno, line in lines_with_cfg: + assert '_api_base' in src.splitlines()[lineno - 2] or 'def _api_base' in line or '_api_base' in line, ( + f"llm_processor.py has unexpected cfg call outside _api_base at line ~{lineno}: {line!r}" + )