From e49d6353b5212c89ae71240a33bf56a428c8b552 Mon Sep 17 00:00:00 2001 From: Greensand321 Date: Sat, 13 Dec 2025 08:41:34 -0500 Subject: [PATCH 1/3] Add task to fix interactive HDBSCAN preprocessing and UI --- docs/hdbscan_task.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 docs/hdbscan_task.md diff --git a/docs/hdbscan_task.md b/docs/hdbscan_task.md new file mode 100644 index 0000000..56dd2e2 --- /dev/null +++ b/docs/hdbscan_task.md @@ -0,0 +1,27 @@ +# Task: Fix HDBSCAN visualization & parameter handling in Clustered Playlists + +## Problem +The interactive HDBSCAN view in the Playlist Creator tab often renders a single-color scatter plot because clustering is recomputed on raw, unscaled feature vectors (`np.vstack(self.features)`) while offline generation scales features with `StandardScaler`. Without normalization or dimensionality reduction, HDBSCAN frequently labels all points as noise, leading to the uninformative visualization. + +## Scope +- `cluster_graph_panel.py`: interactive reclustering and parameter dialog. +- `main_gui.py`: plugin wiring for interactive HDBSCAN/KMeans panels. +- `clustered_playlists.py`: feature extraction and storage feeding the interactive view. + +## Goals +1. Ensure the interactive HDBSCAN workflow uses the same preprocessing as offline generation (feature scaling and optional dimensionality reduction) so clusters can form when density exists. +2. Let users adjust key HDBSCAN parameters before running the initial interactive plot, not only after clicking “Redo Values.” +3. Keep K-Means behavior unchanged while preventing shared-state regressions between the two interactive modes. +4. Make the visualization clearly differentiate noise points versus clustered points after preprocessing fixes. + +## Proposed Approach +- Pass scaled (and optionally reduced) features into the interactive panel, reusing the standardized matrix already produced for playlist generation instead of the raw feature list. +- Centralize HDBSCAN parameter defaults so both the initial render and the edit dialog share the same values, and surface the dialog before the first run (or persist last-used values from `cluster_params`). +- Update recoloring logic if needed to keep noise (`label == -1`) visibly distinct after preprocessing changes. +- Add defensive logging to confirm when HDBSCAN returns only noise versus multiple clusters to aid troubleshooting. + +## Acceptance Criteria +- Interactive HDBSCAN plots use normalized feature data and, when clusters exist, show multiple colors; an all-noise result is logged explicitly rather than silently appearing unclustered. +- Users can configure `min_cluster_size`, `min_samples`, and `cluster_selection_epsilon` prior to the first interactive HDBSCAN run, with sensible defaults and validation. +- K-Means interactive behavior remains the same, and shared UI elements (buttons, dialogs) still function for both clustering modes. +- Manual test instructions added to the issue or commit notes describing how to verify the corrected visualization and parameter entry. From c96677edccbe573e9d2eb1c96fb4f5c58ee71f6b Mon Sep 17 00:00:00 2001 From: Greensand321 Date: Sat, 13 Dec 2025 08:41:49 -0500 Subject: [PATCH 2/3] Add genre playlist generation tool --- controllers/genre_playlist_controller.py | 111 +++++++++++++++++++++++ main_gui.py | 65 +++++++++++++ tests/test_genre_playlist_controller.py | 77 ++++++++++++++++ 3 files changed, 253 insertions(+) create mode 100644 controllers/genre_playlist_controller.py create mode 100644 tests/test_genre_playlist_controller.py diff --git a/controllers/genre_playlist_controller.py b/controllers/genre_playlist_controller.py new file mode 100644 index 0000000..ee2129b --- /dev/null +++ b/controllers/genre_playlist_controller.py @@ -0,0 +1,111 @@ +"""Helpers for generating playlists grouped by genre.""" + +from __future__ import annotations + +import os +import re +from typing import Callable, Dict, Iterable, List, Mapping + +from mutagen import File as MutagenFile + +from controllers.normalize_controller import normalize_genres +from playlist_generator import write_playlist + +GenreGroups = Dict[str, List[str]] + +_SPLIT_RE = re.compile(r"[;,/|]") + + +def _safe_name(text: str) -> str: + cleaned = re.sub(r"[^\w\- ]+", "_", text).strip(" _") + return cleaned or "Unknown" + + +def read_genres(path: str, split_multi: bool = True) -> list[str]: + """Return list of genres found in ``path``. + + Parameters + ---------- + path: + Audio file path. + split_multi: + When True, split combined genre strings on common separators. + """ + + try: + audio = MutagenFile(path, easy=True) + except Exception: + return [] + + if not audio or not audio.tags: + return [] + + genres = audio.tags.get("genre", []) or [] + results: list[str] = [] + for raw in genres: + parts = _SPLIT_RE.split(raw) if split_multi else [raw] + for part in parts: + part = part.strip() + if part: + results.append(part) + return results + + +def group_tracks_by_genre( + tracks: Iterable[str], + mapping: Mapping[str, str] | None = None, + include_unknown: bool = False, + split_multi: bool = True, + log_callback: Callable[[str], None] | None = None, +) -> GenreGroups: + """Group ``tracks`` into playlists keyed by genre.""" + + mapping = mapping or {} + log = log_callback or (lambda _m: None) + grouped: GenreGroups = {} + + for track in tracks: + genres = read_genres(track, split_multi=split_multi) + if mapping: + genres = normalize_genres(genres, mapping) + + if not genres: + if include_unknown: + grouped.setdefault("Unknown", []).append(track) + log(f"! No genre tag for {track}") + continue + + for genre in genres: + grouped.setdefault(genre, []).append(track) + log(f"• {os.path.basename(track)} → {genre}") + + return grouped + + +def write_genre_playlists( + grouped: Mapping[str, List[str]], + playlists_dir: str, + log_callback: Callable[[str], None] | None = None, +) -> Dict[str, str]: + """Write one playlist per genre and return mapping of genre->path.""" + + os.makedirs(playlists_dir, exist_ok=True) + log = log_callback or (lambda _m: None) + + used_names: set[str] = set() + out_paths: Dict[str, str] = {} + for genre in sorted(grouped.keys(), key=str.lower): + base = _safe_name(genre) + name = base + suffix = 2 + while name in used_names: + name = f"{base}_{suffix}" + suffix += 1 + used_names.add(name) + + outfile = os.path.join(playlists_dir, f"{name}.m3u") + write_playlist(grouped[genre], outfile) + out_paths[genre] = outfile + log(f"→ Wrote {outfile}") + + return out_paths diff --git a/main_gui.py b/main_gui.py index f353e59..ca0416b 100644 --- a/main_gui.py +++ b/main_gui.py @@ -44,6 +44,10 @@ from controllers.library_index_controller import generate_index from controllers.import_controller import import_new_files from controllers.genre_list_controller import list_unique_genres +from controllers.genre_playlist_controller import ( + group_tracks_by_genre, + write_genre_playlists, +) from controllers.highlight_controller import play_snippet, PYDUB_AVAILABLE from controllers.scan_progress_controller import ScanProgressController from gui.audio_preview import PreviewPlayer @@ -79,6 +83,7 @@ from controllers.cluster_controller import cluster_library from config import load_config, save_config, DEFAULT_FP_THRESHOLDS from playlist_engine import bucket_by_tempo_energy, more_like_this, autodj_playlist +from playlist_generator import write_playlist from controllers.cluster_controller import gather_tracks FilterFn = Callable[[FileRecord], bool] @@ -179,6 +184,66 @@ def km_func(X, p): "engine": engine, **extras, } + elif name == "Sort by Genre": + use_mapping_var = tk.BooleanVar(value=True) + split_multi_var = tk.BooleanVar(value=True) + include_unknown_var = tk.BooleanVar(value=True) + + opts = ttk.Frame(frame) + opts.pack(fill="x", padx=10, pady=10) + ttk.Checkbutton( + opts, + text="Normalize with genre mapping", + variable=use_mapping_var, + ).pack(anchor="w") + ttk.Checkbutton( + opts, + text="Split multiple genres (;,/)", + variable=split_multi_var, + ).pack(anchor="w", pady=(5, 0)) + ttk.Checkbutton( + opts, + text="Include Unknown playlist", + variable=include_unknown_var, + ).pack(anchor="w", pady=(5, 0)) + + def _run(): + path = app.require_library() + if not path: + return + + app.show_log_tab() + tracks = gather_tracks(path, getattr(app, "folder_filter", None)) + if not tracks: + messagebox.showinfo("No Tracks", "No audio files found in the library.") + return + + mapping = app.genre_mapping if use_mapping_var.get() else {} + grouped = group_tracks_by_genre( + tracks, + mapping=mapping, + include_unknown=include_unknown_var.get(), + split_multi=split_multi_var.get(), + log_callback=app._log, + ) + if not grouped: + messagebox.showinfo( + "No Genres", + "No genre tags found. Update your tags or adjust the options.", + ) + return + + playlists_dir = os.path.join(path, "Playlists") + write_genre_playlists(grouped, playlists_dir, log_callback=app._log) + messagebox.showinfo( + "Playlists", + f"Wrote {len(grouped)} genre playlists to {playlists_dir}", + ) + + ttk.Button(frame, text="Generate Playlists", command=_run).pack( + padx=10, pady=10 + ) + return frame elif name == "Tempo/Energy Buckets": def _run(): path = app.require_library() diff --git a/tests/test_genre_playlist_controller.py b/tests/test_genre_playlist_controller.py new file mode 100644 index 0000000..1d6cc0e --- /dev/null +++ b/tests/test_genre_playlist_controller.py @@ -0,0 +1,77 @@ +import importlib +import os +import sys +import types + + +def _load_controller(monkeypatch): + mutagen_stub = types.ModuleType("mutagen") + mutagen_stub.File = lambda *a, **k: None + monkeypatch.setitem(sys.modules, "mutagen", mutagen_stub) + + if "controllers.genre_playlist_controller" in sys.modules: + del sys.modules["controllers.genre_playlist_controller"] + + return importlib.import_module("controllers.genre_playlist_controller") + + +def test_group_tracks_by_genre_applies_mapping(monkeypatch, tmp_path): + gpc = _load_controller(monkeypatch) + track_a = tmp_path / "song_a.mp3" + track_b = tmp_path / "song_b.mp3" + track_a.write_text("a") + track_b.write_text("b") + + genre_tags = { + str(track_a): ["Alt;Rock"], + str(track_b): [], + } + + class DummyAudio: + def __init__(self, path): + tags = genre_tags.get(path, []) + self.tags = {"genre": tags} if tags is not None else None + + monkeypatch.setattr(gpc, "MutagenFile", lambda path, easy=True: DummyAudio(path)) + + grouped = gpc.group_tracks_by_genre( + [str(track_a), str(track_b)], + mapping={"Alt": "Alternative"}, + include_unknown=True, + ) + + assert grouped["Alternative"] == [str(track_a)] + assert grouped["Rock"] == [str(track_a)] + assert grouped["Unknown"] == [str(track_b)] + + +def test_write_genre_playlists_creates_files(monkeypatch, tmp_path): + gpc = _load_controller(monkeypatch) + track_a = tmp_path / "song_a.mp3" + track_b = tmp_path / "song_b.mp3" + track_a.write_text("a") + track_b.write_text("b") + + grouped = { + "Indie Rock": [str(track_a)], + "Chill/Lo-fi": [str(track_b)], + } + + playlists_dir = tmp_path / "Playlists" + paths = gpc.write_genre_playlists(grouped, str(playlists_dir)) + + indie = playlists_dir / "Indie Rock.m3u" + chill = playlists_dir / "Chill_Lo-fi.m3u" + + assert indie.exists() + assert chill.exists() + assert paths["Indie Rock"] == str(indie) + assert paths["Chill/Lo-fi"] == str(chill) + + with open(indie, "r", encoding="utf-8") as f: + lines = [line.strip() for line in f.readlines()] + with open(chill, "r", encoding="utf-8") as f: + chill_lines = [line.strip() for line in f.readlines()] + + assert lines == [os.path.relpath(track_a, playlists_dir)] + assert chill_lines == [os.path.relpath(track_b, playlists_dir)] From 91c46bcea686c4706480e27c6db97910adf8ca09 Mon Sep 17 00:00:00 2001 From: Greensand321 Date: Sat, 13 Dec 2025 08:54:27 -0500 Subject: [PATCH 3/3] Improve interactive HDBSCAN handling --- clustered_playlists.py | 2 +- main_gui.py | 80 +++++++++++++++++++++++------------------- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/clustered_playlists.py b/clustered_playlists.py index b3e5193..efa95ee 100644 --- a/clustered_playlists.py +++ b/clustered_playlists.py @@ -150,4 +150,4 @@ def generate_clustered_playlists( log_callback(f"\u2717 Failed to write {outfile}: {e}") log_callback("✓ Clustered playlist generation finished") - return feats + return X diff --git a/main_gui.py b/main_gui.py index ca0416b..ea286b5 100644 --- a/main_gui.py +++ b/main_gui.py @@ -134,12 +134,9 @@ def create_panel_for_plugin(app, name: str, parent: tk.Widget) -> ttk.Frame: app._log(f"\u26a0 {exc}") return frame - cluster_data = getattr(app, "cluster_data", None) - cluster_cfg = getattr(app, "cluster_params", None) - if cluster_data is None: - tracks = features = None - else: - tracks, features = cluster_data + cluster_results = getattr(app, "cluster_results", {}) + cluster_cfgs = getattr(app, "cluster_params", {}) + tracks = features = None if name == "Interactive – KMeans": from sklearn.cluster import KMeans @@ -147,12 +144,13 @@ def create_panel_for_plugin(app, name: str, parent: tk.Widget) -> ttk.Frame: def km_func(X, p): return KMeans(n_clusters=p["n_clusters"]).fit_predict(X) - n_clusters = 5 - if cluster_cfg and cluster_cfg.get("method") == "kmeans": - n_clusters = int(cluster_cfg.get("num", 5)) - engine = "librosa" - if cluster_cfg and "engine" in cluster_cfg: - engine = cluster_cfg["engine"] + cfg = cluster_cfgs.get("kmeans", {}) + data = cluster_results.get("kmeans") + if data: + tracks, features = data + + n_clusters = int(cfg.get("n_clusters", cfg.get("num", 5))) + engine = cfg.get("engine", "librosa") params = {"n_clusters": n_clusters, "method": "kmeans", "engine": engine} elif name == "Interactive – HDBSCAN": from hdbscan import HDBSCAN @@ -165,19 +163,20 @@ def km_func(X, p): kwargs["cluster_selection_epsilon"] = p["cluster_selection_epsilon"] return HDBSCAN(**kwargs).fit_predict(X) - min_cs = 5 + cfg = cluster_cfgs.get("hdbscan", {}) + data = cluster_results.get("hdbscan") + if data: + tracks, features = data + + min_cs = int(cfg.get("min_cluster_size", cfg.get("num", 5))) extras = {} - if cluster_cfg and cluster_cfg.get("method") == "hdbscan": - min_cs = int(cluster_cfg.get("min_cluster_size", cluster_cfg.get("num", 5))) - if "min_samples" in cluster_cfg: - extras["min_samples"] = int(cluster_cfg["min_samples"]) - if "cluster_selection_epsilon" in cluster_cfg: - extras["cluster_selection_epsilon"] = float( - cluster_cfg["cluster_selection_epsilon"] - ) - engine = "librosa" - if cluster_cfg and "engine" in cluster_cfg: - engine = cluster_cfg["engine"] + if "min_samples" in cfg: + extras["min_samples"] = int(cfg["min_samples"]) + if "cluster_selection_epsilon" in cfg: + extras["cluster_selection_epsilon"] = float( + cfg["cluster_selection_epsilon"] + ) + engine = cfg.get("engine", "librosa") params = { "min_cluster_size": min_cs, "method": "hdbscan", @@ -353,6 +352,7 @@ def generate(): btn_frame = ttk.Frame(container) btn_frame.grid(row=2, column=0, sticky="ew", pady=5) + btn_frame.columnconfigure(6, weight=1) panel.lasso_var = tk.BooleanVar(value=False) @@ -362,7 +362,7 @@ def generate(): variable=panel.lasso_var, command=panel.toggle_lasso, ) - lasso_btn.pack(side="left") + lasso_btn.grid(row=0, column=0, padx=(0, 5)) panel.lasso_btn = lasso_btn panel.ok_btn = ttk.Button( @@ -371,7 +371,7 @@ def generate(): command=panel.finalize_lasso, state="disabled", ) - panel.ok_btn.pack(side="left", padx=(5, 0)) + panel.ok_btn.grid(row=0, column=1, padx=(0, 5)) panel.gen_btn = ttk.Button( btn_frame, @@ -379,7 +379,7 @@ def generate(): command=panel.create_playlist, state="disabled", ) - panel.gen_btn.pack(side="left", padx=(5, 0)) + panel.gen_btn.grid(row=0, column=2, padx=(0, 5)) def _auto_create_all(): method = panel.cluster_params.get("method") @@ -399,13 +399,13 @@ def _auto_create_all(): ).start() auto_btn = ttk.Button(btn_frame, text="Auto-Create", command=_auto_create_all) - auto_btn.pack(side="left", padx=(5, 0)) + auto_btn.grid(row=0, column=3, padx=(0, 5)) if name == "Interactive – HDBSCAN": redo_btn = ttk.Button( btn_frame, text="Redo Values", command=panel.open_param_dialog ) - redo_btn.pack(side="left", padx=(5, 0)) + redo_btn.grid(row=0, column=4, padx=(0, 5)) # ─── Hover Metadata Panel ──────────────────────────────────────────── hover_panel = ttk.Frame(panel, relief="solid", borderwidth=1) @@ -562,7 +562,8 @@ def __init__(self): self.assistant_plugin = AssistantPlugin() # Cached tracks and feature vectors for interactive clustering - self.cluster_data = None + self.cluster_results: dict[str, tuple[list[str], list]] = {} + self.cluster_params: dict[str, dict] = {} self.folder_filter = {"include": [], "exclude": []} # Library Sync state @@ -1161,7 +1162,8 @@ def select_library(self): self.mapping_path = os.path.join(self.library_path, ".genre_mapping.json") self._load_genre_mapping() # Clear any cached clustering data when switching libraries - self.cluster_data = None + self.cluster_results.clear() + self.cluster_params.clear() if hasattr(self, "scan_btn"): self.scan_btn.config(state="normal") self._validate_threshold() @@ -1658,7 +1660,8 @@ def _update_fields(*args): # KMeans params km_frame = ttk.Frame(params_frame) - km_var = tk.StringVar(value="5") + prev_km = self.cluster_params.get("kmeans", {}) + km_var = tk.StringVar(value=str(prev_km.get("n_clusters", 5))) ttk.Label(km_frame, text="Number of clusters:").pack(side="left") ttk.Entry(km_frame, textvariable=km_var, width=10).pack( side="left", padx=(5, 0) @@ -1666,18 +1669,21 @@ def _update_fields(*args): # HDBSCAN params hdb_frame = ttk.Frame(params_frame) - min_size_var = tk.StringVar(value="5") + prev_hdb = self.cluster_params.get("hdbscan", {}) + min_size_var = tk.StringVar(value=str(prev_hdb.get("min_cluster_size", 5))) ttk.Label(hdb_frame, text="Min cluster size:").grid(row=0, column=0, sticky="w") ttk.Entry(hdb_frame, textvariable=min_size_var, width=10).grid( row=0, column=1, sticky="w", padx=(5, 0) ) ttk.Label(hdb_frame, text="Min samples:").grid(row=1, column=0, sticky="w") - min_samples_var = tk.StringVar(value="") + min_samples_var = tk.StringVar(value=str(prev_hdb.get("min_samples", ""))) ttk.Entry(hdb_frame, textvariable=min_samples_var, width=10).grid( row=1, column=1, sticky="w", padx=(5, 0) ) ttk.Label(hdb_frame, text="Epsilon:").grid(row=2, column=0, sticky="w") - epsilon_var = tk.StringVar(value="") + epsilon_var = tk.StringVar( + value=str(prev_hdb.get("cluster_selection_epsilon", "")) + ) ttk.Entry(hdb_frame, textvariable=epsilon_var, width=10).grid( row=2, column=1, sticky="w", padx=(5, 0) ) @@ -1831,8 +1837,8 @@ def _run_cluster_generation( tracks, feats = cluster_library( path, method, params, self._log, self.folder_filter, engine ) - self.cluster_data = (tracks, feats) - self.cluster_params = {"method": method, "engine": engine, **params} + self.cluster_results[method] = (tracks, feats) + self.cluster_params[method] = {"method": method, "engine": engine, **params} def done(): messagebox.showinfo("Clustered Playlists", "Generation complete")