From 92acfaa85fafea03d6e7ddfb42af904480a80ba7 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 09:43:54 +0100 Subject: [PATCH 01/16] Added SSD implementation --- .gitignore | 2 + src/pyversity/__init__.py | 15 ++- src/pyversity/datatypes.py | 1 + src/pyversity/pyversity.py | 4 +- src/pyversity/strategies/__init__.py | 3 +- src/pyversity/strategies/cover.py | 2 +- src/pyversity/strategies/ssd.py | 181 +++++++++++++++++++++++++++ tests/test_strategies.py | 33 +++++ 8 files changed, 236 insertions(+), 5 deletions(-) create mode 100644 src/pyversity/strategies/ssd.py diff --git a/.gitignore b/.gitignore index a80c727..e9cb00a 100644 --- a/.gitignore +++ b/.gitignore @@ -207,3 +207,5 @@ marimo/_lsp/ __marimo__/ local + +.DS_Store diff --git a/src/pyversity/__init__.py b/src/pyversity/__init__.py index e1d697a..a4d94b2 100644 --- a/src/pyversity/__init__.py +++ b/src/pyversity/__init__.py @@ -1,6 +1,17 @@ from pyversity.datatypes import DiversificationResult, Metric, Strategy from pyversity.pyversity import diversify -from pyversity.strategies import cover, dpp, mmr, msd +from pyversity.strategies import cover, dpp, mmr, msd, ssd from pyversity.version import __version__ -__all__ = ["diversify", "Strategy", "Metric", "DiversificationResult", "mmr", "msd", "cover", "dpp", "__version__"] +__all__ = [ + "diversify", + "Strategy", + "Metric", + "DiversificationResult", + "mmr", + "msd", + "cover", + "dpp", + "ssd", + "__version__", +] diff --git a/src/pyversity/datatypes.py b/src/pyversity/datatypes.py index c74c1ae..d73df3f 100644 --- a/src/pyversity/datatypes.py +++ b/src/pyversity/datatypes.py @@ -11,6 +11,7 @@ class Strategy(str, Enum): MSD = "msd" COVER = "cover" DPP = "dpp" + SSD = "ssd" class Metric(str, Enum): diff --git a/src/pyversity/pyversity.py b/src/pyversity/pyversity.py index ab69efe..b543807 100644 --- a/src/pyversity/pyversity.py +++ b/src/pyversity/pyversity.py @@ -3,7 +3,7 @@ import numpy as np from pyversity.datatypes import DiversificationResult, Strategy -from pyversity.strategies import cover, dpp, mmr, msd +from pyversity.strategies import cover, dpp, mmr, msd, ssd def diversify( @@ -36,4 +36,6 @@ def diversify( return cover(embeddings, scores, k, diversity, **kwargs) if strategy == Strategy.DPP: return dpp(embeddings, scores, k, diversity, **kwargs) + if strategy == Strategy.SSD: + return ssd(embeddings, scores, k, diversity, **kwargs) raise ValueError(f"Unknown strategy: {strategy}") diff --git a/src/pyversity/strategies/__init__.py b/src/pyversity/strategies/__init__.py index 189088a..e5f9e9e 100644 --- a/src/pyversity/strategies/__init__.py +++ b/src/pyversity/strategies/__init__.py @@ -2,5 +2,6 @@ from pyversity.strategies.dpp import dpp from pyversity.strategies.mmr import mmr from pyversity.strategies.msd import msd +from pyversity.strategies.ssd import ssd -__all__ = ["mmr", "msd", "cover", "dpp"] +__all__ = ["mmr", "msd", "cover", "dpp", "ssd"] diff --git a/src/pyversity/strategies/cover.py b/src/pyversity/strategies/cover.py index 5e3f9e7..75412bd 100644 --- a/src/pyversity/strategies/cover.py +++ b/src/pyversity/strategies/cover.py @@ -14,7 +14,7 @@ def cover( normalize: bool = True, ) -> DiversificationResult: """ - Select a subset of items that balances relevance and coverage/diversity. + Cover (Facility Location) selection. This strategy chooses `k` items by combining pure relevance with diversity-driven coverage using a concave submodular formulation. diff --git a/src/pyversity/strategies/ssd.py b/src/pyversity/strategies/ssd.py new file mode 100644 index 0000000..6cfc4da --- /dev/null +++ b/src/pyversity/strategies/ssd.py @@ -0,0 +1,181 @@ +import numpy as np + +from pyversity.datatypes import DiversificationResult, Strategy +from pyversity.utils import EPS32, normalize_rows, prepare_inputs + + +def ssd( # noqa: C901 + embeddings: np.ndarray, + scores: np.ndarray, + k: int, + diversity: float = 0.5, + window: int = 10, + gamma: float = 1.0, + normalize: bool = True, + append_bias: bool = True, + normalize_scores: bool = True, +) -> DiversificationResult: + """ + Sliding Spectrum Decomposition (SSD) selection. + + This strategy performs greedy, sequence-aware diversification + that maintains a sliding window of orthogonal bases (modified Gram-Schmidt). + Each step picks the item that maximizes a combination of relevance and orthogonalized residual norm. + + :param embeddings: 2D array of shape (n_samples, n_features). + :param scores: 1D array of relevance scores for each item. + :param k: Number of items to select. + :param diversity: Trade-off between relevance and coverage/diversity in [0, 1] (inverse of theta parameter). + 1.0 = pure diversity, 0.0 = pure relevance. + :param window: Sliding window size (≥1) for Gram-Schmidt bases. + :param gamma: Diversity scale (>0). + :param normalize: Whether to L2-normalize embeddings before computing similarity (cosine geometry). + :param append_bias: Whether to append a constant-one bias dimension after normalization. + :param normalize_scores: Whether to z-score normalize relevance scores per request (stabilizes gamma). + :return: A DiversificationResult containing the selected item indices, + their selection scores, the strategy used, and the parameters. + :raises ValueError: If diversity is not in [0, 1]. + :raises ValueError: If window < 1. + :raises ValueError: If gamma ≤ 0. + """ + # Validate parameters + if not (0.0 <= float(diversity) <= 1.0): + raise ValueError("diversity must be in [0, 1]") + if window < 1: + raise ValueError("window must be >= 1") + if gamma <= 0.0: + raise ValueError("gamma must be > 0") + + theta = 1.0 - float(diversity) + window_size = int(window) + + # Prepare inputs + feature_matrix, relevance_scores, top_k, early_exit = prepare_inputs(embeddings, scores, k) + if early_exit: + return DiversificationResult( + indices=np.empty(0, np.int32), + selection_scores=np.empty(0, np.float32), + strategy=Strategy.SSD, + diversity=diversity, + parameters={ + "variant": "SSD*", + "window": window_size, + "gamma": float(gamma), + "normalize": bool(normalize), + "append_bias": bool(append_bias), + "normalize_scores": bool(normalize_scores), + }, + ) + + if theta == 1.0: + # Pure relevance: select top-k by relevance scores + topk_indices = np.argsort(-relevance_scores)[:top_k].astype(np.int32) + topk_scores = relevance_scores[topk_indices].astype(np.float32, copy=False) + return DiversificationResult( + indices=topk_indices, + selection_scores=topk_scores, + strategy=Strategy.SSD, + diversity=diversity, + parameters={ + "window": window_size, + "gamma": float(gamma), + "normalize": bool(normalize), + "append_bias": bool(append_bias), + "normalize_scores": bool(normalize_scores), + }, + ) + + # Normalize feature vectors to unit length + if normalize: + feature_matrix = normalize_rows(feature_matrix) + + # Append a constant-one dimension for bias + if append_bias: + last_col_is_ones = feature_matrix.shape[1] > 0 and np.allclose(feature_matrix[:, -1], 1.0, atol=1e-6, rtol=0.0) + if not last_col_is_ones: + ones = np.ones((feature_matrix.shape[0], 1), dtype=feature_matrix.dtype) + feature_matrix = np.concatenate([feature_matrix, ones], axis=1) + + # Per-request z-score normalization of relevance to stabilize gamma + if normalize_scores: + mean = float(np.mean(relevance_scores)) + std = float(np.std(relevance_scores)) + relevance_scores = (relevance_scores - mean) / std if std > 0.0 else (relevance_scores - mean) + + num_items, _ = feature_matrix.shape + + # Initialize selection state + selected_mask = np.zeros(num_items, dtype=bool) + selected_indices = np.empty(top_k, dtype=np.int32) + selection_scores = np.empty(top_k, dtype=np.float32) + + # Residuals of all candidates under the current sliding window + residual_matrix = feature_matrix.astype(np.float32, copy=True) + + # Sliding window lists (oldest first) + basis_vectors: list[np.ndarray] = [] + projection_coeffs_per_basis: list[np.ndarray] = [] + + def push_new_basis(selected_index: int) -> None: + """Update the sliding window with the newly selected basis.""" + if len(basis_vectors) == window_size: + oldest_basis = basis_vectors.pop(0) + oldest_coeffs = projection_coeffs_per_basis.pop(0) + mask_unselected = ~selected_mask + if np.any(mask_unselected): + residual_matrix[mask_unselected] += oldest_coeffs[mask_unselected, None] * oldest_basis + + new_basis = residual_matrix[selected_index].copy() + denom = float(new_basis @ new_basis) + EPS32 + basis_vectors.append(new_basis) + + mask_unselected = ~selected_mask + coeffs = np.zeros(num_items, dtype=np.float32) + if np.any(mask_unselected): + proj = (residual_matrix[mask_unselected] @ new_basis) / denom + coeffs[mask_unselected] = proj + residual_matrix[mask_unselected] -= proj[:, None] * new_basis + projection_coeffs_per_basis.append(coeffs) + + # First selection: pick item with highest relevance score + first_index = int(np.argmax(relevance_scores)) + selected_mask[first_index] = True + selected_indices[0] = first_index + + # Compute selection score for the first item + first_norm = float(np.linalg.norm(feature_matrix[first_index])) + selection_scores[0] = float(theta * relevance_scores[first_index] + (1.0 - theta) * gamma * first_norm) + + push_new_basis(first_index) + + for step in range(1, top_k): + available_indices = np.where(~selected_mask)[0] + if available_indices.size == 0: + selected_indices = selected_indices[:step] + selection_scores = selection_scores[:step] + break + + residual_norms = np.linalg.norm(residual_matrix[available_indices], axis=1) + combined_scores = theta * relevance_scores[available_indices] + (1.0 - theta) * gamma * residual_norms + local_best = int(np.argmax(combined_scores)) + best_index = int(available_indices[local_best]) + best_score = float(combined_scores[local_best]) + + selected_mask[best_index] = True + selected_indices[step] = best_index + selection_scores[step] = best_score + push_new_basis(best_index) + + return DiversificationResult( + indices=selected_indices, + selection_scores=selection_scores.astype(np.float32, copy=False), + strategy=Strategy.SSD, + diversity=diversity, + parameters={ + "window": window_size, + "gamma": float(gamma), + "normalize": bool(normalize), + "append_bias": bool(append_bias), + "normalize_scores": bool(normalize_scores), + }, + ) diff --git a/tests/test_strategies.py b/tests/test_strategies.py index 0fd59de..3912122 100644 --- a/tests/test_strategies.py +++ b/tests/test_strategies.py @@ -4,6 +4,7 @@ import pytest from pyversity import Metric, Strategy, cover, diversify, dpp, mmr, msd from pyversity.datatypes import DiversificationResult +from pyversity.strategies import ssd def test_mmr() -> None: @@ -177,6 +178,38 @@ def test_dpp() -> None: assert np.all(res.selection_scores[:-1] + 1e-7 >= res.selection_scores[1:]) +def test_ssd() -> None: + """Test SSD strategy with various diversity settings (1=diverse, 0=relevance).""" + emb = np.eye(3, dtype=np.float32) + scores = np.array([0.1, 0.8, 0.3], dtype=np.float32) + + # Relevance-only (diversity=0): picks top-k by scores + res = ssd(emb, scores, k=2, diversity=0.0) + expected = np.array([1, 2], dtype=np.int32) + assert np.array_equal(res.indices, expected) + assert np.allclose(res.selection_scores, scores[expected]) + + # Balanced coverage (diversity=0.5, gamma=0.5): picks diverse set + res = ssd(emb, scores, k=2, diversity=0.5, gamma=0.5) + assert res.indices[0] == 1 and res.indices[1] in (0, 2) + + # Parameter validation + with pytest.raises(ValueError): + ssd(emb, scores, k=2, diversity=-0.01) + with pytest.raises(ValueError): + ssd(emb, scores, k=2, diversity=1.01) + with pytest.raises(ValueError): + ssd(emb, scores, k=2, gamma=0.0) + with pytest.raises(ValueError): + ssd(emb, scores, k=2, gamma=-0.5) + + # Early exit on empty input + emb = np.empty((0, 3), dtype=np.float32) + scores = np.array([], dtype=np.float32) + res = ssd(emb, scores, k=5) + assert res.indices.size == 0 and res.selection_scores.size == 0 + + @pytest.mark.parametrize( "strategy, fn, kwargs", [ From 4b8701dad5d5becd62feb4f139fd6f341c36c379 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 11:29:19 +0100 Subject: [PATCH 02/16] Added SSD implementation --- src/pyversity/strategies/ssd.py | 191 ++++++++++++++++++-------------- tests/test_strategies.py | 28 +++++ 2 files changed, 134 insertions(+), 85 deletions(-) diff --git a/src/pyversity/strategies/ssd.py b/src/pyversity/strategies/ssd.py index 6cfc4da..0c0a43f 100644 --- a/src/pyversity/strategies/ssd.py +++ b/src/pyversity/strategies/ssd.py @@ -9,6 +9,7 @@ def ssd( # noqa: C901 scores: np.ndarray, k: int, diversity: float = 0.5, + recent_embeddings: np.ndarray | None = None, window: int = 10, gamma: float = 1.0, normalize: bool = True, @@ -16,27 +17,28 @@ def ssd( # noqa: C901 normalize_scores: bool = True, ) -> DiversificationResult: """ - Sliding Spectrum Decomposition (SSD) selection. + Sliding Spectrum Decomposition (SSD*) selection. - This strategy performs greedy, sequence-aware diversification - that maintains a sliding window of orthogonal bases (modified Gram-Schmidt). - Each step picks the item that maximizes a combination of relevance and orthogonalized residual norm. + This strategy selects `k` items using a greedy, sequence-aware approach that maintains a sliding window + of Gram-Schmidt bases to promote diversity while considering recent context. + If `recent_embeddings` are provided (oldest → newest), the window is seeded so the very first pick is + already novel relative to what the user just saw. - :param embeddings: 2D array of shape (n_samples, n_features). - :param scores: 1D array of relevance scores for each item. + :param embeddings: 2D array (n_items, n_dims) of candidate embeddings. + :param scores: 1D array (n_items,) of relevance scores. :param k: Number of items to select. - :param diversity: Trade-off between relevance and coverage/diversity in [0, 1] (inverse of theta parameter). - 1.0 = pure diversity, 0.0 = pure relevance. + :param diversity: Trade-off between relevance and diversity in [0, 1] (inverse of theta parameter). + 1.0 = pure diversity, 0.0 = pure relevance. + :param recent_embeddings: Optional 2D array (n_items, n_dims) with recent embeddings from oldest → newest. + seeds the sliding window so selection is aware of recent context. :param window: Sliding window size (≥1) for Gram-Schmidt bases. - :param gamma: Diversity scale (>0). - :param normalize: Whether to L2-normalize embeddings before computing similarity (cosine geometry). - :param append_bias: Whether to append a constant-one bias dimension after normalization. - :param normalize_scores: Whether to z-score normalize relevance scores per request (stabilizes gamma). - :return: A DiversificationResult containing the selected item indices, - their selection scores, the strategy used, and the parameters. - :raises ValueError: If diversity is not in [0, 1]. - :raises ValueError: If window < 1. - :raises ValueError: If gamma ≤ 0. + :param gamma: Diversity scale (> 0). + :param normalize: Whether to normalize embeddings before computing similarity. + :param append_bias: Append constant-one bias dim after normalization (paper §5.3). + :param normalize_scores: Z-score scores per request (applied only if diversity > 0). + + :return: DiversificationResult with selected indices and their selection scores. + :raises ValueError: If diversity ∉ [0, 1], or window < 1, or gamma ≤ 0. """ # Validate parameters if not (0.0 <= float(diversity) <= 1.0): @@ -46,57 +48,50 @@ def ssd( # noqa: C901 if gamma <= 0.0: raise ValueError("gamma must be > 0") + # Theta parameter for trade‑off between relevance and diversity + # This is 1 - diversity to align with common notation theta = 1.0 - float(diversity) - window_size = int(window) # Prepare inputs feature_matrix, relevance_scores, top_k, early_exit = prepare_inputs(embeddings, scores, k) if early_exit: + # Nothing to select: return empty arrays return DiversificationResult( indices=np.empty(0, np.int32), selection_scores=np.empty(0, np.float32), strategy=Strategy.SSD, diversity=diversity, - parameters={ - "variant": "SSD*", - "window": window_size, - "gamma": float(gamma), - "normalize": bool(normalize), - "append_bias": bool(append_bias), - "normalize_scores": bool(normalize_scores), - }, + parameters={"gamma": gamma, "window": window}, ) - if theta == 1.0: - # Pure relevance: select top-k by relevance scores - topk_indices = np.argsort(-relevance_scores)[:top_k].astype(np.int32) - topk_scores = relevance_scores[topk_indices].astype(np.float32, copy=False) + # Pure relevance: select top‑k by raw scores + if float(theta) == 1.0: + topk = np.argsort(-relevance_scores)[:top_k].astype(np.int32) + gains = relevance_scores[topk].astype(np.float32, copy=False) return DiversificationResult( - indices=topk_indices, - selection_scores=topk_scores, + indices=topk, + selection_scores=gains, strategy=Strategy.SSD, diversity=diversity, - parameters={ - "window": window_size, - "gamma": float(gamma), - "normalize": bool(normalize), - "append_bias": bool(append_bias), - "normalize_scores": bool(normalize_scores), - }, + parameters={"gamma": gamma, "window": window}, ) - # Normalize feature vectors to unit length - if normalize: - feature_matrix = normalize_rows(feature_matrix) - - # Append a constant-one dimension for bias - if append_bias: - last_col_is_ones = feature_matrix.shape[1] > 0 and np.allclose(feature_matrix[:, -1], 1.0, atol=1e-6, rtol=0.0) - if not last_col_is_ones: - ones = np.ones((feature_matrix.shape[0], 1), dtype=feature_matrix.dtype) - feature_matrix = np.concatenate([feature_matrix, ones], axis=1) - - # Per-request z-score normalization of relevance to stabilize gamma + def _prepare_vectors(matrix: np.ndarray) -> np.ndarray: + """Prepare feature vectors with normalization and bias appending.""" + out = matrix + if normalize: + out = normalize_rows(out) + if append_bias: + # Append constant‑one bias dimension (see paper §5.3) + last_col_is_ones = (out.shape[1] > 0) and np.allclose(out[:, -1], 1.0, atol=1e-6, rtol=0.0) + if not last_col_is_ones: + out = np.concatenate([out, np.ones((out.shape[0], 1), dtype=out.dtype)], axis=1) + return out + + # Prepare feature vectors + feature_matrix = _prepare_vectors(feature_matrix) + + # Normalize scores per request (to stabilize gamma) if normalize_scores: mean = float(np.mean(relevance_scores)) std = float(np.std(relevance_scores)) @@ -109,73 +104,99 @@ def ssd( # noqa: C901 selected_indices = np.empty(top_k, dtype=np.int32) selection_scores = np.empty(top_k, dtype=np.float32) - # Residuals of all candidates under the current sliding window + # Current residuals under the sliding window residual_matrix = feature_matrix.astype(np.float32, copy=True) - # Sliding window lists (oldest first) + # Sliding window storage basis_vectors: list[np.ndarray] = [] projection_coeffs_per_basis: list[np.ndarray] = [] - def push_new_basis(selected_index: int) -> None: - """Update the sliding window with the newly selected basis.""" - if len(basis_vectors) == window_size: + def _push_basis_vector(basis_vec: np.ndarray) -> None: + """Add a new basis vector to the sliding window, updating residuals.""" + if len(basis_vectors) == window: + # If at capacity, remove oldest basis and restore its contribution to residuals oldest_basis = basis_vectors.pop(0) oldest_coeffs = projection_coeffs_per_basis.pop(0) mask_unselected = ~selected_mask if np.any(mask_unselected): residual_matrix[mask_unselected] += oldest_coeffs[mask_unselected, None] * oldest_basis - new_basis = residual_matrix[selected_index].copy() - denom = float(new_basis @ new_basis) + EPS32 - basis_vectors.append(new_basis) - + denom = float(basis_vec @ basis_vec) + EPS32 + basis_vectors.append(basis_vec.astype(np.float32, copy=False)) mask_unselected = ~selected_mask coeffs = np.zeros(num_items, dtype=np.float32) if np.any(mask_unselected): - proj = (residual_matrix[mask_unselected] @ new_basis) / denom + # Compute projection coefficients and update residuals if unselected items remain + proj = (residual_matrix[mask_unselected] @ basis_vec) / denom coeffs[mask_unselected] = proj - residual_matrix[mask_unselected] -= proj[:, None] * new_basis + residual_matrix[mask_unselected] -= proj[:, None] * basis_vec + # Store the projection coefficients for later restoration projection_coeffs_per_basis.append(coeffs) - # First selection: pick item with highest relevance score - first_index = int(np.argmax(relevance_scores)) + # Seed with recent context (oldest → newest) + context_seed = 0 + if recent_embeddings is not None and np.size(recent_embeddings) > 0: + if recent_embeddings.ndim != 2 or recent_embeddings.shape[1] != embeddings.shape[1]: + raise ValueError("recent_embeddings must have shape (m, n_dims) matching `embeddings` columns.") + ctx = _prepare_vectors(recent_embeddings.astype(feature_matrix.dtype, copy=False)) + ctx = ctx[-window:] # keep only the latest window items + for vec in ctx: + # Orthogonalize the context vector against current bases + residual_vec = vec.copy() + for basis in basis_vectors: + denom_b = float(basis @ basis) + EPS32 + residual_vec -= float(residual_vec @ basis) / denom_b * basis + _push_basis_vector(residual_vec) + context_seed += 1 + + # Decide what to select first + if context_seed > 0: + # If we seeded context, use the combined score immediately (novel vs. recent) + residual_norms = np.linalg.norm(residual_matrix, axis=1) + combined = theta * relevance_scores + (1.0 - theta) * gamma * residual_norms + combined[selected_mask] = -np.inf + first_index = int(np.argmax(combined)) + first_score = float(combined[first_index]) + else: + # No context yet: pick purely by relevance (then start residualization) + first_index = int(np.argmax(relevance_scores)) + first_score = float( + theta * relevance_scores[first_index] + + (1.0 - theta) * gamma * float(np.linalg.norm(feature_matrix[first_index])) + ) + + # Select the first item selected_mask[first_index] = True selected_indices[0] = first_index + selection_scores[0] = first_score + _push_basis_vector(residual_matrix[first_index]) - # Compute selection score for the first item - first_norm = float(np.linalg.norm(feature_matrix[first_index])) - selection_scores[0] = float(theta * relevance_scores[first_index] + (1.0 - theta) * gamma * first_norm) - - push_new_basis(first_index) - + # Main loop for step in range(1, top_k): - available_indices = np.where(~selected_mask)[0] - if available_indices.size == 0: + # Select next item by combined score + available = np.where(~selected_mask)[0] + if available.size == 0: + # No more items to select selected_indices = selected_indices[:step] selection_scores = selection_scores[:step] break - residual_norms = np.linalg.norm(residual_matrix[available_indices], axis=1) - combined_scores = theta * relevance_scores[available_indices] + (1.0 - theta) * gamma * residual_norms - local_best = int(np.argmax(combined_scores)) - best_index = int(available_indices[local_best]) - best_score = float(combined_scores[local_best]) + residual_norms = np.linalg.norm(residual_matrix[available], axis=1) + combined_scores = theta * relevance_scores[available] + (1.0 - theta) * gamma * residual_norms + best_local = int(np.argmax(combined_scores)) + best_index = int(available[best_local]) + best_score = float(combined_scores[best_local]) + # Select the best item selected_mask[best_index] = True selected_indices[step] = best_index selection_scores[step] = best_score - push_new_basis(best_index) + _push_basis_vector(residual_matrix[best_index]) return DiversificationResult( indices=selected_indices, selection_scores=selection_scores.astype(np.float32, copy=False), strategy=Strategy.SSD, diversity=diversity, - parameters={ - "window": window_size, - "gamma": float(gamma), - "normalize": bool(normalize), - "append_bias": bool(append_bias), - "normalize_scores": bool(normalize_scores), - }, + parameters={"gamma": gamma, "window": window}, ) diff --git a/tests/test_strategies.py b/tests/test_strategies.py index 3912122..0536e72 100644 --- a/tests/test_strategies.py +++ b/tests/test_strategies.py @@ -210,6 +210,34 @@ def test_ssd() -> None: assert res.indices.size == 0 and res.selection_scores.size == 0 +def test_ssd_recent_embeddings_avoids_recent_first_pick() -> None: + """With equal relevance, the first pick should avoid the most recent item when context is seeded.""" + # 3 orthogonal items (identity); equal scores + emb = np.eye(3, dtype=np.float32) + scores = np.array([0.5, 0.5, 0.5], dtype=np.float32) + + # Seed recent history with item 1 (oldest->newest) + recent = emb[[1]] # the user just saw item 1 + res = ssd(emb, scores, k=2, diversity=0.5, gamma=1.0, window=2, recent_embeddings=recent) + + # First selection should not be the recent item (index 1) because its residual vs. context is ~0 + assert res.indices[0] in (0, 2) + assert res.indices[0] != 1 + + +def test_ssd_recent_embeddings_window_blocks_multiple_recent() -> None: + """If the window contains two recent items, the first pick should avoid both when scores are tied.""" + emb = np.eye(4, dtype=np.float32) + scores = np.ones(4, dtype=np.float32) # tie + + # Seed with items 0 and 1 (oldest->newest), window=2 + recent = emb[[0, 1]] + res = ssd(emb, scores, k=3, diversity=0.6, gamma=1.0, window=2, recent_embeddings=recent) + + # The first pick should be from {2,3}, not {0,1} + assert res.indices[0] in (2, 3) + + @pytest.mark.parametrize( "strategy, fn, kwargs", [ From 45c2597cca2b158f1b871a9ef49f84a4b1b0545f Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 11:32:46 +0100 Subject: [PATCH 03/16] Updated readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4e2087d..21d0be6 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ The following table describes the supported strategies, how they work, their tim | **MSD** (Max Sum of Distances) | Prefers items that are both relevant and far from *all* previous selections. | **O(k · n · d)** | Use when you want stronger spread, i.e. results that cover a wider range of topics or styles. | | **DPP** (Determinantal Point Process) | Samples diverse yet relevant items using probabilistic “repulsion.” | **O(k · n · d + n · k²)** | Ideal when you want to eliminate redundancy or ensure diversity is built-in to selection. | | **COVER** (Facility-Location) | Ensures selected items collectively represent the full dataset’s structure. | **O(k · n²)** | Great for topic coverage or clustering scenarios, but slower for large `n`. | +| **SSD*** (Sliding Spectrum Decomposition) | Sequence‑aware diversification: rewards novelty relative to the last `w` picks via Gram–Schmidt residuals. | **O(k · n · d)** | Great for content feeds & infinite scroll, e.g. social/news/product feeds where users consume sequentially, as well as conversational RAG to avoid showing similar chunks within the recent window. ## Motivation From ffd53ec11f030fb418ace6a819e9a60ad14e4526 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 11:34:54 +0100 Subject: [PATCH 04/16] Updated readme --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 21d0be6..591ad64 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ The following table describes the supported strategies, how they work, their tim | **MSD** (Max Sum of Distances) | Prefers items that are both relevant and far from *all* previous selections. | **O(k · n · d)** | Use when you want stronger spread, i.e. results that cover a wider range of topics or styles. | | **DPP** (Determinantal Point Process) | Samples diverse yet relevant items using probabilistic “repulsion.” | **O(k · n · d + n · k²)** | Ideal when you want to eliminate redundancy or ensure diversity is built-in to selection. | | **COVER** (Facility-Location) | Ensures selected items collectively represent the full dataset’s structure. | **O(k · n²)** | Great for topic coverage or clustering scenarios, but slower for large `n`. | -| **SSD*** (Sliding Spectrum Decomposition) | Sequence‑aware diversification: rewards novelty relative to the last `w` picks via Gram–Schmidt residuals. | **O(k · n · d)** | Great for content feeds & infinite scroll, e.g. social/news/product feeds where users consume sequentially, as well as conversational RAG to avoid showing similar chunks within the recent window. +| **SSD** (Sliding Spectrum Decomposition) | Sequence‑aware diversification: rewards novelty relative to recently shown items. | **O(k · n · d)** | Great for content feeds & infinite scroll, e.g. social/news/product feeds where users consume sequentially, as well as conversational RAG to avoid showing similar chunks within the recent window. ## Motivation @@ -103,6 +103,9 @@ The implementations in this package are based on the following research papers: - **DPP (efficient greedy implementation)**: Chen, L., Zhang, G., & Zhou, H. (2018). Fast greedy MAP inference for determinantal point process to improve recommendation diversity. [Link](https://arxiv.org/pdf/1709.05135) +- **SSD**: Huang, Y., Wang, W., Zhang, L., & Xu, R. (2021). Sliding Spectrum Decomposition for Diversified +Recommendation. [Link](https://arxiv.org/pdf/2107.05204) + ## Author Thomas van Dongen From 0655de40453961a9f53469e9d543bf0f38e23e45 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 12:08:03 +0100 Subject: [PATCH 05/16] Added examples --- README.md | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/README.md b/README.md index 591ad64..f0e6db1 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,135 @@ This improves exploration, user satisfaction, and coverage across many domains, - Academic retrieval: Surface papers from different subfields or methods. - RAG / LLM contexts: Avoid feeding the model near-duplicate passages. +## Examples + +The following examples illustrate how to apply different diversification strategies in various scenarios. + +
Product / Web Search — Simple diversification with MMR or DPP
+ +MMR and DPP are great general-purpose diversification strategies. They are fast, easy to use, and work well in many scenarios. +For example, in a product search setting where you want to show diverse items to a user, you can diversify the top results as follows: + +```python +from pyversity import diversify, Strategy + +# Suppose you have: +# - item_embeddings: embeddings of the retrieved products +# - item_scores: relevance scores for these products + +# Re-rank with MMR +result = diversify( + embeddings=item_embeddings, + scores=item_scores, + k=10, + strategy=Strategy.MMR, +) +``` +
+ +
Literature Search — Represent the full topic space with COVER
+ +COVER (Facility-Location) is well-suited for scenarios where you want to ensure that the selected items collectively represent the entire dataset’s structure. For instance, when searching for academic papers on a broad topic, you might want to cover various subfields and methodologies: + +```python +from pyversity import diversify, Strategy + +# Suppose you have: +# - paper_embeddings: embeddings of the retrieved papers +# - paper_scores: relevance scores for these papers + +# Re-rank with COVER +result = diversify( + embeddings=paper_embeddings, + scores=paper_scores, + k=10, + strategy=Strategy.COVER, +) +``` +
+ +
+Conversational RAG — Avoid redundant chunks with SSD +
+ +In retrieval-augmented generation (RAG) for conversational AI, it’s crucial to avoid feeding the model redundant or similar chunks of information within the recent conversation context. The SSD (Sliding Spectrum Decomposition) strategy is designed for sequence-aware diversification, making it ideal for this use case: + +```python +import numpy as np +from pyversity import diversify, Strategy + +# Suppose you have: +# - candidate_embeddings (for retrieved chunks this turn) +# - candidate_scores (relevance scores for these chunks) +# - recent_chunk_embeddings (chunks shown in the last few turns (oldest→newest) + +# Re-rank with SSD (sequence-aware) +result = diversify( + embeddings=candidate_embeddings, + scores=candidate_scores, + k=10, + strategy=Strategy.SSD, + recent_embeddings=recent_chunk_embeddings, +) + +picked_indices = result.indices +picked_embeddings = candidate_embeddings[picked_indices] + +# Maintain your rolling context window (keep oldest→newest) +recent_chunk_embeddings = np.vstack([recent_chunk_embeddings, picked_embeddings]) +``` +
+ + +
Infinite Scroll / Recommendation Feed — Sequence-aware novelty with SSD
+ +In content feeds or infinite scroll scenarios, users consume items sequentially. To keep the experience engaging, it’s important to introduce novelty relative to recently shown items. The SSD strategy is well-suited for this: + +```python +from pyversity import diversify, Strategy + +# Suppose you have: +# - feed_embeddings: embeddings of candidate items for the feed +# - feed_scores: relevance scores for these items +# - recent_feed_embeddings: embeddings of recently shown items in the feed (oldest→newest) + +# Re-rank with SSD (sequence-aware) +res = diversify( + embeddings=feed_embeddings, + scores=feed_scores, + k=30, + strategy=Strategy.SSD, + recent_embeddings=recent_feed_embeddings, +) + +# Maintain the rolling context window (keep oldest→newest) +recent_feed_embeddings = np.vstack([recent_feed_embeddings, feed_embeddings[res.indices]]) +``` +
+ + +
Single Long Document — Pick diverse sections with MSD
+ +When summarizing or extracting information from a single long document, it’s beneficial to select sections that are both relevant and cover different parts of the document. The MSD strategy helps achieve this by preferring items that are far apart from each other: + +```python +from pyversity import diversify, Strategy + +# Suppose you have: +# - doc_chunk_embembeddings: embeddings of document chunks +# - doc_chunk_scores: relevance scores for these chunks + +# Re-rank with MSD +result = diversify( + embeddings=doc_chunk_embeddings, + scores=doc_chunk_scores, + k=10, + strategy=Strategy.MSD, +) +``` + +
+ ## References The implementations in this package are based on the following research papers: From 4391c600240e93f2e70261f704d9de80ede73be8 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 12:11:38 +0100 Subject: [PATCH 06/16] Added examples --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f0e6db1..2da3e83 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,8 @@ [Quickstart](#quickstart) • [Supported Strategies](#supported-strategies) • -[Motivation](#motivation) +[Motivation](#motivation) • +[Examples](#examples) @@ -83,10 +84,11 @@ Each new item is chosen not only because it’s relevant, but also because it ad This improves exploration, user satisfaction, and coverage across many domains, for example: -- E-commerce: Show different product styles, not multiple copies of the same black pants. +- E-commerce: Show different product styles, not multiple copies of the same product. - News search: Highlight articles from different outlets or viewpoints. - Academic retrieval: Surface papers from different subfields or methods. - RAG / LLM contexts: Avoid feeding the model near-duplicate passages. +- Recommendation feeds: Keep content diverse and engaging over time. ## Examples From 0f6ca576ba0e9779ca772287b57c93441d2aa682 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 12:12:04 +0100 Subject: [PATCH 07/16] Added examples --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2da3e83..af58cef 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,8 @@ [Quickstart](#quickstart) • [Supported Strategies](#supported-strategies) • [Motivation](#motivation) • -[Examples](#examples) +[Examples](#examples) • +[References](#references) • From 24bd3e1191b481f4706f43063cbfe1685304cb51 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 12:12:18 +0100 Subject: [PATCH 08/16] Added examples --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index af58cef..5af2aa5 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ [Supported Strategies](#supported-strategies) • [Motivation](#motivation) • [Examples](#examples) • -[References](#references) • +[References](#references) From d1869fdbb2c44e2cf0fef7a0b1dbbec85726dd98 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 12:16:48 +0100 Subject: [PATCH 09/16] Added examples --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5af2aa5..921cf93 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Package version Codecov + License - MIT @@ -206,7 +207,7 @@ When summarizing or extracting information from a single long document, it’s b from pyversity import diversify, Strategy # Suppose you have: -# - doc_chunk_embembeddings: embeddings of document chunks +# - doc_chunk_embbeddings: embeddings of document chunks # - doc_chunk_scores: relevance scores for these chunks # Re-rank with MSD From 5daabd9371ea68c4c7dca33ff5bc9f42cd242ef2 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 16:50:32 +0100 Subject: [PATCH 10/16] Added tests --- src/pyversity/strategies/ssd.py | 27 ++++++++++++++++++--------- tests/test_strategies.py | 8 ++++++++ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/pyversity/strategies/ssd.py b/src/pyversity/strategies/ssd.py index 0c0a43f..9d5dfb1 100644 --- a/src/pyversity/strategies/ssd.py +++ b/src/pyversity/strategies/ssd.py @@ -17,25 +17,27 @@ def ssd( # noqa: C901 normalize_scores: bool = True, ) -> DiversificationResult: """ - Sliding Spectrum Decomposition (SSD*) selection. + Sliding Spectrum Decomposition (SSD) selection. This strategy selects `k` items using a greedy, sequence-aware approach that maintains a sliding window of Gram-Schmidt bases to promote diversity while considering recent context. If `recent_embeddings` are provided (oldest → newest), the window is seeded so the very first pick is already novel relative to what the user just saw. + Note: this is the implementation proposed in Equation 12 of the SSD paper, called SSD*. + :param embeddings: 2D array (n_items, n_dims) of candidate embeddings. :param scores: 1D array (n_items,) of relevance scores. :param k: Number of items to select. :param diversity: Trade-off between relevance and diversity in [0, 1] (inverse of theta parameter). 1.0 = pure diversity, 0.0 = pure relevance. - :param recent_embeddings: Optional 2D array (n_items, n_dims) with recent embeddings from oldest → newest. + :param recent_embeddings: Optional 2D array (m, n_dims) with recent embeddings from oldest → newest. seeds the sliding window so selection is aware of recent context. :param window: Sliding window size (≥1) for Gram-Schmidt bases. :param gamma: Diversity scale (> 0). :param normalize: Whether to normalize embeddings before computing similarity. - :param append_bias: Append constant-one bias dim after normalization (paper §5.3). - :param normalize_scores: Z-score scores per request (applied only if diversity > 0). + :param append_bias: Append constant-one bias dim after normalization. + :param normalize_scores: Z-score scores per request. :return: DiversificationResult with selected indices and their selection scores. :raises ValueError: If diversity ∉ [0, 1], or window < 1, or gamma ≤ 0. @@ -63,6 +65,15 @@ def ssd( # noqa: C901 diversity=diversity, parameters={"gamma": gamma, "window": window}, ) + # Validate recent_embeddings + if recent_embeddings is not None and np.size(recent_embeddings) > 0: + if recent_embeddings.ndim != 2: + raise ValueError("recent_embeddings must be a 2D array of shape (n_items, n_dims).") + if recent_embeddings.shape[1] != feature_matrix.shape[1]: + raise ValueError( + f"recent_embeddings has {recent_embeddings.shape[1]} dims; " + f"expected {feature_matrix.shape[1]} to match `embeddings` columns." + ) # Pure relevance: select top‑k by raw scores if float(theta) == 1.0: @@ -136,11 +147,9 @@ def _push_basis_vector(basis_vec: np.ndarray) -> None: # Seed with recent context (oldest → newest) context_seed = 0 if recent_embeddings is not None and np.size(recent_embeddings) > 0: - if recent_embeddings.ndim != 2 or recent_embeddings.shape[1] != embeddings.shape[1]: - raise ValueError("recent_embeddings must have shape (m, n_dims) matching `embeddings` columns.") - ctx = _prepare_vectors(recent_embeddings.astype(feature_matrix.dtype, copy=False)) - ctx = ctx[-window:] # keep only the latest window items - for vec in ctx: + context = _prepare_vectors(recent_embeddings.astype(feature_matrix.dtype, copy=False)) + context = context[-window:] # keep only the latest window items + for vec in context: # Orthogonalize the context vector against current bases residual_vec = vec.copy() for basis in basis_vectors: diff --git a/tests/test_strategies.py b/tests/test_strategies.py index 0536e72..5661157 100644 --- a/tests/test_strategies.py +++ b/tests/test_strategies.py @@ -203,6 +203,14 @@ def test_ssd() -> None: with pytest.raises(ValueError): ssd(emb, scores, k=2, gamma=-0.5) + # recent_embeddings validation: must be 2D + with pytest.raises(ValueError): + ssd(emb, scores, k=2, diversity=0.5, recent_embeddings=np.array([0.0, 1.0], dtype=np.float32)) + + # recent_embeddings validation: dim mismatch with embeddings + with pytest.raises(ValueError): + ssd(emb, scores, k=2, diversity=0.5, recent_embeddings=np.ones((2, 4), dtype=np.float32)) # emb is (.,3) + # Early exit on empty input emb = np.empty((0, 3), dtype=np.float32) scores = np.array([], dtype=np.float32) From d42c8a22ded6dd91da5a707cde1b2c54d0dac6d0 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 17:07:11 +0100 Subject: [PATCH 11/16] Updated readme --- README.md | 2 +- src/pyversity/strategies/ssd.py | 127 ++++++++++++++++---------------- 2 files changed, 64 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 921cf93..49f49ea 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,7 @@ from pyversity import diversify, Strategy res = diversify( embeddings=feed_embeddings, scores=feed_scores, - k=30, + k=10, strategy=Strategy.SSD, recent_embeddings=recent_feed_embeddings, ) diff --git a/src/pyversity/strategies/ssd.py b/src/pyversity/strategies/ssd.py index 9d5dfb1..28f1de2 100644 --- a/src/pyversity/strategies/ssd.py +++ b/src/pyversity/strategies/ssd.py @@ -24,21 +24,20 @@ def ssd( # noqa: C901 If `recent_embeddings` are provided (oldest → newest), the window is seeded so the very first pick is already novel relative to what the user just saw. - Note: this is the implementation proposed in Equation 12 of the SSD paper, called SSD*. + Note: this follows the stabilized SSD variant (“SSD*”) described in Eq. (12) in the paper. :param embeddings: 2D array (n_items, n_dims) of candidate embeddings. :param scores: 1D array (n_items,) of relevance scores. :param k: Number of items to select. :param diversity: Trade-off between relevance and diversity in [0, 1] (inverse of theta parameter). - 1.0 = pure diversity, 0.0 = pure relevance. - :param recent_embeddings: Optional 2D array (m, n_dims) with recent embeddings from oldest → newest. - seeds the sliding window so selection is aware of recent context. - :param window: Sliding window size (≥1) for Gram-Schmidt bases. + 1.0 = pure diversity, 0.0 = pure relevance. + :param recent_embeddings: Optional 2D array (m, n_dims), oldest → newest; seeds the sliding window so + selection is aware of what was recently shown. + :param window: Sliding window size (≥ 1) for Gram-Schmidt bases. :param gamma: Diversity scale (> 0). :param normalize: Whether to normalize embeddings before computing similarity. - :param append_bias: Append constant-one bias dim after normalization. - :param normalize_scores: Z-score scores per request. - + :param append_bias: Append a constant-one bias dimension after normalization. + :param normalize_scores: Z-score the scores per request (stabilizes the gamma trade-off). :return: DiversificationResult with selected indices and their selection scores. :raises ValueError: If diversity ∉ [0, 1], or window < 1, or gamma ≤ 0. """ @@ -50,8 +49,7 @@ def ssd( # noqa: C901 if gamma <= 0.0: raise ValueError("gamma must be > 0") - # Theta parameter for trade‑off between relevance and diversity - # This is 1 - diversity to align with common notation + # Theta parameter for trade-off between relevance and diversity (1 - diversity) theta = 1.0 - float(diversity) # Prepare inputs @@ -65,6 +63,7 @@ def ssd( # noqa: C901 diversity=diversity, parameters={"gamma": gamma, "window": window}, ) + # Validate recent_embeddings if recent_embeddings is not None and np.size(recent_embeddings) > 0: if recent_embeddings.ndim != 2: @@ -75,34 +74,35 @@ def ssd( # noqa: C901 f"expected {feature_matrix.shape[1]} to match `embeddings` columns." ) - # Pure relevance: select top‑k by raw scores + # Pure relevance: select top-k by raw scores if float(theta) == 1.0: topk = np.argsort(-relevance_scores)[:top_k].astype(np.int32) - gains = relevance_scores[topk].astype(np.float32, copy=False) + selection_scores = relevance_scores[topk].astype(np.float32, copy=False) return DiversificationResult( indices=topk, - selection_scores=gains, + selection_scores=selection_scores, strategy=Strategy.SSD, diversity=diversity, parameters={"gamma": gamma, "window": window}, ) def _prepare_vectors(matrix: np.ndarray) -> np.ndarray: - """Prepare feature vectors with normalization and bias appending.""" - out = matrix + """Prepare feature vectors with normalization and (optionally) a bias dimension.""" + prepared = matrix if normalize: - out = normalize_rows(out) + prepared = normalize_rows(prepared) if append_bias: - # Append constant‑one bias dimension (see paper §5.3) - last_col_is_ones = (out.shape[1] > 0) and np.allclose(out[:, -1], 1.0, atol=1e-6, rtol=0.0) + # Bias trick described in the paper (Section 5.3) + last_col_is_ones = prepared.shape[1] > 0 and np.allclose(prepared[:, -1], 1.0, atol=1e-6, rtol=0.0) if not last_col_is_ones: - out = np.concatenate([out, np.ones((out.shape[0], 1), dtype=out.dtype)], axis=1) - return out + ones = np.ones((prepared.shape[0], 1), dtype=prepared.dtype) + prepared = np.concatenate([prepared, ones], axis=1) + return prepared # Prepare feature vectors feature_matrix = _prepare_vectors(feature_matrix) - # Normalize scores per request (to stabilize gamma) + # Per-request score normalization (stabilizes gamma) if normalize_scores: mean = float(np.mean(relevance_scores)) std = float(np.std(relevance_scores)) @@ -120,54 +120,53 @@ def _prepare_vectors(matrix: np.ndarray) -> np.ndarray: # Sliding window storage basis_vectors: list[np.ndarray] = [] - projection_coeffs_per_basis: list[np.ndarray] = [] + projection_coefficients_per_basis: list[np.ndarray] = [] - def _push_basis_vector(basis_vec: np.ndarray) -> None: - """Add a new basis vector to the sliding window, updating residuals.""" + def _push_basis_vector(basis_vector: np.ndarray) -> None: + """Add a new basis vector to the sliding window and update residuals/projections.""" if len(basis_vectors) == window: - # If at capacity, remove oldest basis and restore its contribution to residuals + # Remove oldest basis and restore its contribution to residuals oldest_basis = basis_vectors.pop(0) - oldest_coeffs = projection_coeffs_per_basis.pop(0) + oldest_coefficients = projection_coefficients_per_basis.pop(0) mask_unselected = ~selected_mask if np.any(mask_unselected): - residual_matrix[mask_unselected] += oldest_coeffs[mask_unselected, None] * oldest_basis + residual_matrix[mask_unselected] += oldest_coefficients[mask_unselected, None] * oldest_basis + + denominator = float(basis_vector @ basis_vector) + EPS32 + basis_vectors.append(basis_vector.astype(np.float32, copy=False)) - denom = float(basis_vec @ basis_vec) + EPS32 - basis_vectors.append(basis_vec.astype(np.float32, copy=False)) mask_unselected = ~selected_mask - coeffs = np.zeros(num_items, dtype=np.float32) + coefficients = np.zeros(num_items, dtype=np.float32) if np.any(mask_unselected): - # Compute projection coefficients and update residuals if unselected items remain - proj = (residual_matrix[mask_unselected] @ basis_vec) / denom - coeffs[mask_unselected] = proj - residual_matrix[mask_unselected] -= proj[:, None] * basis_vec - # Store the projection coefficients for later restoration - projection_coeffs_per_basis.append(coeffs) - - # Seed with recent context (oldest → newest) - context_seed = 0 + projections = (residual_matrix[mask_unselected] @ basis_vector) / denominator + coefficients[mask_unselected] = projections + residual_matrix[mask_unselected] -= projections[:, None] * basis_vector + + projection_coefficients_per_basis.append(coefficients) + + # Seed with recent context (oldest → newest) if provided + seeded_bases = 0 if recent_embeddings is not None and np.size(recent_embeddings) > 0: context = _prepare_vectors(recent_embeddings.astype(feature_matrix.dtype, copy=False)) - context = context[-window:] # keep only the latest window items - for vec in context: - # Orthogonalize the context vector against current bases - residual_vec = vec.copy() + context = context[-window:] # keep only the latest `window` items + for context_vector in context: + residual_context = context_vector.copy() for basis in basis_vectors: - denom_b = float(basis @ basis) + EPS32 - residual_vec -= float(residual_vec @ basis) / denom_b * basis - _push_basis_vector(residual_vec) - context_seed += 1 + denominator_b = float(basis @ basis) + EPS32 + residual_context -= float(residual_context @ basis) / denominator_b * basis + _push_basis_vector(residual_context) + seeded_bases += 1 # Decide what to select first - if context_seed > 0: - # If we seeded context, use the combined score immediately (novel vs. recent) + if seeded_bases > 0: + # Use combined scores with diversity from seeded context residual_norms = np.linalg.norm(residual_matrix, axis=1) - combined = theta * relevance_scores + (1.0 - theta) * gamma * residual_norms - combined[selected_mask] = -np.inf - first_index = int(np.argmax(combined)) - first_score = float(combined[first_index]) + combined_scores = theta * relevance_scores + (1.0 - theta) * gamma * residual_norms + combined_scores[selected_mask] = -np.inf + first_index = int(np.argmax(combined_scores)) + first_score = float(combined_scores[first_index]) else: - # No context yet: pick purely by relevance (then start residualization) + # No context yet: pick by highest relevance, then start residualization first_index = int(np.argmax(relevance_scores)) first_score = float( theta * relevance_scores[first_index] @@ -182,21 +181,21 @@ def _push_basis_vector(basis_vec: np.ndarray) -> None: # Main loop for step in range(1, top_k): - # Select next item by combined score - available = np.where(~selected_mask)[0] - if available.size == 0: - # No more items to select + # Find best candidate among unselected items + available_indices = np.where(~selected_mask)[0] + if available_indices.size == 0: selected_indices = selected_indices[:step] selection_scores = selection_scores[:step] break - residual_norms = np.linalg.norm(residual_matrix[available], axis=1) - combined_scores = theta * relevance_scores[available] + (1.0 - theta) * gamma * residual_norms - best_local = int(np.argmax(combined_scores)) - best_index = int(available[best_local]) - best_score = float(combined_scores[best_local]) + # Residual norms measure novelty relative to the last `window` selections/context + residual_norms = np.linalg.norm(residual_matrix[available_indices], axis=1) + combined_scores = theta * relevance_scores[available_indices] + (1.0 - theta) * gamma * residual_norms + local_best = int(np.argmax(combined_scores)) + best_index = int(available_indices[local_best]) + best_score = float(combined_scores[local_best]) - # Select the best item + # Update selection state selected_mask[best_index] = True selected_indices[step] = best_index selection_scores[step] = best_score From 1646846e3ddff652569237927e2904cbd9a5be46 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 19:19:27 +0100 Subject: [PATCH 12/16] Updated readme --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 49f49ea..dbcbcde 100644 --- a/README.md +++ b/README.md @@ -166,7 +166,7 @@ result = diversify( picked_indices = result.indices picked_embeddings = candidate_embeddings[picked_indices] -# Maintain your rolling context window (keep oldest→newest) +# Maintain the rolling context window for recent chunks recent_chunk_embeddings = np.vstack([recent_chunk_embeddings, picked_embeddings]) ``` @@ -177,6 +177,7 @@ recent_chunk_embeddings = np.vstack([recent_chunk_embeddings, picked_embeddings] In content feeds or infinite scroll scenarios, users consume items sequentially. To keep the experience engaging, it’s important to introduce novelty relative to recently shown items. The SSD strategy is well-suited for this: ```python +import numpy as np from pyversity import diversify, Strategy # Suppose you have: @@ -184,8 +185,8 @@ from pyversity import diversify, Strategy # - feed_scores: relevance scores for these items # - recent_feed_embeddings: embeddings of recently shown items in the feed (oldest→newest) -# Re-rank with SSD (sequence-aware) -res = diversify( +# Sequence-aware re-ranking with Sliding Spectrum Decomposition (SSD) +result = diversify( embeddings=feed_embeddings, scores=feed_scores, k=10, @@ -193,8 +194,8 @@ res = diversify( recent_embeddings=recent_feed_embeddings, ) -# Maintain the rolling context window (keep oldest→newest) -recent_feed_embeddings = np.vstack([recent_feed_embeddings, feed_embeddings[res.indices]]) +# Maintain the rolling context window for recent items +recent_feed_embeddings = np.vstack([recent_feed_embeddings, feed_embeddings[result.indices]]) ``` From 5ca5aedcac19fd838dc6469e2d9eb3178817f86f Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 14 Nov 2025 19:30:48 +0100 Subject: [PATCH 13/16] Updated tests --- src/pyversity/strategies/ssd.py | 5 ----- tests/test_strategies.py | 3 +++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/pyversity/strategies/ssd.py b/src/pyversity/strategies/ssd.py index 28f1de2..4a70f35 100644 --- a/src/pyversity/strategies/ssd.py +++ b/src/pyversity/strategies/ssd.py @@ -183,11 +183,6 @@ def _push_basis_vector(basis_vector: np.ndarray) -> None: for step in range(1, top_k): # Find best candidate among unselected items available_indices = np.where(~selected_mask)[0] - if available_indices.size == 0: - selected_indices = selected_indices[:step] - selection_scores = selection_scores[:step] - break - # Residual norms measure novelty relative to the last `window` selections/context residual_norms = np.linalg.norm(residual_matrix[available_indices], axis=1) combined_scores = theta * relevance_scores[available_indices] + (1.0 - theta) * gamma * residual_norms diff --git a/tests/test_strategies.py b/tests/test_strategies.py index 5661157..b1e1638 100644 --- a/tests/test_strategies.py +++ b/tests/test_strategies.py @@ -202,6 +202,8 @@ def test_ssd() -> None: ssd(emb, scores, k=2, gamma=0.0) with pytest.raises(ValueError): ssd(emb, scores, k=2, gamma=-0.5) + with pytest.raises(ValueError): + ssd(emb, scores, k=2, window=0) # recent_embeddings validation: must be 2D with pytest.raises(ValueError): @@ -253,6 +255,7 @@ def test_ssd_recent_embeddings_window_blocks_multiple_recent() -> None: (Strategy.MSD, msd, {"diversity": 0.5, "metric": Metric.COSINE, "normalize": True}), (Strategy.COVER, cover, {"diversity": 0.5, "gamma": 0.5}), (Strategy.DPP, dpp, {"diversity": 0.5}), + (Strategy.SSD, ssd, {"diversity": 0.5}), ], ) def test_diversify(strategy: Strategy, fn: Callable[..., DiversificationResult], kwargs: Any) -> None: From a9d3345762222a25a1d159d90365569c579c540d Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 15 Nov 2025 08:53:42 +0100 Subject: [PATCH 14/16] Updated readme --- README.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index dbcbcde..06377a9 100644 --- a/README.md +++ b/README.md @@ -150,24 +150,21 @@ import numpy as np from pyversity import diversify, Strategy # Suppose you have: -# - candidate_embeddings (for retrieved chunks this turn) -# - candidate_scores (relevance scores for these chunks) +# - chunk_embeddings (for retrieved chunks this turn) +# - chunk_scores (relevance scores for these chunks) # - recent_chunk_embeddings (chunks shown in the last few turns (oldest→newest) # Re-rank with SSD (sequence-aware) result = diversify( - embeddings=candidate_embeddings, - scores=candidate_scores, + embeddings=chunk_embeddings, + scores=chunk_scores, k=10, strategy=Strategy.SSD, recent_embeddings=recent_chunk_embeddings, ) -picked_indices = result.indices -picked_embeddings = candidate_embeddings[picked_indices] - # Maintain the rolling context window for recent chunks -recent_chunk_embeddings = np.vstack([recent_chunk_embeddings, picked_embeddings]) +recent_chunk_embeddings = np.vstack([recent_chunk_embeddings, chunk_embeddings[result.indices]]) ``` @@ -208,7 +205,7 @@ When summarizing or extracting information from a single long document, it’s b from pyversity import diversify, Strategy # Suppose you have: -# - doc_chunk_embbeddings: embeddings of document chunks +# - doc_chunk_embeddings: embeddings of document chunks # - doc_chunk_scores: relevance scores for these chunks # Re-rank with MSD From 1d0710fb0417982bcc5a1b1023568942fd08c154 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 15 Nov 2025 08:55:23 +0100 Subject: [PATCH 15/16] Updated tests --- tests/test_strategies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_strategies.py b/tests/test_strategies.py index b1e1638..11400fc 100644 --- a/tests/test_strategies.py +++ b/tests/test_strategies.py @@ -221,7 +221,7 @@ def test_ssd() -> None: def test_ssd_recent_embeddings_avoids_recent_first_pick() -> None: - """With equal relevance, the first pick should avoid the most recent item when context is seeded.""" + """Test that with equal relevance, the first pick should avoid the most recent item when context is seeded.""" # 3 orthogonal items (identity); equal scores emb = np.eye(3, dtype=np.float32) scores = np.array([0.5, 0.5, 0.5], dtype=np.float32) @@ -236,7 +236,7 @@ def test_ssd_recent_embeddings_avoids_recent_first_pick() -> None: def test_ssd_recent_embeddings_window_blocks_multiple_recent() -> None: - """If the window contains two recent items, the first pick should avoid both when scores are tied.""" + """Test that if the window contains two recent items, the first pick should avoid both when scores are tied.""" emb = np.eye(4, dtype=np.float32) scores = np.ones(4, dtype=np.float32) # tie From a629c438fdda55ec8db66498620485be16597430 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 15 Nov 2025 08:55:59 +0100 Subject: [PATCH 16/16] Updated tests --- tests/test_strategies.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_strategies.py b/tests/test_strategies.py index 11400fc..f306fbb 100644 --- a/tests/test_strategies.py +++ b/tests/test_strategies.py @@ -2,9 +2,8 @@ import numpy as np import pytest -from pyversity import Metric, Strategy, cover, diversify, dpp, mmr, msd +from pyversity import Metric, Strategy, cover, diversify, dpp, mmr, msd, ssd from pyversity.datatypes import DiversificationResult -from pyversity.strategies import ssd def test_mmr() -> None: