diff --git a/.gitignore b/.gitignore
index a80c727..e9cb00a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -207,3 +207,5 @@ marimo/_lsp/
__marimo__/
local
+
+.DS_Store
diff --git a/README.md b/README.md
index 4e2087d..06377a9 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@
+
@@ -17,7 +18,9 @@
[Quickstart](#quickstart) •
[Supported Strategies](#supported-strategies) •
-[Motivation](#motivation)
+[Motivation](#motivation) •
+[Examples](#examples) •
+[References](#references)
@@ -71,6 +74,7 @@ The following table describes the supported strategies, how they work, their tim
| **MSD** (Max Sum of Distances) | Prefers items that are both relevant and far from *all* previous selections. | **O(k · n · d)** | Use when you want stronger spread, i.e. results that cover a wider range of topics or styles. |
| **DPP** (Determinantal Point Process) | Samples diverse yet relevant items using probabilistic “repulsion.” | **O(k · n · d + n · k²)** | Ideal when you want to eliminate redundancy or ensure diversity is built-in to selection. |
| **COVER** (Facility-Location) | Ensures selected items collectively represent the full dataset’s structure. | **O(k · n²)** | Great for topic coverage or clustering scenarios, but slower for large `n`. |
+| **SSD** (Sliding Spectrum Decomposition) | Sequence‑aware diversification: rewards novelty relative to recently shown items. | **O(k · n · d)** | Great for content feeds & infinite scroll, e.g. social/news/product feeds where users consume sequentially, as well as conversational RAG to avoid showing similar chunks within the recent window.
## Motivation
@@ -82,10 +86,138 @@ Each new item is chosen not only because it’s relevant, but also because it ad
This improves exploration, user satisfaction, and coverage across many domains, for example:
-- E-commerce: Show different product styles, not multiple copies of the same black pants.
+- E-commerce: Show different product styles, not multiple copies of the same product.
- News search: Highlight articles from different outlets or viewpoints.
- Academic retrieval: Surface papers from different subfields or methods.
- RAG / LLM contexts: Avoid feeding the model near-duplicate passages.
+- Recommendation feeds: Keep content diverse and engaging over time.
+
+## Examples
+
+The following examples illustrate how to apply different diversification strategies in various scenarios.
+
+ Product / Web Search — Simple diversification with MMR or DPP
+
+MMR and DPP are great general-purpose diversification strategies. They are fast, easy to use, and work well in many scenarios.
+For example, in a product search setting where you want to show diverse items to a user, you can diversify the top results as follows:
+
+```python
+from pyversity import diversify, Strategy
+
+# Suppose you have:
+# - item_embeddings: embeddings of the retrieved products
+# - item_scores: relevance scores for these products
+
+# Re-rank with MMR
+result = diversify(
+ embeddings=item_embeddings,
+ scores=item_scores,
+ k=10,
+ strategy=Strategy.MMR,
+)
+```
+
+
+ Literature Search — Represent the full topic space with COVER
+
+COVER (Facility-Location) is well-suited for scenarios where you want to ensure that the selected items collectively represent the entire dataset’s structure. For instance, when searching for academic papers on a broad topic, you might want to cover various subfields and methodologies:
+
+```python
+from pyversity import diversify, Strategy
+
+# Suppose you have:
+# - paper_embeddings: embeddings of the retrieved papers
+# - paper_scores: relevance scores for these papers
+
+# Re-rank with COVER
+result = diversify(
+ embeddings=paper_embeddings,
+ scores=paper_scores,
+ k=10,
+ strategy=Strategy.COVER,
+)
+```
+
+
+
+Conversational RAG — Avoid redundant chunks with SSD
+
+
+In retrieval-augmented generation (RAG) for conversational AI, it’s crucial to avoid feeding the model redundant or similar chunks of information within the recent conversation context. The SSD (Sliding Spectrum Decomposition) strategy is designed for sequence-aware diversification, making it ideal for this use case:
+
+```python
+import numpy as np
+from pyversity import diversify, Strategy
+
+# Suppose you have:
+# - chunk_embeddings (for retrieved chunks this turn)
+# - chunk_scores (relevance scores for these chunks)
+# - recent_chunk_embeddings (chunks shown in the last few turns (oldest→newest)
+
+# Re-rank with SSD (sequence-aware)
+result = diversify(
+ embeddings=chunk_embeddings,
+ scores=chunk_scores,
+ k=10,
+ strategy=Strategy.SSD,
+ recent_embeddings=recent_chunk_embeddings,
+)
+
+# Maintain the rolling context window for recent chunks
+recent_chunk_embeddings = np.vstack([recent_chunk_embeddings, chunk_embeddings[result.indices]])
+```
+
+
+
+ Infinite Scroll / Recommendation Feed — Sequence-aware novelty with SSD
+
+In content feeds or infinite scroll scenarios, users consume items sequentially. To keep the experience engaging, it’s important to introduce novelty relative to recently shown items. The SSD strategy is well-suited for this:
+
+```python
+import numpy as np
+from pyversity import diversify, Strategy
+
+# Suppose you have:
+# - feed_embeddings: embeddings of candidate items for the feed
+# - feed_scores: relevance scores for these items
+# - recent_feed_embeddings: embeddings of recently shown items in the feed (oldest→newest)
+
+# Sequence-aware re-ranking with Sliding Spectrum Decomposition (SSD)
+result = diversify(
+ embeddings=feed_embeddings,
+ scores=feed_scores,
+ k=10,
+ strategy=Strategy.SSD,
+ recent_embeddings=recent_feed_embeddings,
+)
+
+# Maintain the rolling context window for recent items
+recent_feed_embeddings = np.vstack([recent_feed_embeddings, feed_embeddings[result.indices]])
+```
+
+
+
+ Single Long Document — Pick diverse sections with MSD
+
+When summarizing or extracting information from a single long document, it’s beneficial to select sections that are both relevant and cover different parts of the document. The MSD strategy helps achieve this by preferring items that are far apart from each other:
+
+```python
+from pyversity import diversify, Strategy
+
+# Suppose you have:
+# - doc_chunk_embeddings: embeddings of document chunks
+# - doc_chunk_scores: relevance scores for these chunks
+
+# Re-rank with MSD
+result = diversify(
+ embeddings=doc_chunk_embeddings,
+ scores=doc_chunk_scores,
+ k=10,
+ strategy=Strategy.MSD,
+)
+```
+
+
## References
@@ -102,6 +234,9 @@ The implementations in this package are based on the following research papers:
- **DPP (efficient greedy implementation)**: Chen, L., Zhang, G., & Zhou, H. (2018). Fast greedy MAP inference for determinantal point process to improve recommendation diversity.
[Link](https://arxiv.org/pdf/1709.05135)
+- **SSD**: Huang, Y., Wang, W., Zhang, L., & Xu, R. (2021). Sliding Spectrum Decomposition for Diversified
+Recommendation. [Link](https://arxiv.org/pdf/2107.05204)
+
## Author
Thomas van Dongen
diff --git a/src/pyversity/__init__.py b/src/pyversity/__init__.py
index e1d697a..a4d94b2 100644
--- a/src/pyversity/__init__.py
+++ b/src/pyversity/__init__.py
@@ -1,6 +1,17 @@
from pyversity.datatypes import DiversificationResult, Metric, Strategy
from pyversity.pyversity import diversify
-from pyversity.strategies import cover, dpp, mmr, msd
+from pyversity.strategies import cover, dpp, mmr, msd, ssd
from pyversity.version import __version__
-__all__ = ["diversify", "Strategy", "Metric", "DiversificationResult", "mmr", "msd", "cover", "dpp", "__version__"]
+__all__ = [
+ "diversify",
+ "Strategy",
+ "Metric",
+ "DiversificationResult",
+ "mmr",
+ "msd",
+ "cover",
+ "dpp",
+ "ssd",
+ "__version__",
+]
diff --git a/src/pyversity/datatypes.py b/src/pyversity/datatypes.py
index c74c1ae..d73df3f 100644
--- a/src/pyversity/datatypes.py
+++ b/src/pyversity/datatypes.py
@@ -11,6 +11,7 @@ class Strategy(str, Enum):
MSD = "msd"
COVER = "cover"
DPP = "dpp"
+ SSD = "ssd"
class Metric(str, Enum):
diff --git a/src/pyversity/pyversity.py b/src/pyversity/pyversity.py
index ab69efe..b543807 100644
--- a/src/pyversity/pyversity.py
+++ b/src/pyversity/pyversity.py
@@ -3,7 +3,7 @@
import numpy as np
from pyversity.datatypes import DiversificationResult, Strategy
-from pyversity.strategies import cover, dpp, mmr, msd
+from pyversity.strategies import cover, dpp, mmr, msd, ssd
def diversify(
@@ -36,4 +36,6 @@ def diversify(
return cover(embeddings, scores, k, diversity, **kwargs)
if strategy == Strategy.DPP:
return dpp(embeddings, scores, k, diversity, **kwargs)
+ if strategy == Strategy.SSD:
+ return ssd(embeddings, scores, k, diversity, **kwargs)
raise ValueError(f"Unknown strategy: {strategy}")
diff --git a/src/pyversity/strategies/__init__.py b/src/pyversity/strategies/__init__.py
index 189088a..e5f9e9e 100644
--- a/src/pyversity/strategies/__init__.py
+++ b/src/pyversity/strategies/__init__.py
@@ -2,5 +2,6 @@
from pyversity.strategies.dpp import dpp
from pyversity.strategies.mmr import mmr
from pyversity.strategies.msd import msd
+from pyversity.strategies.ssd import ssd
-__all__ = ["mmr", "msd", "cover", "dpp"]
+__all__ = ["mmr", "msd", "cover", "dpp", "ssd"]
diff --git a/src/pyversity/strategies/cover.py b/src/pyversity/strategies/cover.py
index 5e3f9e7..75412bd 100644
--- a/src/pyversity/strategies/cover.py
+++ b/src/pyversity/strategies/cover.py
@@ -14,7 +14,7 @@ def cover(
normalize: bool = True,
) -> DiversificationResult:
"""
- Select a subset of items that balances relevance and coverage/diversity.
+ Cover (Facility Location) selection.
This strategy chooses `k` items by combining pure relevance with
diversity-driven coverage using a concave submodular formulation.
diff --git a/src/pyversity/strategies/ssd.py b/src/pyversity/strategies/ssd.py
new file mode 100644
index 0000000..4a70f35
--- /dev/null
+++ b/src/pyversity/strategies/ssd.py
@@ -0,0 +1,205 @@
+import numpy as np
+
+from pyversity.datatypes import DiversificationResult, Strategy
+from pyversity.utils import EPS32, normalize_rows, prepare_inputs
+
+
+def ssd( # noqa: C901
+ embeddings: np.ndarray,
+ scores: np.ndarray,
+ k: int,
+ diversity: float = 0.5,
+ recent_embeddings: np.ndarray | None = None,
+ window: int = 10,
+ gamma: float = 1.0,
+ normalize: bool = True,
+ append_bias: bool = True,
+ normalize_scores: bool = True,
+) -> DiversificationResult:
+ """
+ Sliding Spectrum Decomposition (SSD) selection.
+
+ This strategy selects `k` items using a greedy, sequence-aware approach that maintains a sliding window
+ of Gram-Schmidt bases to promote diversity while considering recent context.
+ If `recent_embeddings` are provided (oldest → newest), the window is seeded so the very first pick is
+ already novel relative to what the user just saw.
+
+ Note: this follows the stabilized SSD variant (“SSD*”) described in Eq. (12) in the paper.
+
+ :param embeddings: 2D array (n_items, n_dims) of candidate embeddings.
+ :param scores: 1D array (n_items,) of relevance scores.
+ :param k: Number of items to select.
+ :param diversity: Trade-off between relevance and diversity in [0, 1] (inverse of theta parameter).
+ 1.0 = pure diversity, 0.0 = pure relevance.
+ :param recent_embeddings: Optional 2D array (m, n_dims), oldest → newest; seeds the sliding window so
+ selection is aware of what was recently shown.
+ :param window: Sliding window size (≥ 1) for Gram-Schmidt bases.
+ :param gamma: Diversity scale (> 0).
+ :param normalize: Whether to normalize embeddings before computing similarity.
+ :param append_bias: Append a constant-one bias dimension after normalization.
+ :param normalize_scores: Z-score the scores per request (stabilizes the gamma trade-off).
+ :return: DiversificationResult with selected indices and their selection scores.
+ :raises ValueError: If diversity ∉ [0, 1], or window < 1, or gamma ≤ 0.
+ """
+ # Validate parameters
+ if not (0.0 <= float(diversity) <= 1.0):
+ raise ValueError("diversity must be in [0, 1]")
+ if window < 1:
+ raise ValueError("window must be >= 1")
+ if gamma <= 0.0:
+ raise ValueError("gamma must be > 0")
+
+ # Theta parameter for trade-off between relevance and diversity (1 - diversity)
+ theta = 1.0 - float(diversity)
+
+ # Prepare inputs
+ feature_matrix, relevance_scores, top_k, early_exit = prepare_inputs(embeddings, scores, k)
+ if early_exit:
+ # Nothing to select: return empty arrays
+ return DiversificationResult(
+ indices=np.empty(0, np.int32),
+ selection_scores=np.empty(0, np.float32),
+ strategy=Strategy.SSD,
+ diversity=diversity,
+ parameters={"gamma": gamma, "window": window},
+ )
+
+ # Validate recent_embeddings
+ if recent_embeddings is not None and np.size(recent_embeddings) > 0:
+ if recent_embeddings.ndim != 2:
+ raise ValueError("recent_embeddings must be a 2D array of shape (n_items, n_dims).")
+ if recent_embeddings.shape[1] != feature_matrix.shape[1]:
+ raise ValueError(
+ f"recent_embeddings has {recent_embeddings.shape[1]} dims; "
+ f"expected {feature_matrix.shape[1]} to match `embeddings` columns."
+ )
+
+ # Pure relevance: select top-k by raw scores
+ if float(theta) == 1.0:
+ topk = np.argsort(-relevance_scores)[:top_k].astype(np.int32)
+ selection_scores = relevance_scores[topk].astype(np.float32, copy=False)
+ return DiversificationResult(
+ indices=topk,
+ selection_scores=selection_scores,
+ strategy=Strategy.SSD,
+ diversity=diversity,
+ parameters={"gamma": gamma, "window": window},
+ )
+
+ def _prepare_vectors(matrix: np.ndarray) -> np.ndarray:
+ """Prepare feature vectors with normalization and (optionally) a bias dimension."""
+ prepared = matrix
+ if normalize:
+ prepared = normalize_rows(prepared)
+ if append_bias:
+ # Bias trick described in the paper (Section 5.3)
+ last_col_is_ones = prepared.shape[1] > 0 and np.allclose(prepared[:, -1], 1.0, atol=1e-6, rtol=0.0)
+ if not last_col_is_ones:
+ ones = np.ones((prepared.shape[0], 1), dtype=prepared.dtype)
+ prepared = np.concatenate([prepared, ones], axis=1)
+ return prepared
+
+ # Prepare feature vectors
+ feature_matrix = _prepare_vectors(feature_matrix)
+
+ # Per-request score normalization (stabilizes gamma)
+ if normalize_scores:
+ mean = float(np.mean(relevance_scores))
+ std = float(np.std(relevance_scores))
+ relevance_scores = (relevance_scores - mean) / std if std > 0.0 else (relevance_scores - mean)
+
+ num_items, _ = feature_matrix.shape
+
+ # Initialize selection state
+ selected_mask = np.zeros(num_items, dtype=bool)
+ selected_indices = np.empty(top_k, dtype=np.int32)
+ selection_scores = np.empty(top_k, dtype=np.float32)
+
+ # Current residuals under the sliding window
+ residual_matrix = feature_matrix.astype(np.float32, copy=True)
+
+ # Sliding window storage
+ basis_vectors: list[np.ndarray] = []
+ projection_coefficients_per_basis: list[np.ndarray] = []
+
+ def _push_basis_vector(basis_vector: np.ndarray) -> None:
+ """Add a new basis vector to the sliding window and update residuals/projections."""
+ if len(basis_vectors) == window:
+ # Remove oldest basis and restore its contribution to residuals
+ oldest_basis = basis_vectors.pop(0)
+ oldest_coefficients = projection_coefficients_per_basis.pop(0)
+ mask_unselected = ~selected_mask
+ if np.any(mask_unselected):
+ residual_matrix[mask_unselected] += oldest_coefficients[mask_unselected, None] * oldest_basis
+
+ denominator = float(basis_vector @ basis_vector) + EPS32
+ basis_vectors.append(basis_vector.astype(np.float32, copy=False))
+
+ mask_unselected = ~selected_mask
+ coefficients = np.zeros(num_items, dtype=np.float32)
+ if np.any(mask_unselected):
+ projections = (residual_matrix[mask_unselected] @ basis_vector) / denominator
+ coefficients[mask_unselected] = projections
+ residual_matrix[mask_unselected] -= projections[:, None] * basis_vector
+
+ projection_coefficients_per_basis.append(coefficients)
+
+ # Seed with recent context (oldest → newest) if provided
+ seeded_bases = 0
+ if recent_embeddings is not None and np.size(recent_embeddings) > 0:
+ context = _prepare_vectors(recent_embeddings.astype(feature_matrix.dtype, copy=False))
+ context = context[-window:] # keep only the latest `window` items
+ for context_vector in context:
+ residual_context = context_vector.copy()
+ for basis in basis_vectors:
+ denominator_b = float(basis @ basis) + EPS32
+ residual_context -= float(residual_context @ basis) / denominator_b * basis
+ _push_basis_vector(residual_context)
+ seeded_bases += 1
+
+ # Decide what to select first
+ if seeded_bases > 0:
+ # Use combined scores with diversity from seeded context
+ residual_norms = np.linalg.norm(residual_matrix, axis=1)
+ combined_scores = theta * relevance_scores + (1.0 - theta) * gamma * residual_norms
+ combined_scores[selected_mask] = -np.inf
+ first_index = int(np.argmax(combined_scores))
+ first_score = float(combined_scores[first_index])
+ else:
+ # No context yet: pick by highest relevance, then start residualization
+ first_index = int(np.argmax(relevance_scores))
+ first_score = float(
+ theta * relevance_scores[first_index]
+ + (1.0 - theta) * gamma * float(np.linalg.norm(feature_matrix[first_index]))
+ )
+
+ # Select the first item
+ selected_mask[first_index] = True
+ selected_indices[0] = first_index
+ selection_scores[0] = first_score
+ _push_basis_vector(residual_matrix[first_index])
+
+ # Main loop
+ for step in range(1, top_k):
+ # Find best candidate among unselected items
+ available_indices = np.where(~selected_mask)[0]
+ # Residual norms measure novelty relative to the last `window` selections/context
+ residual_norms = np.linalg.norm(residual_matrix[available_indices], axis=1)
+ combined_scores = theta * relevance_scores[available_indices] + (1.0 - theta) * gamma * residual_norms
+ local_best = int(np.argmax(combined_scores))
+ best_index = int(available_indices[local_best])
+ best_score = float(combined_scores[local_best])
+
+ # Update selection state
+ selected_mask[best_index] = True
+ selected_indices[step] = best_index
+ selection_scores[step] = best_score
+ _push_basis_vector(residual_matrix[best_index])
+
+ return DiversificationResult(
+ indices=selected_indices,
+ selection_scores=selection_scores.astype(np.float32, copy=False),
+ strategy=Strategy.SSD,
+ diversity=diversity,
+ parameters={"gamma": gamma, "window": window},
+ )
diff --git a/tests/test_strategies.py b/tests/test_strategies.py
index 0fd59de..f306fbb 100644
--- a/tests/test_strategies.py
+++ b/tests/test_strategies.py
@@ -2,7 +2,7 @@
import numpy as np
import pytest
-from pyversity import Metric, Strategy, cover, diversify, dpp, mmr, msd
+from pyversity import Metric, Strategy, cover, diversify, dpp, mmr, msd, ssd
from pyversity.datatypes import DiversificationResult
@@ -177,6 +177,76 @@ def test_dpp() -> None:
assert np.all(res.selection_scores[:-1] + 1e-7 >= res.selection_scores[1:])
+def test_ssd() -> None:
+ """Test SSD strategy with various diversity settings (1=diverse, 0=relevance)."""
+ emb = np.eye(3, dtype=np.float32)
+ scores = np.array([0.1, 0.8, 0.3], dtype=np.float32)
+
+ # Relevance-only (diversity=0): picks top-k by scores
+ res = ssd(emb, scores, k=2, diversity=0.0)
+ expected = np.array([1, 2], dtype=np.int32)
+ assert np.array_equal(res.indices, expected)
+ assert np.allclose(res.selection_scores, scores[expected])
+
+ # Balanced coverage (diversity=0.5, gamma=0.5): picks diverse set
+ res = ssd(emb, scores, k=2, diversity=0.5, gamma=0.5)
+ assert res.indices[0] == 1 and res.indices[1] in (0, 2)
+
+ # Parameter validation
+ with pytest.raises(ValueError):
+ ssd(emb, scores, k=2, diversity=-0.01)
+ with pytest.raises(ValueError):
+ ssd(emb, scores, k=2, diversity=1.01)
+ with pytest.raises(ValueError):
+ ssd(emb, scores, k=2, gamma=0.0)
+ with pytest.raises(ValueError):
+ ssd(emb, scores, k=2, gamma=-0.5)
+ with pytest.raises(ValueError):
+ ssd(emb, scores, k=2, window=0)
+
+ # recent_embeddings validation: must be 2D
+ with pytest.raises(ValueError):
+ ssd(emb, scores, k=2, diversity=0.5, recent_embeddings=np.array([0.0, 1.0], dtype=np.float32))
+
+ # recent_embeddings validation: dim mismatch with embeddings
+ with pytest.raises(ValueError):
+ ssd(emb, scores, k=2, diversity=0.5, recent_embeddings=np.ones((2, 4), dtype=np.float32)) # emb is (.,3)
+
+ # Early exit on empty input
+ emb = np.empty((0, 3), dtype=np.float32)
+ scores = np.array([], dtype=np.float32)
+ res = ssd(emb, scores, k=5)
+ assert res.indices.size == 0 and res.selection_scores.size == 0
+
+
+def test_ssd_recent_embeddings_avoids_recent_first_pick() -> None:
+ """Test that with equal relevance, the first pick should avoid the most recent item when context is seeded."""
+ # 3 orthogonal items (identity); equal scores
+ emb = np.eye(3, dtype=np.float32)
+ scores = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+
+ # Seed recent history with item 1 (oldest->newest)
+ recent = emb[[1]] # the user just saw item 1
+ res = ssd(emb, scores, k=2, diversity=0.5, gamma=1.0, window=2, recent_embeddings=recent)
+
+ # First selection should not be the recent item (index 1) because its residual vs. context is ~0
+ assert res.indices[0] in (0, 2)
+ assert res.indices[0] != 1
+
+
+def test_ssd_recent_embeddings_window_blocks_multiple_recent() -> None:
+ """Test that if the window contains two recent items, the first pick should avoid both when scores are tied."""
+ emb = np.eye(4, dtype=np.float32)
+ scores = np.ones(4, dtype=np.float32) # tie
+
+ # Seed with items 0 and 1 (oldest->newest), window=2
+ recent = emb[[0, 1]]
+ res = ssd(emb, scores, k=3, diversity=0.6, gamma=1.0, window=2, recent_embeddings=recent)
+
+ # The first pick should be from {2,3}, not {0,1}
+ assert res.indices[0] in (2, 3)
+
+
@pytest.mark.parametrize(
"strategy, fn, kwargs",
[
@@ -184,6 +254,7 @@ def test_dpp() -> None:
(Strategy.MSD, msd, {"diversity": 0.5, "metric": Metric.COSINE, "normalize": True}),
(Strategy.COVER, cover, {"diversity": 0.5, "gamma": 0.5}),
(Strategy.DPP, dpp, {"diversity": 0.5}),
+ (Strategy.SSD, ssd, {"diversity": 0.5}),
],
)
def test_diversify(strategy: Strategy, fn: Callable[..., DiversificationResult], kwargs: Any) -> None: