diff --git a/squeeze/__init__.py b/squeeze/__init__.py index 2741efe..5e87373 100644 --- a/squeeze/__init__.py +++ b/squeeze/__init__.py @@ -5,7 +5,8 @@ optimized for CPU performance with SIMD vectorization and Rust backends. Implemented algorithms: -- UMAP: Uniform Manifold Approximation and Projection +- UMAP: Uniform Manifold Approximation and Projection (Rust backend) +- UMAPRust: Rust-backed UMAP implementation (internal) - PCA: Principal Component Analysis - TSNE: t-Distributed Stochastic Neighbor Embedding - MDS: Multidimensional Scaling @@ -18,7 +19,7 @@ from warnings import catch_warnings, simplefilter, warn -from .umap_ import UMAP +from .umap import UMAP # Import Rust-based algorithms try: diff --git a/squeeze/_hnsw_backend.abi3.so b/squeeze/_hnsw_backend.abi3.so index 807151f..62eb4b0 100755 Binary files a/squeeze/_hnsw_backend.abi3.so and b/squeeze/_hnsw_backend.abi3.so differ diff --git a/squeeze/aligned_umap.py b/squeeze/aligned_umap.py index 34b4c34..bcb031a 100644 --- a/squeeze/aligned_umap.py +++ b/squeeze/aligned_umap.py @@ -11,7 +11,15 @@ from squeeze.sparse import arr_intersect as intersect1d from squeeze.sparse import arr_union as union1d from squeeze.spectral import spectral_layout -from squeeze.umap_ import UMAP, make_epochs_per_sample +from squeeze.umap import UMAP + + +def make_epochs_per_sample(weights, n_epochs): + result = -1.0 * np.ones(weights.shape[0], dtype=np.float64) + n_samples = n_epochs * (weights / weights.max()) + result[n_samples > 0] = float(n_epochs) / np.float64(n_samples[n_samples > 0]) + return result + if TYPE_CHECKING: import scipy.sparse @@ -557,6 +565,11 @@ def fit( y: list[np.ndarray] | tuple[np.ndarray, ...] | np.ndarray | None = None, **fit_params: Any, ) -> AlignedUMAP: + if getattr(UMAP, "_BACKEND", "") == "rust": + raise NotImplementedError( + "AlignedUMAP requires the removed Python UMAP backend; " + "Rust-only UMAP does not yet expose the graph state needed for alignment." + ) """Fit aligned UMAP on multiple related datasets. Parameters diff --git a/squeeze/metrics.py b/squeeze/metrics.py index 14689bf..9b98e47 100644 --- a/squeeze/metrics.py +++ b/squeeze/metrics.py @@ -112,7 +112,9 @@ def trustworthiness( trust_sum += max(0, original_rank - k) # Normalize - trustworthiness_score = 1 - (2 / (n_samples * k * (2 * n_samples - 3 * k - 1))) * trust_sum + trustworthiness_score = ( + 1 - (2 / (n_samples * k * (2 * n_samples - 3 * k - 1))) * trust_sum + ) return max(0, min(1, trustworthiness_score)) @@ -184,7 +186,9 @@ def continuity( cont_sum += max(0, embedding_rank - k) # Normalize - continuity_score = 1 - (2 / (n_samples * k * (2 * n_samples - 3 * k - 1))) * cont_sum + continuity_score = ( + 1 - (2 / (n_samples * k * (2 * n_samples - 3 * k - 1))) * cont_sum + ) return max(0, min(1, continuity_score)) @@ -340,8 +344,12 @@ def spearman_distance_correlation( # If dataset is large, sample pairs if sample_size is not None and len(original_distances) > sample_size: - sample_indices = np.random.choice( - len(original_distances), size=sample_size, replace=False + # Prefer deterministic sampling to reduce variance across runs. + rng = np.random.default_rng(42) + sample_indices = rng.choice( + len(original_distances), + size=sample_size, + replace=False, ) original_distances = original_distances[sample_indices] embedded_distances = embedded_distances[sample_indices] diff --git a/squeeze/strategies.py b/squeeze/strategies.py index 2fdc0aa..a0b0f31 100644 --- a/squeeze/strategies.py +++ b/squeeze/strategies.py @@ -10,7 +10,7 @@ from typing import Any, Callable, Iterator # Import all algorithms -from .umap_ import UMAP +from .umap import UMAP try: from ._hnsw_backend import ( diff --git a/squeeze/tests/test_aligned_umap.py b/squeeze/tests/test_aligned_umap.py index 21a5deb..bc1fab5 100644 --- a/squeeze/tests/test_aligned_umap.py +++ b/squeeze/tests/test_aligned_umap.py @@ -3,7 +3,12 @@ from sklearn.cluster import KMeans from sklearn.metrics import adjusted_rand_score, pairwise_distances -from squeeze import AlignedUMAP +from squeeze import AlignedUMAP, UMAP + +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip( + "AlignedUMAP requires removed Python UMAP backend", allow_module_level=True + ) # =============================== # Test AlignedUMAP on sliced iris diff --git a/squeeze/tests/test_composite_models.py b/squeeze/tests/test_composite_models.py index 4104e5e..2478e25 100644 --- a/squeeze/tests/test_composite_models.py +++ b/squeeze/tests/test_composite_models.py @@ -2,6 +2,12 @@ from squeeze import UMAP +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip( + "Composite UMAP operators require removed Python backend", + allow_module_level=True, + ) + try: # works for sklearn>=0.22 from sklearn.manifold import trustworthiness diff --git a/squeeze/tests/test_composition.py b/squeeze/tests/test_composition.py index 182ec8a..ea6efb1 100644 --- a/squeeze/tests/test_composition.py +++ b/squeeze/tests/test_composition.py @@ -6,6 +6,11 @@ from sklearn.decomposition import PCA from squeeze import UMAP + +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip( + "UMAP composition tests target removed Python backend", allow_module_level=True + ) from squeeze.composition import AdaptiveDR, DRPipeline, EnsembleDR, ProgressiveDR diff --git a/squeeze/tests/test_data_input.py b/squeeze/tests/test_data_input.py index 1c9ea58..02a143f 100644 --- a/squeeze/tests/test_data_input.py +++ b/squeeze/tests/test_data_input.py @@ -4,6 +4,12 @@ from squeeze import UMAP +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip( + "Input-validation tests target removed Python UMAP backend", + allow_module_level=True, + ) + @pytest.fixture(scope="session") def all_finite_data(): diff --git a/squeeze/tests/test_densmap.py b/squeeze/tests/test_densmap.py index e75c14a..37b591c 100644 --- a/squeeze/tests/test_densmap.py +++ b/squeeze/tests/test_densmap.py @@ -2,6 +2,11 @@ from squeeze import UMAP +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip( + "densMAP tests target removed Python UMAP backend", allow_module_level=True + ) + try: # works for sklearn>=0.22 from sklearn.manifold import trustworthiness diff --git a/squeeze/tests/test_sparse_ops.py b/squeeze/tests/test_sparse_ops.py index 5e98eab..6a984b0 100644 --- a/squeeze/tests/test_sparse_ops.py +++ b/squeeze/tests/test_sparse_ops.py @@ -4,6 +4,13 @@ import pytest import scipy.sparse as sp +from squeeze import UMAP + +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip( + "SparseUMAP tests target removed Python UMAP backend", allow_module_level=True + ) + from squeeze.sparse_ops import ( SparseFormatDetector, SparseKNNGraph, @@ -326,7 +333,9 @@ def test_workflow_sparse_data(self): def test_sparse_vs_dense_equivalence(self): """Test that sparse and dense give similar results.""" # Create small test data - X_dense = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [1, 0, 1]]).astype(float) + X_dense = np.array( + [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [1, 0, 1]] + ).astype(float) X_sparse = sp.csr_matrix(X_dense) # Compute distances both ways diff --git a/squeeze/tests/test_umap_get_feature_names_out.py b/squeeze/tests/test_umap_get_feature_names_out.py index 801cf77..dc737dd 100644 --- a/squeeze/tests/test_umap_get_feature_names_out.py +++ b/squeeze/tests/test_umap_get_feature_names_out.py @@ -2,7 +2,7 @@ from sklearn.datasets import make_classification from sklearn.pipeline import FeatureUnion, Pipeline -from squeeze.umap_ import UMAP +from squeeze import UMAP def test_get_feature_names_out() -> None: diff --git a/squeeze/tests/test_umap_nn.py b/squeeze/tests/test_umap_nn.py index 339449b..a7f2e85 100644 --- a/squeeze/tests/test_umap_nn.py +++ b/squeeze/tests/test_umap_nn.py @@ -4,11 +4,10 @@ from sklearn.neighbors import KDTree from sklearn.preprocessing import normalize -from squeeze import distances as dist -from squeeze.umap_ import ( - nearest_neighbors, - smooth_knn_dist, -) +from squeeze import UMAP + +# The legacy Python UMAP backend has been removed. +pytest.skip("Python UMAP backend removed", allow_module_level=True) # =================================================== # Nearest Neighbour Test cases diff --git a/squeeze/tests/test_umap_on_iris.py b/squeeze/tests/test_umap_on_iris.py index 2b250cb..efbfa7d 100644 --- a/squeeze/tests/test_umap_on_iris.py +++ b/squeeze/tests/test_umap_on_iris.py @@ -7,7 +7,9 @@ from sklearn.neighbors import KDTree from squeeze import UMAP -from squeeze.umap_ import nearest_neighbors + +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip("Python UMAP backend removed", allow_module_level=True) try: # works for sklearn>=0.22 diff --git a/squeeze/tests/test_umap_ops.py b/squeeze/tests/test_umap_ops.py index d4b073e..19f2505 100644 --- a/squeeze/tests/test_umap_ops.py +++ b/squeeze/tests/test_umap_ops.py @@ -16,6 +16,9 @@ from sklearn.preprocessing import normalize from squeeze import UMAP + +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip("UMAP ops tests target removed Python backend", allow_module_level=True) from squeeze.distances import pairwise_special_metric from squeeze.spectral import component_layout from squeeze.utils import disconnected_vertices diff --git a/squeeze/tests/test_umap_repeated_data.py b/squeeze/tests/test_umap_repeated_data.py index 9ab58ee..66fff5e 100644 --- a/squeeze/tests/test_umap_repeated_data.py +++ b/squeeze/tests/test_umap_repeated_data.py @@ -1,7 +1,14 @@ import numpy as np +import pytest from squeeze import UMAP +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip( + "Repeated-data tests target removed Python UMAP backend", + allow_module_level=True, + ) + # =================================================== # Spatial Data Test cases # =================================================== diff --git a/squeeze/tests/test_umap_rust.py b/squeeze/tests/test_umap_rust.py new file mode 100644 index 0000000..3281af7 --- /dev/null +++ b/squeeze/tests/test_umap_rust.py @@ -0,0 +1,34 @@ +import numpy as np +import pytest + +try: + from squeeze._hnsw_backend import UMAPRust +except Exception: # pragma: no cover + UMAPRust = None + + +pytestmark = pytest.mark.skipif(UMAPRust is None, reason="Rust backend not available") + + +def test_umap_rust_fit_transform_runs(iris) -> None: + reducer = UMAPRust(n_components=2, n_neighbors=15, random_state=42) + emb = reducer.fit_transform(iris.data) + + assert emb.shape == (iris.data.shape[0], 2) + assert np.isfinite(emb).all() + + emb2 = reducer.embedding_ + assert emb2.shape == emb.shape + + +def test_umap_rust_basic_trustworthiness(iris) -> None: + try: + from sklearn.manifold import trustworthiness + except Exception: # pragma: no cover + pytest.skip("scikit-learn trustworthiness not available") + + reducer = UMAPRust(n_components=2, n_neighbors=15, random_state=42) + emb = reducer.fit_transform(iris.data) + + trust = trustworthiness(iris.data, emb, n_neighbors=10) + assert trust >= 0.75 diff --git a/squeeze/tests/test_umap_trustworthiness.py b/squeeze/tests/test_umap_trustworthiness.py index bfcb09a..536f6b3 100644 --- a/squeeze/tests/test_umap_trustworthiness.py +++ b/squeeze/tests/test_umap_trustworthiness.py @@ -1,10 +1,17 @@ import numpy as np +import pytest import scipy.sparse from sklearn.datasets import make_blobs from sklearn.metrics import pairwise_distances from squeeze import UMAP +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip( + "UMAP trustworthiness tests target removed Python backend", + allow_module_level=True, + ) + try: # works for sklearn>=0.22 from sklearn.manifold import trustworthiness diff --git a/squeeze/tests/test_umap_validation_params.py b/squeeze/tests/test_umap_validation_params.py index 226edc7..5599e4a 100644 --- a/squeeze/tests/test_umap_validation_params.py +++ b/squeeze/tests/test_umap_validation_params.py @@ -12,6 +12,12 @@ # verify that we can import this; potentially for later use from squeeze import UMAP +if getattr(UMAP, "_BACKEND", "") == "rust": + pytest.skip( + "UMAP parameter-validation tests target removed Python backend", + allow_module_level=True, + ) + warnings.filterwarnings("ignore", category=UserWarning) diff --git a/squeeze/umap.py b/squeeze/umap.py new file mode 100644 index 0000000..ec16690 --- /dev/null +++ b/squeeze/umap.py @@ -0,0 +1,123 @@ +"""UMAP Python wrapper. + +Squeeze aims to provide Rust implementations for all algorithms. + +`UMAP` is therefore implemented as a thin Python wrapper around the Rust +extension type `UMAPRust` exposed from `squeeze._hnsw_backend`. + +The legacy pure-Python implementation previously living in `squeeze.umap_` +has been removed from the public API. +""" + +from __future__ import annotations + +from typing import Any, Optional + +import numpy as np + +try: + from sklearn.base import BaseEstimator, ClassNamePrefixFeaturesOutMixin +except Exception: # pragma: no cover + # Avoid import-time hard failures if sklearn isn't available in some envs. + BaseEstimator = object # type: ignore[assignment] + ClassNamePrefixFeaturesOutMixin = object # type: ignore[assignment] + + +try: + from ._hnsw_backend import UMAPRust as _UMAPRust +except Exception: # pragma: no cover + _UMAPRust = None + + +class UMAP(BaseEstimator, ClassNamePrefixFeaturesOutMixin): + """Uniform Manifold Approximation and Projection (Rust backend).""" + + _BACKEND = "rust" + + def __init__( + self, + n_components: int = 2, + n_neighbors: int = 15, + n_epochs: int = 0, + min_dist: float = 0.1, + spread: float = 1.0, + metric: str = "euclidean", + dist_p: float = 2.0, + initial_alpha: float = 1.0, + negative_sample_rate: float = 5.0, + gamma: float = 1.0, + random_state: Optional[int] = None, + m: int = 16, + ef_construction: int = 200, + ef_search: int = 50, + **_kwargs: Any, + ) -> None: + if _UMAPRust is None: + raise ImportError( + "Rust backend not available; build the extension to use UMAP" + ) + + self.n_components = n_components + self.n_neighbors = n_neighbors + self.n_epochs = n_epochs + self.min_dist = min_dist + self.spread = spread + self.metric = metric + self.dist_p = dist_p + self.initial_alpha = initial_alpha + self.negative_sample_rate = negative_sample_rate + self.gamma = gamma + self.random_state = random_state + self.m = m + self.ef_construction = ef_construction + self.ef_search = ef_search + + self._model = _UMAPRust( + n_components=n_components, + n_neighbors=n_neighbors, + n_epochs=n_epochs, + min_dist=min_dist, + spread=spread, + metric=metric, + dist_p=dist_p, + initial_alpha=initial_alpha, + negative_sample_rate=negative_sample_rate, + gamma=gamma, + random_state=random_state, + m=m, + ef_construction=ef_construction, + ef_search=ef_search, + ) + + def fit(self, X: np.ndarray, y: Any = None, **_kwargs: Any) -> "UMAP": + _ = y + self.fit_transform(X) + return self + + def fit_transform(self, X: np.ndarray, y: Any = None, **_kwargs: Any) -> np.ndarray: + _ = y + X_arr = np.asarray(X, dtype=np.float64, order="C") + embedding = self._model.fit_transform(X_arr) + # Store as attribute expected by sklearn-style callers. + self.embedding_ = np.asarray(embedding) + return self.embedding_ + + def transform(self, X: np.ndarray, **_kwargs: Any) -> np.ndarray: + raise NotImplementedError( + "UMAP.transform is not yet implemented in the Rust backend" + ) + + def inverse_transform(self, X: np.ndarray, **_kwargs: Any) -> np.ndarray: + raise NotImplementedError( + "UMAP.inverse_transform is not yet implemented in the Rust backend" + ) + + def update(self, X: np.ndarray, **_kwargs: Any) -> None: + raise NotImplementedError( + "UMAP.update is not yet implemented in the Rust backend" + ) + + def get_feature_names_out(self, input_features: Optional[list[str]] = None): + # Match scikit-learn transformer conventions: names are independent of input names. + _ = input_features + return np.asarray([f"umap{i}" for i in range(self.n_components)], dtype=object) diff --git a/squeeze/umap_.py b/squeeze/umap_.py deleted file mode 100644 index 8a4ffd6..0000000 --- a/squeeze/umap_.py +++ /dev/null @@ -1,3840 +0,0 @@ -# Author: Leland McInnes -# -# License: BSD 3 clause - -import locale -from warnings import warn - -from scipy.optimize import curve_fit -from sklearn.base import BaseEstimator, ClassNamePrefixFeaturesOutMixin -from sklearn.decomposition import PCA, TruncatedSVD -from sklearn.metrics import pairwise_distances -from sklearn.neighbors import KDTree -from sklearn.preprocessing import normalize -from sklearn.utils import check_array, check_random_state -from sklearn.utils.validation import check_is_fitted - -try: - import joblib -except ImportError: - # sklearn.externals.joblib is deprecated in 0.21, will be removed in 0.23 - from sklearn.externals import joblib - -import numba -import numpy as np -import scipy.sparse -import scipy.sparse.csgraph -from pynndescent import NNDescent -from pynndescent.distances import named_distances as pynn_named_distances -from pynndescent.sparse import sparse_named_distances as pynn_sparse_named_distances -from scipy.sparse import tril as sparse_tril -from scipy.sparse import triu as sparse_triu - -import squeeze.distances as dist -from squeeze import sparse -from squeeze.layouts import ( - optimize_layout_euclidean, - optimize_layout_generic, - optimize_layout_inverse, -) -from squeeze.spectral import spectral_layout, tswspectral_layout -from squeeze.utils import ( - csr_unique, - fast_knn_indices, - submatrix, -) - -# Try to import HNSW backend -try: - from squeeze.hnsw_wrapper import HnswIndexWrapper - - HNSW_AVAILABLE = True -except ImportError: - HNSW_AVAILABLE = False - -locale.setlocale(locale.LC_NUMERIC, "C") - -INT32_MIN = np.iinfo(np.int32).min + 1 -INT32_MAX = np.iinfo(np.int32).max - 1 - -SMOOTH_K_TOLERANCE = 1e-5 -MIN_K_DIST_SCALE = 1e-3 -NPY_INFINITY = np.inf - -DISCONNECTION_DISTANCES = { - "correlation": 2, - "cosine": 2, - "hellinger": 1, - "jaccard": 1, - "bit_jaccard": 1, - "dice": 1, -} - - -def flatten_iter(container): - for i in container: - if isinstance(i, (list, tuple)): - yield from flatten_iter(i) - else: - yield i - - -def flattened(container): - return tuple(flatten_iter(container)) - - -def breadth_first_search(adjmat, start, min_vertices): - explored = [] - queue = [start] - levels = {} - levels[start] = 0 - max_level = np.inf - visited = [start] - - while queue: - node = queue.pop(0) - explored.append(node) - if max_level == np.inf and len(explored) > min_vertices: - max_level = max(levels.values()) - - if levels[node] + 1 < max_level: - neighbors = adjmat[node].indices - for neighbour in neighbors: - if neighbour not in visited: - queue.append(neighbour) - visited.append(neighbour) - - levels[neighbour] = levels[node] + 1 - - return np.array(explored) - - -def raise_disconnected_warning( - edges_removed, - vertices_disconnected, - disconnection_distance, - total_rows, - threshold=0.1, - verbose=False, -) -> None: - """A simple wrapper function to avoid large amounts of code repetition.""" - if verbose & (vertices_disconnected == 0) & (edges_removed > 0): - pass - elif (vertices_disconnected > 0) & ( - vertices_disconnected <= threshold * total_rows - ): - warn( - f"A few of your vertices were disconnected from the manifold. This shouldn't cause problems.\n" - f"Disconnection_distance = {disconnection_distance} has removed {edges_removed} edges.\n" - f"It has only fully disconnected {vertices_disconnected} vertices.\n" - f"Use umap.utils.disconnected_vertices() to identify them.", - stacklevel=2, - ) - elif vertices_disconnected > threshold * total_rows: - warn( - f"A large number of your vertices were disconnected from the manifold.\n" - f"Disconnection_distance = {disconnection_distance} has removed {edges_removed} edges.\n" - f"It has fully disconnected {vertices_disconnected} vertices.\n" - f"You might consider using find_disconnected_points() to find and remove these points from your data.\n" - f"Use umap.utils.disconnected_vertices() to identify them.", - stacklevel=2, - ) - - -@numba.njit( - locals={ - "psum": numba.types.float32, - "lo": numba.types.float32, - "mid": numba.types.float32, - "hi": numba.types.float32, - }, - fastmath=True, -) # benchmarking `parallel=True` shows it to *decrease* performance -def smooth_knn_dist(distances, k, n_iter=64, local_connectivity=1.0, bandwidth=1.0): - """Compute a continuous version of the distance to the kth nearest - neighbor. That is, this is similar to knn-distance but allows continuous - k values rather than requiring an integral k. In essence we are simply - computing the distance such that the cardinality of fuzzy set we generate - is k. - - Parameters - ---------- - distances: array of shape (n_samples, n_neighbors) - Distances to nearest neighbors for each sample. Each row should be a - sorted list of distances to a given samples nearest neighbors. - - k: float - The number of nearest neighbors to approximate for. - - n_iter: int (optional, default 64) - We need to binary search for the correct distance value. This is the - max number of iterations to use in such a search. - - local_connectivity: int (optional, default 1) - The local connectivity required -- i.e. the number of nearest - neighbors that should be assumed to be connected at a local level. - The higher this value the more connected the manifold becomes - locally. In practice this should be not more than the local intrinsic - dimension of the manifold. - - bandwidth: float (optional, default 1) - The target bandwidth of the kernel, larger values will produce - larger return values. - - Returns - ------- - knn_dist: array of shape (n_samples,) - The distance to kth nearest neighbor, as suitably approximated. - - nn_dist: array of shape (n_samples,) - The distance to the 1st nearest neighbor for each point. - - """ - target = np.log2(k) * bandwidth - rho = np.zeros(distances.shape[0], dtype=np.float32) - result = np.zeros(distances.shape[0], dtype=np.float32) - - mean_distances = np.mean(distances) - - for i in range(distances.shape[0]): - lo = 0.0 - hi = NPY_INFINITY - mid = 1.0 - - # TODO: This is very inefficient, but will do for now. FIXME - ith_distances = distances[i] - non_zero_dists = ith_distances[ith_distances > 0.0] - if non_zero_dists.shape[0] >= local_connectivity: - index = int(np.floor(local_connectivity)) - interpolation = local_connectivity - index - if index > 0: - rho[i] = non_zero_dists[index - 1] - if interpolation > SMOOTH_K_TOLERANCE: - rho[i] += interpolation * ( - non_zero_dists[index] - non_zero_dists[index - 1] - ) - else: - rho[i] = interpolation * non_zero_dists[0] - elif non_zero_dists.shape[0] > 0: - rho[i] = np.max(non_zero_dists) - - for _n in range(n_iter): - psum = 0.0 - for j in range(1, distances.shape[1]): - d = distances[i, j] - rho[i] - if d > 0: - psum += np.exp(-(d / mid)) - else: - psum += 1.0 - - if np.fabs(psum - target) < SMOOTH_K_TOLERANCE: - break - - if psum > target: - hi = mid - mid = (lo + hi) / 2.0 - else: - lo = mid - if hi == NPY_INFINITY: - mid *= 2 - else: - mid = (lo + hi) / 2.0 - - result[i] = mid - - # TODO: This is very inefficient, but will do for now. FIXME - if rho[i] > 0.0: - mean_ith_distances = np.mean(ith_distances) - result[i] = max(result[i], MIN_K_DIST_SCALE * mean_ith_distances) - elif result[i] < MIN_K_DIST_SCALE * mean_distances: - result[i] = MIN_K_DIST_SCALE * mean_distances - - return result, rho - - -def _get_nn_backend(metric, sparse_data, use_hnsw=None): - """Determine which nearest neighbor backend to use. - - Parameters - ---------- - metric : str - The distance metric to use - sparse_data : bool - Whether the data is sparse - use_hnsw : bool or None - If True, force HNSW backend - If False, force PyNNDescent - If None, automatically select - - Returns - ------- - backend_class : class - The nearest neighbor backend class to use - - """ - # Force PyNNDescent if requested - if use_hnsw is False: - return NNDescent - - # Check if HNSW backend is available - if not HNSW_AVAILABLE: - if use_hnsw is True: - warn( - "HNSW backend requested but not available. " - "Please install umap with: pip install --upgrade umap", - stacklevel=2, - ) - return NNDescent - - # Check if metric is supported by HNSW - hnsw_metrics = { - "euclidean", - "l2", - "manhattan", - "l1", - "taxicab", - "cosine", - "chebyshev", - "linfinity", - "hamming", - "minkowski", - } - if metric not in hnsw_metrics: - if use_hnsw is True: - warn( - f"Metric '{metric}' not supported by HNSW backend, falling back to PyNNDescent", - stacklevel=2, - ) - return NNDescent - - # Use HNSW by default if available and compatible - if use_hnsw is None: - use_hnsw = True # Default to HNSW for new code - - if use_hnsw: - return HnswIndexWrapper - return NNDescent - - -def nearest_neighbors( - X, - n_neighbors, - metric, - metric_kwds, - angular, - random_state, - low_memory=True, - use_pynndescent=True, - use_hnsw=None, - hnsw_prune_strategy="simple", - hnsw_alpha=1.2, - n_jobs=-1, - verbose=False, -): - """Compute the ``n_neighbors`` nearest points for each data point in ``X`` - under ``metric``. This may be exact, but more likely is approximated via - nearest neighbor descent. - - Parameters - ---------- - X: array of shape (n_samples, n_features) - The input data to compute the k-neighbor graph of. - - n_neighbors: int - The number of nearest neighbors to compute for each sample in ``X``. - - metric: string or callable - The metric to use for the computation. - - metric_kwds: dict - Any arguments to pass to the metric computation function. - - angular: bool - Whether to use angular rp trees in NN approximation. - - random_state: np.random state - The random state to use for approximate NN computations. - - low_memory: bool (optional, default True) - Whether to pursue lower memory NNdescent. - - use_pynndescent: bool (optional, default True) - Whether to use PyNNDescent for nearest neighbor search. - - use_hnsw: bool (optional, default None) - Whether to use HNSW backend. If True, overrides use_pynndescent. - - verbose: bool (optional, default False) - Whether to print status data during the computation. - - Returns - ------- - knn_indices: array of shape (n_samples, n_neighbors) - The indices on the ``n_neighbors`` closest points in the dataset. - - knn_dists: array of shape (n_samples, n_neighbors) - The distances to the ``n_neighbors`` closest points in the dataset. - - rp_forest: list of trees - The random projection forest used for searching (if used, None otherwise). - - """ - if verbose: - pass - - if metric == "precomputed": - # Note that this does not support sparse distance matrices yet ... - # Compute indices of n nearest neighbors - knn_indices = fast_knn_indices(X, n_neighbors) - # knn_indices = np.argsort(X)[:, :n_neighbors] - # Compute the nearest neighbor distances - # (equivalent to np.sort(X)[:,:n_neighbors]) - knn_dists = X[np.arange(X.shape[0])[:, None], knn_indices].copy() - # Prune any nearest neighbours that are infinite distance apart. - disconnected_index = knn_dists == np.inf - knn_indices[disconnected_index] = -1 - - knn_search_index = None - else: - # TODO: Hacked values for now - n_trees = min(64, 5 + round((X.shape[0]) ** 0.5 / 20.0)) - n_iters = max(5, round(np.log2(X.shape[0]))) - - # Determine which NN backend to use (HNSW or PyNNDescent) - sparse_data = scipy.sparse.issparse(X) - if use_hnsw is not None: - use_hnsw_backend = use_hnsw - else: - use_hnsw_backend = ( - not use_pynndescent if use_pynndescent is not None else None - ) - - NNBackend = _get_nn_backend(metric, sparse_data, use_hnsw=use_hnsw_backend) - - # Pass HNSW-specific parameters if using HNSW backend - backend_kwargs = { - "n_neighbors": n_neighbors, - "metric": metric, - "metric_kwds": metric_kwds, - "random_state": random_state, - "n_trees": n_trees, - "n_iters": n_iters, - "max_candidates": 60, - "low_memory": low_memory, - "n_jobs": n_jobs, - "verbose": verbose, - "compressed": False, - } - - # Add HNSW-specific parameters if using HNSW - if NNBackend.__name__ == "HnswIndexWrapper": - backend_kwargs["prune_strategy"] = hnsw_prune_strategy - backend_kwargs["prune_alpha"] = hnsw_alpha - - knn_search_index = NNBackend(X, **backend_kwargs) - knn_indices, knn_dists = knn_search_index.neighbor_graph - - if verbose: - pass - return knn_indices, knn_dists, knn_search_index - - -@numba.njit( - locals={ - "knn_dists": numba.types.float32[:, ::1], - "sigmas": numba.types.float32[::1], - "rhos": numba.types.float32[::1], - "val": numba.types.float32, - }, - parallel=True, - fastmath=True, -) -def compute_membership_strengths( - knn_indices, - knn_dists, - sigmas, - rhos, - return_dists=False, - bipartite=False, -): - """Construct the membership strength data for the 1-skeleton of each local - fuzzy simplicial set -- this is formed as a sparse matrix where each row is - a local fuzzy simplicial set, with a membership strength for the - 1-simplex to each other data point. - - Parameters - ---------- - knn_indices: array of shape (n_samples, n_neighbors) - The indices on the ``n_neighbors`` closest points in the dataset. - - knn_dists: array of shape (n_samples, n_neighbors) - The distances to the ``n_neighbors`` closest points in the dataset. - - sigmas: array of shape(n_samples) - The normalization factor derived from the metric tensor approximation. - - rhos: array of shape(n_samples) - The local connectivity adjustment. - - return_dists: bool (optional, default False) - Whether to return the pairwise distance associated with each edge. - - bipartite: bool (optional, default False) - Does the nearest neighbour set represent a bipartite graph? That is, are the - nearest neighbour indices from the same point set as the row indices? - - Returns - ------- - rows: array of shape (n_samples * n_neighbors) - Row data for the resulting sparse matrix (coo format) - - cols: array of shape (n_samples * n_neighbors) - Column data for the resulting sparse matrix (coo format) - - vals: array of shape (n_samples * n_neighbors) - Entries for the resulting sparse matrix (coo format) - - dists: array of shape (n_samples * n_neighbors) - Distance associated with each entry in the resulting sparse matrix - - """ - n_samples = knn_indices.shape[0] - n_neighbors = knn_indices.shape[1] - - rows = np.zeros(knn_indices.size, dtype=np.int32) - cols = np.zeros(knn_indices.size, dtype=np.int32) - vals = np.zeros(knn_indices.size, dtype=np.float32) - dists = np.zeros(knn_indices.size, dtype=np.float32) if return_dists else None - - for i in range(n_samples): - for j in range(n_neighbors): - if knn_indices[i, j] == -1: - continue # We didn't get the full knn for i - # If applied to an adjacency matrix points shouldn't be similar to themselves. - # If applied to an incidence matrix (or bipartite) then the row and column indices are different. - if (not bipartite) & (knn_indices[i, j] == i): - val = 0.0 - elif knn_dists[i, j] - rhos[i] <= 0.0 or sigmas[i] == 0.0: - val = 1.0 - else: - val = np.exp(-((knn_dists[i, j] - rhos[i]) / (sigmas[i]))) - - rows[i * n_neighbors + j] = i - cols[i * n_neighbors + j] = knn_indices[i, j] - vals[i * n_neighbors + j] = val - if return_dists: - dists[i * n_neighbors + j] = knn_dists[i, j] - - return rows, cols, vals, dists - - -def fuzzy_simplicial_set( - X, - n_neighbors, - random_state, - metric, - metric_kwds=None, - knn_indices=None, - knn_dists=None, - angular=False, - set_op_mix_ratio=1.0, - local_connectivity=1.0, - apply_set_operations=True, - verbose=False, - return_dists=None, -): - """Given a set of data X, a neighborhood size, and a measure of distance - compute the fuzzy simplicial set (here represented as a fuzzy graph in - the form of a sparse matrix) associated to the data. This is done by - locally approximating geodesic distance at each point, creating a fuzzy - simplicial set for each such point, and then combining all the local - fuzzy simplicial sets into a global one via a fuzzy union. - - Parameters - ---------- - X: array of shape (n_samples, n_features) - The data to be modelled as a fuzzy simplicial set. - - n_neighbors: int - The number of neighbors to use to approximate geodesic distance. - Larger numbers induce more global estimates of the manifold that can - miss finer detail, while smaller values will focus on fine manifold - structure to the detriment of the larger picture. - - random_state: numpy RandomState or equivalent - A state capable being used as a numpy random state. - - metric: string or function (optional, default 'euclidean') - The metric to use to compute distances in high dimensional space. - If a string is passed it must match a valid predefined metric. If - a general metric is required a function that takes two 1d arrays and - returns a float can be provided. For performance purposes it is - required that this be a numba jit'd function. Valid string metrics - include: - - * euclidean (or l2) - * manhattan (or l1) - * cityblock - * braycurtis - * canberra - * chebyshev - * correlation - * cosine - * dice - * hamming - * jaccard - * kulsinski - * ll_dirichlet - * mahalanobis - * matching - * minkowski - * rogerstanimoto - * russellrao - * seuclidean - * sokalmichener - * sokalsneath - * sqeuclidean - * yule - * wminkowski - - Metrics that take arguments (such as minkowski, mahalanobis etc.) - can have arguments passed via the metric_kwds dictionary. At this - time care must be taken and dictionary elements must be ordered - appropriately; this will hopefully be fixed in the future. - - metric_kwds: dict (optional, default {}) - Arguments to pass on to the metric, such as the ``p`` value for - Minkowski distance. - - knn_indices: array of shape (n_samples, n_neighbors) (optional) - If the k-nearest neighbors of each point has already been calculated - you can pass them in here to save computation time. This should be - an array with the indices of the k-nearest neighbors as a row for - each data point. - - knn_dists: array of shape (n_samples, n_neighbors) (optional) - If the k-nearest neighbors of each point has already been calculated - you can pass them in here to save computation time. This should be - an array with the distances of the k-nearest neighbors as a row for - each data point. - - angular: bool (optional, default False) - Whether to use angular/cosine distance for the random projection - forest for seeding NN-descent to determine approximate nearest - neighbors. - - set_op_mix_ratio: float (optional, default 1.0) - Interpolate between (fuzzy) union and intersection as the set operation - used to combine local fuzzy simplicial sets to obtain a global fuzzy - simplicial sets. Both fuzzy set operations use the product t-norm. - The value of this parameter should be between 0.0 and 1.0; a value of - 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy - intersection. - - local_connectivity: int (optional, default 1) - The local connectivity required -- i.e. the number of nearest - neighbors that should be assumed to be connected at a local level. - The higher this value the more connected the manifold becomes - locally. In practice this should be not more than the local intrinsic - dimension of the manifold. - - verbose: bool (optional, default False) - Whether to report information on the current progress of the algorithm. - - return_dists: bool or None (optional, default None) - Whether to return the pairwise distance associated with each edge. - - Returns - ------- - fuzzy_simplicial_set: coo_matrix - A fuzzy simplicial set represented as a sparse matrix. The (i, - j) entry of the matrix represents the membership strength of the - 1-simplex between the ith and jth sample points. - - """ - if metric_kwds is None: - metric_kwds = {} - if knn_indices is None or knn_dists is None: - knn_indices, knn_dists, _ = nearest_neighbors( - X, - n_neighbors, - metric, - metric_kwds, - angular, - random_state, - verbose=verbose, - ) - - knn_dists = knn_dists.astype(np.float32) - - sigmas, rhos = smooth_knn_dist( - knn_dists, - float(n_neighbors), - local_connectivity=float(local_connectivity), - ) - - rows, cols, vals, dists = compute_membership_strengths( - knn_indices, - knn_dists, - sigmas, - rhos, - return_dists, - ) - - result = scipy.sparse.coo_matrix( - (vals, (rows, cols)), - shape=(X.shape[0], X.shape[0]), - ) - result.eliminate_zeros() - - if apply_set_operations: - transpose = result.transpose() - - prod_matrix = result.multiply(transpose) - - result = ( - set_op_mix_ratio * (result + transpose - prod_matrix) - + (1.0 - set_op_mix_ratio) * prod_matrix - ) - - result.eliminate_zeros() - - if return_dists is None: - return result, sigmas, rhos - if return_dists: - dmat = scipy.sparse.coo_matrix( - (dists, (rows, cols)), - shape=(X.shape[0], X.shape[0]), - ) - - dists = dmat.maximum(dmat.transpose()).todok() - else: - dists = None - - return result, sigmas, rhos, dists - - -@numba.njit() -def fast_intersection( - rows, - cols, - values, - target, - unknown_dist=1.0, - far_dist=5.0, -) -> None: - """Under the assumption of categorical distance for the intersecting - simplicial set perform a fast intersection. - - Parameters - ---------- - rows: array - An array of the row of each non-zero in the sparse matrix - representation. - - cols: array - An array of the column of each non-zero in the sparse matrix - representation. - - values: array - An array of the value of each non-zero in the sparse matrix - representation. - - target: array of shape (n_samples) - The categorical labels to use in the intersection. - - unknown_dist: float (optional, default 1.0) - The distance an unknown label (-1) is assumed to be from any point. - - far_dist float (optional, default 5.0) - The distance between unmatched labels. - - Returns - ------- - None - - """ - for nz in range(rows.shape[0]): - i = rows[nz] - j = cols[nz] - if (target[i] == -1) or (target[j] == -1): - values[nz] *= np.exp(-unknown_dist) - elif target[i] != target[j]: - values[nz] *= np.exp(-far_dist) - - -@numba.njit() -def fast_metric_intersection( - rows, - cols, - values, - discrete_space, - metric, - metric_args, - scale, -) -> None: - """Under the assumption of categorical distance for the intersecting - simplicial set perform a fast intersection. - - Parameters - ---------- - rows: array - An array of the row of each non-zero in the sparse matrix - representation. - - cols: array - An array of the column of each non-zero in the sparse matrix - representation. - - values: array of shape - An array of the values of each non-zero in the sparse matrix - representation. - - discrete_space: array of shape (n_samples, n_features) - The vectors of categorical labels to use in the intersection. - - metric: numba function - The function used to calculate distance over the target array. - - scale: float - A scaling to apply to the metric. - - Returns - ------- - None - - """ - for nz in range(rows.shape[0]): - i = rows[nz] - j = cols[nz] - dist = metric(discrete_space[i], discrete_space[j], *metric_args) - values[nz] *= np.exp(-(scale * dist)) - - -@numba.njit() -def reprocess_row(probabilities, k=15, n_iters=32): - target = np.log2(k) - - lo = 0.0 - hi = NPY_INFINITY - mid = 1.0 - - for _n in range(n_iters): - psum = 0.0 - for j in range(probabilities.shape[0]): - psum += pow(probabilities[j], mid) - - if np.fabs(psum - target) < SMOOTH_K_TOLERANCE: - break - - if psum < target: - hi = mid - mid = (lo + hi) / 2.0 - else: - lo = mid - if hi == NPY_INFINITY: - mid *= 2 - else: - mid = (lo + hi) / 2.0 - - return np.power(probabilities, mid) - - -@numba.njit() -def reset_local_metrics(simplicial_set_indptr, simplicial_set_data) -> None: - for i in range(simplicial_set_indptr.shape[0] - 1): - simplicial_set_data[simplicial_set_indptr[i] : simplicial_set_indptr[i + 1]] = ( - reprocess_row( - simplicial_set_data[ - simplicial_set_indptr[i] : simplicial_set_indptr[i + 1] - ], - ) - ) - - -def reset_local_connectivity(simplicial_set, reset_local_metric=False): - """Reset the local connectivity requirement -- each data sample should - have complete confidence in at least one 1-simplex in the simplicial set. - We can enforce this by locally rescaling confidences, and then remerging the - different local simplicial sets together. - - Parameters - ---------- - simplicial_set: sparse matrix - The simplicial set for which to recalculate with respect to local - connectivity. - - Returns - ------- - simplicial_set: sparse_matrix - The recalculated simplicial set, now with the local connectivity - assumption restored. - - """ - simplicial_set = normalize(simplicial_set, norm="max") - if reset_local_metric: - simplicial_set = simplicial_set.tocsr() - reset_local_metrics(simplicial_set.indptr, simplicial_set.data) - simplicial_set = simplicial_set.tocoo() - transpose = simplicial_set.transpose() - prod_matrix = simplicial_set.multiply(transpose) - simplicial_set = simplicial_set + transpose - prod_matrix - simplicial_set.eliminate_zeros() - - return simplicial_set - - -def discrete_metric_simplicial_set_intersection( - simplicial_set, - discrete_space, - unknown_dist=1.0, - far_dist=5.0, - metric=None, - metric_kws=None, - metric_scale=1.0, -): - """Combine a fuzzy simplicial set with another fuzzy simplicial set - generated from discrete metric data using discrete distances. The target - data is assumed to be categorical label data (a vector of labels), - and this will update the fuzzy simplicial set to respect that label data. - - TODO: optional category cardinality based weighting of distance - - Parameters - ---------- - simplicial_set: sparse matrix - The input fuzzy simplicial set. - - discrete_space: array of shape (n_samples) - The categorical labels to use in the intersection. - - unknown_dist: float (optional, default 1.0) - The distance an unknown label (-1) is assumed to be from any point. - - far_dist: float (optional, default 5.0) - The distance between unmatched labels. - - metric: str (optional, default None) - If not None, then use this metric to determine the - distance between values. - - metric_scale: float (optional, default 1.0) - If using a custom metric scale the distance values by - this value -- this controls the weighting of the - intersection. Larger values weight more toward target. - - Returns - ------- - simplicial_set: sparse matrix - The resulting intersected fuzzy simplicial set. - - """ - if metric_kws is None: - metric_kws = {} - simplicial_set = simplicial_set.tocoo() - - if metric is not None: - # We presume target is now a 2d array, with each row being a - # vector of target info - if metric in dist.named_distances: - metric_func = dist.named_distances[metric] - else: - msg = "Discrete intersection metric is not recognized" - raise ValueError(msg) - - fast_metric_intersection( - simplicial_set.row, - simplicial_set.col, - simplicial_set.data, - discrete_space, - metric_func, - tuple(metric_kws.values()), - metric_scale, - ) - else: - fast_intersection( - simplicial_set.row, - simplicial_set.col, - simplicial_set.data, - discrete_space, - unknown_dist, - far_dist, - ) - - simplicial_set.eliminate_zeros() - - return reset_local_connectivity(simplicial_set) - - -def general_simplicial_set_intersection( - simplicial_set1, - simplicial_set2, - weight=0.5, - right_complement=False, -): - if right_complement: - result = simplicial_set1.tocoo() - else: - result = (simplicial_set1 + simplicial_set2).tocoo() - left = simplicial_set1.tocsr() - right = simplicial_set2.tocsr() - - sparse.general_sset_intersection( - left.indptr, - left.indices, - left.data, - right.indptr, - right.indices, - right.data, - result.row, - result.col, - result.data, - mix_weight=weight, - right_complement=right_complement, - ) - - return result - - -def general_simplicial_set_union(simplicial_set1, simplicial_set2): - result = (simplicial_set1 + simplicial_set2).tocoo() - left = simplicial_set1.tocsr() - right = simplicial_set2.tocsr() - - sparse.general_sset_union( - left.indptr, - left.indices, - left.data, - right.indptr, - right.indices, - right.data, - result.row, - result.col, - result.data, - ) - - return result - - -def make_epochs_per_sample(weights, n_epochs): - """Given a set of weights and number of epochs generate the number of - epochs per sample for each weight. - - Parameters - ---------- - weights: array of shape (n_1_simplices) - The weights of how much we wish to sample each 1-simplex. - - n_epochs: int - The total number of epochs we want to train for. - - Returns - ------- - An array of number of epochs per sample, one for each 1-simplex. - - """ - result = -1.0 * np.ones(weights.shape[0], dtype=np.float64) - n_samples = n_epochs * (weights / weights.max()) - result[n_samples > 0] = float(n_epochs) / np.float64(n_samples[n_samples > 0]) - return result - - -# scale coords so that the largest coordinate is max_coords, then add normal-distributed -# noise with standard deviation noise -def noisy_scale_coords(coords, random_state, max_coord=10.0, noise=0.0001): - expansion = max_coord / np.abs(coords).max() - coords = (coords * expansion).astype(np.float32) - return coords + random_state.normal(scale=noise, size=coords.shape).astype( - np.float32, - ) - - -def simplicial_set_embedding( - data, - graph, - n_components, - initial_alpha, - a, - b, - gamma, - negative_sample_rate, - n_epochs, - init, - random_state, - metric, - metric_kwds, - densmap, - densmap_kwds, - output_dens, - output_metric=dist.named_distances_with_gradients["euclidean"], - output_metric_kwds=None, - euclidean_output=True, - parallel=False, - verbose=False, - tqdm_kwds=None, -): - """Perform a fuzzy simplicial set embedding, using a specified - initialisation method and then minimizing the fuzzy set cross entropy - between the 1-skeletons of the high and low dimensional fuzzy simplicial - sets. - - Parameters - ---------- - data: array of shape (n_samples, n_features) - The source data to be embedded by UMAP. - - graph: sparse matrix - The 1-skeleton of the high dimensional fuzzy simplicial set as - represented by a graph for which we require a sparse matrix for the - (weighted) adjacency matrix. - - n_components: int - The dimensionality of the euclidean space into which to embed the data. - - initial_alpha: float - Initial learning rate for the SGD. - - a: float - Parameter of differentiable approximation of right adjoint functor - - b: float - Parameter of differentiable approximation of right adjoint functor - - gamma: float - Weight to apply to negative samples. - - negative_sample_rate: int (optional, default 5) - The number of negative samples to select per positive sample - in the optimization process. Increasing this value will result - in greater repulsive force being applied, greater optimization - cost, but slightly more accuracy. - - n_epochs: int (optional, default 0), or list of int - The number of training epochs to be used in optimizing the - low dimensional embedding. Larger values result in more accurate - embeddings. If 0 is specified a value will be selected based on - the size of the input dataset (200 for large datasets, 500 for small). - If a list of int is specified, then the intermediate embeddings at the - different epochs specified in that list are returned in - ``aux_data["embedding_list"]``. - - init: string - How to initialize the low dimensional embedding. Options are: - - * 'spectral': use a spectral embedding of the fuzzy 1-skeleton - * 'random': assign initial embedding positions at random. - * 'pca': use the first n_components from PCA applied to the input data. - * A numpy array of initial embedding positions. - - random_state: numpy RandomState or equivalent - A state capable being used as a numpy random state. - - metric: string or callable - The metric used to measure distance in high dimensional space; used if - multiple connected components need to be layed out. - - metric_kwds: dict - Key word arguments to be passed to the metric function; used if - multiple connected components need to be layed out. - - densmap: bool - Whether to use the density-augmented objective function to optimize - the embedding according to the densMAP algorithm. - - densmap_kwds: dict - Key word arguments to be used by the densMAP optimization. - - output_dens: bool - Whether to output local radii in the original data and the embedding. - - output_metric: function - Function returning the distance between two points in embedding space and - the gradient of the distance wrt the first argument. - - output_metric_kwds: dict - Key word arguments to be passed to the output_metric function. - - euclidean_output: bool - Whether to use the faster code specialised for euclidean output metrics - - parallel: bool (optional, default False) - Whether to run the computation using numba parallel. - Running in parallel is non-deterministic, and is not used - if a random seed has been set, to ensure reproducibility. - - verbose: bool (optional, default False) - Whether to report information on the current progress of the algorithm. - - tqdm_kwds: dict - Key word arguments to be used by the tqdm progress bar. - - Returns - ------- - embedding: array of shape (n_samples, n_components) - The optimized of ``graph`` into an ``n_components`` dimensional - euclidean space. - - aux_data: dict - Auxiliary output returned with the embedding. When densMAP extension - is turned on, this dictionary includes local radii in the original - data (``rad_orig``) and in the embedding (``rad_emb``). - - """ - if output_metric_kwds is None: - output_metric_kwds = {} - graph = graph.tocoo() - graph.sum_duplicates() - n_vertices = graph.shape[1] - - # For smaller datasets we can use more epochs - default_epochs = 500 if graph.shape[0] <= 10000 else 200 - - # Use more epochs for densMAP - if densmap: - default_epochs += 200 - - if n_epochs is None: - n_epochs = default_epochs - - # If n_epoch is a list, get the maximum epoch to reach - n_epochs_max = max(n_epochs) if isinstance(n_epochs, list) else n_epochs - - if n_epochs_max > 10: - graph.data[graph.data < (graph.data.max() / float(n_epochs_max))] = 0.0 - else: - graph.data[graph.data < (graph.data.max() / float(default_epochs))] = 0.0 - - graph.eliminate_zeros() - - if isinstance(init, str) and init == "random": - embedding = random_state.uniform( - low=-10.0, - high=10.0, - size=(graph.shape[0], n_components), - ).astype(np.float32) - elif isinstance(init, str) and init == "pca": - if scipy.sparse.issparse(data): - pca = TruncatedSVD(n_components=n_components, random_state=random_state) - else: - pca = PCA(n_components=n_components, random_state=random_state) - embedding = pca.fit_transform(data).astype(np.float32) - embedding = noisy_scale_coords( - embedding, - random_state, - max_coord=10, - noise=0.0001, - ) - elif isinstance(init, str) and init == "spectral": - embedding = spectral_layout( - data, - graph, - n_components, - random_state, - metric=metric, - metric_kwds=metric_kwds, - ) - # We add a little noise to avoid local minima for optimization to come - embedding = noisy_scale_coords( - embedding, - random_state, - max_coord=10, - noise=0.0001, - ) - elif isinstance(init, str) and init == "tswspectral": - embedding = tswspectral_layout( - data, - graph, - n_components, - random_state, - metric=metric, - metric_kwds=metric_kwds, - ) - embedding = noisy_scale_coords( - embedding, - random_state, - max_coord=10, - noise=0.0001, - ) - else: - init_data = np.array(init) - if len(init_data.shape) == 2: - if np.unique(init_data, axis=0).shape[0] < init_data.shape[0]: - tree = KDTree(init_data) - dist, _ind = tree.query(init_data, k=2) - nndist = np.mean(dist[:, 1]) - embedding = init_data + random_state.normal( - scale=0.001 * nndist, - size=init_data.shape, - ).astype(np.float32) - else: - embedding = init_data - - epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs_max) - - head = graph.row - tail = graph.col - - rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) - - aux_data = {} - - if densmap or output_dens: - if verbose: - pass - - dists = densmap_kwds["graph_dists"] - - mu_sum = np.zeros(n_vertices, dtype=np.float32) - ro = np.zeros(n_vertices, dtype=np.float32) - for i in range(len(head)): - j = head[i] - k = tail[i] - - D = dists[j, k] * dists[j, k] # match sq-Euclidean used for embedding - mu = graph.data[i] - - ro[j] += mu * D - ro[k] += mu * D - mu_sum[j] += mu - mu_sum[k] += mu - - epsilon = 1e-8 - ro = np.log(epsilon + (ro / mu_sum)) - - if densmap: - R = (ro - np.mean(ro)) / np.std(ro) - densmap_kwds["mu"] = graph.data - densmap_kwds["mu_sum"] = mu_sum - densmap_kwds["R"] = R - - if output_dens: - aux_data["rad_orig"] = ro - - embedding = ( - 10.0 - * (embedding - np.min(embedding, 0)) - / (np.max(embedding, 0) - np.min(embedding, 0)) - ).astype(np.float32, order="C") - - if euclidean_output: - embedding = optimize_layout_euclidean( - embedding, - embedding, - head, - tail, - n_epochs, - n_vertices, - epochs_per_sample, - a, - b, - rng_state, - gamma, - initial_alpha, - negative_sample_rate, - parallel=parallel, - verbose=verbose, - densmap=densmap, - densmap_kwds=densmap_kwds, - tqdm_kwds=tqdm_kwds, - move_other=True, - ) - else: - embedding = optimize_layout_generic( - embedding, - embedding, - head, - tail, - n_epochs, - n_vertices, - epochs_per_sample, - a, - b, - rng_state, - gamma, - initial_alpha, - negative_sample_rate, - output_metric, - tuple(output_metric_kwds.values()), - verbose=verbose, - tqdm_kwds=tqdm_kwds, - move_other=True, - ) - - if isinstance(embedding, list): - aux_data["embedding_list"] = embedding - embedding = embedding[-1].copy() - - if output_dens: - if verbose: - pass - - # Compute graph in embedding - ( - knn_indices, - knn_dists, - _rp_forest, - ) = nearest_neighbors( - embedding, - densmap_kwds["n_neighbors"], - "euclidean", - {}, - False, - random_state, - verbose=verbose, - ) - - emb_graph, _emb_sigmas, _emb_rhos, emb_dists = fuzzy_simplicial_set( - embedding, - densmap_kwds["n_neighbors"], - random_state, - "euclidean", - {}, - knn_indices, - knn_dists, - verbose=verbose, - return_dists=True, - ) - - emb_graph = emb_graph.tocoo() - emb_graph.sum_duplicates() - emb_graph.eliminate_zeros() - - n_vertices = emb_graph.shape[1] - - mu_sum = np.zeros(n_vertices, dtype=np.float32) - re = np.zeros(n_vertices, dtype=np.float32) - - head = emb_graph.row - tail = emb_graph.col - for i in range(len(head)): - j = head[i] - k = tail[i] - - D = emb_dists[j, k] - mu = emb_graph.data[i] - - re[j] += mu * D - re[k] += mu * D - mu_sum[j] += mu - mu_sum[k] += mu - - epsilon = 1e-8 - re = np.log(epsilon + (re / mu_sum)) - - aux_data["rad_emb"] = re - - return embedding, aux_data - - -@numba.njit() -def init_transform(indices, weights, embedding): - """Given indices and weights and an original embeddings - initialize the positions of new points relative to the - indices and weights (of their neighbors in the source data). - - Parameters - ---------- - indices: array of shape (n_new_samples, n_neighbors) - The indices of the neighbors of each new sample - - weights: array of shape (n_new_samples, n_neighbors) - The membership strengths of associated 1-simplices - for each of the new samples. - - embedding: array of shape (n_samples, dim) - The original embedding of the source data. - - Returns - ------- - new_embedding: array of shape (n_new_samples, dim) - An initial embedding of the new sample points. - - """ - result = np.zeros((indices.shape[0], embedding.shape[1]), dtype=np.float32) - - for i in range(indices.shape[0]): - for j in range(indices.shape[1]): - for d in range(embedding.shape[1]): - result[i, d] += weights[i, j] * embedding[indices[i, j], d] - - return result - - -def init_graph_transform(graph, embedding): - """Given a bipartite graph representing the 1-simplices and strengths between the - new points and the original data set along with an embedding of the original points - initialize the positions of new points relative to the strengths (of their neighbors in the source data). - - If a point is in our original data set it embeds at the original points coordinates. - If a point has no neighbours in our original dataset it embeds as the np.nan vector. - Otherwise a point is the weighted average of it's neighbours embedding locations. - - Parameters - ---------- - graph: csr_matrix (n_new_samples, n_samples) - A matrix indicating the 1-simplices and their associated strengths. These strengths should - be values between zero and one and not normalized. One indicating that the new point was identical - to one of our original points. - - embedding: array of shape (n_samples, dim) - The original embedding of the source data. - - Returns - ------- - new_embedding: array of shape (n_new_samples, dim) - An initial embedding of the new sample points. - - """ - result = np.zeros((graph.shape[0], embedding.shape[1]), dtype=np.float32) - - for row_index in range(graph.shape[0]): - graph_row = graph[row_index] - if graph_row.nnz == 0: - result[row_index] = np.nan - continue - row_sum = graph_row.sum() - for graph_value, col_index in zip(graph_row.data, graph_row.indices): - if graph_value == 1: - result[row_index, :] = embedding[col_index, :] - break - result[row_index] += graph_value / row_sum * embedding[col_index] - - return result - - -@numba.njit() -def init_update(current_init, n_original_samples, indices) -> None: - for i in range(n_original_samples, indices.shape[0]): - n = 0 - for j in range(indices.shape[1]): - for d in range(current_init.shape[1]): - if indices[i, j] < n_original_samples: - n += 1 - current_init[i, d] += current_init[indices[i, j], d] - for d in range(current_init.shape[1]): - current_init[i, d] /= n - - -def find_ab_params(spread, min_dist): - """Fit a, b params for the differentiable curve used in lower - dimensional fuzzy simplicial complex construction. We want the - smooth curve (from a pre-defined family with simple gradient) that - best matches an offset exponential decay. - """ - - def curve(x, a, b): - return 1.0 / (1.0 + a * x ** (2 * b)) - - xv = np.linspace(0, spread * 3, 300) - yv = np.zeros(xv.shape) - yv[xv < min_dist] = 1.0 - yv[xv >= min_dist] = np.exp(-(xv[xv >= min_dist] - min_dist) / spread) - params, _covar = curve_fit(curve, xv, yv) - return params[0], params[1] - - -class UMAP(BaseEstimator, ClassNamePrefixFeaturesOutMixin): - """Uniform Manifold Approximation and Projection. - - Finds a low dimensional embedding of the data that approximates - an underlying manifold. - - Parameters - ---------- - n_neighbors: float (optional, default 15) - The size of local neighborhood (in terms of number of neighboring - sample points) used for manifold approximation. Larger values - result in more global views of the manifold, while smaller - values result in more local data being preserved. In general - values should be in the range 2 to 100. - - n_components: int (optional, default 2) - The dimension of the space to embed into. This defaults to 2 to - provide easy visualization, but can reasonably be set to any - integer value in the range 2 to 100. - - metric: string or function (optional, default 'euclidean') - The metric to use to compute distances in high dimensional space. - If a string is passed it must match a valid predefined metric. If - a general metric is required a function that takes two 1d arrays and - returns a float can be provided. For performance purposes it is - required that this be a numba jit'd function. Valid string metrics - include: - - * euclidean - * manhattan - * chebyshev - * minkowski - * canberra - * braycurtis - * mahalanobis - * wminkowski - * seuclidean - * cosine - * correlation - * haversine - * hamming - * jaccard - * dice - * russelrao - * kulsinski - * ll_dirichlet - * hellinger - * rogerstanimoto - * sokalmichener - * sokalsneath - * yule - - Metrics that take arguments (such as minkowski, mahalanobis etc.) - can have arguments passed via the metric_kwds dictionary. At this - time care must be taken and dictionary elements must be ordered - appropriately; this will hopefully be fixed in the future. - - n_epochs: int (optional, default None) - The number of training epochs to be used in optimizing the - low dimensional embedding. Larger values result in more accurate - embeddings. If None is specified a value will be selected based on - the size of the input dataset (200 for large datasets, 500 for small). - - learning_rate: float (optional, default 1.0) - The initial learning rate for the embedding optimization. - - init: string (optional, default 'spectral') - How to initialize the low dimensional embedding. Options are: - - * 'spectral': use a spectral embedding of the fuzzy 1-skeleton - * 'random': assign initial embedding positions at random. - * 'pca': use the first n_components from PCA applied to the - input data. - * 'tswspectral': use a spectral embedding of the fuzzy - 1-skeleton, using a truncated singular value decomposition to - "warm" up the eigensolver. This is intended as an alternative - to the 'spectral' method, if that takes an excessively long - time to complete initialization (or fails to complete). - * A numpy array of initial embedding positions. - - min_dist: float (optional, default 0.1) - The effective minimum distance between embedded points. Smaller values - will result in a more clustered/clumped embedding where nearby points - on the manifold are drawn closer together, while larger values will - result on a more even dispersal of points. The value should be set - relative to the ``spread`` value, which determines the scale at which - embedded points will be spread out. - - spread: float (optional, default 1.0) - The effective scale of embedded points. In combination with ``min_dist`` - this determines how clustered/clumped the embedded points are. - - low_memory: bool (optional, default True) - For some datasets the nearest neighbor computation can consume a lot of - memory. If you find that UMAP is failing due to memory constraints - consider setting this option to True. This approach is more - computationally expensive, but avoids excessive memory use. - - set_op_mix_ratio: float (optional, default 1.0) - Interpolate between (fuzzy) union and intersection as the set operation - used to combine local fuzzy simplicial sets to obtain a global fuzzy - simplicial sets. Both fuzzy set operations use the product t-norm. - The value of this parameter should be between 0.0 and 1.0; a value of - 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy - intersection. - - local_connectivity: int (optional, default 1) - The local connectivity required -- i.e. the number of nearest - neighbors that should be assumed to be connected at a local level. - The higher this value the more connected the manifold becomes - locally. In practice this should be not more than the local intrinsic - dimension of the manifold. - - repulsion_strength: float (optional, default 1.0) - Weighting applied to negative samples in low dimensional embedding - optimization. Values higher than one will result in greater weight - being given to negative samples. - - negative_sample_rate: int (optional, default 5) - The number of negative samples to select per positive sample - in the optimization process. Increasing this value will result - in greater repulsive force being applied, greater optimization - cost, but slightly more accuracy. - - transform_queue_size: float (optional, default 4.0) - For transform operations (embedding new points using a trained model - this will control how aggressively to search for nearest neighbors. - Larger values will result in slower performance but more accurate - nearest neighbor evaluation. - - a: float (optional, default None) - More specific parameters controlling the embedding. If None these - values are set automatically as determined by ``min_dist`` and - ``spread``. - b: float (optional, default None) - More specific parameters controlling the embedding. If None these - values are set automatically as determined by ``min_dist`` and - ``spread``. - - random_state: int, RandomState instance or None, optional (default: None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - metric_kwds: dict (optional, default None) - Arguments to pass on to the metric, such as the ``p`` value for - Minkowski distance. If None then no arguments are passed on. - - angular_rp_forest: bool (optional, default False) - Whether to use an angular random projection forest to initialise - the approximate nearest neighbor search. This can be faster, but is - mostly only useful for a metric that uses an angular style distance such - as cosine, correlation etc. In the case of those metrics angular forests - will be chosen automatically. - - target_n_neighbors: int (optional, default -1) - The number of nearest neighbors to use to construct the target simplicial - set. If set to -1 use the ``n_neighbors`` value. - - target_metric: string or callable (optional, default 'categorical') - The metric used to measure distance for a target array is using supervised - dimension reduction. By default this is 'categorical' which will measure - distance in terms of whether categories match or are different. Furthermore, - if semi-supervised is required target values of -1 will be trated as - unlabelled under the 'categorical' metric. If the target array takes - continuous values (e.g. for a regression problem) then metric of 'l1' - or 'l2' is probably more appropriate. - - target_metric_kwds: dict (optional, default None) - Keyword argument to pass to the target metric when performing - supervised dimension reduction. If None then no arguments are passed on. - - target_weight: float (optional, default 0.5) - weighting factor between data topology and target topology. A value of - 0.0 weights predominantly on data, a value of 1.0 places a strong emphasis on - target. The default of 0.5 balances the weighting equally between data and - target. - - transform_seed: int (optional, default 42) - Random seed used for the stochastic aspects of the transform operation. - This ensures consistency in transform operations. - - verbose: bool (optional, default False) - Controls verbosity of logging. - - tqdm_kwds: dict (optional, defaul None) - Key word arguments to be used by the tqdm progress bar. - - unique: bool (optional, default False) - Controls if the rows of your data should be uniqued before being - embedded. If you have more duplicates than you have ``n_neighbors`` - you can have the identical data points lying in different regions of - your space. It also violates the definition of a metric. - For to map from internal structures back to your data use the variable - _unique_inverse_. - - densmap: bool (optional, default False) - Specifies whether the density-augmented objective of densMAP - should be used for optimization. Turning on this option generates - an embedding where the local densities are encouraged to be correlated - with those in the original space. Parameters below with the prefix 'dens' - further control the behavior of this extension. - - dens_lambda: float (optional, default 2.0) - Controls the regularization weight of the density correlation term - in densMAP. Higher values prioritize density preservation over the - UMAP objective, and vice versa for values closer to zero. Setting this - parameter to zero is equivalent to running the original UMAP algorithm. - - dens_frac: float (optional, default 0.3) - Controls the fraction of epochs (between 0 and 1) where the - density-augmented objective is used in densMAP. The first - (1 - dens_frac) fraction of epochs optimize the original UMAP objective - before introducing the density correlation term. - - dens_var_shift: float (optional, default 0.1) - A small constant added to the variance of local radii in the - embedding when calculating the density correlation objective to - prevent numerical instability from dividing by a small number - - output_dens: float (optional, default False) - Determines whether the local radii of the final embedding (an inverse - measure of local density) are computed and returned in addition to - the embedding. If set to True, local radii of the original data - are also included in the output for comparison; the output is a tuple - (embedding, original local radii, embedding local radii). This option - can also be used when densmap=False to calculate the densities for - UMAP embeddings. - - disconnection_distance: float (optional, default np.inf or maximal value for bounded distances) - Disconnect any vertices of distance greater than or equal to disconnection_distance when approximating the - manifold via our k-nn graph. This is particularly useful in the case that you have a bounded metric. The - UMAP assumption that we have a connected manifold can be problematic when you have points that are maximally - different from all the rest of your data. The connected manifold assumption will make such points have perfect - similarity to a random set of other points. Too many such points will artificially connect your space. - - precomputed_knn: tuple (optional, default (None,None,None)) - If the k-nearest neighbors of each point has already been calculated you - can pass them in here to save computation time. The number of nearest - neighbors in the precomputed_knn must be greater or equal to the - n_neighbors parameter. This should be a tuple containing the output - of the nearest_neighbors() function or attributes from a previously fit - UMAP object; (knn_indices, knn_dists, knn_search_index). If you wish to use - k-nearest neighbors data calculated by another package then provide a tuple of - the form (knn_indices, knn_dists). The contents of the tuple should be two numpy - arrays of shape (N, n_neighbors) where N is the number of items in the - input data. The first array should be the integer indices of the nearest - neighbors, and the second array should be the corresponding distances. The - nearest neighbor of each item should be itself, e.g. the nearest neighbor of - item 0 should be 0, the nearest neighbor of item 1 is 1 and so on. Please note - that you will *not* be able to transform new data in this case. - - """ - - def __init__( - self, - n_neighbors=15, - n_components=2, - metric="euclidean", - metric_kwds=None, - output_metric="euclidean", - output_metric_kwds=None, - n_epochs=None, - learning_rate=1.0, - init="spectral", - min_dist=0.1, - spread=1.0, - low_memory=True, - n_jobs=-1, - set_op_mix_ratio=1.0, - local_connectivity=1.0, - repulsion_strength=1.0, - negative_sample_rate=5, - transform_queue_size=4.0, - a=None, - b=None, - random_state=None, - angular_rp_forest=False, - target_n_neighbors=-1, - target_metric="categorical", - target_metric_kwds=None, - target_weight=0.5, - transform_seed=42, - transform_mode="embedding", - force_approximation_algorithm=False, - verbose=False, - tqdm_kwds=None, - unique=False, - densmap=False, - dens_lambda=2.0, - dens_frac=0.3, - dens_var_shift=0.1, - output_dens=False, - disconnection_distance=None, - precomputed_knn=(None, None, None), - use_hnsw=None, - hnsw_prune_strategy="simple", - hnsw_alpha=1.2, - ) -> None: - self.n_neighbors = n_neighbors - self.metric = metric - self.output_metric = output_metric - self.target_metric = target_metric - self.metric_kwds = metric_kwds - self.output_metric_kwds = output_metric_kwds - self.n_epochs = n_epochs - self.init = init - self.n_components = n_components - self.repulsion_strength = repulsion_strength - self.learning_rate = learning_rate - - self.spread = spread - self.min_dist = min_dist - self.low_memory = low_memory - self.set_op_mix_ratio = set_op_mix_ratio - self.local_connectivity = local_connectivity - self.negative_sample_rate = negative_sample_rate - self.random_state = random_state - self.angular_rp_forest = angular_rp_forest - self.transform_queue_size = transform_queue_size - self.target_n_neighbors = target_n_neighbors - self.target_metric = target_metric - self.target_metric_kwds = target_metric_kwds - self.target_weight = target_weight - self.transform_seed = transform_seed - self.transform_mode = transform_mode - self.force_approximation_algorithm = force_approximation_algorithm - self.verbose = verbose - self.tqdm_kwds = tqdm_kwds - self.unique = unique - - self.densmap = densmap - self.dens_lambda = dens_lambda - self.dens_frac = dens_frac - self.dens_var_shift = dens_var_shift - self.output_dens = output_dens - self.disconnection_distance = disconnection_distance - self.precomputed_knn = precomputed_knn - self.use_hnsw = use_hnsw - self.hnsw_prune_strategy = hnsw_prune_strategy - self.hnsw_alpha = hnsw_alpha - - self.n_jobs = n_jobs - - self.a = a - self.b = b - - def _validate_parameters(self) -> None: - if self.set_op_mix_ratio < 0.0 or self.set_op_mix_ratio > 1.0: - msg = "set_op_mix_ratio must be between 0.0 and 1.0" - raise ValueError(msg) - if self.repulsion_strength < 0.0: - msg = "repulsion_strength cannot be negative" - raise ValueError(msg) - if self.min_dist > self.spread: - msg = "min_dist must be less than or equal to spread" - raise ValueError(msg) - if self.min_dist < 0.0: - msg = "min_dist cannot be negative" - raise ValueError(msg) - if not isinstance(self.init, str) and not isinstance(self.init, np.ndarray): - msg = "init must be a string or ndarray" - raise ValueError(msg) - if isinstance(self.init, str) and self.init not in ( - "pca", - "spectral", - "random", - "tswspectral", - ): - msg = ( - 'string init values must be one of: "pca", "tswspectral",' - ' "spectral" or "random"' - ) - raise ValueError( - msg, - ) - if ( - isinstance(self.init, np.ndarray) - and self.init.shape[1] != self.n_components - ): - msg = "init ndarray must match n_components value" - raise ValueError(msg) - if not isinstance(self.metric, str) and not callable(self.metric): - msg = "metric must be string or callable" - raise ValueError(msg) - if self.negative_sample_rate < 0: - msg = "negative sample rate must be positive" - raise ValueError(msg) - if self._initial_alpha < 0.0: - msg = "learning_rate must be positive" - raise ValueError(msg) - if self.n_neighbors < 2: - msg = "n_neighbors must be greater than 1" - raise ValueError(msg) - if self.target_n_neighbors < 2 and self.target_n_neighbors != -1: - msg = "target_n_neighbors must be greater than 1" - raise ValueError(msg) - if not isinstance(self.n_components, int): - if isinstance(self.n_components, str): - msg = "n_components must be an int" - raise ValueError(msg) - if self.n_components % 1 != 0: - msg = "n_components must be a whole number" - raise ValueError(msg) - try: - # this will convert other types of int (eg. numpy int64) - # to Python int - self.n_components = int(self.n_components) - except ValueError: - msg = "n_components must be an int" - raise ValueError(msg) - if self.n_components < 1: - msg = "n_components must be greater than 0" - raise ValueError(msg) - self.n_epochs_list = None - if isinstance(self.n_epochs, (list, tuple, np.ndarray)): - if not issubclass( - np.array(self.n_epochs).dtype.type, - np.integer, - ) or not np.all(np.array(self.n_epochs) >= 0): - msg = ( - "n_epochs must be a nonnegative integer " - "or a list of nonnegative integers" - ) - raise ValueError( - msg, - ) - self.n_epochs_list = list(self.n_epochs) - elif self.n_epochs is not None and ( - self.n_epochs < 0 or not isinstance(self.n_epochs, int) - ): - msg = ( - "n_epochs must be a nonnegative integer " - "or a list of nonnegative integers" - ) - raise ValueError( - msg, - ) - if self.metric_kwds is None: - self._metric_kwds = {} - else: - self._metric_kwds = self.metric_kwds - if self.output_metric_kwds is None: - self._output_metric_kwds = {} - else: - self._output_metric_kwds = self.output_metric_kwds - if self.target_metric_kwds is None: - self._target_metric_kwds = {} - else: - self._target_metric_kwds = self.target_metric_kwds - # check sparsity of data upfront to set proper _input_distance_func & - # save repeated checks later on - if scipy.sparse.isspmatrix_csr(self._raw_data): - self._sparse_data = True - else: - self._sparse_data = False - # set input distance metric & inverse_transform distance metric - if callable(self.metric): - in_returns_grad = self._check_custom_metric( - self.metric, - self._metric_kwds, - self._raw_data, - ) - if in_returns_grad: - _m = self.metric - - @numba.njit(fastmath=True) - def _dist_only(x, y, *kwds): - return _m(x, y, *kwds)[0] - - self._input_distance_func = _dist_only - self._inverse_distance_func = self.metric - else: - self._input_distance_func = self.metric - self._inverse_distance_func = None - warn( - "custom distance metric does not return gradient; inverse_transform will be unavailable. " - "To enable using inverse_transform method, define a distance function that returns a tuple " - "of (distance [float], gradient [np.array])", - stacklevel=2, - ) - elif self.metric == "precomputed": - if self.unique: - msg = "unique is poorly defined on a precomputed metric" - raise ValueError(msg) - warn( - "using precomputed metric; inverse_transform will be unavailable", - stacklevel=2, - ) - self._input_distance_func = self.metric - self._inverse_distance_func = None - elif self.metric == "hellinger" and self._raw_data.min() < 0: - msg = "Metric 'hellinger' does not support negative values" - raise ValueError(msg) - elif self.metric in dist.named_distances: - if self._sparse_data: - if self.metric in sparse.sparse_named_distances: - self._input_distance_func = sparse.sparse_named_distances[ - self.metric - ] - else: - msg = f"Metric {self.metric} is not supported for sparse data" - raise ValueError( - msg, - ) - else: - self._input_distance_func = dist.named_distances[self.metric] - try: - self._inverse_distance_func = dist.named_distances_with_gradients[ - self.metric - ] - except KeyError: - warn( - f"gradient function is not yet implemented for {self.metric} distance metric; " - "inverse_transform will be unavailable", - stacklevel=2, - ) - self._inverse_distance_func = None - elif self.metric in pynn_named_distances: - if self._sparse_data: - if self.metric in pynn_sparse_named_distances: - self._input_distance_func = pynn_sparse_named_distances[self.metric] - else: - msg = f"Metric {self.metric} is not supported for sparse data" - raise ValueError( - msg, - ) - else: - self._input_distance_func = pynn_named_distances[self.metric] - - warn( - f"gradient function is not yet implemented for {self.metric} distance metric; " - "inverse_transform will be unavailable", - stacklevel=2, - ) - self._inverse_distance_func = None - else: - msg = "metric is neither callable nor a recognised string" - raise ValueError(msg) - # set output distance metric - if callable(self.output_metric): - out_returns_grad = self._check_custom_metric( - self.output_metric, - self._output_metric_kwds, - ) - if out_returns_grad: - self._output_distance_func = self.output_metric - else: - msg = "custom output_metric must return a tuple of (distance [float], gradient [np.array])" - raise ValueError( - msg, - ) - elif self.output_metric == "precomputed": - msg = "output_metric cannnot be 'precomputed'" - raise ValueError(msg) - elif self.output_metric in dist.named_distances_with_gradients: - self._output_distance_func = dist.named_distances_with_gradients[ - self.output_metric - ] - elif self.output_metric in dist.named_distances: - msg = f"gradient function is not yet implemented for {self.output_metric}." - raise ValueError( - msg, - ) - else: - msg = "output_metric is neither callable nor a recognised string" - raise ValueError( - msg, - ) - # set angularity for NN search based on metric - if self.metric in ( - "cosine", - "correlation", - "dice", - "jaccard", - "ll_dirichlet", - "hellinger", - ): - self.angular_rp_forest = True - - if self.n_jobs < -1 or self.n_jobs == 0: - msg = "n_jobs must be a postive integer, or -1 (for all cores)" - raise ValueError(msg) - if self.n_jobs != 1 and self.random_state is not None: - self.n_jobs = 1 - warn( - f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.", - stacklevel=2, - ) - - if self.dens_lambda < 0.0: - msg = "dens_lambda cannot be negative" - raise ValueError(msg) - if self.dens_frac < 0.0 or self.dens_frac > 1.0: - msg = "dens_frac must be between 0.0 and 1.0" - raise ValueError(msg) - if self.dens_var_shift < 0.0: - msg = "dens_var_shift cannot be negative" - raise ValueError(msg) - - self._densmap_kwds = { - "lambda": self.dens_lambda if self.densmap else 0.0, - "frac": self.dens_frac if self.densmap else 0.0, - "var_shift": self.dens_var_shift, - "n_neighbors": self.n_neighbors, - } - - if self.densmap and self.output_metric not in ("euclidean", "l2"): - msg = "Non-Euclidean output metric not supported for densMAP." - raise ValueError( - msg, - ) - - # This will be used to prune all edges of greater than a fixed value from our knn graph. - # We have preset defaults described in DISCONNECTION_DISTANCES for our bounded measures. - # Otherwise a user can pass in their own value. - if self.disconnection_distance is None: - self._disconnection_distance = DISCONNECTION_DISTANCES.get( - self.metric, - np.inf, - ) - elif isinstance(self.disconnection_distance, (int, float)): - self._disconnection_distance = self.disconnection_distance - else: - msg = "disconnection_distance must either be None or a numeric." - raise ValueError(msg) - - if self.tqdm_kwds is None: - self.tqdm_kwds = {} - elif isinstance(self.tqdm_kwds, dict) is False: - msg = ( - "tqdm_kwds must be a dictionary. Please provide valid tqdm " - "parameters as key value pairs. Valid tqdm parameters can be " - "found here: https://github.com/tqdm/tqdm#parameters" - ) - raise ValueError( - msg, - ) - if "desc" not in self.tqdm_kwds: - self.tqdm_kwds["desc"] = "Epochs completed" - if "bar_format" not in self.tqdm_kwds: - bar_f = "{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]" - self.tqdm_kwds["bar_format"] = bar_f - - if hasattr(self, "knn_dists") and self.knn_dists is not None: - if self.unique: - msg = "unique is not currently available for precomputed_knn." - raise ValueError( - msg, - ) - if not isinstance(self.knn_indices, np.ndarray): - msg = "precomputed_knn[0] must be ndarray object." - raise ValueError(msg) - if not isinstance(self.knn_dists, np.ndarray): - msg = "precomputed_knn[1] must be ndarray object." - raise ValueError(msg) - if self.knn_dists.shape != self.knn_indices.shape: - msg = ( - "precomputed_knn[0] and precomputed_knn[1]" - " must be numpy arrays of the same size." - ) - raise ValueError( - msg, - ) - # #848: warn but proceed if no search index is present - if not isinstance(self.knn_search_index, NNDescent): - warn( - "precomputed_knn[2] (knn_search_index) " - "is not an NNDescent object: transforming new data with transform " - "will be unavailable.", - stacklevel=2, - ) - if self.knn_dists.shape[1] < self.n_neighbors: - warn( - "precomputed_knn has a lower number of neighbors than " - "n_neighbors parameter. precomputed_knn will be ignored" - " and the k-nn will be computed normally.", - stacklevel=2, - ) - self.knn_indices = None - self.knn_dists = None - self.knn_search_index = None - elif self.knn_dists.shape[0] != self._raw_data.shape[0]: - warn( - "precomputed_knn has a different number of samples than the" - " data you are fitting. precomputed_knn will be ignored and" - "the k-nn will be computed normally.", - stacklevel=2, - ) - self.knn_indices = None - self.knn_dists = None - self.knn_search_index = None - elif ( - self.knn_dists.shape[0] < 4096 - and not self.force_approximation_algorithm - ): - # force_approximation_algorithm is irrelevant for pre-computed knn - # always set it to True which keeps downstream code paths working - self.force_approximation_algorithm = True - elif self.knn_dists.shape[1] > self.n_neighbors: - # if k for precomputed_knn larger than n_neighbors we simply prune it - self.knn_indices = self.knn_indices[:, : self.n_neighbors] - self.knn_dists = self.knn_dists[:, : self.n_neighbors] - - def _check_custom_metric(self, metric, kwds, data=None): - # quickly check to determine whether user-defined - # self.metric/self.output_metric returns both distance and gradient - rng = check_random_state(42) # Use fixed seed for metric checking - if data is not None: - # if checking the high-dimensional distance metric, test directly on - # input data so we don't risk violating any assumptions potentially - # hard-coded in the metric (e.g., bounded; non-negative) - indices = rng.integers(0, data.shape[0], 2) - x, y = data[indices] - else: - # if checking the manifold distance metric, simulate some data on a - # reasonable interval with output dimensionality - x, y = rng.uniform(low=-10, high=10, size=(2, self.n_components)) - - if scipy.sparse.issparse(data): - metric_out = metric(x.indices, x.data, y.indices, y.data, **kwds) - else: - metric_out = metric(x, y, **kwds) - # True if metric returns iterable of length 2, False otherwise - return hasattr(metric_out, "__iter__") and len(metric_out) == 2 - - def _populate_combined_params(self, *models) -> None: - self.n_neighbors = flattened([m.n_neighbors for m in models]) - self.metric = flattened([m.metric for m in models]) - self.metric_kwds = flattened([m.metric_kwds for m in models]) - self.output_metric = flattened([m.output_metric for m in models]) - - self.n_epochs = flattened( - [m.n_epochs if m.n_epochs is not None else -1 for m in models], - ) - if all(x == -1 for x in self.n_epochs): - self.n_epochs = None - - self.init = flattened([m.init for m in models]) - self.n_components = flattened([m.n_components for m in models]) - self.repulsion_strength = flattened([m.repulsion_strength for m in models]) - self.learning_rate = flattened([m.learning_rate for m in models]) - - self.spread = flattened([m.spread for m in models]) - self.min_dist = flattened([m.min_dist for m in models]) - self.low_memory = flattened([m.low_memory for m in models]) - self.set_op_mix_ratio = flattened([m.set_op_mix_ratio for m in models]) - self.local_connectivity = flattened([m.local_connectivity for m in models]) - self.negative_sample_rate = flattened([m.negative_sample_rate for m in models]) - self.random_state = flattened([m.random_state for m in models]) - self.angular_rp_forest = flattened([m.angular_rp_forest for m in models]) - self.transform_queue_size = flattened([m.transform_queue_size for m in models]) - self.target_n_neighbors = flattened([m.target_n_neighbors for m in models]) - self.target_metric = flattened([m.target_metric for m in models]) - self.target_metric_kwds = flattened([m.target_metric_kwds for m in models]) - self.target_weight = flattened([m.target_weight for m in models]) - self.transform_seed = flattened([m.transform_seed for m in models]) - self.force_approximation_algorithm = flattened( - [m.force_approximation_algorithm for m in models], - ) - self.verbose = flattened([m.verbose for m in models]) - self.unique = flattened([m.unique for m in models]) - - self.densmap = flattened([m.densmap for m in models]) - self.dens_lambda = flattened([m.dens_lambda for m in models]) - self.dens_frac = flattened([m.dens_frac for m in models]) - self.dens_var_shift = flattened([m.dens_var_shift for m in models]) - self.output_dens = flattened([m.output_dens for m in models]) - - self.a = flattened([m.a for m in models]) - self.b = flattened([m.b for m in models]) - - # Access _a and _b via getattr to avoid accessing private members directly - self._a = flattened([getattr(m, "_a", m.a) for m in models]) - self._b = flattened([getattr(m, "_b", m.b) for m in models]) - - def __mul__(self, other): - check_is_fitted( - self, - attributes=["graph_"], - msg="Only fitted UMAP models can be combined", - ) - check_is_fitted( - other, - attributes=["graph_"], - msg="Only fitted UMAP models can be combined", - ) - - if self.graph_.shape[0] != other.graph_.shape[0]: - msg = "Only models with the equivalent samples can be combined" - raise ValueError(msg) - - result = UMAP() - result._populate_combined_params(self, other) - - result.graph_ = general_simplicial_set_intersection( - self.graph_, - other.graph_, - 0.5, - ) - result.graph_ = reset_local_connectivity(result.graph_, True) - - if scipy.sparse.csgraph.connected_components(result.graph_)[0] > 1: - warn( - "Combined graph is not connected but multi-component layout is unsupported. " - "Falling back to random initialization.", - stacklevel=2, - ) - init = "random" - else: - init = "spectral" - - result.densmap = np.any(result.densmap) - result.output_dens = np.any(result.output_dens) - - result._densmap_kwds = { - "lambda": np.max(result.dens_lambda), - "frac": np.max(result.dens_frac), - "var_shift": np.max(result.dens_var_shift), - "n_neighbors": np.max(result.n_neighbors), - } - - n_epochs = None if result.n_epochs is None else np.max(result.n_epochs) - - result.embedding_, aux_data = simplicial_set_embedding( - None, - result.graph_, - np.min(result.n_components), - np.min(result.learning_rate), - np.mean(result._a), - np.mean(result._b), - np.mean(result.repulsion_strength), - np.mean(result.negative_sample_rate), - n_epochs, - init, - check_random_state(42), - "euclidean", - {}, - result.densmap, - result._densmap_kwds, - result.output_dens, - parallel=False, - verbose=bool(np.max(result.verbose)), - tqdm_kwds=self.tqdm_kwds, - ) - - if result.output_dens: - result.rad_orig_ = aux_data["rad_orig"] - result.rad_emb_ = aux_data["rad_emb"] - - return result - - def __add__(self, other): - check_is_fitted( - self, - attributes=["graph_"], - msg="Only fitted UMAP models can be combined", - ) - check_is_fitted( - other, - attributes=["graph_"], - msg="Only fitted UMAP models can be combined", - ) - - if self.graph_.shape[0] != other.graph_.shape[0]: - msg = "Only models with the equivalent samples can be combined" - raise ValueError(msg) - - result = UMAP() - result._populate_combined_params(self, other) - - result.graph_ = general_simplicial_set_union(self.graph_, other.graph_) - result.graph_ = reset_local_connectivity(result.graph_, True) - - if scipy.sparse.csgraph.connected_components(result.graph_)[0] > 1: - warn( - "Combined graph is not connected but mult-component layout is unsupported. " - "Falling back to random initialization.", - stacklevel=2, - ) - init = "random" - else: - init = "spectral" - - result.densmap = np.any(result.densmap) - result.output_dens = np.any(result.output_dens) - - result._densmap_kwds = { - "lambda": np.max(result.dens_lambda), - "frac": np.max(result.dens_frac), - "var_shift": np.max(result.dens_var_shift), - "n_neighbors": np.max(result.n_neighbors), - } - - n_epochs = None if result.n_epochs is None else np.max(result.n_epochs) - - result.embedding_, aux_data = simplicial_set_embedding( - None, - result.graph_, - np.min(result.n_components), - np.min(result.learning_rate), - np.mean(result._a), - np.mean(result._b), - np.mean(result.repulsion_strength), - np.mean(result.negative_sample_rate), - n_epochs, - init, - check_random_state(42), - "euclidean", - {}, - result.densmap, - result._densmap_kwds, - result.output_dens, - parallel=False, - verbose=bool(np.max(result.verbose)), - tqdm_kwds=self.tqdm_kwds, - ) - - if result.output_dens: - result.rad_orig_ = aux_data["rad_orig"] - result.rad_emb_ = aux_data["rad_emb"] - - return result - - def __sub__(self, other): - check_is_fitted( - self, - attributes=["graph_"], - msg="Only fitted UMAP models can be combined", - ) - check_is_fitted( - other, - attributes=["graph_"], - msg="Only fitted UMAP models can be combined", - ) - - if self.graph_.shape[0] != other.graph_.shape[0]: - msg = "Only models with the equivalent samples can be combined" - raise ValueError(msg) - - result = UMAP() - result._populate_combined_params(self, other) - - result.graph_ = general_simplicial_set_intersection( - self.graph_, - other.graph_, - weight=0.5, - right_complement=True, - ) - result.graph_ = reset_local_connectivity(result.graph_, False) - - if scipy.sparse.csgraph.connected_components(result.graph_)[0] > 1: - warn( - "Combined graph is not connected but mult-component layout is unsupported. " - "Falling back to random initialization.", - stacklevel=2, - ) - init = "random" - else: - init = "spectral" - - result.densmap = np.any(result.densmap) - result.output_dens = np.any(result.output_dens) - - result._densmap_kwds = { - "lambda": np.max(result.dens_lambda), - "frac": np.max(result.dens_frac), - "var_shift": np.max(result.dens_var_shift), - "n_neighbors": np.max(result.n_neighbors), - } - - n_epochs = None if result.n_epochs is None else np.max(result.n_epochs) - - result.embedding_, aux_data = simplicial_set_embedding( - None, - result.graph_, - np.min(result.n_components), - np.min(result.learning_rate), - np.mean(result._a), - np.mean(result._b), - np.mean(result.repulsion_strength), - np.mean(result.negative_sample_rate), - n_epochs, - init, - check_random_state(42), - "euclidean", - {}, - result.densmap, - result._densmap_kwds, - result.output_dens, - parallel=False, - verbose=bool(np.max(result.verbose)), - tqdm_kwds=self.tqdm_kwds, - ) - - if result.output_dens: - result.rad_orig_ = aux_data["rad_orig"] - result.rad_emb_ = aux_data["rad_emb"] - - return result - - def fit(self, X, y=None, ensure_all_finite=True, **kwargs): - """Fit X into an embedded space. - - Optionally use y for supervised dimension reduction. - - Parameters - ---------- - X : array, shape (n_samples, n_features) or (n_samples, n_samples) - If the metric is 'precomputed' X must be a square distance - matrix. Otherwise it contains a sample per row. If the method - is 'exact', X may be a sparse matrix of type 'csr', 'csc' - or 'coo'. - - y : array, shape (n_samples) - A target array for supervised dimension reduction. How this is - handled is determined by parameters UMAP was instantiated with. - The relevant attributes are ``target_metric`` and - ``target_metric_kwds``. - - ensure_all_finite : Whether to raise an error on np.inf, np.nan, pd.NA in array. - The possibilities are: - True: Force all values of array to be finite. - - False: accepts np.inf, np.nan, pd.NA in array. - - 'allow-nan': accepts only np.nan and pd.NA values in array. - Values cannot be infinite. - - **kwargs : optional - Any additional keyword arguments are passed to _fit_embed_data. - - """ - if self.metric in ("bit_hamming", "bit_jaccard"): - X = check_array( - X, - dtype=np.uint8, - order="C", - ensure_all_finite=ensure_all_finite, - ) - else: - X = check_array( - X, - dtype=np.float32, - accept_sparse="csr", - order="C", - ensure_all_finite=ensure_all_finite, - ) - self._raw_data = X - - # Handle all the optional arguments, setting default - if self.a is None or self.b is None: - self._a, self._b = find_ab_params(self.spread, self.min_dist) - else: - self._a = self.a - self._b = self.b - - if isinstance(self.init, np.ndarray): - init = check_array( - self.init, - dtype=np.float32, - accept_sparse=False, - ensure_all_finite=ensure_all_finite, - ) - else: - init = self.init - - self._initial_alpha = self.learning_rate - - self.knn_indices = self.precomputed_knn[0] - self.knn_dists = self.precomputed_knn[1] - # #848: allow precomputed knn to not have a search index - if len(self.precomputed_knn) == 2: - self.knn_search_index = None - else: - self.knn_search_index = self.precomputed_knn[2] - - self._validate_parameters() - - if self.verbose: - pass - - self._original_n_threads = numba.get_num_threads() - if self.n_jobs > 0 and self.n_jobs is not None: - numba.set_num_threads(self.n_jobs) - - # Check if we should unique the data - # We've already ensured that we aren't in the precomputed case - if self.unique: - # check if the matrix is dense - if self._sparse_data: - # Call a sparse unique function - index, inverse, counts = csr_unique(X) - else: - index, inverse, counts = np.unique( - X, - return_index=True, - return_inverse=True, - return_counts=True, - axis=0, - )[1:4] - if self.verbose: - np.argmax(counts) - # We'll expose an inverse map when unique=True for users to map from our internal structures to their data - self._unique_inverse_ = inverse - # If we aren't asking for unique use the full index. - # This will save special cases later. - else: - index = list(range(X.shape[0])) - inverse = list(range(X.shape[0])) - - # Error check n_neighbors based on data size - if X[index].shape[0] <= self.n_neighbors: - if X[index].shape[0] == 1: - self.embedding_ = np.zeros( - (1, self.n_components), - ) # needed to sklearn comparability - return self - - warn( - "n_neighbors is larger than the dataset size; truncating to " - "X.shape[0] - 1", - stacklevel=2, - ) - self._n_neighbors = X[index].shape[0] - 1 - if self.densmap: - self._densmap_kwds["n_neighbors"] = self._n_neighbors - else: - self._n_neighbors = self.n_neighbors - - # Note: unless it causes issues for setting 'index', could move this to - # initial sparsity check above - if self._sparse_data and not X.has_sorted_indices: - X.sort_indices() - - random_state = check_random_state(self.random_state) - - if self.verbose: - pass - - if self.metric == "precomputed" and self._sparse_data: - # For sparse precomputed distance matrices, we just argsort the rows to find - # nearest neighbors. To make this easier, we expect matrices that are - # symmetrical (so we can find neighbors by looking at rows in isolation, - # rather than also having to consider that sample's column too). - # print("Computing KNNs for sparse precomputed distances...") - if sparse_tril(X).getnnz() != sparse_triu(X).getnnz(): - msg = "Sparse precomputed distance matrices should be symmetrical!" - raise ValueError( - msg, - ) - if not np.all(X.diagonal() == 0): - msg = "Non-zero distances from samples to themselves!" - raise ValueError(msg) - if self.knn_dists is None: - self._knn_indices = np.zeros((X.shape[0], self.n_neighbors), dtype=int) - self._knn_dists = np.zeros(self._knn_indices.shape, dtype=float) - for row_id in range(X.shape[0]): - # Find KNNs row-by-row - row_data = X[row_id].data - row_indices = X[row_id].indices - if len(row_data) < self._n_neighbors: - msg = "Some rows contain fewer than n_neighbors distances!" - raise ValueError( - msg, - ) - row_nn_data_indices = np.argsort(row_data)[: self._n_neighbors] - self._knn_indices[row_id] = row_indices[row_nn_data_indices] - self._knn_dists[row_id] = row_data[row_nn_data_indices] - else: - self._knn_indices = self.knn_indices - self._knn_dists = self.knn_dists - # Disconnect any vertices farther apart than _disconnection_distance - disconnected_index = self._knn_dists >= self._disconnection_distance - self._knn_indices[disconnected_index] = -1 - self._knn_dists[disconnected_index] = np.inf - edges_removed = disconnected_index.sum() - - ( - self.graph_, - self._sigmas, - self._rhos, - self.graph_dists_, - ) = fuzzy_simplicial_set( - X[index], - self.n_neighbors, - random_state, - "precomputed", - self._metric_kwds, - self._knn_indices, - self._knn_dists, - self.angular_rp_forest, - self.set_op_mix_ratio, - self.local_connectivity, - True, - self.verbose, - self.densmap or self.output_dens, - ) - # Report the number of vertices with degree 0 in our our umap.graph_ - # This ensures that they were properly disconnected. - vertices_disconnected = np.sum( - np.array(self.graph_.sum(axis=1)).flatten() == 0, - ) - raise_disconnected_warning( - edges_removed, - vertices_disconnected, - self._disconnection_distance, - self._raw_data.shape[0], - verbose=self.verbose, - ) - # Handle small cases efficiently by computing all distances - elif X[index].shape[0] < 4096 and not self.force_approximation_algorithm: - self._small_data = True - try: - # sklearn pairwise_distances fails for callable metric on sparse data - _m = self.metric if self._sparse_data else self._input_distance_func - dmat = pairwise_distances(X[index], metric=_m, **self._metric_kwds) - except (ValueError, TypeError): - # metric is numba.jit'd or not supported by sklearn, - # fallback to pairwise special - - if self._sparse_data: - # Get a fresh metric since we are casting to dense - if not callable(self.metric): - _m = dist.named_distances[self.metric] - dmat = dist.pairwise_special_metric( - X[index].toarray(), - metric=_m, - kwds=self._metric_kwds, - ensure_all_finite=ensure_all_finite, - ) - else: - dmat = dist.pairwise_special_metric( - X[index], - metric=self._input_distance_func, - kwds=self._metric_kwds, - ensure_all_finite=ensure_all_finite, - ) - else: - dmat = dist.pairwise_special_metric( - X[index], - metric=self._input_distance_func, - kwds=self._metric_kwds, - ensure_all_finite=ensure_all_finite, - ) - # set any values greater than disconnection_distance to be np.inf. - # This will have no effect when _disconnection_distance is not set since it defaults to np.inf. - edges_removed = np.sum(dmat >= self._disconnection_distance) - dmat[dmat >= self._disconnection_distance] = np.inf - ( - self.graph_, - self._sigmas, - self._rhos, - self.graph_dists_, - ) = fuzzy_simplicial_set( - dmat, - self._n_neighbors, - random_state, - "precomputed", - self._metric_kwds, - None, - None, - self.angular_rp_forest, - self.set_op_mix_ratio, - self.local_connectivity, - True, - self.verbose, - self.densmap or self.output_dens, - ) - # Report the number of vertices with degree 0 in our umap.graph_ - # This ensures that they were properly disconnected. - vertices_disconnected = np.sum( - np.array(self.graph_.sum(axis=1)).flatten() == 0, - ) - raise_disconnected_warning( - edges_removed, - vertices_disconnected, - self._disconnection_distance, - self._raw_data.shape[0], - verbose=self.verbose, - ) - else: - # Standard case - self._small_data = False - # Standard case - if (self._sparse_data and self.metric in pynn_sparse_named_distances) or ( - not self._sparse_data and self.metric in pynn_named_distances - ): - nn_metric = self.metric - else: - nn_metric = self._input_distance_func - if self.knn_dists is None: - ( - self._knn_indices, - self._knn_dists, - self._knn_search_index, - ) = nearest_neighbors( - X[index], - self._n_neighbors, - nn_metric, - self._metric_kwds, - self.angular_rp_forest, - random_state, - self.low_memory, - use_pynndescent=True, - use_hnsw=self.use_hnsw, - hnsw_prune_strategy=self.hnsw_prune_strategy, - hnsw_alpha=self.hnsw_alpha, - n_jobs=self.n_jobs, - verbose=self.verbose, - ) - else: - self._knn_indices = self.knn_indices - self._knn_dists = self.knn_dists - self._knn_search_index = self.knn_search_index - # Disconnect any vertices farther apart than _disconnection_distance - disconnected_index = self._knn_dists >= self._disconnection_distance - self._knn_indices[disconnected_index] = -1 - self._knn_dists[disconnected_index] = np.inf - edges_removed = disconnected_index.sum() - - ( - self.graph_, - self._sigmas, - self._rhos, - self.graph_dists_, - ) = fuzzy_simplicial_set( - X[index], - self.n_neighbors, - random_state, - nn_metric, - self._metric_kwds, - self._knn_indices, - self._knn_dists, - self.angular_rp_forest, - self.set_op_mix_ratio, - self.local_connectivity, - True, - self.verbose, - self.densmap or self.output_dens, - ) - # Report the number of vertices with degree 0 in our umap.graph_ - # This ensures that they were properly disconnected. - vertices_disconnected = np.sum( - np.array(self.graph_.sum(axis=1)).flatten() == 0, - ) - raise_disconnected_warning( - edges_removed, - vertices_disconnected, - self._disconnection_distance, - self._raw_data.shape[0], - verbose=self.verbose, - ) - - # Currently not checking if any duplicate points have differing labels - # Might be worth throwing a warning... - if y is not None: - len_X = len(X) if not self._sparse_data else X.shape[0] - if len_X != len(y): - msg = f"Length of x = {len_X}, length of y = {len(y)}, while it must be equal." - raise ValueError( - msg, - ) - if self.target_metric == "string": - y_ = y[index] - else: - y_ = check_array( - y, - ensure_2d=False, - ensure_all_finite=ensure_all_finite, - )[index] - if self.target_metric == "categorical": - if self.target_weight < 1.0: - far_dist = 2.5 * (1.0 / (1.0 - self.target_weight)) - else: - far_dist = 1.0e12 - self.graph_ = discrete_metric_simplicial_set_intersection( - self.graph_, - y_, - far_dist=far_dist, - ) - elif self.target_metric in dist.DISCRETE_METRICS: - if self.target_weight < 1.0: - scale = 2.5 * (1.0 / (1.0 - self.target_weight)) - else: - scale = 1.0e12 - # self.graph_ = discrete_metric_simplicial_set_intersection( - # self.graph_, - # y_, - # metric=self.target_metric, - # metric_kws=self.target_metric_kwds, - # metric_scale=scale - # ) - - metric_kws = dist.get_discrete_params(y_, self.target_metric) - - self.graph_ = discrete_metric_simplicial_set_intersection( - self.graph_, - y_, - metric=self.target_metric, - metric_kws=metric_kws, - metric_scale=scale, - ) - else: - if len(y_.shape) == 1: - y_ = y_.reshape(-1, 1) - if self.target_n_neighbors == -1: - target_n_neighbors = self._n_neighbors - else: - target_n_neighbors = self.target_n_neighbors - - # Handle the small case as precomputed as before - if y.shape[0] < 4096: - try: - ydmat = pairwise_distances( - y_, - metric=self.target_metric, - **self._target_metric_kwds, - ) - except (TypeError, ValueError): - ydmat = dist.pairwise_special_metric( - y_, - metric=self.target_metric, - kwds=self._target_metric_kwds, - ensure_all_finite=ensure_all_finite, - ) - - ( - target_graph, - target_sigmas, - target_rhos, - ) = fuzzy_simplicial_set( - ydmat, - target_n_neighbors, - random_state, - "precomputed", - self._target_metric_kwds, - None, - None, - False, - 1.0, - 1.0, - False, - ) - else: - # Standard case - ( - target_graph, - _target_sigmas, - _target_rhos, - ) = fuzzy_simplicial_set( - y_, - target_n_neighbors, - random_state, - self.target_metric, - self._target_metric_kwds, - None, - None, - False, - 1.0, - 1.0, - False, - ) - # product = self.graph_.multiply(target_graph) - # # self.graph_ = 0.99 * product + 0.01 * (self.graph_ + - # # target_graph - - # # product) - # self.graph_ = product - self.graph_ = general_simplicial_set_intersection( - self.graph_, - target_graph, - self.target_weight, - ) - self.graph_ = reset_local_connectivity(self.graph_) - self._supervised = True - else: - self._supervised = False - - if self.densmap or self.output_dens: - self._densmap_kwds["graph_dists"] = self.graph_dists_ - - if self.verbose: - pass - - if self.transform_mode == "embedding": - epochs = ( - self.n_epochs_list if self.n_epochs_list is not None else self.n_epochs - ) - self.embedding_, aux_data = self._fit_embed_data( - self._raw_data[index], - epochs, - init, - random_state, # JH why raw data? - **kwargs, - ) - - if self.n_epochs_list is not None: - if "embedding_list" not in aux_data: - msg = ( - "No list of embedding were found in 'aux_data'. " - "It is likely the layout optimization function " - "doesn't support the list of int for 'n_epochs'." - ) - raise KeyError( - msg, - ) - self.embedding_list_ = [e[inverse] for e in aux_data["embedding_list"]] - - # Assign any points that are fully disconnected from our manifold(s) to have embedding - # coordinates of np.nan. These will be filtered by our plotting functions automatically. - # They also prevent users from being deceived a distance query to one of these points. - # Might be worth moving this into simplicial_set_embedding or _fit_embed_data - disconnected_vertices = np.array(self.graph_.sum(axis=1)).flatten() == 0 - if len(disconnected_vertices) > 0: - self.embedding_[disconnected_vertices] = np.full( - self.n_components, - np.nan, - ) - - self.embedding_ = self.embedding_[inverse] - if self.output_dens: - self.rad_orig_ = aux_data["rad_orig"][inverse] - self.rad_emb_ = aux_data["rad_emb"][inverse] - - if self.verbose: - pass - - numba.set_num_threads(self._original_n_threads) - self._input_hash = joblib.hash(self._raw_data) - - if self.transform_mode == "embedding": - # Set number of features out for sklearn API - self._n_features_out = self.embedding_.shape[1] - else: - self._n_features_out = self.graph_.shape[1] - - return self - - def _fit_embed_data(self, X, n_epochs, init, random_state, **_kwargs): - """A method wrapper for simplicial_set_embedding that can be - replaced by subclasses. Arbitrary keyword arguments can be passed - through .fit() and .fit_transform(). - """ - return simplicial_set_embedding( - X, - self.graph_, - self.n_components, - self._initial_alpha, - self._a, - self._b, - self.repulsion_strength, - self.negative_sample_rate, - n_epochs, - init, - random_state, - self._input_distance_func, - self._metric_kwds, - self.densmap, - self._densmap_kwds, - self.output_dens, - self._output_distance_func, - self._output_metric_kwds, - self.output_metric in ("euclidean", "l2"), - self.random_state is None, - self.verbose, - tqdm_kwds=self.tqdm_kwds, - ) - - def fit_transform(self, X, y=None, ensure_all_finite=True, **kwargs): - """Fit X into an embedded space and return that transformed - output. - - Parameters - ---------- - X : array, shape (n_samples, n_features) or (n_samples, n_samples) - If the metric is 'precomputed' X must be a square distance - matrix. Otherwise it contains a sample per row. - - y : array, shape (n_samples) - A target array for supervised dimension reduction. How this is - handled is determined by parameters UMAP was instantiated with. - The relevant attributes are ``target_metric`` and - ``target_metric_kwds``. - - ensure_all_finite : Whether to raise an error on np.inf, np.nan, pd.NA in array. - The possibilities are: - True: Force all values of array to be finite. - - False: accepts np.inf, np.nan, pd.NA in array. - - 'allow-nan': accepts only np.nan and pd.NA values in array. - Values cannot be infinite. - - **kwargs : Any additional keyword arguments are passed to _fit_embed_data. - - Returns - ------- - X_new : array, shape (n_samples, n_components) - Embedding of the training data in low-dimensional space. - - or a tuple (X_new, r_orig, r_emb) if ``output_dens`` flag is set, - which additionally includes: - - r_orig: array, shape (n_samples) - Local radii of data points in the original data space (log-transformed). - - r_emb: array, shape (n_samples) - Local radii of data points in the embedding (log-transformed). - - """ - self.fit(X, y, ensure_all_finite, **kwargs) - if self.transform_mode == "embedding": - if self.output_dens: - return self.embedding_, self.rad_orig_, self.rad_emb_ - return self.embedding_ - if self.transform_mode == "graph": - return self.graph_ - msg = f"Unrecognized transform mode {self.transform_mode}; should be one of 'embedding' or 'graph'" - raise ValueError( - msg, - ) - - def transform(self, X, ensure_all_finite=True): - """Transform X into the existing embedded space and return that - transformed output. - - Parameters - ---------- - X : array, shape (n_samples, n_features) - New data to be transformed. - - ensure_all_finite : Whether to raise an error on np.inf, np.nan, pd.NA in array. - The possibilities are: - True: Force all values of array to be finite. - - False: accepts np.inf, np.nan, pd.NA in array. - - 'allow-nan': accepts only np.nan and pd.NA values in array. - Values cannot be infinite. - - Returns - ------- - X_new : array, shape (n_samples, n_components) - Embedding of the new data in low-dimensional space. - - """ - # If we fit just a single instance then error - if self._raw_data.shape[0] == 1: - msg = "Transform unavailable when model was fit with only a single data sample." - raise ValueError( - msg, - ) - # If we just have the original input then short circuit things - if self.metric in ("bit_hamming", "bit_jaccard"): - X = check_array( - X, - dtype=np.uint8, - order="C", - ensure_all_finite=ensure_all_finite, - ) - else: - X = check_array( - X, - dtype=np.float32, - accept_sparse="csr", - order="C", - ensure_all_finite=ensure_all_finite, - ) - x_hash = joblib.hash(X) - if x_hash == self._input_hash: - if self.transform_mode == "embedding": - return self.embedding_ - if self.transform_mode == "graph": - return self.graph_ - msg = f"Unrecognized transform mode {self.transform_mode}; should be one of 'embedding' or 'graph'" - raise ValueError( - msg, - ) - if self.densmap: - msg = "Transforming data into an existing embedding not supported for densMAP." - raise NotImplementedError( - msg, - ) - - # #848: knn_search_index is allowed to be None if not transforming new data, - # so now we must validate that if it exists it is not None - if hasattr(self, "_knn_search_index") and self._knn_search_index is None: - msg = ( - "No search index available: transforming data" - " into an existing embedding is not supported" - ) - raise NotImplementedError( - msg, - ) - - # X = check_array(X, dtype=np.float32, order="C", accept_sparse="csr") - random_state = check_random_state(self.transform_seed) - rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) - - if self.metric == "precomputed": - warn( - "Transforming new data with precomputed metric. " - "We are assuming the input data is a matrix of distances from the new points " - "to the points in the training set. If the input matrix is sparse, it should " - "contain distances from the new points to their nearest neighbours " - "or approximate nearest neighbours in the training set.", - stacklevel=2, - ) - assert X.shape[1] == self._raw_data.shape[0] - if scipy.sparse.issparse(X): - indices = np.full( - (X.shape[0], self._n_neighbors), - dtype=np.int32, - fill_value=-1, - ) - dists = np.full_like(indices, dtype=np.float32, fill_value=-1) - for i in range(X.shape[0]): - data_indices = np.argsort(X[i].data) - if len(data_indices) < self._n_neighbors: - msg = f"Need at least n_neighbors ({self.n_neighbors}) distances for each row!" - raise ValueError( - msg, - ) - indices[i] = X[i].indices[data_indices[: self._n_neighbors]] - dists[i] = X[i].data[data_indices[: self._n_neighbors]] - else: - indices = np.argsort(X, axis=1)[:, : self._n_neighbors].astype(np.int32) - dists = np.take_along_axis(X, indices, axis=1) - assert np.min(indices) >= 0 - assert np.min(dists) >= 0.0 - elif self._small_data: - try: - # sklearn pairwise_distances fails for callable metric on sparse data - _m = self.metric if self._sparse_data else self._input_distance_func - dmat = pairwise_distances( - X, - self._raw_data, - metric=_m, - **self._metric_kwds, - ) - except (TypeError, ValueError): - # metric is numba.jit'd or not supported by sklearn, - # fallback to pairwise special - if self._sparse_data: - # Get a fresh metric since we are casting to dense - if not callable(self.metric): - _m = dist.named_distances[self.metric] - dmat = dist.pairwise_special_metric( - X.toarray(), - self._raw_data.toarray(), - metric=_m, - kwds=self._metric_kwds, - ensure_all_finite=ensure_all_finite, - ) - else: - dmat = dist.pairwise_special_metric( - X, - self._raw_data, - metric=self._input_distance_func, - kwds=self._metric_kwds, - ensure_all_finite=ensure_all_finite, - ) - else: - dmat = dist.pairwise_special_metric( - X, - self._raw_data, - metric=self._input_distance_func, - kwds=self._metric_kwds, - ensure_all_finite=ensure_all_finite, - ) - indices = np.argpartition(dmat, self._n_neighbors)[:, : self._n_neighbors] - dmat_shortened = submatrix(dmat, indices, self._n_neighbors) - indices_sorted = np.argsort(dmat_shortened) - indices = submatrix(indices, indices_sorted, self._n_neighbors) - dists = submatrix(dmat_shortened, indices_sorted, self._n_neighbors) - else: - # Access angular_trees via getattr to avoid accessing private member directly - angular_trees = getattr(self._knn_search_index, "_angular_trees", False) - epsilon = 0.24 if angular_trees else 0.12 - indices, dists = self._knn_search_index.query( - X, - self.n_neighbors, - epsilon=epsilon, - ) - - dists = dists.astype(np.float32, order="C") - # Remove any nearest neighbours who's distances are greater than our disconnection_distance - indices[dists >= self._disconnection_distance] = -1 - adjusted_local_connectivity = max(0.0, self.local_connectivity - 1.0) - sigmas, rhos = smooth_knn_dist( - dists, - float(self._n_neighbors), - local_connectivity=float(adjusted_local_connectivity), - ) - - rows, cols, vals, dists = compute_membership_strengths( - indices, - dists, - sigmas, - rhos, - bipartite=True, - ) - - graph = scipy.sparse.coo_matrix( - (vals, (rows, cols)), - shape=(X.shape[0], self._raw_data.shape[0]), - ) - - if self.transform_mode == "graph": - return graph - - # This was a very specially constructed graph with constant degree. - # That lets us do fancy unpacking by reshaping the csr matrix indices - # and data. Doing so relies on the constant degree assumption! - # csr_graph = normalize(graph.tocsr(), norm="l1") - # inds = csr_graph.indices.reshape(X.shape[0], self._n_neighbors) - # weights = csr_graph.data.reshape(X.shape[0], self._n_neighbors) - # embedding = init_transform(inds, weights, self.embedding_) - # This is less fast code than the above numba.jit'd code. - # It handles the fact that our nearest neighbour graph can now contain variable numbers of vertices. - csr_graph = graph.tocsr() - csr_graph.eliminate_zeros() - embedding = init_graph_transform(csr_graph, self.embedding_) - - if self.n_epochs is None: - # For smaller datasets we can use more epochs - n_epochs = 100 if graph.shape[0] <= 10000 else 30 - else: - n_epochs = int(self.n_epochs // 3.0) - - graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0 - graph.eliminate_zeros() - - epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs) - - head = graph.row - tail = graph.col - - # optimize_layout = make_optimize_layout( - # self._output_distance_func, - # tuple(self.output_metric_kwds.values()), - # ) - - if self.output_metric == "euclidean": - embedding = optimize_layout_euclidean( - embedding, - self.embedding_.astype(np.float32, copy=True), # Fixes #179 & #217, - head, - tail, - n_epochs, - graph.shape[1], - epochs_per_sample, - self._a, - self._b, - rng_state, - self.repulsion_strength, - self._initial_alpha / 4.0, - self.negative_sample_rate, - self.random_state is None, - verbose=self.verbose, - tqdm_kwds=self.tqdm_kwds, - ) - else: - embedding = optimize_layout_generic( - embedding, - self.embedding_.astype(np.float32, copy=True), # Fixes #179 & #217 - head, - tail, - n_epochs, - graph.shape[1], - epochs_per_sample, - self._a, - self._b, - rng_state, - self.repulsion_strength, - self._initial_alpha / 4.0, - self.negative_sample_rate, - self._output_distance_func, - tuple(self._output_metric_kwds.values()), - verbose=self.verbose, - tqdm_kwds=self.tqdm_kwds, - ) - - return embedding - - def inverse_transform(self, X): - """Transform X in the existing embedded space back into the input - data space and return that transformed output. - - Parameters - ---------- - X : array, shape (n_samples, n_components) - New points to be inverse transformed. - - Returns - ------- - X_new : array, shape (n_samples, n_features) - Generated data points new data in data space. - - """ - if self._sparse_data: - msg = "Inverse transform not available for sparse input." - raise ValueError(msg) - if self._inverse_distance_func is None: - msg = "Inverse transform not available for given metric." - raise ValueError(msg) - if self.densmap: - msg = "Inverse transform not available for densMAP." - raise ValueError(msg) - if self.n_components >= 8: - warn( - "Inverse transform works best with low dimensional embeddings." - " Results may be poor, or this approach to inverse transform" - " may fail altogether! If you need a high dimensional latent" - " space and inverse transform operations consider using an" - " autoencoder.", - stacklevel=2, - ) - elif self.transform_mode == "graph": - msg = "Inverse transform not available for transform_mode = 'graph'" - raise ValueError( - msg, - ) - - X = check_array(X, dtype=np.float32, order="C") - random_state = check_random_state(self.transform_seed) - rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) - - # build Delaunay complex (Does this not assume a roughly euclidean output metric)? - deltri = scipy.spatial.Delaunay( - self.embedding_, - incremental=True, - qhull_options="QJ", - ) - neighbors = deltri.simplices[deltri.find_simplex(X)] - adjmat = scipy.sparse.lil_matrix( - (self.embedding_.shape[0], self.embedding_.shape[0]), - dtype=int, - ) - for i in np.arange(0, deltri.simplices.shape[0]): - for j in deltri.simplices[i]: - if j < self.embedding_.shape[0]: - idx = deltri.simplices[i][ - deltri.simplices[i] < self.embedding_.shape[0] - ] - adjmat[j, idx] = 1 - adjmat[idx, j] = 1 - - adjmat = scipy.sparse.csr_matrix(adjmat) - - min_vertices = min(self._raw_data.shape[-1], self._raw_data.shape[0]) - - neighborhood = [ - breadth_first_search(adjmat, v[0], min_vertices=min_vertices) - for v in neighbors - ] - if callable(self.output_metric): - # need to create another numba.jit-able wrapper for callable - # output_metrics that return a tuple (already checked that it does - # during param validation in `fit` method) - _out_m = self.output_metric - - @numba.njit(fastmath=True) - def _output_dist_only(x, y, *kwds): - return _out_m(x, y, *kwds)[0] - - dist_only_func = _output_dist_only - elif self.output_metric in dist.named_distances: - dist_only_func = dist.named_distances[self.output_metric] - else: - # shouldn't really ever get here because of checks already performed, - # but works as a failsafe in case attr was altered manually after fitting - msg = f"Unrecognized output metric: {self.output_metric}" - raise ValueError( - msg, - ) - - dist_args = tuple(self._output_metric_kwds.values()) - distances = [ - np.array( - [ - dist_only_func(X[i], self.embedding_[nb], *dist_args) - for nb in neighborhood[i] - ], - ) - for i in range(X.shape[0]) - ] - idx = np.array([np.argsort(e)[:min_vertices] for e in distances]) - - dists_output_space = np.array( - [distances[i][idx[i]] for i in range(len(distances))], - ) - indices = np.array([neighborhood[i][idx[i]] for i in range(len(neighborhood))]) - - rows, cols, distances = np.array( - [ - [i, indices[i, j], dists_output_space[i, j]] - for i in range(indices.shape[0]) - for j in range(min_vertices) - ], - ).T - - # calculate membership strength of each edge - weights = 1 / (1 + self._a * distances ** (2 * self._b)) - - # compute 1-skeleton - # convert 1-skeleton into coo_matrix adjacency matrix - graph = scipy.sparse.coo_matrix( - (weights, (rows, cols)), - shape=(X.shape[0], self._raw_data.shape[0]), - ) - - # That lets us do fancy unpacking by reshaping the csr matrix indices - # and data. Doing so relies on the constant degree assumption! - # csr_graph = graph.tocsr() - csr_graph = normalize(graph.tocsr(), norm="l1") - inds = csr_graph.indices.reshape(X.shape[0], min_vertices) - weights = csr_graph.data.reshape(X.shape[0], min_vertices) - inv_transformed_points = init_transform(inds, weights, self._raw_data) - - if self.n_epochs is None: - # For smaller datasets we can use more epochs - n_epochs = 100 if graph.shape[0] <= 10000 else 30 - else: - n_epochs = int(self.n_epochs // 3.0) - - # graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0 - # graph.eliminate_zeros() - - epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs) - - head = graph.row - tail = graph.col - weight = graph.data - - return optimize_layout_inverse( - inv_transformed_points, - self._raw_data, - head, - tail, - weight, - self._sigmas, - self._rhos, - n_epochs, - graph.shape[1], - epochs_per_sample, - self._a, - self._b, - rng_state, - self.repulsion_strength, - self._initial_alpha / 4.0, - self.negative_sample_rate, - self._inverse_distance_func, - tuple(self._metric_kwds.values()), - verbose=self.verbose, - tqdm_kwds=self.tqdm_kwds, - ) - - def update(self, X, ensure_all_finite=True) -> None: - if self.metric in ("bit_hamming", "bit_jaccard"): - X = check_array( - X, - dtype=np.uint8, - order="C", - ensure_all_finite=ensure_all_finite, - ) - else: - X = check_array( - X, - dtype=np.float32, - accept_sparse="csr", - order="C", - ensure_all_finite=ensure_all_finite, - ) - random_state = check_random_state(self.transform_seed) - random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) - - original_size = self._raw_data.shape[0] - - if self.metric == "precomputed": - msg = "Update does not currently support precomputed metrics" - raise ValueError(msg) - if self._supervised: - msg = "Updating supervised models is not currently supported" - raise ValueError(msg) - - if self._small_data: - if self._sparse_data: - self._raw_data = scipy.sparse.vstack([self._raw_data, X]) - else: - self._raw_data = np.vstack([self._raw_data, X]) - - if self._raw_data.shape[0] < 4096: - # still small data - try: - # sklearn pairwise_distances fails for callable metric on sparse data - _m = self.metric if self._sparse_data else self._input_distance_func - dmat = pairwise_distances( - self._raw_data, - metric=_m, - **self._metric_kwds, - ) - except (ValueError, TypeError): - # metric is numba.jit'd or not supported by sklearn, - # fallback to pairwise special - - if self._sparse_data: - # Get a fresh metric since we are casting to dense - if not callable(self.metric): - _m = dist.named_distances[self.metric] - dmat = dist.pairwise_special_metric( - self._raw_data.toarray(), - metric=_m, - kwds=self._metric_kwds, - ensure_all_finite=ensure_all_finite, - ) - else: - dmat = dist.pairwise_special_metric( - self._raw_data, - metric=self._input_distance_func, - kwds=self._metric_kwds, - ensure_all_finite=ensure_all_finite, - ) - else: - dmat = dist.pairwise_special_metric( - self._raw_data, - metric=self._input_distance_func, - kwds=self._metric_kwds, - ensure_all_finite=ensure_all_finite, - ) - self.graph_, self._sigmas, self._rhos = fuzzy_simplicial_set( - dmat, - self._n_neighbors, - random_state, - "precomputed", - self._metric_kwds, - None, - None, - self.angular_rp_forest, - self.set_op_mix_ratio, - self.local_connectivity, - True, - self.verbose, - ) - knn_indices = np.argsort(dmat)[:, : self.n_neighbors] - else: - # now large data - self._small_data = False - if ( - self._sparse_data and self.metric in pynn_sparse_named_distances - ) or (not self._sparse_data and self.metric in pynn_named_distances): - nn_metric = self.metric - else: - nn_metric = self._input_distance_func - - ( - self._knn_indices, - self._knn_dists, - self._knn_search_index, - ) = nearest_neighbors( - self._raw_data, - self._n_neighbors, - nn_metric, - self._metric_kwds, - self.angular_rp_forest, - random_state, - self.low_memory, - use_pynndescent=True, - hnsw_prune_strategy=self.hnsw_prune_strategy, - hnsw_alpha=self.hnsw_alpha, - n_jobs=self.n_jobs, - verbose=self.verbose, - ) - - self.graph_, self._sigmas, self._rhos = fuzzy_simplicial_set( - self._raw_data, - self.n_neighbors, - random_state, - nn_metric, - self._metric_kwds, - self._knn_indices, - self._knn_dists, - self.angular_rp_forest, - self.set_op_mix_ratio, - self.local_connectivity, - True, - self.verbose, - ) - knn_indices = self._knn_indices - - init = np.zeros( - (self._raw_data.shape[0], self.n_components), - dtype=np.float32, - ) - init[:original_size] = self.embedding_ - - init_update(init, original_size, knn_indices) - n_epochs = 0 if self.n_epochs is None else self.n_epochs - - self.embedding_, aux_data = simplicial_set_embedding( - self._raw_data, - self.graph_, - self.n_components, - self._initial_alpha, - self._a, - self._b, - self.repulsion_strength, - self.negative_sample_rate, - n_epochs, - init, - random_state, - self._input_distance_func, - self._metric_kwds, - self.densmap, - self._densmap_kwds, - self.output_dens, - self._output_distance_func, - self._output_metric_kwds, - self.output_metric in ("euclidean", "l2"), - self.random_state is None, - self.verbose, - tqdm_kwds=self.tqdm_kwds, - ) - - else: - self._knn_search_index.prepare() - self._knn_search_index.update(X) - # Access raw_data via getattr to avoid accessing private member directly - self._raw_data = getattr(self._knn_search_index, "_raw_data", X) - ( - self._knn_indices, - self._knn_dists, - ) = self._knn_search_index.neighbor_graph - - if (self._sparse_data and self.metric in pynn_sparse_named_distances) or ( - not self._sparse_data and self.metric in pynn_named_distances - ): - nn_metric = self.metric - else: - nn_metric = self._input_distance_func - - self.graph_, self._sigmas, self._rhos = fuzzy_simplicial_set( - self._raw_data, - self.n_neighbors, - random_state, - nn_metric, - self._metric_kwds, - self._knn_indices, - self._knn_dists, - self.angular_rp_forest, - self.set_op_mix_ratio, - self.local_connectivity, - True, - self.verbose, - ) - - init = np.zeros( - (self._raw_data.shape[0], self.n_components), - dtype=np.float32, - ) - init[:original_size] = self.embedding_ - init_update(init, original_size, self._knn_indices) - - n_epochs = 0 if self.n_epochs is None else self.n_epochs - - self.embedding_, aux_data = simplicial_set_embedding( - self._raw_data, - self.graph_, - self.n_components, - self._initial_alpha, - self._a, - self._b, - self.repulsion_strength, - self.negative_sample_rate, - n_epochs, - init, - random_state, - self._input_distance_func, - self._metric_kwds, - self.densmap, - self._densmap_kwds, - self.output_dens, - self._output_distance_func, - self._output_metric_kwds, - self.output_metric in ("euclidean", "l2"), - self.random_state is None, - self.verbose, - tqdm_kwds=self.tqdm_kwds, - ) - - if self.output_dens: - self.rad_orig_ = aux_data["rad_orig"] - self.rad_emb_ = aux_data["rad_emb"] - - def __repr__(self) -> str: - import re - - from sklearn.utils._pprint import _EstimatorPrettyPrinter - - pp = _EstimatorPrettyPrinter( - compact=True, - indent=1, - indent_at_name=True, - n_max_elements_to_show=50, - ) - # Set changed_only via setattr to avoid accessing private member directly - pp._changed_only = True - repr_ = pp.pformat(self) - repr_ = re.sub("tqdm_kwds={.*},", "", repr_, flags=re.DOTALL) - # remove empty lines - repr_ = re.sub("\n *\n", "\n", repr_, flags=re.DOTALL) - # remove extra whitespaces after a comma - return re.sub(", +", ", ", repr_) diff --git a/squeeze/utils.py b/squeeze/utils.py index 57b04fc..2fedcad 100644 --- a/squeeze/utils.py +++ b/squeeze/utils.py @@ -15,7 +15,7 @@ from sklearn.utils.validation import check_is_fitted if TYPE_CHECKING: - from squeeze.umap_ import UMAP + from squeeze.umap import UMAP @numba.njit(parallel=True) diff --git a/src/lib.rs b/src/lib.rs index a6fcfce..0e071fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,6 +18,7 @@ pub mod lle; pub mod phate; pub mod trimap; pub mod pacmap; +pub mod umap; #[cfg(not(test))] #[pyo3::pymodule] @@ -35,6 +36,7 @@ fn _hnsw_backend(_py: pyo3::Python, m: &pyo3::Bound<'_, pyo3::types::PyModule>) m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/src/umap.rs b/src/umap.rs new file mode 100644 index 0000000..69d95fe --- /dev/null +++ b/src/umap.rs @@ -0,0 +1,779 @@ +//! UMAP (Uniform Manifold Approximation and Projection) implementation. +//! +//! This provides a Rust-backed UMAP variant intended for fast CPU execution. +//! The implementation focuses on the standard Euclidean-output UMAP objective +//! (fuzzy set cross-entropy with negative sampling). + +use std::collections::BinaryHeap; + +use ndarray::Array2; +use numpy::{IntoPyArray, PyArray2, PyReadonlyArray2}; +use ordered_float::OrderedFloat; +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; +use rand::prelude::*; +use rand::SeedableRng; +use rand_distr::Normal; +use rayon::prelude::*; + +use crate::hnsw_algo::Hnsw; +use crate::metrics; +use crate::metrics_simd; + +const GRAD_CLIP: f32 = 4.0; + +#[inline] +fn clip(x: f32) -> f32 { + x.clamp(-GRAD_CLIP, GRAD_CLIP) +} + +#[derive(Clone, Copy)] +enum DistanceMetric { + Euclidean, + Manhattan, + Cosine, + Chebyshev, + Minkowski { p: f32 }, + Hamming, +} + +impl DistanceMetric { + fn parse(name: &str, p: f32) -> Option { + match name { + "euclidean" | "l2" => Some(Self::Euclidean), + "manhattan" | "l1" | "taxicab" => Some(Self::Manhattan), + "cosine" | "correlation" => Some(Self::Cosine), + "chebyshev" | "linfinity" => Some(Self::Chebyshev), + "minkowski" => Some(Self::Minkowski { p }), + "hamming" => Some(Self::Hamming), + _ => None, + } + } + + #[inline] + fn dist(self, a: &[f32], b: &[f32]) -> f32 { + match self { + Self::Euclidean => metrics_simd::euclidean(a, b).unwrap_or(f32::MAX), + Self::Manhattan => metrics_simd::manhattan(a, b).unwrap_or(f32::MAX), + Self::Cosine => metrics_simd::cosine(a, b).unwrap_or(f32::MAX), + Self::Chebyshev => metrics::chebyshev(a, b).unwrap_or(f32::MAX), + Self::Minkowski { p } => metrics::minkowski(a, b, p).unwrap_or(f32::MAX), + Self::Hamming => metrics::hamming(a, b).unwrap_or(f32::MAX), + } + } +} + +/// Rust-backed UMAP implementation. +/// +/// This is intentionally scoped to the most common UMAP configuration: +/// - kNN graph in input space +/// - Fuzzy simplicial set construction +/// - Euclidean output with SGD + negative sampling +#[pyclass(module = "squeeze._hnsw_backend")] +pub struct UMAPRust { + n_components: usize, + n_neighbors: usize, + n_epochs: usize, + min_dist: f32, + spread: f32, + metric: String, + dist_p: f32, + initial_alpha: f32, + negative_sample_rate: f32, + gamma: f32, + random_state: Option, + + // HNSW tuning (used when n_samples is large) + m: usize, + ef_construction: usize, + ef_search: usize, + + embedding: Option>, +} + +#[pymethods] +impl UMAPRust { + #[new] + #[pyo3( + signature = ( + n_components=2, + n_neighbors=15, + n_epochs=0, + min_dist=0.1, + spread=1.0, + metric="euclidean".to_string(), + dist_p=2.0, + initial_alpha=1.0, + negative_sample_rate=5.0, + gamma=1.0, + random_state=None, + m=16, + ef_construction=200, + ef_search=50, + ) + )] + pub fn new( + n_components: usize, + n_neighbors: usize, + n_epochs: usize, + min_dist: f32, + spread: f32, + metric: String, + dist_p: f32, + initial_alpha: f32, + negative_sample_rate: f32, + gamma: f32, + random_state: Option, + m: usize, + ef_construction: usize, + ef_search: usize, + ) -> Self { + Self { + n_components, + n_neighbors, + n_epochs, + min_dist, + spread, + metric, + dist_p, + initial_alpha, + negative_sample_rate, + gamma, + random_state, + m, + ef_construction, + ef_search, + embedding: None, + } + } + + /// Fit the model and return the embedding. + pub fn fit_transform<'py>( + &mut self, + py: Python<'py>, + data: PyReadonlyArray2, + ) -> PyResult>> { + let x = data.as_array(); + let n_samples = x.nrows(); + let n_features = x.ncols(); + + if n_samples < 2 { + return Err(PyValueError::new_err("UMAP requires at least 2 samples")); + } + if self.n_neighbors == 0 { + return Err(PyValueError::new_err("n_neighbors must be at least 1")); + } + if self.n_neighbors >= n_samples { + return Err(PyValueError::new_err(format!( + "n_neighbors ({}) must be less than n_samples ({})", + self.n_neighbors, n_samples + ))); + } + if self.n_components == 0 { + return Err(PyValueError::new_err("n_components must be at least 1")); + } + if self.min_dist < 0.0 { + return Err(PyValueError::new_err("min_dist must be >= 0")); + } + if self.spread <= 0.0 { + return Err(PyValueError::new_err("spread must be > 0")); + } + + let metric = DistanceMetric::parse(self.metric.as_str(), self.dist_p).ok_or_else(|| { + PyValueError::new_err(format!( + "Unknown metric '{}'. Supported metrics: euclidean, manhattan, cosine, correlation, chebyshev, minkowski, hamming", + self.metric + )) + })?; + + // Copy input to a contiguous Vec> for fast distance computations. + let mut data_f32 = Vec::with_capacity(n_samples); + for row in x.rows() { + let mut v = Vec::with_capacity(n_features); + for &val in row.iter() { + v.push(val as f32); + } + data_f32.push(v); + } + + let (knn_indices, knn_dists) = self.compute_knn(&data_f32, metric); + let (sigmas, rhos) = smooth_knn_dist(&knn_dists, self.n_neighbors, 1.0, 1.0); + let (head, tail, weights) = compute_membership_strengths( + &knn_indices, + &knn_dists, + &sigmas, + &rhos, + n_samples, + ); + + let n_epochs = if self.n_epochs == 0 { + if n_samples <= 10_000 { 500 } else { 200 } + } else { + self.n_epochs + }; + + let (a, b) = find_ab_params(self.spread, self.min_dist); + + // Initialize embedding + let mut embedding = self.initialize_embedding(n_samples); + scale_to_10(&mut embedding, n_samples, self.n_components); + + optimize_embedding( + &mut embedding, + n_samples, + self.n_components, + &head, + &tail, + &weights, + n_epochs, + a, + b, + self.gamma, + self.initial_alpha, + self.negative_sample_rate, + self.random_state, + ); + + let embedding_arr = Array2::from_shape_vec((n_samples, self.n_components), embedding) + .map_err(|e| PyValueError::new_err(format!("Failed to shape embedding: {}", e)))?; + + self.embedding = Some(embedding_arr.clone()); + Ok(embedding_arr.into_pyarray_bound(py)) + } + + /// Fit the model, storing the embedding in `embedding_`. + pub fn fit(&mut self, py: Python<'_>, data: PyReadonlyArray2) -> PyResult<()> { + self.fit_transform(py, data)?; + Ok(()) + } + + #[getter] + pub fn embedding_<'py>(&self, py: Python<'py>) -> PyResult>> { + let emb = self + .embedding + .as_ref() + .ok_or_else(|| PyValueError::new_err("UMAPRust not fitted"))?; + Ok(emb.clone().into_pyarray_bound(py)) + } +} + +impl UMAPRust { + fn initialize_embedding(&self, n_samples: usize) -> Vec { + let mut rng: StdRng = match self.random_state { + Some(seed) => StdRng::seed_from_u64(seed), + None => StdRng::from_seed(rand::random()), + }; + + let normal = Normal::new(0.0, 1e-4).unwrap(); + let mut embedding = vec![0.0f32; n_samples * self.n_components]; + for v in embedding.iter_mut() { + *v = normal.sample(&mut rng); + } + embedding + } + + fn compute_knn( + &self, + data: &[Vec], + metric: DistanceMetric, + ) -> (Vec>, Vec>) { + let n_samples = data.len(); + + // Exact kNN is fast enough for typical datasets used in tests/benchmarks + // (e.g. sklearn Digits), and provides better determinism and quality. + if n_samples <= 4096 { + return exact_knn(data, self.n_neighbors, metric); + } + + hnsw_knn( + data, + self.n_neighbors, + metric, + self.m, + self.ef_construction, + self.ef_search.max(self.n_neighbors), + self.random_state.unwrap_or(42), + ) + } +} + +fn exact_knn( + data: &[Vec], + k: usize, + metric: DistanceMetric, +) -> (Vec>, Vec>) { + let n_samples = data.len(); + + let results: Vec<(Vec, Vec)> = (0..n_samples) + .into_par_iter() + .map(|i| { + let mut heap: BinaryHeap<(OrderedFloat, usize)> = BinaryHeap::with_capacity(k + 1); + for j in 0..n_samples { + if i == j { + continue; + } + let d = metric.dist(&data[i], &data[j]); + if heap.len() < k { + heap.push((OrderedFloat(d), j)); + } else if let Some(&(OrderedFloat(max_d), _)) = heap.peek() { + if d < max_d { + heap.pop(); + heap.push((OrderedFloat(d), j)); + } + } + } + + let mut pairs: Vec<(f32, usize)> = heap + .into_iter() + .map(|(OrderedFloat(d), j)| (d, j)) + .collect(); + pairs.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + + let mut indices = Vec::with_capacity(k); + let mut dists = Vec::with_capacity(k); + for (d, j) in pairs { + indices.push(j); + dists.push(d); + } + + (indices, dists) + }) + .collect(); + + let mut indices = Vec::with_capacity(n_samples); + let mut dists = Vec::with_capacity(n_samples); + for (idx, dist) in results { + indices.push(idx); + dists.push(dist); + } + + (indices, dists) +} + +fn hnsw_knn( + data: &[Vec], + k: usize, + metric: DistanceMetric, + m: usize, + ef_construction: usize, + ef_search: usize, + seed: u64, +) -> (Vec>, Vec>) { + let n_samples = data.len(); + + let mut hnsw = Hnsw::new(m, ef_construction, n_samples, seed); + + // Distance function used for building the index. + { + let dist_fn = |i: usize, j: usize| -> f32 { metric.dist(&data[i], &data[j]) }; + for i in 0..n_samples { + hnsw.insert(i, &dist_fn); + } + } + + let results: Vec<(Vec, Vec)> = (0..n_samples) + .into_par_iter() + .map(|i| { + let query = &data[i]; + let dist_query = |node_idx: usize| -> f32 { metric.dist(query, &data[node_idx]) }; + let found = hnsw.search(None, k + 1, ef_search, dist_query); + + let mut indices = Vec::with_capacity(k); + let mut dists = Vec::with_capacity(k); + for (j, d) in found { + if j == i { + continue; + } + indices.push(j); + dists.push(d); + if indices.len() >= k { + break; + } + } + + // Padding if needed + while indices.len() < k { + indices.push(i); + dists.push(f32::INFINITY); + } + + (indices, dists) + }) + .collect(); + + let mut indices_all = Vec::with_capacity(n_samples); + let mut dists_all = Vec::with_capacity(n_samples); + for (idx, dist) in results { + indices_all.push(idx); + dists_all.push(dist); + } + + (indices_all, dists_all) +} + +fn smooth_knn_dist( + distances: &[Vec], + k: usize, + local_connectivity: f32, + bandwidth: f32, +) -> (Vec, Vec) { + let n_samples = distances.len(); + let target = (k as f32).log2() * bandwidth; + + let mut sigmas = vec![1.0f32; n_samples]; + let mut rhos = vec![0.0f32; n_samples]; + + for i in 0..n_samples { + let mut non_zero: Vec = distances[i] + .iter() + .copied() + .filter(|&d| d > 0.0 && d.is_finite()) + .collect(); + non_zero.sort_by(|a, b| a.partial_cmp(b).unwrap()); + + if !non_zero.is_empty() { + if (non_zero.len() as f32) >= local_connectivity { + let index = local_connectivity.floor() as usize; + let interpolation = local_connectivity - index as f32; + + if index > 0 { + let mut rho = non_zero[index - 1]; + if interpolation > 1e-5 && index < non_zero.len() { + rho += interpolation * (non_zero[index] - non_zero[index - 1]); + } + rhos[i] = rho; + } else { + rhos[i] = interpolation * non_zero[0]; + } + } else { + rhos[i] = *non_zero.last().unwrap(); + } + } + + let mut lo = 0.0f32; + let mut hi = f32::INFINITY; + let mut mid = 1.0f32; + + for _ in 0..64 { + let mut psum = 0.0f32; + for &d in distances[i].iter() { + if !d.is_finite() { + continue; + } + let mut v = d - rhos[i]; + if v > 0.0 { + v = (-v / mid).exp(); + } else { + v = 1.0; + } + psum += v; + } + + if (psum - target).abs() < 1e-5 { + break; + } + + if psum > target { + hi = mid; + mid = (lo + hi) / 2.0; + } else { + lo = mid; + if hi.is_infinite() { + mid *= 2.0; + } else { + mid = (lo + hi) / 2.0; + } + } + + if mid < 1e-6 { + mid = 1e-6; + break; + } + } + + sigmas[i] = mid; + } + + (sigmas, rhos) +} + +fn compute_membership_strengths( + knn_indices: &[Vec], + knn_dists: &[Vec], + sigmas: &[f32], + rhos: &[f32], + n_vertices: usize, +) -> (Vec, Vec, Vec) { + let n_samples = knn_indices.len(); + + // First build directed weights. + let mut directed: Vec<(usize, usize, f32)> = Vec::with_capacity(n_samples * knn_indices[0].len()); + for i in 0..n_samples { + for (j, &nbr) in knn_indices[i].iter().enumerate() { + if nbr >= n_vertices || nbr == i { + continue; + } + + let d = knn_dists[i][j]; + if !d.is_finite() { + continue; + } + + let rho = rhos[i]; + let sigma = sigmas[i].max(1e-6); + let w = if d - rho <= 0.0 { + 1.0 + } else { + (-(d - rho) / sigma).exp() + }; + + if w > 0.0 { + directed.push((i, nbr, w)); + } + } + } + + // Map directed weights for symmetric union. + let mut weight_map = std::collections::HashMap::::with_capacity(directed.len() * 2); + for &(i, j, w) in directed.iter() { + let key = (i as u64) * (n_vertices as u64) + (j as u64); + weight_map.insert(key, w); + } + + let mut head = Vec::with_capacity(directed.len()); + let mut tail = Vec::with_capacity(directed.len()); + let mut weights = Vec::with_capacity(directed.len()); + + for (i, j, w_ij) in directed { + let rev_key = (j as u64) * (n_vertices as u64) + (i as u64); + let w_ji = weight_map.get(&rev_key).copied().unwrap_or(0.0); + let w = w_ij + w_ji - w_ij * w_ji; + if w > 0.0 { + head.push(i); + tail.push(j); + weights.push(w); + } + } + + (head, tail, weights) +} + +fn find_ab_params(spread: f32, min_dist: f32) -> (f32, f32) { + // UMAP-learn typically fits these parameters with scipy curve_fit. + // Here we do a lightweight grid search over plausible values. + let n = 300; + let max_x = spread * 3.0; + + let mut xs = Vec::with_capacity(n); + let mut ys = Vec::with_capacity(n); + + for i in 0..n { + let x = (i as f32) * max_x / ((n - 1) as f32); + let y = if x < min_dist { + 1.0 + } else { + (-(x - min_dist) / spread).exp() + }; + xs.push(x); + ys.push(y); + } + + let mut best_a = 1.5769435; + let mut best_b = 0.8950609; + let mut best_err = f32::INFINITY; + + // Search b in [0.5, 2.0] and a in logspace [0.1, 10] + for bi in 0..40 { + let b = 0.5 + (bi as f32) * (1.5 / 39.0); + for ai in 0..40 { + let log_a = -1.0 + (ai as f32) * (2.0 / 39.0); // [-1, 1] + let a = 10f32.powf(log_a); + + let mut err = 0.0f32; + for (&x, &y) in xs.iter().zip(ys.iter()) { + let denom = 1.0 + a * x.powf(2.0 * b); + let f = 1.0 / denom; + let diff = f - y; + err += diff * diff; + } + + if err < best_err { + best_err = err; + best_a = a; + best_b = b; + } + } + } + + (best_a, best_b) +} + +fn scale_to_10(embedding: &mut [f32], n_samples: usize, dim: usize) { + for d in 0..dim { + let mut min_v = f32::INFINITY; + let mut max_v = f32::NEG_INFINITY; + for i in 0..n_samples { + let v = embedding[i * dim + d]; + min_v = min_v.min(v); + max_v = max_v.max(v); + } + + let range = (max_v - min_v).max(1e-6); + for i in 0..n_samples { + let idx = i * dim + d; + embedding[idx] = 10.0 * (embedding[idx] - min_v) / range; + } + } +} + +fn make_epochs_per_sample(weights: &[f32], n_epochs: usize) -> Vec { + let mut result = vec![-1.0f32; weights.len()]; + let max_w = weights + .iter() + .copied() + .fold(0.0f32, |a, b| a.max(b)); + + if max_w <= 0.0 { + return result; + } + + for (i, &w) in weights.iter().enumerate() { + let n_samples = (n_epochs as f32) * (w / max_w); + if n_samples > 0.0 { + result[i] = (n_epochs as f32) / n_samples; + } + } + + result +} + +fn optimize_embedding( + embedding: &mut [f32], + n_vertices: usize, + dim: usize, + head: &[usize], + tail: &[usize], + weights: &[f32], + n_epochs: usize, + a: f32, + b: f32, + gamma: f32, + initial_alpha: f32, + negative_sample_rate: f32, + random_state: Option, +) { + let epochs_per_sample = make_epochs_per_sample(weights, n_epochs); + let mut epoch_of_next_sample = epochs_per_sample.clone(); + + let epochs_per_negative_sample: Vec = epochs_per_sample + .iter() + .map(|&e| if e > 0.0 { e / negative_sample_rate } else { -1.0 }) + .collect(); + let mut epoch_of_next_negative_sample = epochs_per_negative_sample.clone(); + + let mut rng: StdRng = match random_state { + Some(seed) => StdRng::seed_from_u64(seed), + None => StdRng::from_seed(rand::random()), + }; + + for n in 0..n_epochs { + let alpha = initial_alpha * (1.0 - (n as f32) / (n_epochs as f32)); + + for i in 0..epochs_per_sample.len() { + if epochs_per_sample[i] <= 0.0 { + continue; + } + if epoch_of_next_sample[i] > (n as f32) { + continue; + } + + let j = head[i]; + let k = tail[i]; + + // Attractive update (move both points) + let mut dist2 = 0.0f32; + for d in 0..dim { + let diff = embedding[j * dim + d] - embedding[k * dim + d]; + dist2 += diff * diff; + } + + let mut grad_coeff = 0.0f32; + if dist2 > 0.0 { + grad_coeff = -2.0 * a * b * dist2.powf(b - 1.0); + grad_coeff /= a * dist2.powf(b) + 1.0; + } + + for d in 0..dim { + let diff = embedding[j * dim + d] - embedding[k * dim + d]; + let grad_d = clip(grad_coeff * diff); + embedding[j * dim + d] += grad_d * alpha; + embedding[k * dim + d] -= grad_d * alpha; + } + + epoch_of_next_sample[i] += epochs_per_sample[i]; + + // Negative sampling + let n_neg_samples = if epochs_per_negative_sample[i] > 0.0 { + (((n as f32) - epoch_of_next_negative_sample[i]) / epochs_per_negative_sample[i]) + .floor() + .max(0.0) as usize + } else { + 0 + }; + + for _ in 0..n_neg_samples { + let kk = rng.gen_range(0..n_vertices); + if kk == j { + continue; + } + + let mut dist2 = 0.0f32; + for d in 0..dim { + let diff = embedding[j * dim + d] - embedding[kk * dim + d]; + dist2 += diff * diff; + } + + let mut grad_coeff = 0.0f32; + if dist2 > 0.0 { + grad_coeff = 2.0 * gamma * b; + grad_coeff /= (0.001 + dist2) * (a * dist2.powf(b) + 1.0); + } + + if grad_coeff > 0.0 { + for d in 0..dim { + let diff = embedding[j * dim + d] - embedding[kk * dim + d]; + let grad_d = clip(grad_coeff * diff); + embedding[j * dim + d] += grad_d * alpha; + } + } + } + + epoch_of_next_negative_sample[i] += (n_neg_samples as f32) * epochs_per_negative_sample[i]; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_find_ab_params_returns_finite() { + let (a, b) = find_ab_params(1.0, 0.1); + assert!(a.is_finite()); + assert!(b.is_finite()); + assert!(a > 0.0); + assert!(b > 0.0); + } + + #[test] + fn test_make_epochs_per_sample_basic() { + let weights = vec![0.0, 0.5, 1.0]; + let eps = make_epochs_per_sample(&weights, 100); + assert_eq!(eps.len(), 3); + assert!(eps[0] < 0.0); + assert!(eps[1] > 0.0); + assert!(eps[2] > 0.0); + assert!(eps[2] <= eps[1]); + } +}