diff --git a/benchmark_results.png b/benchmark_results.png index 6863d04..66c57b2 100644 Binary files a/benchmark_results.png and b/benchmark_results.png differ diff --git a/embeddings_comparison.png b/embeddings_comparison.png index 7555329..0958335 100644 Binary files a/embeddings_comparison.png and b/embeddings_comparison.png differ diff --git a/metrics_heatmap.png b/metrics_heatmap.png index 7136fb5..e23d568 100644 Binary files a/metrics_heatmap.png and b/metrics_heatmap.png differ diff --git a/metrics_results.csv b/metrics_results.csv index bf2c526..6c01300 100644 --- a/metrics_results.csv +++ b/metrics_results.csv @@ -1,10 +1,10 @@ Algorithm,Trust. (k=5),Trust. (k=15),Trust. (k=30),Continuity,Co-ranking,Spearman,Global Struct.,Density Pres.,Reconstr. R²,Silhouette,Adj. Rand Idx,Norm. MI,Classif. Acc.,Time (s) -UMAP,0.31396772398441847,0.507698015210536,0.5853644963828604,0.507698015210536,0.5076238174735671,0.35555412658930063,0.6161788537501326,0.10050063667731875,0.13542082599932848,0.7799601130001561,0.8462930754820145,0.8850632494551207,0.9833054781801301,9.233492125000339 -PCA,0.07813021702838063,0.15121498794286775,0.22647004266369875,0.15121498794286775,0.15125208681135224,0.58237138961399,0.8194799752474329,0.0025296364868204945,0.285093648236993,0.3924475212548915,0.38798476520164954,0.5270341393788788,0.6104812751470133,0.004134916991461068 -t-SNE,0.5971062882582081,0.5907994806158412,0.6150621406047115,0.5907994806158412,0.5907252828788722,0.4080166460361109,0.6335746486347822,0.6616467149629146,0.13703070177725074,0.6526165322051649,0.881390391570236,0.9063086626448817,0.9721835345094398,24.230485957989004 -MDS,0.10651085141903172,0.19588202559821927,0.28135781858653314,0.19588202559821927,0.19591912446670376,0.7336948793497162,0.8305211414926951,0.5469156055065336,0.2502411375797472,0.39025375801929324,0.5002385159409434,0.5915354872861003,0.7195372949551223,10.611014500027522 -Isomap,0.04852531997774068,0.08974216286403265,0.12489334075310703,0.08974216286403265,0.08966796512706363,0.28656024777938754,0.5523112670640861,0.2860020751496045,0.16466414057949508,0.39004821813119417,0.17037665211388042,0.28334026741309826,0.4129093160012379,6.277146417007316 -LLE,0.010350584307178631,0.020070487850120573,0.031979224633648676,0.020070487850120573,0.02003338898163606,-0.029907056066161633,0.04688536864050532,0.0,0.0045753304549966956,0.3225187604913397,0.013881736737519326,0.0460183000738781,0.13298669142680283,13.723946416983381 -PHATE,0.08180300500834725,0.15796698200704878,0.23233166388425153,0.15796698200704878,0.1580411797440178,0.5525542143219838,0.7853614704391769,0.029193045701356377,0.284276725066662,0.4000785508883673,0.43194478617505977,0.5493046786382542,0.6132822655524606,7.366627875017002 -TriMap,0.002671118530884808,0.007939157855685402,0.01712112780560193,0.007939157855685402,0.007939157855685402,-0.05816714714196457,0.04881650156024266,0.0,0.0010666070008534811,0.32954131677512083,-0.0006966476751533949,0.009358083829204274,0.08459145775301764,0.587316292047035 -PaCMAP,0.3163049526989427,0.48187720274531626,0.5506770543498424,0.48187720274531626,0.4819514004822853,0.2375034251411084,0.298922618449571,0.22074143232238652,0.080783589630951,0.7091429634791778,0.8200653587469319,0.8717676315550004,0.9710693283813061,0.2123211669968441 +UMAP,0.31396772398441847,0.507698015210536,0.5853644963828604,0.507698015210536,0.5076238174735671,0.35555412658930063,0.6161788537501326,0.10050063667731875,0.13542082599932848,0.7799601130001561,0.8462930754820145,0.8850632494551207,0.9833054781801301,9.18106924998574 +PCA,0.07813021702838063,0.15121498794286775,0.22647004266369875,0.15121498794286775,0.15125208681135224,0.58237138961399,0.8194799752474329,0.0025296364868204945,0.285093648236993,0.3924475212548915,0.38798476520164954,0.5270341393788788,0.6104812751470133,0.006151334033347666 +t-SNE,0.5971062882582081,0.5907994806158412,0.6150621406047115,0.5907994806158412,0.5907252828788722,0.4080166460361109,0.6335746486347822,0.6616467149629146,0.13703070177725074,0.6526165322051649,0.881390391570236,0.9063086626448817,0.9721835345094398,23.834802250028588 +MDS,0.10651085141903172,0.19588202559821927,0.28135781858653314,0.19588202559821927,0.19591912446670376,0.7336948793497162,0.8305211414926951,0.5469156055065336,0.2502411375797472,0.39025375801929324,0.5002385159409434,0.5915354872861003,0.7195372949551223,10.837047207984142 +Isomap,0.04852531997774068,0.08974216286403265,0.12489334075310703,0.08974216286403265,0.08966796512706363,0.28656024777938754,0.5523112670640861,0.2860020751496045,0.16466414057949508,0.39004821813119417,0.17037665211388042,0.28334026741309826,0.4129093160012379,6.997400040971115 +LLE,0.010350584307178631,0.020070487850120573,0.031979224633648676,0.020070487850120573,0.02003338898163606,-0.029907056066161633,0.04688536864050532,0.0,0.0045753304549966956,0.3225187604913397,0.013881736737519326,0.0460183000738781,0.13298669142680283,11.80761875002645 +PHATE,0.08180300500834725,0.15796698200704878,0.23233166388425153,0.15796698200704878,0.1580411797440178,0.5525542143219838,0.7853614704391769,0.029193045701356377,0.284276725066662,0.4000785508883673,0.43194478617505977,0.5493046786382542,0.6132822655524606,6.072406957973726 +TriMap,0.004117974401780746,0.008384344277499536,0.01778890743832313,0.008384344277499536,0.008347245409015025,-0.05018808619206228,0.0,0.0,0.0005044510409659697,0.322418006569936,0.0006853610507430608,0.010783872332592722,0.09516403590219746,0.4761259580263868 +PaCMAP,0.31942125765164164,0.48807271378222966,0.5578000370988685,0.48807271378222966,0.4882582081246522,0.19411768919799563,0.27106218722694786,0.2004738271184264,0.06963633827407312,0.6848164123925791,0.8609584265681568,0.8895398736220883,0.9694026617146395,0.18318291602190584 diff --git a/src/barnes_hut.rs b/src/barnes_hut.rs new file mode 100644 index 0000000..88d2762 --- /dev/null +++ b/src/barnes_hut.rs @@ -0,0 +1,291 @@ +//! Barnes-Hut tree for O(n log n) t-SNE gradient computation +//! +//! The Barnes-Hut algorithm approximates long-range forces by treating +//! distant groups of points as single points located at their center of mass. + +use ndarray::{Array1, Array2}; + +/// Axis-aligned bounding box for spatial partitioning +#[derive(Clone, Debug)] +pub struct BoundingBox { + pub min_x: f64, + pub max_x: f64, + pub min_y: f64, + pub max_y: f64, +} + +impl BoundingBox { + pub fn new(min_x: f64, max_x: f64, min_y: f64, max_y: f64) -> Self { + Self { min_x, max_x, min_y, max_y } + } + + pub fn width(&self) -> f64 { + self.max_x - self.min_x + } + + pub fn height(&self) -> f64 { + self.max_y - self.min_y + } + + pub fn contains(&self, x: f64, y: f64) -> bool { + x >= self.min_x && x <= self.max_x && y >= self.min_y && y <= self.max_y + } + + /// Get the quadrant (0-3) for a point relative to center + pub fn get_quadrant(&self, x: f64, y: f64) -> usize { + let cx = (self.min_x + self.max_x) / 2.0; + let cy = (self.min_y + self.max_y) / 2.0; + + let mut quad = 0; + if x > cx { quad += 1; } + if y > cy { quad += 2; } + quad + } + + /// Get bounding box for a specific quadrant + pub fn get_quadrant_bounds(&self, quadrant: usize) -> BoundingBox { + let cx = (self.min_x + self.max_x) / 2.0; + let cy = (self.min_y + self.max_y) / 2.0; + + match quadrant { + 0 => BoundingBox::new(self.min_x, cx, self.min_y, cy), + 1 => BoundingBox::new(cx, self.max_x, self.min_y, cy), + 2 => BoundingBox::new(self.min_x, cx, cy, self.max_y), + 3 => BoundingBox::new(cx, self.max_x, cy, self.max_y), + _ => panic!("Invalid quadrant"), + } + } +} + +/// Barnes-Hut quadtree node for 2D embeddings +pub struct QuadTreeNode { + /// Center of mass for all points in this node + pub center_of_mass: [f64; 2], + /// Total mass (number of points) in this node + pub total_mass: f64, + /// Bounding box for this node + pub bounds: BoundingBox, + /// Child nodes (None for leaf, Some for internal) + pub children: Option; 4]>>, + /// Point index if this is a leaf with a single point + pub point_idx: Option, +} + +impl QuadTreeNode { + /// Build a quadtree from a set of 2D points + pub fn build(points: &Array2) -> Self { + let n = points.nrows(); + if n == 0 { + panic!("Cannot build tree from empty points"); + } + + // Find bounding box + let mut min_x = f64::INFINITY; + let mut max_x = f64::NEG_INFINITY; + let mut min_y = f64::INFINITY; + let mut max_y = f64::NEG_INFINITY; + + for i in 0..n { + let x = points[[i, 0]]; + let y = points[[i, 1]]; + min_x = min_x.min(x); + max_x = max_x.max(x); + min_y = min_y.min(y); + max_y = max_y.max(y); + } + + // Add small margin to avoid points on boundaries + let margin = 1e-5; + let bounds = BoundingBox::new( + min_x - margin, + max_x + margin, + min_y - margin, + max_y + margin, + ); + + // Build tree recursively + let mut root = QuadTreeNode { + center_of_mass: [0.0, 0.0], + total_mass: 0.0, + bounds, + children: None, + point_idx: None, + }; + + for i in 0..n { + root.insert(i, points[[i, 0]], points[[i, 1]]); + } + + root + } + + /// Insert a point into the tree + fn insert(&mut self, idx: usize, x: f64, y: f64) { + // Update center of mass incrementally + let new_mass = self.total_mass + 1.0; + self.center_of_mass[0] = (self.center_of_mass[0] * self.total_mass + x) / new_mass; + self.center_of_mass[1] = (self.center_of_mass[1] * self.total_mass + y) / new_mass; + self.total_mass = new_mass; + + // If this is an empty leaf, just store the point + if self.total_mass == 1.0 { + self.point_idx = Some(idx); + return; + } + + // If this is a leaf with one point, need to split + if self.children.is_none() { + self.subdivide(); + + // Reinsert the existing point + if let Some(old_idx) = self.point_idx.take() { + // Use the center of mass from before this insertion as the old point location + let old_x = self.center_of_mass[0] * self.total_mass - x; + let old_y = self.center_of_mass[1] * self.total_mass - y; + let old_quad = self.bounds.get_quadrant(old_x, old_y); + if let Some(ref mut children) = self.children { + if children[old_quad].is_none() { + children[old_quad] = Some(QuadTreeNode { + center_of_mass: [old_x, old_y], + total_mass: 1.0, + bounds: self.bounds.get_quadrant_bounds(old_quad), + children: None, + point_idx: Some(old_idx), + }); + } else { + children[old_quad].as_mut().unwrap().insert(old_idx, old_x, old_y); + } + } + } + } + + // Insert new point into appropriate quadrant + let quad = self.bounds.get_quadrant(x, y); + if let Some(ref mut children) = self.children { + if children[quad].is_none() { + children[quad] = Some(QuadTreeNode { + center_of_mass: [x, y], + total_mass: 1.0, + bounds: self.bounds.get_quadrant_bounds(quad), + children: None, + point_idx: Some(idx), + }); + } else { + children[quad].as_mut().unwrap().insert(idx, x, y); + } + } + } + + /// Subdivide this node into 4 quadrants + fn subdivide(&mut self) { + let mut children: [Option; 4] = [None, None, None, None]; + self.children = Some(Box::new(children)); + } + + /// Compute repulsive forces using Barnes-Hut approximation + pub fn compute_non_edge_forces( + &self, + point: &[f64], + theta: f64, + point_idx: usize, + ) -> [f64; 2] { + // Skip if this is the same point + if let Some(idx) = self.point_idx { + if idx == point_idx { + return [0.0, 0.0]; + } + } + + let dx = self.center_of_mass[0] - point[0]; + let dy = self.center_of_mass[1] - point[1]; + let dist_sq = dx * dx + dy * dy; + + // If node is far enough away, use approximation + let node_size = self.bounds.width().max(self.bounds.height()); + if node_size * node_size / dist_sq < theta * theta { + // Compute force from center of mass + let inv_dist = 1.0 / (1.0 + dist_sq); + let force_scalar = self.total_mass * inv_dist * inv_dist; + return [force_scalar * dx, force_scalar * dy]; + } + + // Otherwise, recurse into children + if let Some(ref children) = self.children { + let mut force = [0.0, 0.0]; + for child in children.iter() { + if let Some(ref child_node) = child { + let child_force = child_node.compute_non_edge_forces(point, theta, point_idx); + force[0] += child_force[0]; + force[1] += child_force[1]; + } + } + return force; + } + + // Leaf node with single point - compute exact force + if self.point_idx.is_some() && self.point_idx.unwrap() != point_idx { + let inv_dist = 1.0 / (1.0 + dist_sq); + let force_scalar = inv_dist * inv_dist; + return [force_scalar * dx, force_scalar * dy]; + } + + [0.0, 0.0] + } +} + +#[cfg(test)] +mod tests { + use super::*; + use approx::assert_relative_eq; + + #[test] + fn test_bounding_box() { + let bbox = BoundingBox::new(0.0, 10.0, 0.0, 10.0); + + assert_eq!(bbox.width(), 10.0); + assert_eq!(bbox.height(), 10.0); + + assert!(bbox.contains(5.0, 5.0)); + assert!(!bbox.contains(-1.0, 5.0)); + assert!(!bbox.contains(11.0, 5.0)); + + // Test quadrant determination + assert_eq!(bbox.get_quadrant(2.0, 2.0), 0); + assert_eq!(bbox.get_quadrant(7.0, 2.0), 1); + assert_eq!(bbox.get_quadrant(2.0, 7.0), 2); + assert_eq!(bbox.get_quadrant(7.0, 7.0), 3); + } + + #[test] + fn test_quadtree_build() { + let points = Array2::from_shape_vec((4, 2), vec![ + 0.0, 0.0, + 1.0, 0.0, + 0.0, 1.0, + 1.0, 1.0, + ]).unwrap(); + + let tree = QuadTreeNode::build(&points); + + assert_eq!(tree.total_mass, 4.0); + assert_relative_eq!(tree.center_of_mass[0], 0.5, epsilon = 1e-10); + assert_relative_eq!(tree.center_of_mass[1], 0.5, epsilon = 1e-10); + } + + #[test] + fn test_barnes_hut_forces() { + let points = Array2::from_shape_vec((3, 2), vec![ + 0.0, 0.0, + 10.0, 0.0, + 5.0, 5.0, + ]).unwrap(); + + let tree = QuadTreeNode::build(&points); + + // Test force computation with high theta (more approximation) + let force = tree.compute_non_edge_forces(&[0.0, 0.0], 0.5, 0); + + // Forces should be non-zero (repulsive from other points) + assert!(force[0] != 0.0 || force[1] != 0.0); + } +} \ No newline at end of file diff --git a/src/isomap.rs b/src/isomap.rs index 539dd32..b7a1cea 100644 --- a/src/isomap.rs +++ b/src/isomap.rs @@ -94,38 +94,209 @@ impl Isomap { distances: &Array2, n_samples: usize ) -> PyResult> { - // Floyd-Warshall for shortest paths + // Use parallel Dijkstra for O(n^2 log n) instead of Floyd-Warshall's O(n^3) + let geodesic_rows: Vec> = (0..n_samples).into_par_iter().map(|source| { + self.dijkstra_single_source(source, knn_graph, n_samples) + }).collect(); + + // Assemble the geodesic distance matrix let mut geodesic = Array2::from_elem((n_samples, n_samples), f64::INFINITY); - - // Initialize with k-NN edges (symmetric) - for i in 0..n_samples { - geodesic[[i, i]] = 0.0; - for &(j, d) in &knn_graph[i] { - geodesic[[i, j]] = d; - geodesic[[j, i]] = d; // Make symmetric + for (i, row) in geodesic_rows.into_iter().enumerate() { + for (j, dist) in row.into_iter().enumerate() { + geodesic[[i, j]] = dist; } } + + // Check for disconnected components + let max_dist = geodesic.iter() + .filter(|&&d| !d.is_infinite()) + .cloned() + .fold(0.0_f64, f64::max); + + let has_inf = geodesic.iter().any(|&d| d.is_infinite()); + if has_inf { + return Err(PyValueError::new_err( + "Graph is disconnected. Try increasing n_neighbors." + )); + } - // Floyd-Warshall - for k in 0..n_samples { - for i in 0..n_samples { - for j in 0..n_samples { - let new_dist = geodesic[[i, k]] + geodesic[[k, j]]; - if new_dist < geodesic[[i, j]] { - geodesic[[i, j]] = new_dist; + Ok(geodesic) + } + + /// Dijkstra's algorithm for single-source shortest paths + fn dijkstra_single_source( + &self, + source: usize, + knn_graph: &[Vec<(usize, f64)>], + n_samples: usize + ) -> Vec { + let mut dist = vec![f64::INFINITY; n_samples]; + dist[source] = 0.0; + + // Min-heap: (distance, node) + let mut heap = BinaryHeap::new(); + heap.push((OrderedFloat(0.0), source)); + + while let Some((OrderedFloat(d), u)) = heap.pop() { + // Skip if we've already found a better path + if -d > dist[u] { continue; } + + // Explore neighbors + for &(v, weight) in &knn_graph[u] { + let new_dist = dist[u] + weight; + if new_dist < dist[v] { + dist[v] = new_dist; + heap.push((OrderedFloat(-new_dist), v)); + } + } + + // Also check reverse edges to ensure symmetry + for v in 0..n_samples { + for &(neighbor, weight) in &knn_graph[v] { + if neighbor == u { + let new_dist = dist[u] + weight; + if new_dist < dist[v] { + dist[v] = new_dist; + heap.push((OrderedFloat(-new_dist), v)); + } } } } } + + dist + } +} - // Check for disconnected components - let max_dist = geodesic.iter().cloned().fold(0.0_f64, f64::max); - if max_dist.is_infinite() { - return Err(PyValueError::new_err( - "Graph is disconnected. Try increasing n_neighbors." - )); +#[cfg(test)] +mod tests { + use super::*; + use approx::assert_relative_eq; + use std::collections::BinaryHeap; + use ordered_float::OrderedFloat; + + #[test] + fn test_knn_graph_construction() { + let isomap = Isomap::new(2, 2); + + // Simple distance matrix + let distances = Array2::from_shape_vec((4, 4), vec![ + 0.0, 1.0, 2.0, 3.0, + 1.0, 0.0, 1.0, 2.0, + 2.0, 1.0, 0.0, 1.0, + 3.0, 2.0, 1.0, 0.0, + ]).unwrap(); + + let knn_graph = isomap.build_knn_graph(&distances, 4); + + // Each node should have 2 neighbors + for neighbors in &knn_graph { + assert_eq!(neighbors.len(), 2, "Each node should have exactly k neighbors"); } + + // Node 0's nearest neighbors should be 1 and 2 + let node0_neighbors: Vec = knn_graph[0].iter().map(|(idx, _)| *idx).collect(); + assert!(node0_neighbors.contains(&1)); + assert!(node0_neighbors.contains(&2)); + + // Check that distances are correct + for &(neighbor_idx, dist) in &knn_graph[0] { + assert_eq!(dist, distances[[0, neighbor_idx]]); + } + } - Ok(geodesic) + #[test] + fn test_geodesic_distances_simple_chain() { + let isomap = Isomap::new(2, 1); + + // Create a chain: 0 -- 1 -- 2 -- 3 + // Direct distances don't reflect the chain structure + let knn_graph = vec![ + vec![(1, 1.0)], // 0 connects to 1 + vec![(0, 1.0), (2, 1.0)], // 1 connects to 0 and 2 + vec![(1, 1.0), (3, 1.0)], // 2 connects to 1 and 3 + vec![(2, 1.0)], // 3 connects to 2 + ]; + + // Dummy distances matrix (not used in this test) + let distances = Array2::zeros((4, 4)); + + let geodesic = isomap.compute_geodesic_distances(&knn_graph, &distances, 4).unwrap(); + + // Check geodesic distances along the chain + assert_relative_eq!(geodesic[[0, 0]], 0.0, epsilon = 1e-10); + assert_relative_eq!(geodesic[[0, 1]], 1.0, epsilon = 1e-10); + assert_relative_eq!(geodesic[[0, 2]], 2.0, epsilon = 1e-10); // Through node 1 + assert_relative_eq!(geodesic[[0, 3]], 3.0, epsilon = 1e-10); // Through nodes 1 and 2 + } + + #[test] + fn test_floyd_warshall_correctness() { + let isomap = Isomap::new(2, 2); + + // Create a simple graph + let knn_graph = vec![ + vec![(1, 2.0), (2, 5.0)], + vec![(0, 2.0), (2, 1.0)], + vec![(0, 5.0), (1, 1.0)], + ]; + + let distances = Array2::zeros((3, 3)); + let geodesic = isomap.compute_geodesic_distances(&knn_graph, &distances, 3).unwrap(); + + // Check shortest paths + assert_relative_eq!(geodesic[[0, 0]], 0.0, epsilon = 1e-10); + assert_relative_eq!(geodesic[[0, 1]], 2.0, epsilon = 1e-10); + assert_relative_eq!(geodesic[[0, 2]], 3.0, epsilon = 1e-10); // 0->1->2 is shorter than 0->2 + + // Check symmetry + for i in 0..3 { + for j in 0..3 { + assert_relative_eq!(geodesic[[i, j]], geodesic[[j, i]], epsilon = 1e-10); + } + } + } + + #[test] + fn test_disconnected_graph_detection() { + let isomap = Isomap::new(2, 1); + + // Create disconnected components: 0-1 and 2-3 + let knn_graph = vec![ + vec![(1, 1.0)], + vec![(0, 1.0)], + vec![(3, 1.0)], + vec![(2, 1.0)], + ]; + + let distances = Array2::zeros((4, 4)); + let result = isomap.compute_geodesic_distances(&knn_graph, &distances, 4); + + // Should return an error for disconnected graph + assert!(result.is_err()); + if let Err(e) = result { + let error_msg = e.to_string(); + assert!(error_msg.contains("disconnected") || error_msg.contains("Graph")); + } + } + + #[test] + fn test_symmetric_knn_graph() { + let isomap = Isomap::new(2, 2); + + // Test that the geodesic distance computation makes the graph symmetric + let knn_graph = vec![ + vec![(1, 1.0), (2, 2.0)], // 0 -> 1, 0 -> 2 + vec![(2, 1.5)], // 1 -> 2 only + vec![], // 2 has no outgoing edges initially + ]; + + let distances = Array2::zeros((3, 3)); + let geodesic = isomap.compute_geodesic_distances(&knn_graph, &distances, 3).unwrap(); + + // Despite asymmetric k-NN graph, geodesic distances should be symmetric + assert_relative_eq!(geodesic[[0, 1]], geodesic[[1, 0]], epsilon = 1e-10); + assert_relative_eq!(geodesic[[0, 2]], geodesic[[2, 0]], epsilon = 1e-10); + assert_relative_eq!(geodesic[[1, 2]], geodesic[[2, 1]], epsilon = 1e-10); } } diff --git a/src/lib.rs b/src/lib.rs index 216cc3d..a7c4242 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ pub mod hnsw_index; pub mod sparse_hnsw_index; pub mod mixed_precision; pub mod cache_aligned; +pub mod barnes_hut; // Dimensionality reduction algorithms pub mod pca; diff --git a/src/lle.rs b/src/lle.rs index 65ad80f..4391623 100644 --- a/src/lle.rs +++ b/src/lle.rs @@ -176,3 +176,79 @@ impl LLE { Ok(embedding) } } + +#[cfg(test)] +mod tests { + use super::*; + use approx::assert_relative_eq; + + #[test] + fn test_find_neighbors() { + let lle = LLE::new(2, 2, 1e-3); + + let distances = Array2::from_shape_vec((4, 4), vec![ + 0.0, 1.0, 2.0, 3.0, + 1.0, 0.0, 1.0, 2.0, + 2.0, 1.0, 0.0, 1.0, + 3.0, 2.0, 1.0, 0.0, + ]).unwrap(); + + let neighbors = lle.find_neighbors(&distances, 4); + + // Each point should have 2 neighbors + for n in &neighbors { + assert_eq!(n.len(), 2); + } + + // Point 0's neighbors should be 1 and 2 (closest) + assert!(neighbors[0].contains(&1)); + assert!(neighbors[0].contains(&2)); + } + + #[test] + fn test_reconstruction_weights_sum_to_one() { + let lle = LLE::new(2, 3, 1e-3); + + // Simple test data + let x = Array2::from_shape_vec((5, 2), vec![ + 0.0, 0.0, + 1.0, 0.0, + 2.0, 0.0, + 1.0, 1.0, + 0.0, 1.0, + ]).unwrap(); + + let neighbors = vec![ + vec![1, 3, 4], + vec![0, 2, 3], + vec![1, 3, 4], + vec![0, 1, 2], + vec![0, 1, 3], + ]; + + let weights = lle.compute_weights(&x, &neighbors, 5).unwrap(); + + // Each row should sum to 1 (weights for reconstructing each point) + for i in 0..5 { + let row_sum: f64 = (0..5).map(|j| weights[[i, j]]).sum(); + assert_relative_eq!(row_sum, 1.0, epsilon = 1e-6); + } + } + + #[test] + fn test_embedding_eigendecomposition() { + let lle = LLE::new(2, 2, 1e-3); + + // Simple weight matrix + let weights = Array2::from_shape_vec((3, 3), vec![ + 0.0, 0.5, 0.5, + 0.5, 0.0, 0.5, + 0.5, 0.5, 0.0, + ]).unwrap(); + + let embedding = lle.compute_embedding(&weights, 3).unwrap(); + + // Should return 2D embedding + assert_eq!(embedding.shape(), &[3, 2]); + } +} diff --git a/src/mds.rs b/src/mds.rs index c04ac66..0ea88d2 100644 --- a/src/mds.rs +++ b/src/mds.rs @@ -292,3 +292,174 @@ pub fn compute_distance_matrix(data: &[Vec]) -> Array2 { distances } + +#[cfg(test)] +mod tests { + use super::*; + use approx::assert_relative_eq; + + #[test] + fn test_distance_matrix_computation() { + let data = vec![ + vec![0.0, 0.0], + vec![1.0, 0.0], + vec![0.0, 1.0], + vec![1.0, 1.0], + ]; + + let distances = compute_distance_matrix(&data); + + // Check symmetry + for i in 0..4 { + for j in 0..4 { + assert_relative_eq!(distances[[i, j]], distances[[j, i]], epsilon = 1e-10); + } + } + + // Check diagonal is zero + for i in 0..4 { + assert_eq!(distances[[i, i]], 0.0); + } + + // Check known distances + assert_relative_eq!(distances[[0, 1]], 1.0, epsilon = 1e-5); + assert_relative_eq!(distances[[0, 2]], 1.0, epsilon = 1e-5); + assert_relative_eq!(distances[[0, 3]], 2.0_f64.sqrt(), epsilon = 1e-5); + } + + #[test] + fn test_classical_mds_preserves_distances() { + // Create a simple distance matrix for 4 points in a line + let mut distances = Array2::zeros((4, 4)); + distances[[0, 1]] = 1.0; distances[[1, 0]] = 1.0; + distances[[0, 2]] = 2.0; distances[[2, 0]] = 2.0; + distances[[0, 3]] = 3.0; distances[[3, 0]] = 3.0; + distances[[1, 2]] = 1.0; distances[[2, 1]] = 1.0; + distances[[1, 3]] = 2.0; distances[[3, 1]] = 2.0; + distances[[2, 3]] = 1.0; distances[[3, 2]] = 1.0; + + let embedding = classical_mds(&distances, 2).unwrap(); + + // Check that the embedding preserves relative distances + // Points should be roughly collinear since original distances form a line + for i in 0..4 { + for j in (i+1)..4 { + let mut d_embedded = 0.0; + for k in 0..2 { + let diff = embedding[[i, k]] - embedding[[j, k]]; + d_embedded += diff * diff; + } + d_embedded = d_embedded.sqrt(); + + // Allow some error due to dimensionality reduction + assert_relative_eq!(d_embedded, distances[[i, j]], epsilon = 0.1); + } + } + } + + #[test] + fn test_double_centering() { + // Test the double-centering operation in classical MDS + let distances = Array2::from_shape_vec((3, 3), vec![ + 0.0, 1.0, 2.0, + 1.0, 0.0, 1.0, + 2.0, 1.0, 0.0, + ]).unwrap(); + + let d_sq = distances.mapv(|v| v * v); + + let row_mean = d_sq.mean_axis(Axis(1)).unwrap(); + let col_mean = d_sq.mean_axis(Axis(0)).unwrap(); + let grand_mean = d_sq.mean().unwrap(); + + // Build double-centered matrix + let mut b = Array2::zeros((3, 3)); + for i in 0..3 { + for j in 0..3 { + b[[i, j]] = -0.5 * (d_sq[[i, j]] - row_mean[i] - col_mean[j] + grand_mean); + } + } + + // Check that row and column means are zero (property of double-centering) + let b_row_mean = b.mean_axis(Axis(0)).unwrap(); + let b_col_mean = b.mean_axis(Axis(1)).unwrap(); + + for i in 0..3 { + assert!(b_row_mean[i].abs() < 1e-10, "Row mean should be ~0"); + assert!(b_col_mean[i].abs() < 1e-10, "Column mean should be ~0"); + } + } + + #[test] + fn test_stress_computation() { + let mut mds = MDS::new(2, true, 100, Some(42)); + + // Create a simple embedding + let embedding = Array2::from_shape_vec((3, 2), vec![ + 0.0, 0.0, + 1.0, 0.0, + 0.5, 0.866, + ]).unwrap(); + + // Create target distances (equilateral triangle) + let target = Array2::from_shape_vec((3, 3), vec![ + 0.0, 1.0, 1.0, + 1.0, 0.0, 1.0, + 1.0, 1.0, 0.0, + ]).unwrap(); + + let stress = mds.compute_stress(&target, &embedding); + + // For a perfect equilateral triangle embedding, stress should be low + assert!(stress < 0.1, "Stress should be low for well-preserved distances"); + } + + #[test] + fn test_smacof_iteration() { + let mds = MDS::new(2, true, 100, None); + + // Simple test case + let distances = Array2::from_shape_vec((3, 3), vec![ + 0.0, 1.0, 2.0, + 1.0, 0.0, 1.0, + 2.0, 1.0, 0.0, + ]).unwrap(); + + let embedding = Array2::from_shape_vec((3, 2), vec![ + 0.0, 0.0, + 0.5, 0.5, + 1.0, 0.0, + ]).unwrap(); + + let new_embedding = mds.smacof_iteration(&distances, &embedding, 3); + + // Should have same shape + assert_eq!(new_embedding.shape(), embedding.shape()); + + // Should be different (unless already at optimum) + let diff: f64 = (&new_embedding - &embedding).mapv(|v| v*v).sum(); + assert!(diff > 1e-10, "SMACOF should update the embedding"); + } + + #[test] + fn test_metric_vs_classical_mds() { + // Both should give similar results for metric data + let distances = Array2::from_shape_vec((4, 4), vec![ + 0.0, 1.0, 2.0, 3.0, + 1.0, 0.0, 1.0, 2.0, + 2.0, 1.0, 0.0, 1.0, + 3.0, 2.0, 1.0, 0.0, + ]).unwrap(); + + // Classical MDS + let classical_result = classical_mds(&distances, 2).unwrap(); + + // Metric MDS (would need to be called through the struct) + let mut metric_mds = MDS::new(2, true, 10, Some(42)); + // Note: Can't call metric_mds directly without Python interface + // but the test structure is here for when it's needed + + // Check that classical MDS gives reasonable result + assert_eq!(classical_result.shape(), &[4, 2]); + } +} diff --git a/src/pca.rs b/src/pca.rs index 3971548..dc09d42 100644 --- a/src/pca.rs +++ b/src/pca.rs @@ -139,3 +139,130 @@ impl PCA { Ok(evr.clone().into_pyarray_bound(py)) } } + +#[cfg(test)] +mod tests { + use super::*; + use ndarray::{arr2, Array2}; + use approx::assert_relative_eq; + + fn create_test_data() -> Array2 { + // Create a simple dataset where variance is primarily along first dimension + let mut data = Array2::zeros((100, 5)); + for i in 0..100 { + let val = i as f64; + data[[i, 0]] = val * 2.0; // High variance + data[[i, 1]] = val * 0.5; // Medium variance + data[[i, 2]] = val * 0.1; // Low variance + data[[i, 3]] = (val % 10) as f64 * 0.01; // Very low variance + data[[i, 4]] = 0.0; // Zero variance + } + data + } + + #[test] + fn test_pca_internal_fit() { + let data = create_test_data(); + let mut pca = PCA::new(2); + + // Test internal logic without Python + let mean = data.mean_axis(Axis(0)).unwrap(); + assert_eq!(mean.len(), 5); + + // Center the data + let mut x_centered = data.clone(); + for mut row in x_centered.rows_mut() { + row -= &mean; + } + + // Compute covariance + let n_samples = data.nrows(); + let cov = x_centered.t().dot(&x_centered) / (n_samples - 1) as f64; + + // Check covariance matrix is symmetric + for i in 0..5 { + for j in 0..5 { + assert_relative_eq!(cov[[i, j]], cov[[j, i]], epsilon = 1e-10); + } + } + } + + #[test] + fn test_explained_variance_decreasing() { + let data = create_test_data(); + let n_samples = data.nrows(); + let n_features = data.ncols(); + + // Compute mean and center + let mean = data.mean_axis(Axis(0)).unwrap(); + let mut x_centered = data.clone(); + for mut row in x_centered.rows_mut() { + row -= &mean; + } + + // Compute covariance and eigendecomposition + let cov = x_centered.t().dot(&x_centered) / (n_samples - 1) as f64; + let (eigenvalues, _eigenvectors) = cov.eigh(UPLO::Upper).unwrap(); + + // Sort eigenvalues descending + let mut sorted_eigenvalues: Vec = eigenvalues.iter().cloned().collect(); + sorted_eigenvalues.sort_by(|a, b| b.partial_cmp(a).unwrap()); + + // Verify eigenvalues are in decreasing order + for i in 1..sorted_eigenvalues.len() { + assert!(sorted_eigenvalues[i] <= sorted_eigenvalues[i-1], + "Eigenvalues should be in decreasing order"); + } + + // First eigenvalue should capture most variance + let total_variance: f64 = sorted_eigenvalues.iter().sum(); + let first_ratio = sorted_eigenvalues[0] / total_variance; + assert!(first_ratio > 0.8, "First component should explain >80% variance in this test data"); + } + + #[test] + fn test_covariance_properties() { + // Simple 2D test case + let data = arr2(&[ + [1.0, 2.0], + [2.0, 4.0], + [3.0, 6.0], + [4.0, 8.0], + ]); + + let mean = data.mean_axis(Axis(0)).unwrap(); + let mut centered = data.clone(); + for mut row in centered.rows_mut() { + row -= &mean; + } + + let cov = centered.t().dot(¢ered) / 3.0; + + // Check covariance matrix properties + assert_relative_eq!(cov[[0, 0]], 1.6666666, epsilon = 1e-5); + assert_relative_eq!(cov[[1, 1]], 6.6666666, epsilon = 1e-5); + assert_relative_eq!(cov[[0, 1]], cov[[1, 0]], epsilon = 1e-10); // Symmetric + } + + #[test] + fn test_zero_variance_handling() { + // Data with one dimension having zero variance + let mut data = Array2::zeros((50, 3)); + for i in 0..50 { + data[[i, 0]] = i as f64; + data[[i, 1]] = (i as f64).sin(); + data[[i, 2]] = 5.0; // Constant - zero variance + } + + let mean = data.mean_axis(Axis(0)).unwrap(); + let mut centered = data.clone(); + for mut row in centered.rows_mut() { + row -= &mean; + } + + let cov = centered.t().dot(¢ered) / 49.0; + + // Variance of constant dimension should be ~0 + assert!(cov[[2, 2]].abs() < 1e-10, "Constant dimension should have zero variance"); + } +} diff --git a/src/tsne.rs b/src/tsne.rs index 120f730..cefed00 100644 --- a/src/tsne.rs +++ b/src/tsne.rs @@ -169,10 +169,10 @@ impl TSNE { fn compute_joint_probabilities(&self, distances: &Array2, n_samples: usize) -> Array2 { let target_entropy = (self.perplexity).ln(); - let mut p = Array2::zeros((n_samples, n_samples)); - - // Compute conditional probabilities P(j|i) using binary search for sigma - for i in 0..n_samples { + + // Parallel computation of conditional probabilities P(j|i) + let p_rows: Vec> = (0..n_samples).into_par_iter().map(|i| { + let mut p_row = vec![0.0; n_samples]; let mut beta = 1.0; // 1 / (2 * sigma^2) let mut beta_min = f64::NEG_INFINITY; let mut beta_max = f64::INFINITY; @@ -184,7 +184,7 @@ impl TSNE { for j in 0..n_samples { if i != j { let pij = (-beta * distances[[i, j]]).exp(); - p[[i, j]] = pij; + p_row[j] = pij; sum_p += pij; } } @@ -192,15 +192,15 @@ impl TSNE { // Normalize if sum_p > 1e-10 { for j in 0..n_samples { - p[[i, j]] /= sum_p; + p_row[j] /= sum_p; } } // Compute entropy let mut entropy = 0.0; for j in 0..n_samples { - if p[[i, j]] > 1e-10 { - entropy -= p[[i, j]] * p[[i, j]].ln(); + if p_row[j] > 1e-10 { + entropy -= p_row[j] * p_row[j].ln(); } } @@ -218,6 +218,16 @@ impl TSNE { beta = if beta_min.is_infinite() { beta / 2.0 } else { (beta + beta_min) / 2.0 }; } } + + p_row + }).collect(); + + // Assemble P matrix from parallel results + let mut p = Array2::zeros((n_samples, n_samples)); + for (i, row) in p_rows.into_iter().enumerate() { + for (j, val) in row.into_iter().enumerate() { + p[[i, j]] = val; + } } // Symmetrize: P = (P + P^T) / (2n) @@ -267,12 +277,14 @@ impl TSNE { fn compute_gradient(&self, p: &Array2, q: &Array2, y: &Array2) -> Array2 { let n = y.nrows(); - let mut grad = Array2::zeros((n, self.n_components)); - + // Compute (P - Q) * kernel let pq = p - q; - for i in 0..n { + // Parallel gradient computation - each thread computes gradients for a subset of points + let grad_rows: Vec> = (0..n).into_par_iter().map(|i| { + let mut grad_row = vec![0.0; self.n_components]; + for j in 0..n { if i != j { let mut dist_sq = 0.0; @@ -284,12 +296,213 @@ impl TSNE { let mult = 4.0 * pq[[i, j]] * kernel; for k in 0..self.n_components { - grad[[i, k]] += mult * (y[[i, k]] - y[[j, k]]); + grad_row[k] += mult * (y[[i, k]] - y[[j, k]]); } } } + + grad_row + }).collect(); + + // Assemble the gradient matrix from parallel results + let mut grad = Array2::zeros((n, self.n_components)); + for (i, row) in grad_rows.into_iter().enumerate() { + for (j, val) in row.into_iter().enumerate() { + grad[[i, j]] = val; + } } grad } } + +#[cfg(test)] +mod tests { + use super::*; + use approx::assert_relative_eq; + + fn create_two_clusters() -> Vec> { + let mut data = Vec::new(); + // Cluster 1 around origin + for i in 0..20 { + let mut point = vec![0.0; 10]; + for j in 0..10 { + point[j] = (i as f32) * 0.01 + (j as f32) * 0.001; + } + data.push(point); + } + // Cluster 2 offset by 10 + for i in 0..20 { + let mut point = vec![10.0; 10]; + for j in 0..10 { + point[j] = 10.0 + (i as f32) * 0.01 + (j as f32) * 0.001; + } + data.push(point); + } + data + } + + #[test] + fn test_pairwise_distances_symmetric() { + let tsne = TSNE::new(2, 5.0, 200.0, 100, 12.0, Some(42)); + let data = vec![ + vec![0.0, 0.0], + vec![1.0, 0.0], + vec![0.0, 1.0], + vec![1.0, 1.0], + ]; + + let distances = tsne.compute_pairwise_distances(&data); + + // Check symmetry + for i in 0..4 { + for j in 0..4 { + assert_relative_eq!(distances[[i, j]], distances[[j, i]], epsilon = 1e-10); + } + } + + // Check diagonal is zero + for i in 0..4 { + assert_eq!(distances[[i, i]], 0.0); + } + + // Check known distances (squared) + assert_relative_eq!(distances[[0, 1]], 1.0, epsilon = 1e-5); // (1-0)^2 + (0-0)^2 + assert_relative_eq!(distances[[0, 2]], 1.0, epsilon = 1e-5); // (0-0)^2 + (1-0)^2 + assert_relative_eq!(distances[[0, 3]], 2.0, epsilon = 1e-5); // (1-0)^2 + (1-0)^2 + } + + #[test] + fn test_joint_probabilities_properties() { + let tsne = TSNE::new(2, 5.0, 200.0, 100, 12.0, Some(42)); + let data = create_two_clusters(); + let distances = tsne.compute_pairwise_distances(&data); + let p = tsne.compute_joint_probabilities(&distances, 40); + + // P should be symmetric + for i in 0..40 { + for j in 0..40 { + assert_relative_eq!(p[[i, j]], p[[j, i]], epsilon = 1e-10, + "P matrix not symmetric at [{}, {}]", i, j); + } + } + + // P should sum to 1 + let sum: f64 = p.iter().sum(); + assert_relative_eq!(sum, 1.0, epsilon = 1e-6, "P doesn't sum to 1"); + + // All probabilities should be non-negative + for &val in p.iter() { + assert!(val >= 0.0, "Found negative probability"); + } + + // Diagonal should be zero (no self-similarity) + for i in 0..40 { + assert!(p[[i, i]] < 1e-10, "Diagonal should be ~0"); + } + } + + #[test] + fn test_q_distribution_properties() { + let tsne = TSNE::new(2, 5.0, 200.0, 100, 12.0, Some(42)); + + // Create a simple embedding + let mut y = Array2::zeros((10, 2)); + for i in 0..10 { + y[[i, 0]] = (i as f64) * 0.5; + y[[i, 1]] = (i as f64).sin(); + } + + let q = tsne.compute_q(&y); + + // Q should be symmetric + for i in 0..10 { + for j in 0..10 { + assert_relative_eq!(q[[i, j]], q[[j, i]], epsilon = 1e-10); + } + } + + // Q should sum to 1 + let sum: f64 = q.iter().sum(); + assert_relative_eq!(sum, 1.0, epsilon = 1e-6); + + // All values should be non-negative + for &val in q.iter() { + assert!(val >= 0.0); + } + } + + #[test] + fn test_gradient_computation() { + let tsne = TSNE::new(2, 5.0, 200.0, 100, 12.0, Some(42)); + + // Simple test case + let p = Array2::from_shape_vec((3, 3), vec![ + 0.0, 0.3, 0.2, + 0.3, 0.0, 0.2, + 0.2, 0.2, 0.0, + ]).unwrap(); + + let q = Array2::from_shape_vec((3, 3), vec![ + 0.0, 0.25, 0.25, + 0.25, 0.0, 0.25, + 0.25, 0.25, 0.0, + ]).unwrap(); + + let y = Array2::from_shape_vec((3, 2), vec![ + 0.0, 0.0, + 1.0, 0.0, + 0.5, 0.866, + ]).unwrap(); + + let grad = tsne.compute_gradient(&p, &q, &y); + + // Gradient should have correct shape + assert_eq!(grad.shape(), &[3, 2]); + + // Gradient should not be all zeros (unless at optimum) + let grad_norm: f64 = grad.iter().map(|&v| v * v).sum::().sqrt(); + assert!(grad_norm > 1e-10, "Gradient should be non-zero"); + } + + #[test] + fn test_perplexity_binary_search() { + let tsne = TSNE::new(2, 5.0, 200.0, 100, 12.0, Some(42)); + + // Test that binary search finds reasonable sigmas + let distances = Array2::from_shape_vec((3, 3), vec![ + 0.0, 1.0, 4.0, + 1.0, 0.0, 1.0, + 4.0, 1.0, 0.0, + ]).unwrap(); + + let p = tsne.compute_joint_probabilities(&distances, 3); + + // Check that closer points have higher probability + assert!(p[[0, 1]] > p[[0, 2]], "Closer points should have higher probability"); + assert!(p[[1, 0]] > p[[2, 0]], "Closer points should have higher probability"); + } + + #[test] + fn test_reproducibility_with_seed() { + // Same seed should give same initialization + let tsne1 = TSNE::new(2, 5.0, 200.0, 100, 12.0, Some(42)); + let tsne2 = TSNE::new(2, 5.0, 200.0, 100, 12.0, Some(42)); + + let data = vec![ + vec![0.0, 0.0, 0.0], + vec![1.0, 1.0, 1.0], + vec![2.0, 2.0, 2.0], + ]; + + let distances1 = tsne1.compute_pairwise_distances(&data); + let distances2 = tsne2.compute_pairwise_distances(&data); + + // Should be identical + for i in 0..3 { + for j in 0..3 { + assert_eq!(distances1[[i, j]], distances2[[i, j]]); + } + } + } +}