diff --git a/src/kete/rust/fitting.rs b/src/kete/rust/fitting.rs deleted file mode 100644 index 7dffbb5..0000000 --- a/src/kete/rust/fitting.rs +++ /dev/null @@ -1,28 +0,0 @@ -//! Basic statistics -use kete_stats::prelude::{Data, UncertainData}; -use pyo3::{PyResult, pyfunction}; - -/// Perform a KS test between two vectors of values. -#[pyfunction] -#[pyo3(name = "ks_test")] -pub fn ks_test_py(sample_a: Vec, sample_b: Vec) -> PyResult { - let sample_a: Data = sample_a - .try_into() - .expect("Sample A did not contain valid data."); - let sample_b: Data = sample_b - .try_into() - .expect("Sample B did not contain valid data."); - Ok(sample_a - .into_sorted() - .two_sample_ks_statistic(&sample_b.into_sorted())) -} - -/// Fit the reduced chi squared value for a collection of data with uncertainties. -#[pyfunction] -#[pyo3(name = "fit_chi2")] -pub fn fit_chi2_py(data: Vec, sigmas: Vec) -> f64 { - let data: UncertainData = (data, sigmas) - .try_into() - .expect("Data or sigmas did not contain valid data."); - data.fit_reduced_chi2().unwrap() -} diff --git a/src/kete/rust/lib.rs b/src/kete/rust/lib.rs index da8a8ec..7c8fd05 100644 --- a/src/kete/rust/lib.rs +++ b/src/kete/rust/lib.rs @@ -30,7 +30,6 @@ use state::PyState; pub mod covariance; pub mod desigs; pub mod elements; -pub mod fitting; pub mod flux; pub mod fovs; pub mod frame; @@ -178,9 +177,6 @@ fn _core(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(state_transition::compute_stm_py, m)?)?; - m.add_function(wrap_pyfunction!(fitting::ks_test_py, m)?)?; - m.add_function(wrap_pyfunction!(fitting::fit_chi2_py, m)?)?; - m.add_function(wrap_pyfunction!(kete_core::cache::cache_path, m)?)?; m.add_function(wrap_pyfunction!(utils::ra_degrees_to_hms_py, m)?)?; diff --git a/src/kete_stats/src/data.rs b/src/kete_stats/src/data.rs index e111555..1aaf937 100644 --- a/src/kete_stats/src/data.rs +++ b/src/kete_stats/src/data.rs @@ -1,6 +1,6 @@ -//! # Statistics +//! # Data //! -//! Commonly used statistical methods. +//! Handling of finite, nonempty datasets for basic statistical calculations. //! // BSD 3-Clause License // @@ -32,8 +32,6 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. use std::{fmt::Debug, ops::Index}; -use crate::fitting::{FittingResult, newton_raphson}; - /// Error types for statistics calculations. #[derive(Debug, Clone, Copy, thiserror::Error)] #[non_exhaustive] @@ -70,16 +68,21 @@ where /// Dataset with associated uncertainties. /// +/// This structure pairs measurements with their one-sigma (1σ) uncertainties, representing +/// the standard deviation of each measurement. All statistical methods using these +/// uncertainties assume they represent Gaussian (normal) errors. +/// /// There is a one-to-one correspondence between values and uncertainties. #[derive(Clone, Debug)] pub struct UncertainData where T: num_traits::Float, { - /// Values of the dataset. + /// Measured values of the dataset. pub values: Data, - /// Uncertainties associated with the dataset. + /// One-sigma (1σ) uncertainties (standard deviations) for each measurement + /// assuming Gaussian errors. pub uncertainties: Data, } @@ -167,8 +170,11 @@ where } /// Find the k-th smallest element in the dataset. + /// + /// If k is larger than the length of the data, it returns the final element. #[must_use] pub fn kth_smallest(&mut self, k: usize) -> T { + let k = k.min(self.len() - 1); quickselect(&mut self.0, k) } @@ -262,17 +268,49 @@ where SortedData(self) } - /// Select a sample of N data points from the dataset. + /// Shuffle the dataset in-place using a Fisher-Yates shuffle. + /// + /// This uses a simple Linear Congruential Generator (LCG) for pseudorandom + /// number generation with the given seed for reproducibility. + /// + /// # Arguments + /// + /// * `seed` - The seed for the random number generator. + pub fn shuffle(&mut self, seed: u64) { + let mut rng_state = seed; + for i in (1..self.0.len()).rev() { + rng_state = rng_state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let j = (rng_state as usize) % (i + 1); + self.0.swap(i, j); + } + } + + /// Select a sample of N data points from the dataset, this always returns at + /// least one data point. This does not panic. /// /// If more data is requested than exists, the full dataset is returned. /// + /// If 0 data points are requested, a single data point is returned. + /// /// This assumes the data is nearly IID, samples at nearly even step sizes. #[must_use] pub fn sample_n(&self, n: usize) -> Self { + if n == 0 { + // Cannot sample 0 elements, return a single element + return Self::new_unchecked(vec![self.0[0]].into_boxed_slice()); + } + if n >= self.len() { return Self::new_unchecked(self.0.clone()); } + if n == 1 { + // Special case to avoid division by zero + return Self::new_unchecked(vec![self.0[0]].into_boxed_slice()); + } + let mut sampled_data = Vec::with_capacity(n); let step = (self.len() - 1) as f64 / (n - 1) as f64; for i in 0..n { @@ -292,12 +330,19 @@ where /// /// Exits early if no data points are removed in an iteration (convergence). /// + /// If all data points are filtered out, returns a dataset containing only the median + /// value to maintain the invariant that `Data` is never empty. + /// /// # Arguments /// * `lower_std` - Lower standard deviation threshold. /// * `upper_std` - Upper standard deviation threshold. /// * `n_iter` - Number of iterations to perform. #[must_use] pub fn sigma_clip(&self, lower_std: T, upper_std: T, n_iter: usize) -> Self { + // to avoid confusion with users passing negative std values + let lower_std = -lower_std.abs(); + let upper_std = upper_std.abs(); + let mut clipped_data = self.0.to_vec(); for _ in 0..n_iter { let prev_len = clipped_data.len(); @@ -314,6 +359,13 @@ where if clipped_data.len() == prev_len { break; } + + // If all points were filtered out, return median value to maintain invariant + if clipped_data.is_empty() { + let mut median_data = Self::new_unchecked(self.0.clone()); + let median = median_data.median(); + return Self::new_unchecked(vec![median].into_boxed_slice()); + } } Self::new_unchecked(clipped_data.into_boxed_slice()) } @@ -323,60 +375,109 @@ impl UncertainData where T: num_traits::Float + num_traits::NumAssignOps + std::iter::Sum, { - /// Compute the reduced chi squared value from known values and standard deviations. - /// This computes the reduced chi squared against a single desired value. - #[inline(always)] - pub fn reduced_chi2(&self, val: T) -> T { - self.values + /// Compute the weighted mean using inverse variance weighting (1/σ²). + /// + /// This is the optimal estimator for combining measurements with different uncertainties, + /// and is mathematically equivalent to the value that minimizes the reduced chi-squared + /// statistic. This provides the same result as fitting via chi-squared minimization + /// but is computed directly without iterative optimization. + /// + /// # Formula + /// ``weighted_mean = Σ(x_i / σ_i²) / Σ(1 / σ_i²)`` + #[must_use] + pub fn weighted_mean(&self) -> T { + let mut sum_weights = T::zero(); + let mut sum_weighted_values = T::zero(); + + for (value, sigma) in self.values.0.iter().zip(self.uncertainties.0.iter()) { + let weight = T::one() / sigma.powi(2); + sum_weighted_values += *value * weight; + sum_weights += weight; + } + + sum_weighted_values / sum_weights + } + + /// Compute the weighted variance using inverse variance weighting. + /// + /// # Formula + /// ``weighted_variance = 1 / Σ(1 / σ_i²)`` + #[must_use] + pub fn weighted_variance(&self) -> T { + let sum_weights: T = self + .uncertainties .0 .iter() - .zip(self.uncertainties.0.iter()) - .map(|(d, sigma)| ((*d - val) / *sigma).powi(2)) - .sum::() + .map(|sigma| T::one() / sigma.powi(2)) + .sum(); + + T::one() / sum_weights } - /// Compute the derivative of reduced chi squared value with respect to the set value. - #[inline(always)] - fn reduced_chi2_der(&self, val: T) -> T { - let two = T::from(2.0).unwrap(); - self.values + /// Compute the weighted standard deviation. + /// + /// This is the square root of the weighted variance and represents the + /// uncertainty in the weighted mean. + #[must_use] + pub fn weighted_std(&self) -> T { + self.weighted_variance().sqrt() + } + + /// Compute the effective sample size accounting for varying uncertainties. + /// + /// When all uncertainties are equal, this equals the number of samples. + /// When uncertainties vary, this is reduced based on the variance of the weights. + /// + /// # Formula + /// ``n_eff = (Σw_i)² / Σ(w_i²) where w_i = 1/σ_i²`` + #[must_use] + pub fn effective_sample_size(&self) -> T { + let weights: Vec = self + .uncertainties .0 .iter() - .zip(self.uncertainties.0.iter()) - .map(|(d, sigma)| two * (val - *d) / sigma.powi(2)) - .sum::() + .map(|sigma| T::one() / sigma.powi(2)) + .collect(); + + let sum_weights: T = weights.iter().copied().sum(); + let sum_weights_squared: T = weights.iter().map(|w| w.powi(2)).sum(); + + sum_weights.powi(2) / sum_weights_squared } - /// Compute the second derivative of reduced chi squared value with respect to the set value. + /// Compute the reduced chi squared value from known values and standard deviations. + /// This computes the reduced chi squared against a single desired value. #[inline(always)] - fn reduced_chi2_der_der(&self) -> T { - let two = T::from(2.0).unwrap(); - self.uncertainties + pub fn reduced_chi2(&self, val: T) -> T { + self.values .0 .iter() - .map(|sigma| two / sigma.powi(2)) + .zip(self.uncertainties.0.iter()) + .map(|(d, sigma)| ((*d - val) / *sigma).powi(2)) .sum::() } - /// Given a collection of data and standard deviations, fit the best reduced chi squared value - /// for the provided data. + /// Shuffle both values and uncertainties in-place using a Fisher-Yates shuffle. /// - /// # Errors - /// [`crate::fitting::ConvergenceError`] may be returned if newton raphson fails to converge. - #[allow( - clippy::missing_panics_doc, - reason = "By construction this cannot panic." - )] - pub fn fit_reduced_chi2(&self) -> FittingResult { - let n_sigmas = T::from(self.uncertainties.0.len()).unwrap(); - let cost = |val: T| -> T { self.reduced_chi2_der(val) / n_sigmas }; - let der = |_: T| -> T { self.reduced_chi2_der_der() / n_sigmas }; - newton_raphson( - cost, - der, - self.values.0[0], - T::epsilon() * T::from(1000.0).unwrap(), - ) + /// This maintains the one-to-one correspondence between values and their + /// uncertainties by applying the same permutation to both arrays. + /// + /// This uses a simple Linear Congruential Generator (LCG) for pseudorandom + /// number generation with the given seed for reproducibility. + /// + /// # Arguments + /// + /// * `seed` - The seed for the random number generator. + pub fn shuffle(&mut self, seed: u64) { + let mut rng_state = seed; + for i in (1..self.values.0.len()).rev() { + rng_state = rng_state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let j = (rng_state as usize) % (i + 1); + self.values.0.swap(i, j); + self.uncertainties.0.swap(i, j); + } } /// Length of the dataset. @@ -623,10 +724,10 @@ where let mut j = arr.len() - 1; loop { - while arr[i] < pivot { + while i < arr.len() && arr[i] < pivot { i += 1; } - while arr[j] > pivot { + while j > 0 && arr[j] > pivot { j -= 1; } if i >= j { @@ -668,6 +769,17 @@ where } } +impl TryFrom> for Data +where + T: num_traits::Float, +{ + type Error = DataError; + + fn try_from(value: Box<[T]>) -> Result { + value.into_vec().try_into() + } +} + impl TryFrom> for Data where T: Copy + num_traits::Float, @@ -686,6 +798,28 @@ where } } +impl TryFrom> for SortedData +where + T: Copy + num_traits::Float + num_traits::float::TotalOrder + num_traits::NumAssignOps + Debug, +{ + type Error = DataError; + + fn try_from(value: Vec) -> Result { + Data::try_from(value).map(Data::into_sorted) + } +} + +impl TryFrom<&[T]> for SortedData +where + T: num_traits::Float + num_traits::float::TotalOrder + num_traits::NumAssignOps + Debug, +{ + type Error = DataError; + + fn try_from(value: &[T]) -> Result { + Data::try_from(value).map(Data::into_sorted) + } +} + impl TryFrom<(&[T], &[T])> for UncertainData where T: Copy + num_traits::Float + num_traits::float::TotalOrder + num_traits::NumAssignOps + Debug, @@ -893,6 +1027,24 @@ mod tests { assert_eq!(sampled[0], 1.0); } + #[test] + fn test_sample_n_edge_case_n_is_one() { + // Edge case: sampling exactly 1 element (tests division by zero fix) + let data: Data<_> = vec![1.0, 2.0, 3.0, 4.0, 5.0].try_into().unwrap(); + let sampled = data.sample_n(1); + assert_eq!(sampled.len(), 1); + // Should return the first element + assert_eq!(sampled[0], 1.0); + } + + #[test] + fn test_kth_smallest_out_of_bounds() { + // Test that kth_smallest clamps to valid range when k is out of bounds + let mut data: Data<_> = vec![1.0, 2.0, 3.0].try_into().unwrap(); + // k=5 is out of bounds for length 3, should return the last element + assert_eq!(data.kth_smallest(5), 3.0); + } + #[test] fn test_two_sample_ks_statistic_identical() { let data1: Data<_> = vec![1.0, 2.0, 3.0, 4.0, 5.0].try_into().unwrap(); @@ -1061,6 +1213,29 @@ mod tests { assert!(clipped.len() <= data.len()); } + #[test] + fn test_sigma_clip_edge_case_all_equal() { + // Edge case: all values are equal (tests quickselect with pivot == all elements) + let data: Data<_> = vec![5.0, 5.0, 5.0, 5.0, 5.0].try_into().unwrap(); + + // Should not panic and should keep all data (std is 0) + let clipped = data.sigma_clip(2.0, 2.0, 1); + assert_eq!(clipped.len(), 5); + } + + #[test] + fn test_sigma_clip_all_filtered() { + // Edge case: threshold so tight that all data is filtered out + let data: Data<_> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0].try_into().unwrap(); + + // Use impossibly tight threshold - should return median value + let clipped = data.sigma_clip(0.001, 0.001, 1); + + // Should return a single element (the median) + assert_eq!(clipped.len(), 1); + assert_eq!(clipped[0], 3.5); // median of [1,2,3,4,5,6] is 3.5 + } + #[test] fn test_uncertain_data_creation() { use super::UncertainData; @@ -1160,60 +1335,216 @@ mod tests { } #[test] - fn test_fit_reduced_chi2_simple() { + fn test_weighted_mean_equal_uncertainties() { use super::UncertainData; - // Test fitting chi2 - should return the mean when uncertainties are equal - let values = vec![4.0, 5.0, 6.0]; - let uncertainties = vec![1.0, 1.0, 1.0]; + // When uncertainties are equal, weighted mean should equal regular mean + let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; + let uncertainties = vec![1.0, 1.0, 1.0, 1.0, 1.0]; let data: UncertainData = (values.as_slice(), uncertainties.as_slice()) .try_into() .unwrap(); - let fitted = data.fit_reduced_chi2().unwrap(); - // Should converge to the mean (5.0) - assert!((fitted - 5.0).abs() < 1e-6); + let weighted_mean = data.weighted_mean(); + let regular_mean = data.values.mean(); + + assert!((weighted_mean - regular_mean).abs() < 1e-10); + assert!((weighted_mean - 3.0).abs() < 1e-10); } #[test] - fn test_fit_reduced_chi2_weighted() { + fn test_weighted_mean_different_uncertainties() { use super::UncertainData; - // Test fitting with different uncertainties - // The fit should be weighted by 1/sigma^2 + // Test that weighted mean gives more weight to more precise measurements + // Value 1.0 with σ=0.1 should dominate over value 10.0 with σ=10.0 let values = vec![1.0, 10.0]; - let uncertainties = vec![0.1, 10.0]; // First point has much smaller uncertainty + let uncertainties = vec![0.1, 10.0]; + + let data: UncertainData = (values.as_slice(), uncertainties.as_slice()) + .try_into() + .unwrap(); + + let weighted_mean = data.weighted_mean(); + + // Should be much closer to 1.0 than 10.0 + assert!(weighted_mean < 2.0); + assert!((weighted_mean - 1.0).abs() < 1.0); + } + + #[test] + fn test_weighted_std() { + use super::UncertainData; + + // Weighted std should decrease as we add more measurements + let values = vec![5.0, 5.0, 5.0]; + let uncertainties = vec![1.0, 1.0, 1.0]; + + let data: UncertainData = (values.as_slice(), uncertainties.as_slice()) + .try_into() + .unwrap(); + + let weighted_std = data.weighted_std(); + let weighted_var = data.weighted_variance(); + + // Variance should be 1 / (3 * 1/1²) = 1/3 + assert!((weighted_var - 1.0 / 3.0).abs() < 1e-10); + assert!((weighted_std - (1.0 / 3.0_f64.sqrt())).abs() < 1e-10); + } + + #[test] + fn test_effective_sample_size_equal_uncertainties() { + use super::UncertainData; + + // When uncertainties are equal, n_eff should equal n + let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; + let uncertainties = vec![1.0, 1.0, 1.0, 1.0, 1.0]; + + let data: UncertainData = (values.as_slice(), uncertainties.as_slice()) + .try_into() + .unwrap(); + + let n_eff = data.effective_sample_size(); + + assert!((n_eff - 5.0).abs() < 1e-10); + } + + #[test] + fn test_effective_sample_size_different_uncertainties() { + use super::UncertainData; + + // When uncertainties vary, n_eff < n + let values = vec![1.0, 2.0, 3.0]; + let uncertainties = vec![0.1, 1.0, 10.0]; // Very different uncertainties let data: UncertainData = (values.as_slice(), uncertainties.as_slice()) .try_into() .unwrap(); - let fitted = data.fit_reduced_chi2().unwrap(); - // Should be much closer to 1.0 than 10.0 due to weighting - assert!(fitted < 5.0); - assert!((fitted - 1.0).abs() < 0.5); + let n_eff = data.effective_sample_size(); + + // Should be less than 3 due to varying weights + assert!(n_eff < 3.0); + assert!(n_eff > 0.0); } #[test] - fn test_fit_reduced_chi2_convergence() { + fn test_weighted_mean_minimizes_chi2() { use super::UncertainData; - // Test with more data points to ensure convergence - let values = vec![2.0, 3.0, 4.0, 5.0, 6.0]; - let uncertainties = vec![0.5, 0.5, 0.5, 0.5, 0.5]; + // The weighted mean should minimize the chi2 value + let values = vec![4.0, 5.0, 6.0]; + let uncertainties = vec![1.0, 0.5, 1.0]; let data: UncertainData = (values.as_slice(), uncertainties.as_slice()) .try_into() .unwrap(); - let fitted = data.fit_reduced_chi2().unwrap(); - // Should converge to the mean (4.0) - assert!((fitted - 4.0).abs() < 1e-6); + let weighted_mean = data.weighted_mean(); + let chi2_at_mean = data.reduced_chi2(weighted_mean); + let chi2_slightly_off = data.reduced_chi2(weighted_mean + 0.1); + + // Chi2 should be minimized at the weighted mean + assert!(chi2_at_mean < chi2_slightly_off); + } + + #[test] + fn test_data_shuffle() { + use super::Data; + + // Test that shuffle changes order but preserves elements + let original = vec![1.0, 2.0, 3.0, 4.0, 5.0]; + let mut data = Data::try_from(original.as_slice()).unwrap(); + + // Calculate sum and length before shuffle + let original_sum: f64 = original.iter().sum(); + let original_len = original.len(); + + let (original_mean, original_std) = data.mean_std(); + + data.shuffle(42); + + // Sum and length should be preserved + let shuffled_sum: f64 = data.as_slice().iter().sum(); + assert_eq!(data.len(), original_len); + assert!((original_sum - shuffled_sum).abs() < 1e-10); + + // Order should have changed (with high probability) + let changed = data.as_slice() != original.as_slice(); + assert!(changed); + + // mean and std should be preserved + let (shuffled_mean, shuffled_std) = data.mean_std(); + assert!((original_std - shuffled_std).abs() < 1e-10); + assert!((original_mean - shuffled_mean).abs() < 1e-10); + } + + #[test] + fn test_data_shuffle_reproducibility() { + use super::Data; + + // Test that same seed produces same shuffle + let original = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + + let mut data1 = Data::try_from(original.as_slice()).unwrap(); + let mut data2 = data1.clone(); + + data1.shuffle(12345); + data2.shuffle(12345); + + // Same seed should produce same result + assert_eq!(data1.as_slice(), data2.as_slice()); + + data1.shuffle(111); + data2.shuffle(222); + + // Different seeds should produce different results (with high probability) + assert_ne!(data1.as_slice(), data2.as_slice()); + } + + #[test] + fn test_uncertain_data_shuffle_preserves_pairing() { + use super::UncertainData; + + // Test that shuffle maintains value-uncertainty correspondence + let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; + let uncertainties = vec![0.1, 0.2, 0.3, 0.4, 0.5]; + + let mut data: UncertainData = (values.as_slice(), uncertainties.as_slice()) + .try_into() + .unwrap(); + + // Store original pairings + let original_pairs: Vec<(f64, f64)> = values + .iter() + .zip(uncertainties.iter()) + .map(|(&v, &u)| (v, u)) + .collect(); + + data.shuffle(999); + + // Check that all original pairs still exist + for i in 0..data.len() { + let pair = (data.values.as_slice()[i], data.uncertainties.as_slice()[i]); + assert!( + original_pairs.contains(&pair), + "Shuffled pair ({}, {}) not found in original pairs", + pair.0, + pair.1 + ); + } + + let mut data2 = data.clone(); - // Verify the chi2 at the fitted value is minimized - let chi2_at_fit = data.reduced_chi2(fitted); - let chi2_slightly_off = data.reduced_chi2(fitted + 0.1); - assert!(chi2_at_fit < chi2_slightly_off); + data.shuffle(54321); + data2.shuffle(54321); + + // Same seed should produce same result + assert_eq!(data.values.as_slice(), data2.values.as_slice()); + assert_eq!( + data.uncertainties.as_slice(), + data2.uncertainties.as_slice() + ); } }