diff --git a/Sources/Compute/Intramodular/Statistics/CollectionStatistics.swift b/Sources/Compute/Intramodular/Statistics/CollectionStatistics.swift new file mode 100644 index 0000000000..9a789a3d8a --- /dev/null +++ b/Sources/Compute/Intramodular/Statistics/CollectionStatistics.swift @@ -0,0 +1,145 @@ +// +// Copyright (c) Vatsal Manot +// + +import Darwin +import Swallow + +// MARK: - BinaryFloatingPoint Statistics + +extension Collection where Element: BinaryFloatingPoint { + + /// The arithmetic mean of all elements in the collection. + /// + /// Returns `nil` if the collection is empty. + /// + /// - Complexity: O(n) + public var mean: Element? { + guard !isEmpty else { return nil } + return reduce(.zero, +) / Element(count) + } + + /// The middle value when elements are sorted in ascending order. + /// + /// For even-count collections, returns the average of the two middle values. + /// Returns `nil` if the collection is empty. + /// + /// - Complexity: O(n log n) + public var median: Element? { + guard !isEmpty else { return nil } + let sorted = self.sorted() + let mid = sorted.count / 2 + if sorted.count % 2 == 0 { + return (sorted[mid - 1] + sorted[mid]) / 2 + } else { + return sorted[mid] + } + } + + /// The population variance of the collection. + /// + /// Returns `nil` if the collection is empty. + /// + /// - Complexity: O(n) + public var variance: Element? { + guard let m = mean else { return nil } + let sumOfSquaredDiffs = reduce(.zero) { $0 + ($1 - m) * ($1 - m) } + return sumOfSquaredDiffs / Element(count) + } + + /// The population standard deviation of the collection. + /// + /// Returns `nil` if the collection is empty. + /// + /// - Complexity: O(n) + public var standardDeviation: Element? { + guard let v = variance else { return nil } + return Element(sqrt(Double(v))) + } + + /// Returns the value at the given percentile using linear interpolation. + /// + /// - Parameter p: A value in the range `0.0...1.0`. For example, `0.9` gives the 90th percentile. + /// - Returns: The interpolated value at that percentile, or `nil` if the collection is empty or `p` is out of range. + /// + /// - Complexity: O(n log n) + public func percentile(_ p: Double) -> Element? { + guard !isEmpty, (0.0...1.0).contains(p) else { return nil } + let sorted = self.sorted() + guard sorted.count > 1 else { return sorted[0] } + let index = p * Double(sorted.count - 1) + let lower = Int(index) + let upper = Swift.min(lower + 1, sorted.count - 1) + let fraction = Element(index - Double(lower)) + return sorted[lower] + fraction * (sorted[upper] - sorted[lower]) + } + + /// Returns a new array where all values are linearly scaled to the range `[0, 1]`. + /// + /// Returns `nil` if the collection is empty or all values are identical (zero range). + /// + /// - Complexity: O(n) + public func normalized() -> [Element]? { + guard let minVal = self.min(), let maxVal = self.max() else { return nil } + let range = maxVal - minVal + guard range != .zero else { return nil } + return map { ($0 - minVal) / range } + } + + /// Returns a new array of z-scores: each value is shifted by the mean and scaled by the standard deviation. + /// + /// Returns `nil` if the collection is empty or the standard deviation is zero. + /// + /// - Complexity: O(n) + public func standardized() -> [Element]? { + guard let m = mean, let sd = standardDeviation, sd != .zero else { return nil } + return map { ($0 - m) / sd } + } + + /// Returns the Pearson correlation coefficient between this collection and another of equal length. + /// + /// A result of `1.0` indicates perfect positive correlation, `-1.0` perfect negative, and `0.0` no linear correlation. + /// Returns `nil` if either collection is empty, they differ in length, or either has zero standard deviation. + /// + /// - Complexity: O(n) + public func pearsonCorrelation(with other: some Collection) -> Element? { + guard count == other.count, !isEmpty else { return nil } + guard let meanX = mean, let meanY = other.mean else { return nil } + let numerator = Swift.zip(self, other).reduce(.zero) { $0 + ($1.0 - meanX) * ($1.1 - meanY) } + let denomX = reduce(.zero) { $0 + ($1 - meanX) * ($1 - meanX) } + let denomY = other.reduce(.zero) { $0 + ($1 - meanY) * ($1 - meanY) } + let denominator = Element(sqrt(Double(denomX * denomY))) + guard denominator != .zero else { return nil } + return numerator / denominator + } +} + +// MARK: - Comparable & Hashable Statistics + +extension Collection where Element: Comparable & Hashable { + + /// The most frequently occurring element in the collection. + /// + /// If multiple elements share the highest frequency, the one returned is unspecified. + /// Returns `nil` if the collection is empty. + /// + /// - Complexity: O(n) + public var mode: Element? { + guard !isEmpty else { return nil } + var counts: [Element: Int] = [:] + for element in self { + counts[element, default: 0] += 1 + } + return counts.max(by: { $0.value < $1.value })?.key + } + + /// A closed range spanning from the minimum to the maximum element in the collection. + /// + /// Returns `nil` if the collection is empty. + /// + /// - Complexity: O(n) + public var valueRange: ClosedRange? { + guard let lo = self.min(), let hi = self.max() else { return nil } + return lo...hi + } +} diff --git a/Sources/Compute/Intramodular/Statistics/RunningStatistics.swift b/Sources/Compute/Intramodular/Statistics/RunningStatistics.swift new file mode 100644 index 0000000000..39f45c8395 --- /dev/null +++ b/Sources/Compute/Intramodular/Statistics/RunningStatistics.swift @@ -0,0 +1,191 @@ +// +// Copyright (c) Vatsal Manot +// + +import Darwin +import Swallow + +/// A type that incrementally computes statistical measures using O(1) memory. +/// +/// Uses Welford's online algorithm to compute the mean and variance as values +/// are pushed one at a time, without storing the individual values. +/// This makes it suitable for large data streams, file processing, or +/// any context where holding all values in memory is not desirable. +/// +/// ```swift +/// var stats = RunningStatistics() +/// +/// for temperature in sensorReadings { +/// stats.push(temperature) +/// } +/// +/// print(stats.mean) // Optional(21.32) +/// print(stats.standardDeviation) // Optional(1.36) +/// print(stats.min) // Optional(19.5) +/// print(stats.max) // Optional(23.1) +/// ``` +public struct RunningStatistics: Sendable { + + private var _count: Int = 0 + private var _mean: Double = 0.0 + private var _m2: Double = 0.0 + private var _min: Double = .infinity + private var _max: Double = -.infinity + + /// Creates an empty instance with no accumulated values. + public init() {} + + // MARK: - Observed Properties + + /// The number of values pushed so far. + public var count: Int { + _count + } + + /// Returns `true` if no values have been pushed yet. + public var isEmpty: Bool { + _count == 0 + } + + // MARK: - Statistical Properties + + /// The running arithmetic mean, or `nil` if no values have been pushed. + public var mean: Double? { + _count == 0 ? nil : _mean + } + + /// The running population variance, or `nil` if no values have been pushed. + public var variance: Double? { + _count == 0 ? nil : _m2 / Double(_count) + } + + /// The running population standard deviation, or `nil` if no values have been pushed. + public var standardDeviation: Double? { + variance.map(sqrt) + } + + /// The smallest value pushed so far, or `nil` if no values have been pushed. + public var min: Double? { + _count == 0 ? nil : _min + } + + /// The largest value pushed so far, or `nil` if no values have been pushed. + public var max: Double? { + _count == 0 ? nil : _max + } + + /// A closed range from `min` to `max`, or `nil` if no values have been pushed. + public var valueRange: ClosedRange? { + guard let lo = min, let hi = max else { return nil } + return lo...hi + } + + // MARK: - Mutation + + /// Incorporates a new `Double` value into the running statistics. + /// + /// - Complexity: O(1) + public mutating func push(_ value: Double) { + _count += 1 + let delta = value - _mean + _mean += delta / Double(_count) + let delta2 = value - _mean + _m2 += delta * delta2 + if value < _min { _min = value } + if value > _max { _max = value } + } + + /// Incorporates a new `BinaryFloatingPoint` value into the running statistics. + /// + /// - Complexity: O(1) + public mutating func push(_ value: T) { + push(Double(value)) + } + + /// Incorporates every element from a sequence into the running statistics. + /// + /// - Complexity: O(n) + public mutating func push(_ values: S) where S.Element: BinaryFloatingPoint { + for value in values { + push(value) + } + } + + /// Resets all accumulated statistics back to the initial empty state. + public mutating func reset() { + _count = 0 + _mean = 0.0 + _m2 = 0.0 + _min = .infinity + _max = -.infinity + } +} + +// MARK: - Merging + +extension RunningStatistics: MergeOperatable { + + /// Merges another `RunningStatistics` into this one in-place. + /// + /// The resulting instance is statistically equivalent to having pushed + /// all values from both instances into a single `RunningStatistics`. + /// + /// Uses the parallel/combined Welford formula for numerically stable merging. + /// + /// - Complexity: O(1) + public mutating func mergeInPlace(with other: RunningStatistics) { + guard other._count > 0 else { return } + guard _count > 0 else { + self = other + return + } + let combinedCount = _count + other._count + let delta = other._mean - _mean + _mean = (_mean * Double(_count) + other._mean * Double(other._count)) / Double(combinedCount) + _m2 = _m2 + other._m2 + delta * delta * Double(_count) * Double(other._count) / Double(combinedCount) + _count = combinedCount + if other._min < _min { _min = other._min } + if other._max > _max { _max = other._max } + } + + /// Returns a new `RunningStatistics` that is the combination of both instances. + /// + /// - Complexity: O(1) + public func merging(_ other: RunningStatistics) -> RunningStatistics { + var copy = self + copy.mergeInPlace(with: other) + return copy + } +} + +// MARK: - Conformances + +extension RunningStatistics: CustomStringConvertible { + public var description: String { + guard _count > 0 else { + return "RunningStatistics(empty)" + } + let sdString = standardDeviation.map { String(format: "%.4f", $0) } ?? "nil" + return "RunningStatistics(count: \(_count), mean: \(String(format: "%.4f", _mean)), stdDev: \(sdString), min: \(_min), max: \(_max))" + } +} + +extension RunningStatistics: Equatable { + public static func == (lhs: RunningStatistics, rhs: RunningStatistics) -> Bool { + lhs._count == rhs._count && + lhs._mean == rhs._mean && + lhs._m2 == rhs._m2 && + lhs._min == rhs._min && + lhs._max == rhs._max + } +} + +extension RunningStatistics: Hashable { + public func hash(into hasher: inout Hasher) { + hasher.combine(_count) + hasher.combine(_mean) + hasher.combine(_m2) + hasher.combine(_min) + hasher.combine(_max) + } +}