Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 145 additions & 0 deletions Sources/Compute/Intramodular/Statistics/CollectionStatistics.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
//
// Copyright (c) Vatsal Manot
//

import Darwin
import Swallow

// MARK: - BinaryFloatingPoint Statistics

extension Collection where Element: BinaryFloatingPoint {

/// The arithmetic mean of all elements in the collection.
///
/// Returns `nil` if the collection is empty.
///
/// - Complexity: O(n)
public var mean: Element? {
guard !isEmpty else { return nil }
return reduce(.zero, +) / Element(count)
}

/// The middle value when elements are sorted in ascending order.
///
/// For even-count collections, returns the average of the two middle values.
/// Returns `nil` if the collection is empty.
///
/// - Complexity: O(n log n)
public var median: Element? {
guard !isEmpty else { return nil }
let sorted = self.sorted()
let mid = sorted.count / 2
if sorted.count % 2 == 0 {
return (sorted[mid - 1] + sorted[mid]) / 2
} else {
return sorted[mid]
}
}

/// The population variance of the collection.
///
/// Returns `nil` if the collection is empty.
///
/// - Complexity: O(n)
public var variance: Element? {
guard let m = mean else { return nil }
let sumOfSquaredDiffs = reduce(.zero) { $0 + ($1 - m) * ($1 - m) }
return sumOfSquaredDiffs / Element(count)
}

/// The population standard deviation of the collection.
///
/// Returns `nil` if the collection is empty.
///
/// - Complexity: O(n)
public var standardDeviation: Element? {
guard let v = variance else { return nil }
return Element(sqrt(Double(v)))
}

/// Returns the value at the given percentile using linear interpolation.
///
/// - Parameter p: A value in the range `0.0...1.0`. For example, `0.9` gives the 90th percentile.
/// - Returns: The interpolated value at that percentile, or `nil` if the collection is empty or `p` is out of range.
///
/// - Complexity: O(n log n)
public func percentile(_ p: Double) -> Element? {
guard !isEmpty, (0.0...1.0).contains(p) else { return nil }
let sorted = self.sorted()
guard sorted.count > 1 else { return sorted[0] }
let index = p * Double(sorted.count - 1)
let lower = Int(index)
let upper = Swift.min(lower + 1, sorted.count - 1)
let fraction = Element(index - Double(lower))
return sorted[lower] + fraction * (sorted[upper] - sorted[lower])
}

/// Returns a new array where all values are linearly scaled to the range `[0, 1]`.
///
/// Returns `nil` if the collection is empty or all values are identical (zero range).
///
/// - Complexity: O(n)
public func normalized() -> [Element]? {
guard let minVal = self.min(), let maxVal = self.max() else { return nil }
let range = maxVal - minVal
guard range != .zero else { return nil }
return map { ($0 - minVal) / range }
}

/// Returns a new array of z-scores: each value is shifted by the mean and scaled by the standard deviation.
///
/// Returns `nil` if the collection is empty or the standard deviation is zero.
///
/// - Complexity: O(n)
public func standardized() -> [Element]? {
guard let m = mean, let sd = standardDeviation, sd != .zero else { return nil }
return map { ($0 - m) / sd }
}

/// Returns the Pearson correlation coefficient between this collection and another of equal length.
///
/// A result of `1.0` indicates perfect positive correlation, `-1.0` perfect negative, and `0.0` no linear correlation.
/// Returns `nil` if either collection is empty, they differ in length, or either has zero standard deviation.
///
/// - Complexity: O(n)
public func pearsonCorrelation(with other: some Collection<Element>) -> Element? {
guard count == other.count, !isEmpty else { return nil }
guard let meanX = mean, let meanY = other.mean else { return nil }
let numerator = Swift.zip(self, other).reduce(.zero) { $0 + ($1.0 - meanX) * ($1.1 - meanY) }
let denomX = reduce(.zero) { $0 + ($1 - meanX) * ($1 - meanX) }
let denomY = other.reduce(.zero) { $0 + ($1 - meanY) * ($1 - meanY) }
let denominator = Element(sqrt(Double(denomX * denomY)))
guard denominator != .zero else { return nil }
return numerator / denominator
}
}

// MARK: - Comparable & Hashable Statistics

extension Collection where Element: Comparable & Hashable {

/// The most frequently occurring element in the collection.
///
/// If multiple elements share the highest frequency, the one returned is unspecified.
/// Returns `nil` if the collection is empty.
///
/// - Complexity: O(n)
public var mode: Element? {
guard !isEmpty else { return nil }
var counts: [Element: Int] = [:]
for element in self {
counts[element, default: 0] += 1
}
return counts.max(by: { $0.value < $1.value })?.key
}

/// A closed range spanning from the minimum to the maximum element in the collection.
///
/// Returns `nil` if the collection is empty.
///
/// - Complexity: O(n)
public var valueRange: ClosedRange<Element>? {
guard let lo = self.min(), let hi = self.max() else { return nil }
return lo...hi
}
}
191 changes: 191 additions & 0 deletions Sources/Compute/Intramodular/Statistics/RunningStatistics.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
//
// Copyright (c) Vatsal Manot
//

import Darwin
import Swallow

/// A type that incrementally computes statistical measures using O(1) memory.
///
/// Uses Welford's online algorithm to compute the mean and variance as values
/// are pushed one at a time, without storing the individual values.
/// This makes it suitable for large data streams, file processing, or
/// any context where holding all values in memory is not desirable.
///
/// ```swift
/// var stats = RunningStatistics()
///
/// for temperature in sensorReadings {
/// stats.push(temperature)
/// }
///
/// print(stats.mean) // Optional(21.32)
/// print(stats.standardDeviation) // Optional(1.36)
/// print(stats.min) // Optional(19.5)
/// print(stats.max) // Optional(23.1)
/// ```
public struct RunningStatistics: Sendable {

private var _count: Int = 0
private var _mean: Double = 0.0
private var _m2: Double = 0.0
private var _min: Double = .infinity
private var _max: Double = -.infinity

/// Creates an empty instance with no accumulated values.
public init() {}

// MARK: - Observed Properties

/// The number of values pushed so far.
public var count: Int {
_count
}

/// Returns `true` if no values have been pushed yet.
public var isEmpty: Bool {
_count == 0
}

// MARK: - Statistical Properties

/// The running arithmetic mean, or `nil` if no values have been pushed.
public var mean: Double? {
_count == 0 ? nil : _mean
}

/// The running population variance, or `nil` if no values have been pushed.
public var variance: Double? {
_count == 0 ? nil : _m2 / Double(_count)
}

/// The running population standard deviation, or `nil` if no values have been pushed.
public var standardDeviation: Double? {
variance.map(sqrt)
}

/// The smallest value pushed so far, or `nil` if no values have been pushed.
public var min: Double? {
_count == 0 ? nil : _min
}

/// The largest value pushed so far, or `nil` if no values have been pushed.
public var max: Double? {
_count == 0 ? nil : _max
}

/// A closed range from `min` to `max`, or `nil` if no values have been pushed.
public var valueRange: ClosedRange<Double>? {
guard let lo = min, let hi = max else { return nil }
return lo...hi
}

// MARK: - Mutation

/// Incorporates a new `Double` value into the running statistics.
///
/// - Complexity: O(1)
public mutating func push(_ value: Double) {
_count += 1
let delta = value - _mean
_mean += delta / Double(_count)
let delta2 = value - _mean
_m2 += delta * delta2
if value < _min { _min = value }
if value > _max { _max = value }
}

/// Incorporates a new `BinaryFloatingPoint` value into the running statistics.
///
/// - Complexity: O(1)
public mutating func push<T: BinaryFloatingPoint>(_ value: T) {
push(Double(value))
}

/// Incorporates every element from a sequence into the running statistics.
///
/// - Complexity: O(n)
public mutating func push<S: Sequence>(_ values: S) where S.Element: BinaryFloatingPoint {
for value in values {
push(value)
}
}

/// Resets all accumulated statistics back to the initial empty state.
public mutating func reset() {
_count = 0
_mean = 0.0
_m2 = 0.0
_min = .infinity
_max = -.infinity
}
}

// MARK: - Merging

extension RunningStatistics: MergeOperatable {

/// Merges another `RunningStatistics` into this one in-place.
///
/// The resulting instance is statistically equivalent to having pushed
/// all values from both instances into a single `RunningStatistics`.
///
/// Uses the parallel/combined Welford formula for numerically stable merging.
///
/// - Complexity: O(1)
public mutating func mergeInPlace(with other: RunningStatistics) {
guard other._count > 0 else { return }
guard _count > 0 else {
self = other
return
}
let combinedCount = _count + other._count
let delta = other._mean - _mean
_mean = (_mean * Double(_count) + other._mean * Double(other._count)) / Double(combinedCount)
_m2 = _m2 + other._m2 + delta * delta * Double(_count) * Double(other._count) / Double(combinedCount)
_count = combinedCount
if other._min < _min { _min = other._min }
if other._max > _max { _max = other._max }
}

/// Returns a new `RunningStatistics` that is the combination of both instances.
///
/// - Complexity: O(1)
public func merging(_ other: RunningStatistics) -> RunningStatistics {
var copy = self
copy.mergeInPlace(with: other)
return copy
}
}

// MARK: - Conformances

extension RunningStatistics: CustomStringConvertible {
public var description: String {
guard _count > 0 else {
return "RunningStatistics(empty)"
}
let sdString = standardDeviation.map { String(format: "%.4f", $0) } ?? "nil"
return "RunningStatistics(count: \(_count), mean: \(String(format: "%.4f", _mean)), stdDev: \(sdString), min: \(_min), max: \(_max))"
}
}

extension RunningStatistics: Equatable {
public static func == (lhs: RunningStatistics, rhs: RunningStatistics) -> Bool {
lhs._count == rhs._count &&
lhs._mean == rhs._mean &&
lhs._m2 == rhs._m2 &&
lhs._min == rhs._min &&
lhs._max == rhs._max
}
}

extension RunningStatistics: Hashable {
public func hash(into hasher: inout Hasher) {
hasher.combine(_count)
hasher.combine(_mean)
hasher.combine(_m2)
hasher.combine(_min)
hasher.combine(_max)
}
}