From ac8238c80041dbf9723d1045c7e03ab43bccaf7c Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 5 Sep 2025 11:04:19 -0700 Subject: [PATCH 1/6] Improvements to integer typing and overflow protection. - Use the sanisizer library to avoid integer overflow, mostly during vector allocations but also in some additions and casts from float. - Slap const'ness on as many variables as possible, for safety. --- CMakeLists.txt | 3 +- cmake/Config.cmake.in | 1 + extern/CMakeLists.txt | 7 + include/umappp/Status.hpp | 14 +- include/umappp/combine_neighbor_sets.hpp | 13 +- include/umappp/find_ab.hpp | 36 ++--- include/umappp/initialize.hpp | 57 ++++---- include/umappp/neighbor_similarities.hpp | 46 ++++--- include/umappp/optimize_layout.hpp | 164 +++++++++++------------ include/umappp/parallelize.hpp | 2 +- include/umappp/spectral_init.hpp | 47 ++++--- 11 files changed, 201 insertions(+), 189 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77149c5..eda873c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,10 +25,11 @@ else() find_package(ltla_aarand CONFIG REQUIRED) find_package(ltla_irlba CONFIG REQUIRED) find_package(ltla_subpar 0.3.1 CONFIG REQUIRED) + find_package(ltla_sanisizer 0.1.3 CONFIG REQUIRED) find_package(knncolle_knncolle 2.3.0 CONFIG REQUIRED) endif() -target_link_libraries(umappp INTERFACE ltla::aarand ltla::irlba ltla::subpar knncolle::knncolle) +target_link_libraries(umappp INTERFACE ltla::aarand ltla::irlba ltla::subpar ltla::sanisizer knncolle::knncolle) # Tests if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in index 4df2b2f..9772c32 100644 --- a/cmake/Config.cmake.in +++ b/cmake/Config.cmake.in @@ -3,6 +3,7 @@ include(CMakeFindDependencyMacro) find_dependency(ltla_aarand CONFIG REQUIRED) find_dependency(ltla_subpar 0.3.1 CONFIG REQUIRED) +find_dependency(ltla_sanisizer 0.1.3 CONFIG REQUIRED) find_dependency(ltla_irlba CONFIG REQUIRED) find_dependency(knncolle_knncolle 2.3.0 CONFIG REQUIRED) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 5ccd313..890dca7 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -12,6 +12,12 @@ FetchContent_Declare( GIT_TAG master # ^0.3.1 ) +FetchContent_Declare( + sanisizer + GIT_REPOSITORY https://github.com/LTLA/sanisizer + GIT_TAG master # ^0.1.3 +) + FetchContent_Declare( knncolle GIT_REPOSITORY https://github.com/knncolle/knncolle @@ -26,5 +32,6 @@ FetchContent_Declare( FetchContent_MakeAvailable(aarand) FetchContent_MakeAvailable(subpar) +FetchContent_MakeAvailable(sanisizer) FetchContent_MakeAvailable(knncolle) FetchContent_MakeAvailable(irlba) diff --git a/include/umappp/Status.hpp b/include/umappp/Status.hpp index d503c67..5db8485 100644 --- a/include/umappp/Status.hpp +++ b/include/umappp/Status.hpp @@ -1,12 +1,14 @@ #ifndef UMAPPP_STATUS_HPP #define UMAPPP_STATUS_HPP -#include "Options.hpp" -#include "optimize_layout.hpp" - #include #include +#include "sanisizer/sanisizer.hpp" + +#include "Options.hpp" +#include "optimize_layout.hpp" + /** * @file Status.hpp * @brief Status of the UMAP algorithm. @@ -27,7 +29,7 @@ class Status { /** * @cond */ - Status(internal::EpochData epochs, Options options, std::size_t num_dim, Float_* embedding) : + Status(internal::EpochData epochs, Options options, const std::size_t num_dim, Float_* const embedding) : my_epochs(std::move(epochs)), my_options(std::move(options)), my_engine(my_options.seed), @@ -84,9 +86,9 @@ class Status { * The contents of the new array in `ptr` should be the same as the array that it replaces, as `run()` will continue the iteration from the coordinates inside the array. * This is enforced by default when `copy = true`, but if the supplied `ptr` already contains a copy, the caller may set `copy = false` to avoid extra work */ - void set_embedding(Float_* ptr, bool copy = true) { + void set_embedding(Float_* const ptr, const bool copy = true) { if (copy) { - std::size_t n = num_dimensions() * static_cast(num_observations()); // cast to avoid overflow. + const std::size_t n = sanisizer::product_unsafe(num_dimensions(), num_observations()); std::copy_n(my_embedding, n, ptr); } my_embedding = ptr; diff --git a/include/umappp/combine_neighbor_sets.hpp b/include/umappp/combine_neighbor_sets.hpp index 42e0430..7d3ade3 100644 --- a/include/umappp/combine_neighbor_sets.hpp +++ b/include/umappp/combine_neighbor_sets.hpp @@ -5,6 +5,8 @@ #include #include +#include "sanisizer/sanisizer.hpp" + #include "NeighborList.hpp" namespace umappp { @@ -12,9 +14,10 @@ namespace umappp { namespace internal { template -void combine_neighbor_sets(NeighborList& x, Float_ mix_ratio) { - Index_ num_obs = x.size(); - std::vector last(num_obs), original(num_obs); +void combine_neighbor_sets(NeighborList& x, const Float_ mix_ratio) { + const Index_ num_obs = x.size(); // assume that Index_ is large enough to store the number of observations. + auto last = sanisizer::create >(num_obs); + auto original = sanisizer::create >(num_obs); for (Index_ i = 0; i < num_obs; ++i) { auto& current = x[i]; @@ -22,7 +25,7 @@ void combine_neighbor_sets(NeighborList& x, Float_ mix_ratio) { original[i] = x[i].size(); } - for (Index_ i = 0; i < num_obs; ++i ){ + for (Index_ i = 0; i < num_obs; ++i) { auto& current = x[i]; // Looping through the neighbors and searching for self in each @@ -42,7 +45,7 @@ void combine_neighbor_sets(NeighborList& x, Float_ mix_ratio) { // previous iteration of the outermost loop where i and y.first // swap values. So we skip this to avoid adding it twice. if (i < y.first) { - Float_ product = y.second * target[curlast].second; + const Float_ product = y.second * target[curlast].second; Float_ prob_final; if (mix_ratio == 1) { diff --git a/include/umappp/find_ab.hpp b/include/umappp/find_ab.hpp index 7ae75d1..c8ade26 100644 --- a/include/umappp/find_ab.hpp +++ b/include/umappp/find_ab.hpp @@ -4,6 +4,8 @@ #include #include +#include "sanisizer/sanisizer.hpp" + namespace umappp { namespace internal { @@ -42,13 +44,15 @@ namespace internal { */ template -std::pair find_ab(Float_ spread, Float_ min_dist) { - constexpr Float_ grid = 300; +std::pair find_ab(const Float_ spread, const Float_ min_dist) { + constexpr std::size_t grid = 300; + auto grid_x = sanisizer::create >(grid); + auto grid_y = sanisizer::create >(grid); + auto log_x = sanisizer::create >(grid); // Compute the x and y coordinates of the expected distance curve. - std::vector grid_x(grid), grid_y(grid), log_x(grid); const Float_ delta = spread * 3 / grid; - for (int g = 0; g < grid; ++g) { + for (std::size_t g = 0; g < grid; ++g) { grid_x[g] = (g + 1) * delta; // +1 to avoid meaningless least squares result at x = 0, where both curves have y = 1 (and also the derivative w.r.t. b is not defined). log_x[g] = std::log(grid_x[g]); grid_y[g] = (grid_x[g] <= min_dist ? 1 : std::exp(- (grid_x[g] - min_dist) / spread)); @@ -59,25 +63,23 @@ std::pair find_ab(Float_ spread, Float_ min_dist) { // We use 'limit = 0.5' because that's where most interesting stuff // happens, given that the curve is bounded between 0 and 1 on the y-axis. constexpr Float_ limit = 0.5; - Float_ x_half = std::log(limit) * -spread + min_dist; // guaranteed > 0, as log(limit) is negative. - Float_ d_half = limit / -spread; // first derivative at x_half. + const Float_ x_half = std::log(limit) * -spread + min_dist; // guaranteed > 0, as log(limit) is negative. + const Float_ d_half = limit / -spread; // first derivative at x_half. Float_ b = - d_half * x_half / (1 / limit - 1) / (2 * limit * limit); Float_ a = (1 / limit - 1) / std::pow(x_half, 2 * b); - std::vector fit_y(grid), xpow(grid), grid_resid(grid); - auto compute_ss = [&](Float_ A, Float_ B) -> Float_ { - for (int g = 0; g < grid; ++g) { + auto fit_y = sanisizer::create >(grid); + auto xpow = sanisizer::create >(grid); + auto grid_resid = sanisizer::create >(grid); + + auto compute_ss = [&](const Float_ A, const Float_ B) -> Float_ { + Float_ ss = 0; + for (std::size_t g = 0; g < grid; ++g) { xpow[g] = std::pow(grid_x[g], 2 * B); fit_y[g] = 1 / (1 + A * xpow[g]); grid_resid[g] = grid_y[g] - fit_y[g]; + ss += grid_resid[g] * grid_resid[g]; } - - Float_ ss = 0; - for (int g = 0; g < grid; ++g) { - auto r = grid_resid[g]; - ss += r * r; - } - return ss; }; Float_ ss = compute_ss(a, b); @@ -98,7 +100,7 @@ std::pair find_ab(Float_ spread, Float_ min_dist) { */ Float_ da2 = 0, db2 = 0, dadb = 0, da_resid = 0, db_resid = 0; - for (int g = 0; g < grid; ++g) { + for (std::size_t g = 0; g < grid; ++g) { const Float_ x2b = xpow[g]; // set by the last compute_ss() call. const Float_ oy = fit_y[g]; // ditto const Float_ resid = grid_resid[g]; // ditto diff --git a/include/umappp/initialize.hpp b/include/umappp/initialize.hpp index fdb1c0a..15a600f 100644 --- a/include/umappp/initialize.hpp +++ b/include/umappp/initialize.hpp @@ -26,26 +26,25 @@ namespace umappp { namespace internal { template -int choose_num_epochs(int num_epochs, Index_ size) { - if (num_epochs < 0) { - // Choosing the number of epochs. We use a simple formula to decrease - // the number of epochs with increasing size, with the aim being that - // the 'extra work' beyond the minimal 200 epochs should be the same - // regardless of the numbe of observations. Given one calculation per - // observation per epoch, this amounts to 300 * 10000 calculations at - // the lower bound, so we simply choose a number of epochs that - // equalizes the number of calculations for any number of observations. - if (num_epochs < 0) { - constexpr Index_ limit = 10000; - const int minimal = 200, maximal = 300; - if (size <= limit) { - num_epochs = minimal + maximal; - } else { - num_epochs = minimal + static_cast(std::ceil(maximal * static_cast(limit) / static_cast(size))); - } - } +int choose_num_epochs(const int num_epochs, const Index_ size) { + if (num_epochs >= 0) { + return num_epochs; + } + + // Choosing the number of epochs. We use a simple formula to decrease + // the number of epochs with increasing size, with the aim being that + // the 'extra work' beyond the minimal 200 epochs should be the same + // regardless of the number of observations. Given one calculation per + // observation per epoch, this amounts to 300 * 10000 calculations at + // the lower bound, so we simply choose a number of epochs that + // equalizes the number of calculations for any number of observations. + constexpr Index_ limit = 10000; + const int minimal = 200, maximal = 300; + if (size <= limit) { + return minimal + maximal; + } else { + return minimal + static_cast(std::ceil(maximal * static_cast(limit) / static_cast(size))); } - return num_epochs; } } @@ -72,7 +71,7 @@ int choose_num_epochs(int num_epochs, Index_ size) { * Further calls to `Status::run()` will update the embeddings in `embedding`. */ template -Status initialize(NeighborList x, std::size_t num_dim, Float_* embedding, Options options) { +Status initialize(NeighborList x, const std::size_t num_dim, Float_* const embedding, Options options) { internal::NeighborSimilaritiesOptions nsopt; nsopt.local_connectivity = options.local_connectivity; nsopt.bandwidth = options.bandwidth; @@ -83,7 +82,7 @@ Status initialize(NeighborList x, std::size_t nu // Choosing the manner of initialization. if (options.initialize == InitializeMethod::SPECTRAL || options.initialize == InitializeMethod::SPECTRAL_ONLY) { - bool attempt = internal::spectral_init(x, num_dim, embedding, options.num_threads); + const bool attempt = internal::spectral_init(x, num_dim, embedding, options.num_threads); if (!attempt && options.initialize == InitializeMethod::SPECTRAL) { internal::random_init(x.size(), num_dim, embedding); } @@ -93,7 +92,7 @@ Status initialize(NeighborList x, std::size_t nu // Finding a good a/b pair. if (options.a <= 0 || options.b <= 0) { - auto found = internal::find_ab(options.spread, options.min_dist); + const auto found = internal::find_ab(options.spread, options.min_dist); options.a = found.first; options.b = found.second; } @@ -127,7 +126,7 @@ Status initialize(NeighborList x, std::size_t nu * Further calls to `Status::run()` will update the embeddings in `embedding`. */ template -Status initialize(const knncolle::Prebuilt& prebuilt, std::size_t num_dim, Float_* embedding, Options options) { +Status initialize(const knncolle::Prebuilt& prebuilt, const std::size_t num_dim, Float_* const embedding, Options options) { auto output = knncolle::find_nearest_neighbors(prebuilt, options.num_neighbors, options.num_threads); return initialize(std::move(output), num_dim, embedding, std::move(options)); } @@ -156,15 +155,15 @@ Status initialize(const knncolle::Prebuilt > Status initialize( - std::size_t data_dim, - std::size_t num_obs, - const Float_* data, + const std::size_t data_dim, + const Index_ num_obs, + const Float_* const data, const knncolle::Builder& builder, - std::size_t num_dim, - Float_* embedding, + const std::size_t num_dim, + Float_* const embedding, Options options) { - auto prebuilt = builder.build_unique(knncolle::SimpleMatrix(data_dim, num_obs, data)); + const auto prebuilt = builder.build_unique(knncolle::SimpleMatrix(data_dim, num_obs, data)); return initialize(*prebuilt, num_dim, embedding, std::move(options)); } diff --git a/include/umappp/neighbor_similarities.hpp b/include/umappp/neighbor_similarities.hpp index 556dc12..6af4adc 100644 --- a/include/umappp/neighbor_similarities.hpp +++ b/include/umappp/neighbor_similarities.hpp @@ -7,6 +7,8 @@ #include #include +#include "sanisizer/sanisizer.hpp" + #include "NeighborList.hpp" #include "parallelize.hpp" @@ -51,22 +53,25 @@ false #endif , typename Index_, typename Float_> void neighbor_similarities(NeighborList& x, const NeighborSimilaritiesOptions& options) { - Index_ npoints = x.size(); - const int raw_connect_index = std::floor(options.local_connectivity); + // 'raw_connect_index' is the 1-based index of the first non-identical neighbor that is assumed to always be connected. + // This can also be fractional in which case the threshold distance is defined by interpolation. + const Index_ raw_connect_index = sanisizer::from_float(options.local_connectivity); const Float_ interpolation = options.local_connectivity - raw_connect_index; - parallelize(options.num_threads, npoints, [&](int, Index_ start, Index_ length) -> void { + const Index_ npoints = x.size(); + parallelize(options.num_threads, npoints, [&](const int, const Index_ start, const Index_ length) -> void { std::vector active_delta; for (Index_ i = start, end = start + length; i < end; ++i) { auto& all_neighbors = x[i]; - const int num_neighbors = all_neighbors.size(); + const Index_ num_neighbors = all_neighbors.size(); if (num_neighbors == 0) { continue; } - // Define rho as the distance to the nearest (non-identical) neighbor at 'local_connectivity', possibly with interpolation. - int num_zero = 0; + // Define 'rho' as the distance to the 'raw_connect_index'-th non-identical neighbor. + // In other words, the actual index in the array is 'num_zero + raw_connect_index - 1' (bacause it's 1-based). + Index_ num_zero = 0; for (const auto& f : all_neighbors) { if (f.second) { break; @@ -74,28 +79,25 @@ void neighbor_similarities(NeighborList& x, const NeighborSimila ++num_zero; } - const int connect_index = num_zero + raw_connect_index; - if (num_neighbors <= connect_index) { - // When this happens, 'rho' is just theoretically set to the - // maximum distance. In such cases, the weights are always just - // set to 1 in the remaining code, because no distance can be - // greater than 'rho'. If that's the case, we might as well - // save some time and compute it here. - for (int k = 0; k < num_neighbors; ++k) { + if (sanisizer::is_less_than_or_equal(num_neighbors - num_zero, raw_connect_index)) { + // When this happens, we set 'rho' to the maximum distance, because we can't define it within range. + // In such cases, the weights are always just set to 1 in the remaining code, because no distance can be + // greater than 'rho'. If that's the case, we might as well save some time and compute it here. + for (Index_ k = 0; k < num_neighbors; ++k) { all_neighbors[k].second = 1; } continue; } - - const Float_ lower = (connect_index > 0 ? all_neighbors[connect_index - 1].second : 0); // 'local_connectivity' (and thus 'connect_index') is 1-based, hence the -1. + const Index_ connect_index = num_zero + raw_connect_index; // guaranteed to fit in an Index_, as this should be less than 'num_neighbors'. + const Float_ lower = (connect_index > 0 ? all_neighbors[connect_index - 1].second : static_cast(0)); // 'connect_index' is 1-based, hence the subtraction. const Float_ upper = all_neighbors[connect_index].second; const Float_ rho = lower + interpolation * (upper - lower); // Pre-computing the difference between each distance and rho to reduce work in the inner iterations. active_delta.clear(); Float_ num_le_rho = num_zero; - for (int k = num_zero; k < num_neighbors; ++k) { - auto curdist = all_neighbors[k].second; + for (Index_ k = num_zero; k < num_neighbors; ++k) { + const auto curdist = all_neighbors[k].second; if (curdist > rho) { active_delta.push_back(curdist - rho); } else { @@ -105,7 +107,7 @@ void neighbor_similarities(NeighborList& x, const NeighborSimila if (active_delta.empty()) { // Same early-return logic as above. - for (int k = 0; k < num_neighbors; ++k) { + for (Index_ k = 0; k < num_neighbors; ++k) { all_neighbors[k].second = 1; } continue; @@ -135,8 +137,8 @@ void neighbor_similarities(NeighborList& x, const NeighborSimila // to the bounded nature of the Newton calculation and the // underflow-safe nature of the binary search. const Float_ invsigma = 1 / sigma, invsigma2 = invsigma * invsigma; - for (auto d : active_delta) { - Float_ current = std::exp(- d * invsigma); + for (const auto d : active_delta) { + const Float_ current = std::exp(- d * invsigma); observed += current; deriv += d * current * invsigma2; } @@ -192,7 +194,7 @@ void neighbor_similarities(NeighborList& x, const NeighborSimila sigma = std::max(options.min_k_dist_scale * mean_dist, sigma); const Float_ invsigma = 1 / sigma; - for (int k = 0; k < num_neighbors; ++k) { + for (Index_ k = 0; k < num_neighbors; ++k) { Float_& dist = all_neighbors[k].second; if (dist > rho) { dist = std::exp(-(dist - rho) * invsigma); diff --git a/include/umappp/optimize_layout.hpp b/include/umappp/optimize_layout.hpp index 10c777c..6016eb0 100644 --- a/include/umappp/optimize_layout.hpp +++ b/include/umappp/optimize_layout.hpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #ifndef UMAPPP_NO_PARALLEL_OPTIMIZATION #include @@ -17,16 +17,23 @@ #include #endif -#include "NeighborList.hpp" #include "aarand/aarand.hpp" +#include "sanisizer/sanisizer.hpp" + +#include "NeighborList.hpp" namespace umappp { namespace internal { +template +std::remove_cv_t > I(const Input_ x) { + return x; +} + template struct EpochData { - EpochData(Index_ nobs) : head(nobs) {} + EpochData(const Index_ nobs) : head(sanisizer::sum(nobs, 1)) {} int total_epochs; int current_epoch = 0; @@ -41,33 +48,32 @@ struct EpochData { }; template -EpochData similarities_to_epochs(const NeighborList& p, int num_epochs, Float_ negative_sample_rate) { +EpochData similarities_to_epochs(const NeighborList& p, const int num_epochs, const Float_ negative_sample_rate) { Float_ maxed = 0; std::size_t count = 0; for (const auto& x : p) { - count += x.size(); + count = sanisizer::sum(count, x.size()); for (const auto& y : x) { maxed = std::max(maxed, y.second); } } - EpochData output(p.size()); + const Index_ num_obs = p.size(); // Index_ should be able to hold the number of observations. + EpochData output(num_obs); output.total_epochs = num_epochs; output.tail.reserve(count); output.epochs_per_sample.reserve(count); const Float_ limit = maxed / num_epochs; - std::size_t last = 0; - for (Index_ i = 0, num_obs = p.size(); i < num_obs; ++i) { + for (Index_ i = 0; i < num_obs; ++i) { const auto& x = p[i]; for (const auto& y : x) { if (y.second >= limit) { output.tail.push_back(y.first); output.epochs_per_sample.push_back(maxed / y.second); - ++last; } } - output.head[i] = last; + output.head[i + 1] = output.tail.size(); } // Filling in some epoch-related running statistics. @@ -78,11 +84,14 @@ EpochData similarities_to_epochs(const NeighborList(static_cast(num_epochs) * negative_sample_rate); + return output; } template -Float_ quick_squared_distance(const Float_* left, const Float_* right, std::size_t num_dim) { +Float_ quick_squared_distance(const Float_* const left, const Float_* const right, const std::size_t num_dim) { Float_ dist2 = 0; for (std::size_t d = 0; d < num_dim; ++d) { Float_ delta = (left[d] - right[d]); @@ -93,27 +102,24 @@ Float_ quick_squared_distance(const Float_* left, const Float_* right, std::size } template -Float_ clamp(Float_ input) { +Float_ clamp(const Float_ input) { constexpr Float_ min_gradient = -4; constexpr Float_ max_gradient = 4; return std::min(std::max(input, min_gradient), max_gradient); } template -unsigned long long compute_num_neg_samples(std::size_t j, Float_ epoch, const EpochData& setup) { +int compute_num_neg_samples(const std::size_t j, const Float_ epoch, const EpochData& setup) { // Remember that 'epochs_per_negative_sample' is defined as 'epochs_per_sample[j] / negative_sample_rate'. // We just use it inline below rather than defining a new variable and suffering floating-point round-off. Float_ num_neg_samples = (epoch - setup.epoch_of_next_negative_sample[j]) * setup.negative_sample_rate / setup.epochs_per_sample[j]; // i.e., 1/epochs_per_negative_sample. - // Avoiding problems with overflow. We return an unsigned long long to guarantee at least 64 bits, - // which should be more than enough to hold whatever num_neg_samples is. - constexpr auto max_value = std::numeric_limits::max(); - if (num_neg_samples <= static_cast(max_value)) { - return num_neg_samples; - } else { - return max_value; - } + // Maximum value of 'num_neg_samples' should be 'num_epochs * negative_sample_rate', because: + // - '(epoch - setup.epoch_of_next_negative_sample[j])' has a maximum value of 'num_epochs'. + // - 'setup.epochs_per_sample' has a minimum value of 1, when 'y.second == maxed'. + // So we just have to check that the cast is safe for the maximum value in initialize(). + return num_neg_samples; } /***************************************************** @@ -133,50 +139,47 @@ void optimize_layout( int epoch_limit ) { auto& n = setup.current_epoch; - auto num_epochs = setup.total_epochs; - auto limit_epochs = num_epochs; - if (epoch_limit> 0) { - limit_epochs = std::min(epoch_limit, num_epochs); - } + const auto num_epochs = setup.total_epochs; + const auto limit_epochs = (epoch_limit > 0 ? std::min(epoch_limit, num_epochs) : num_epochs); for (; n < limit_epochs; ++n) { const Float_ epoch = n; const Float_ alpha = initial_alpha * (1.0 - epoch / num_epochs); - Index_ num_obs = setup.head.size(); + const Index_ num_obs = setup.head.size(); for (Index_ i = 0; i < num_obs; ++i) { - std::size_t start = (i == 0 ? 0 : setup.head[i-1]), end = setup.head[i]; - Float_* left = embedding + static_cast(i) * num_dim; // cast to size_t to avoid overflow. + const auto start = setup.head[i], end = setup.head[i + 1]; + const auto left = embedding + sanisizer::product_unsafe(i, num_dim); - for (std::size_t j = start; j < end; ++j) { + for (auto j = start; j < end; ++j) { if (setup.epoch_of_next_sample[j] > epoch) { continue; } { - Float_* right = embedding + static_cast(setup.tail[j]) * num_dim; // again, casting to avoid overflow. - Float_ dist2 = quick_squared_distance(left, right, num_dim); + const auto right = embedding + sanisizer::product_unsafe(setup.tail[j], num_dim); + const Float_ dist2 = quick_squared_distance(left, right, num_dim); const Float_ pd2b = std::pow(dist2, b); const Float_ grad_coef = (-2 * a * b * pd2b) / (dist2 * (a * pd2b + 1.0)); for (std::size_t d = 0; d < num_dim; ++d) { auto& l = left[d]; auto& r = right[d]; - Float_ gradient = alpha * clamp(grad_coef * (l - r)); + const Float_ gradient = alpha * clamp(grad_coef * (l - r)); l += gradient; r -= gradient; } } - auto num_neg_samples = compute_num_neg_samples(j, epoch, setup); - for (decltype(num_neg_samples) p = 0; p < num_neg_samples; ++p) { - auto sampled = aarand::discrete_uniform(rng, num_obs); + const auto num_neg_samples = compute_num_neg_samples(j, epoch, setup); + for (decltype(I(num_neg_samples)) p = 0; p < num_neg_samples; ++p) { + const auto sampled = aarand::discrete_uniform(rng, num_obs); if (sampled == i) { continue; } - const Float_* right = embedding + static_cast(sampled) * num_dim; // again, casting to avoid overflow. - Float_ dist2 = quick_squared_distance(left, right, num_dim); + const auto right = embedding + sanisizer::product_unsafe(sampled, num_dim); + const Float_ dist2 = quick_squared_distance(left, right, num_dim); const Float_ grad_coef = 2 * gamma * b / ((0.001 + dist2) * (a * std::pow(dist2, b) + 1.0)); for (std::size_t d = 0; d < num_dim; ++d) { @@ -202,13 +205,14 @@ void optimize_layout( *****************************************************/ #ifndef UMAPPP_NO_PARALLEL_OPTIMIZATION -constexpr unsigned long long skip_ns_sentinel = static_cast(-1); +constexpr int skip_ns_sentinel = -1; template struct BusyWaiterInput { std::vector negative_sample_selections; - std::vector negative_sample_count; + std::vector negative_sample_count; Index_ observation; + std::size_t tail_offset; Float_ alpha; }; @@ -229,45 +233,42 @@ void optimize_single_observation(const BusyWaiterInput& input, B // We don't bother doing this for the neighbors, though, as it's // tedious to make sure that the modified values are available during negative sampling. // (This isn't a problem for the self, as the self cannot be its own negative sample.) - { - const Float_* left = state.embedding + static_cast(input.observation) * state.num_dim; // cast to avoid overflow. - std::copy_n(left, state.num_dim, state.self_modified.data()); - } + const auto left = state.embedding + sanisizer::product_unsafe(input.observation, state.num_dim); + std::copy_n(left, state.num_dim, state.self_modified.data()); - unsigned long long position = 0; - auto nsIt = input.negative_sample_count.begin(); - std::size_t start = (input.observation == 0 ? 0 : state.setup->head[input.observation-1]), end = state.setup->head[input.observation]; - - for (std::size_t j = start; j < end; ++j, ++nsIt) { - auto number = *nsIt; + decltype(I(input.negative_sample_selections.size())) position = 0; + const auto num_neighbors = input.negative_sample_count.size(); + for (decltype(I(num_neighbors)) n = 0; n < num_neighbors; ++n) { + const auto number = input.negative_sample_count[n]; if (number == skip_ns_sentinel) { continue; } { - Float_* left = state.self_modified.data(); - Float_* right = state.embedding + static_cast(state.setup->tail[j]) * state.num_dim; // cast to avoid overflow. + const auto left = state.self_modified.data(); + const auto j = sanisizer::sum_unsafe(n, input.tail_offset); + const auto right = state.embedding + sanisizer::product_unsafe(state.setup->tail[j], state.num_dim); - Float_ dist2 = quick_squared_distance(left, right, state.num_dim); + const Float_ dist2 = quick_squared_distance(left, right, state.num_dim); const Float_ pd2b = std::pow(dist2, state.b); const Float_ grad_coef = (-2 * state.a * state.b * pd2b) / (dist2 * (state.a * pd2b + 1.0)); for (std::size_t d = 0; d < state.num_dim; ++d) { auto& l = left[d]; auto& r = right[d]; - Float_ gradient = input.alpha * clamp(grad_coef * (l - r)); + const Float_ gradient = input.alpha * clamp(grad_coef * (l - r)); l += gradient; r -= gradient; } } - auto original = position; + auto s = position; position += number; - for (decltype(position) s = original; s < position; ++s) { - Float_* left = state.self_modified.data(); - const Float_* right = state.embedding + static_cast(input.negative_sample_selections[s]) * state.num_dim; // cast to avoid overflow. + for (; s < position; ++s) { + const auto left = state.self_modified.data(); + const auto right = state.embedding + sanisizer::product_unsafe(input.negative_sample_selections[s], state.num_dim); - Float_ dist2 = quick_squared_distance(left, right, state.num_dim); + const Float_ dist2 = quick_squared_distance(left, right, state.num_dim); const Float_ grad_coef = 2 * state.gamma * state.b / ((0.001 + dist2) * (state.a * std::pow(dist2, state.b) + 1.0)); for (std::size_t d = 0; d < state.num_dim; ++d) { @@ -277,10 +278,7 @@ void optimize_single_observation(const BusyWaiterInput& input, B } // Copying it back to the embedding once we're done. - { - std::size_t offset = static_cast(input.observation) * state.num_dim; // cast to avoid overflow. - std::copy(state.self_modified.begin(), state.self_modified.end(), state.embedding + offset); - } + std::copy(state.self_modified.begin(), state.self_modified.end(), left); } template @@ -362,24 +360,21 @@ class BusyWaiterThread { template void optimize_layout_parallel( - std::size_t num_dim, - Float_* embedding, + const std::size_t num_dim, + Float_* const embedding, EpochData& setup, - Float_ a, - Float_ b, - Float_ gamma, - Float_ initial_alpha, + const Float_ a, + const Float_ b, + const Float_ gamma, + const Float_ initial_alpha, Rng_& rng, - int epoch_limit, - int nthreads + const int epoch_limit, + const int nthreads ) { #ifndef UMAPPP_NO_PARALLEL_OPTIMIZATION auto& n = setup.current_epoch; - auto num_epochs = setup.total_epochs; - auto limit_epochs = num_epochs; - if (epoch_limit> 0) { - limit_epochs = std::min(epoch_limit, num_epochs); - } + const auto num_epochs = setup.total_epochs; + const auto limit_epochs = (epoch_limit > 0 ? std::min(epoch_limit, num_epochs) : num_epochs); BusyWaiterState state; state.num_dim = num_dim; @@ -400,7 +395,7 @@ void optimize_layout_parallel( pool.emplace_back(state); } - std::vector > raw_inputs(nthreads); + auto raw_inputs = sanisizer::create > >(nthreads); BusyWaiterInput* main_input = &(raw_inputs.back()); std::vector*> pool_inputs; pool_inputs.reserve(nthreads - 1); @@ -456,15 +451,16 @@ void optimize_layout_parallel( ttype = WRITE; } - const std::size_t start = (i == 0 ? 0 : setup.head[i-1]), end = setup.head[i]; - for (std::size_t j = start; j < end; ++j) { + const auto start = setup.head[i], end = setup.head[i + 1]; + input.tail_offset = start; + for (auto j = start; j < end; ++j) { if (setup.epoch_of_next_sample[j] > epoch) { ns_count.push_back(skip_ns_sentinel); continue; } { - auto neighbor = setup.tail[j]; + const auto neighbor = setup.tail[j]; auto& touched = last_touched_iteration[neighbor]; auto& ttype = touch_type[neighbor]; // if (PRINT) { std::cout << "\tNEIGHBOR: " << neighbor << ": " << touched << " (" << ttype << ")" << std::endl; } @@ -478,10 +474,10 @@ void optimize_layout_parallel( ttype = WRITE; } - auto prior_size = ns_selections.size(); - auto num_neg_samples = compute_num_neg_samples(j, epoch, setup); - for (decltype(num_neg_samples) p = 0; p < num_neg_samples; ++p) { - Index_ sampled = aarand::discrete_uniform(rng, num_obs); + const auto prior_size = ns_selections.size(); + const auto num_neg_samples = compute_num_neg_samples(j, epoch, setup); + for (decltype(I(num_neg_samples)) p = 0; p < num_neg_samples; ++p) { + const Index_ sampled = aarand::discrete_uniform(rng, num_obs); if (sampled == i) { continue; } diff --git a/include/umappp/parallelize.hpp b/include/umappp/parallelize.hpp index a164f43..db62000 100644 --- a/include/umappp/parallelize.hpp +++ b/include/umappp/parallelize.hpp @@ -26,7 +26,7 @@ namespace umappp { * Any user-defined macro should accept the same arguments as `subpar::parallelize_range()`. */ template -void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range) { +void parallelize(const int num_workers, const Task_ num_tasks, Run_ run_task_range) { #ifndef UMAPPP_CUSTOM_PARALLEL // Don't make this nothrow_ = true, there's too many allocations and the // derived methods for the nearest neighbors search could do anything... diff --git a/include/umappp/spectral_init.hpp b/include/umappp/spectral_init.hpp index ab990be..f9701bd 100644 --- a/include/umappp/spectral_init.hpp +++ b/include/umappp/spectral_init.hpp @@ -1,16 +1,17 @@ #ifndef UMAPPP_SPECTRAL_INIT_HPP #define UMAPPP_SPECTRAL_INIT_HPP -#include "irlba/irlba.hpp" -#include "irlba/parallel.hpp" - #include #include #include #include -#include "NeighborList.hpp" #include "aarand/aarand.hpp" +#include "irlba/irlba.hpp" +#include "irlba/parallel.hpp" +#include "sanisizer/sanisizer.hpp" + +#include "NeighborList.hpp" namespace umappp { @@ -22,20 +23,18 @@ namespace internal { * It is assumed that 'edges' has already been symmetrized. */ template -bool normalized_laplacian(const NeighborList& edges, std::size_t num_dim, Float_* Y, int nthreads) { - Index_ nobs = edges.size(); - std::vector sums(nobs); // we deliberately use double-precision to avoid difficult problems from overflow/underflow inside IRLBA. - std::vector pointers; - pointers.reserve(nobs + 1); - pointers.push_back(0); +bool normalized_laplacian(const NeighborList& edges, const std::size_t num_dim, Float_* const Y, const int nthreads) { + const Index_ nobs = edges.size(); + auto sums = sanisizer::create >(nobs); // we deliberately use double-precision to avoid difficult problems from overflow/underflow inside IRLBA. + std::vector pointers(sanisizer::sum::size_type>(nobs, 1)); std::size_t reservable = 0; for (Index_ c = 0; c < nobs; ++c) { const auto& current = edges[c]; - // +1 for self, assuming that no entry of 'current' is equal to 'c'. - reservable += current.size() + 1; - pointers.push_back(reservable); + reservable = sanisizer::sum(reservable, current.size()); + reservable = sanisizer::sum(reservable, 1); // +1 for self, assuming that no entry of 'current' is equal to 'c'. + pointers[c + 1] = reservable; double& sum = sums[c]; for (const auto& f : current) { @@ -54,9 +53,10 @@ bool normalized_laplacian(const NeighborList& edges, std::size_t for (Index_ c = 0; c < nobs; ++c) { const auto& current = edges[c]; - auto cIt = current.begin(), last = current.end(); + auto cIt = current.begin(); + const auto cLast = current.end(); - for (; cIt != last && cIt->first < c; ++cIt) { + for (; cIt != cLast && cIt->first < c; ++cIt) { indices.push_back(cIt->first); values.push_back(- static_cast(cIt->second) / sums[cIt->first] / sums[c] /* TRANSFORM */ * (-1) ); } @@ -65,7 +65,7 @@ bool normalized_laplacian(const NeighborList& edges, std::size_t indices.push_back(c); values.push_back(1 /* TRANSFORM */ * (-1) + 2); - for (; cIt != current.end(); ++cIt) { + for (; cIt != cLast; ++cIt) { indices.push_back(cIt->first); values.push_back(- static_cast(cIt->second) / sums[cIt->first] / sums[c] /* TRANSFORM */ * (-1) ); } @@ -96,7 +96,7 @@ bool normalized_laplacian(const NeighborList& edges, std::size_t * see LTLA/umappp#4 for the discussion. */ - irlba::ParallelSparseMatrix< + const irlba::ParallelSparseMatrix< decltype(values), decltype(indices), decltype(pointers) @@ -104,9 +104,8 @@ bool normalized_laplacian(const NeighborList& edges, std::size_t > mat(nobs, nobs, std::move(values), std::move(indices), std::move(pointers), /* column_major = */ true, nthreads); irlba::EigenThreadScope tscope(nthreads); - irlba::Options opt; - auto actual = irlba::compute(mat, num_dim + 1, opt); - auto ev = actual.U.rightCols(num_dim); + const auto actual = irlba::compute(mat, num_dim + 1, irlba::Options{}); + const auto ev = actual.U.rightCols(num_dim); // Getting the maximum value; this is assumed to be non-zero, // otherwise this entire thing is futile. @@ -114,8 +113,8 @@ bool normalized_laplacian(const NeighborList& edges, std::size_t const double expansion = (max_val > 0 ? 10 / max_val : 1); for (Index_ c = 0; c < nobs; ++c) { - for (std::size_t d = 0; d < num_dim; ++d, ++Y) { - *Y = ev.coeff(c, d) * expansion; // TODO: put back the jitter step. + for (std::size_t d = 0; d < num_dim; ++d) { + Y[sanisizer::nd_offset(d, num_dim, c)] = ev.coeff(c, d) * expansion; // TODO: put back the jitter step. } } return true; @@ -150,7 +149,7 @@ bool has_multiple_components(const NeighborList& edges) { } template -bool spectral_init(const NeighborList& edges, std::size_t num_dim, Float_* vals, int nthreads) { +bool spectral_init(const NeighborList& edges, const std::size_t num_dim, Float_* const vals, const int nthreads) { if (!has_multiple_components(edges)) { if (normalized_laplacian(edges, num_dim, vals, nthreads)) { return true; @@ -160,7 +159,7 @@ bool spectral_init(const NeighborList& edges, std::size_t num_di } template -void random_init(Index_ num_obs, std::size_t num_dim, Float_ * vals) { +void random_init(const Index_ num_obs, const std::size_t num_dim, Float_ * const vals) { std::size_t num = static_cast(num_obs) * num_dim; // cast to avoid overflow. std::mt19937_64 rng(num); // for a bit of deterministic variety. for (std::size_t i = 0; i < num; ++i) { From 01d2ff6bc3b91cbb875e11842097474573c6bc61 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 5 Sep 2025 11:22:22 -0700 Subject: [PATCH 2/6] Minor fixes. --- include/umappp/optimize_layout.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/umappp/optimize_layout.hpp b/include/umappp/optimize_layout.hpp index 6016eb0..015b5b1 100644 --- a/include/umappp/optimize_layout.hpp +++ b/include/umappp/optimize_layout.hpp @@ -112,7 +112,7 @@ template int compute_num_neg_samples(const std::size_t j, const Float_ epoch, const EpochData& setup) { // Remember that 'epochs_per_negative_sample' is defined as 'epochs_per_sample[j] / negative_sample_rate'. // We just use it inline below rather than defining a new variable and suffering floating-point round-off. - Float_ num_neg_samples = (epoch - setup.epoch_of_next_negative_sample[j]) * + const Float_ num_neg_samples = (epoch - setup.epoch_of_next_negative_sample[j]) * setup.negative_sample_rate / setup.epochs_per_sample[j]; // i.e., 1/epochs_per_negative_sample. // Maximum value of 'num_neg_samples' should be 'num_epochs * negative_sample_rate', because: @@ -233,8 +233,8 @@ void optimize_single_observation(const BusyWaiterInput& input, B // We don't bother doing this for the neighbors, though, as it's // tedious to make sure that the modified values are available during negative sampling. // (This isn't a problem for the self, as the self cannot be its own negative sample.) - const auto left = state.embedding + sanisizer::product_unsafe(input.observation, state.num_dim); - std::copy_n(left, state.num_dim, state.self_modified.data()); + const auto source = state.embedding + sanisizer::product_unsafe(input.observation, state.num_dim); + std::copy_n(source, state.num_dim, state.self_modified.data()); decltype(I(input.negative_sample_selections.size())) position = 0; const auto num_neighbors = input.negative_sample_count.size(); @@ -278,7 +278,7 @@ void optimize_single_observation(const BusyWaiterInput& input, B } // Copying it back to the embedding once we're done. - std::copy(state.self_modified.begin(), state.self_modified.end(), left); + std::copy(state.self_modified.begin(), state.self_modified.end(), source); } template From 4ec6da307af876cd321ad3464fa272889aa99eb9 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 5 Sep 2025 11:29:18 -0700 Subject: [PATCH 3/6] Minor fix for the head. --- include/umappp/optimize_layout.hpp | 4 ++-- tests/src/optimize_layout.cpp | 10 +--------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/include/umappp/optimize_layout.hpp b/include/umappp/optimize_layout.hpp index 015b5b1..7a44d0c 100644 --- a/include/umappp/optimize_layout.hpp +++ b/include/umappp/optimize_layout.hpp @@ -146,7 +146,7 @@ void optimize_layout( const Float_ epoch = n; const Float_ alpha = initial_alpha * (1.0 - epoch / num_epochs); - const Index_ num_obs = setup.head.size(); + const Index_ num_obs = setup.head.size() - 1; for (Index_ i = 0; i < num_obs; ++i) { const auto start = setup.head[i], end = setup.head[i + 1]; const auto left = embedding + sanisizer::product_unsafe(i, num_dim); @@ -403,7 +403,7 @@ void optimize_layout_parallel( pool_inputs.push_back(&(raw_inputs[t])); } - const Index_ num_obs = setup.head.size(); + const Index_ num_obs = setup.head.size() - 1; std::vector last_touched_iteration(num_obs); std::vector touch_type(num_obs); diff --git a/tests/src/optimize_layout.cpp b/tests/src/optimize_layout.cpp index ca60d4a..d56769c 100644 --- a/tests/src/optimize_layout.cpp +++ b/tests/src/optimize_layout.cpp @@ -135,17 +135,9 @@ INSTANTIATE_TEST_SUITE_P( TEST(OptimizeLayout, ChooseNumNegSamplesFailsafe) { umappp::internal::EpochData setup(1); - setup.epochs_per_sample.push_back(0); + setup.epochs_per_sample.push_back(1); setup.epoch_of_next_negative_sample.push_back(0); setup.negative_sample_rate = 5; - - auto num_neg = umappp::internal::compute_num_neg_samples(0, 1.0, setup); // division by zero for epochs_per_sample. - EXPECT_EQ(num_neg, std::numeric_limits::max()); - - num_neg = umappp::internal::compute_num_neg_samples(0, 0.0, setup); // still works if the initial calculation yields an NaN. - EXPECT_EQ(num_neg, std::numeric_limits::max()); - - setup.epochs_per_sample.back() = 1; num_neg = umappp::internal::compute_num_neg_samples(0, 1.0, setup); // as a control, checking that we don't hit the cap. EXPECT_EQ(num_neg, 5); } From b68cd318eae1b2bda7808c052871a35ce6a15809 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 5 Sep 2025 14:56:06 -0700 Subject: [PATCH 4/6] Fixes for the tests. --- tests/R/package/src/Makevars | 3 +-- tests/src/optimize_layout.cpp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/R/package/src/Makevars b/tests/R/package/src/Makevars index a718fed..729a428 100644 --- a/tests/R/package/src/Makevars +++ b/tests/R/package/src/Makevars @@ -3,8 +3,7 @@ # are available. BUILD_DIR=../../../../build/_deps PKG_CPPFLAGS=-I${BUILD_DIR}/aarand-src/include \ - -I${BUILD_DIR}/kmeans-src/include \ - -I${BUILD_DIR}/powerit-src/include \ + -I${BUILD_DIR}/sanisizer-src/include \ -I${BUILD_DIR}/knncolle-src/include \ -I${BUILD_DIR}/irlba-src/include \ -I${BUILD_DIR}/subpar-src/include \ diff --git a/tests/src/optimize_layout.cpp b/tests/src/optimize_layout.cpp index d56769c..71c75f2 100644 --- a/tests/src/optimize_layout.cpp +++ b/tests/src/optimize_layout.cpp @@ -138,6 +138,6 @@ TEST(OptimizeLayout, ChooseNumNegSamplesFailsafe) { setup.epochs_per_sample.push_back(1); setup.epoch_of_next_negative_sample.push_back(0); setup.negative_sample_rate = 5; - num_neg = umappp::internal::compute_num_neg_samples(0, 1.0, setup); // as a control, checking that we don't hit the cap. + auto num_neg = umappp::internal::compute_num_neg_samples(0, 1.0, setup); // as a control, checking that we don't hit the cap. EXPECT_EQ(num_neg, 5); } From 8594282486be28068ba0df5897236197198b1401 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 5 Sep 2025 15:13:09 -0700 Subject: [PATCH 5/6] yet another attempt to get this to work. --- tests/src/optimize_layout.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/src/optimize_layout.cpp b/tests/src/optimize_layout.cpp index 71c75f2..d7f3d27 100644 --- a/tests/src/optimize_layout.cpp +++ b/tests/src/optimize_layout.cpp @@ -59,7 +59,7 @@ TEST_P(OptimizeTest, Epochs) { stored[0][0].second = 1e-8; // check for correct removal. auto epoch = umappp::internal::similarities_to_epochs(stored, 500, 5.0); - EXPECT_EQ(epoch.head.size(), nobs); + EXPECT_EQ(epoch.head.size(), nobs + 1); EXPECT_EQ(epoch.tail.size(), epoch.epochs_per_sample.size()); EXPECT_EQ(epoch.tail.size(), epoch.head.back()); From 5414cf3cf52ddfd59e83453ecaf01c22ecc9675e Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 5 Sep 2025 15:32:20 -0700 Subject: [PATCH 6/6] Yet more fixes. --- include/umappp/Status.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/umappp/Status.hpp b/include/umappp/Status.hpp index 5db8485..eca3344 100644 --- a/include/umappp/Status.hpp +++ b/include/umappp/Status.hpp @@ -113,7 +113,7 @@ class Status { * @return The number of observations in the dataset. */ Index_ num_observations() const { - return my_epochs.head.size(); + return my_epochs.head.size() - 1; } public: