libscran · LTLA · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -25,10 +25,11 @@ else()
     find_package(ltla_aarand CONFIG REQUIRED)
     find_package(ltla_irlba CONFIG REQUIRED)
     find_package(ltla_subpar 0.4.0 CONFIG REQUIRED)
+    find_package(ltla_sanisizer 0.1.3 CONFIG REQUIRED)
     find_package(knncolle_knncolle 3.0.0 CONFIG REQUIRED)
 endif()
 
-target_link_libraries(umappp INTERFACE ltla::aarand ltla::irlba ltla::subpar knncolle::knncolle)
+target_link_libraries(umappp INTERFACE ltla::aarand ltla::irlba ltla::subpar ltla::sanisizer knncolle::knncolle)
 
 # Tests
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)

diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in
@@ -2,6 +2,7 @@
 
 include(CMakeFindDependencyMacro)
 find_dependency(ltla_aarand CONFIG REQUIRED)
+find_dependency(ltla_sanisizer 0.1.3 CONFIG REQUIRED)
 find_dependency(ltla_subpar 0.4.0 CONFIG REQUIRED)
 find_dependency(ltla_irlba CONFIG REQUIRED)
 find_dependency(knncolle_knncolle 3.0.0 CONFIG REQUIRED)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
@@ -12,6 +12,12 @@ FetchContent_Declare(
   GIT_TAG master # ^0.4.0
 )
 
+FetchContent_Declare(
+  sanisizer
+  GIT_REPOSITORY https://github.com/LTLA/sanisizer
+  GIT_TAG master # ^0.1.3
+)
+
 FetchContent_Declare(
   knncolle
   GIT_REPOSITORY https://github.com/knncolle/knncolle
@@ -26,5 +32,6 @@ FetchContent_Declare(
 
 FetchContent_MakeAvailable(aarand)
 FetchContent_MakeAvailable(subpar)
+FetchContent_MakeAvailable(sanisizer)
 FetchContent_MakeAvailable(knncolle)
 FetchContent_MakeAvailable(irlba)
diff --git a/include/umappp/Status.hpp b/include/umappp/Status.hpp
@@ -1,12 +1,14 @@
 #ifndef UMAPPP_STATUS_HPP
 #define UMAPPP_STATUS_HPP
 
-#include "Options.hpp"
-#include "optimize_layout.hpp"
-
 #include <random>
 #include <cstddef>
 
+#include "sanisizer/sanisizer.hpp"
+
+#include "Options.hpp"
+#include "optimize_layout.hpp"
+
 /**
  * @file Status.hpp
  * @brief Status of the UMAP algorithm.
@@ -27,7 +29,7 @@ class Status {
     /**
      * @cond
      */
-    Status(internal::EpochData<Index_, Float_> epochs, Options options, std::size_t num_dim, Float_* embedding) : 
+    Status(internal::EpochData<Index_, Float_> epochs, Options options, const std::size_t num_dim, Float_* const embedding) : 
         my_epochs(std::move(epochs)),
         my_options(std::move(options)),
         my_engine(my_options.seed),
@@ -84,9 +86,9 @@ class Status {
      * The contents of the new array in `ptr` should be the same as the array that it replaces, as `run()` will continue the iteration from the coordinates inside the array.
      * This is enforced by default when `copy = true`, but if the supplied `ptr` already contains a copy, the caller may set `copy = false` to avoid extra work
      */
-    void set_embedding(Float_* ptr, bool copy = true) {
+    void set_embedding(Float_* const ptr, const bool copy = true) {
         if (copy) {
-            std::size_t n = num_dimensions() * static_cast<std::size_t>(num_observations()); // cast to avoid overflow.
+            const std::size_t n = sanisizer::product_unsafe<std::size_t>(num_dimensions(), num_observations());
             std::copy_n(my_embedding, n, ptr);
         }
         my_embedding = ptr;
@@ -111,7 +113,7 @@ class Status {
      * @return The number of observations in the dataset.
      */
     Index_ num_observations() const {
-        return my_epochs.head.size();
+        return my_epochs.head.size() - 1;
     }
 
 public:

diff --git a/include/umappp/combine_neighbor_sets.hpp b/include/umappp/combine_neighbor_sets.hpp
@@ -5,24 +5,27 @@
 #include <algorithm>
 #include <type_traits>
 
+#include "sanisizer/sanisizer.hpp"
+
 #include "NeighborList.hpp"
 
 namespace umappp {
 
 namespace internal {
 
 template<typename Index_, typename Float_>
-void combine_neighbor_sets(NeighborList<Index_, Float_>& x, Float_ mix_ratio) {
-    Index_ num_obs = x.size();
-    std::vector<decltype(x[0].size())> last(num_obs), original(num_obs);
+void combine_neighbor_sets(NeighborList<Index_, Float_>& x, const Float_ mix_ratio) {
+    const Index_ num_obs = x.size(); // assume that Index_ is large enough to store the number of observations.
+    auto last = sanisizer::create<std::vector<Index_> >(num_obs);
+    auto original = sanisizer::create<std::vector<Index_> >(num_obs);
 
     for (Index_ i = 0; i < num_obs; ++i) {
         auto& current = x[i];
         std::sort(current.begin(), current.end()); // sorting by ID, see below.
         original[i] = x[i].size();
     }
 
-    for (Index_ i = 0; i < num_obs; ++i ){
+    for (Index_ i = 0; i < num_obs; ++i) {
         auto& current = x[i];
 
         // Looping through the neighbors and searching for self in each
@@ -42,7 +45,7 @@ void combine_neighbor_sets(NeighborList<Index_, Float_>& x, Float_ mix_ratio) {
                 // previous iteration of the outermost loop where i and y.first
                 // swap values. So we skip this to avoid adding it twice.
                 if (i < y.first) { 
-                    Float_ product = y.second * target[curlast].second;
+                    const Float_ product = y.second * target[curlast].second;
                     Float_ prob_final;
 
                     if (mix_ratio == 1) {

diff --git a/include/umappp/find_ab.hpp b/include/umappp/find_ab.hpp
@@ -4,6 +4,8 @@
 #include <cmath>
 #include <vector>
 
+#include "sanisizer/sanisizer.hpp"
+
 namespace umappp {
 
 namespace internal {
@@ -42,13 +44,15 @@ namespace internal {
  */
 
 template<typename Float_>
-std::pair<Float_, Float_> find_ab(Float_ spread, Float_ min_dist) {
-    constexpr Float_ grid = 300;
+std::pair<Float_, Float_> find_ab(const Float_ spread, const Float_ min_dist) {
+    constexpr std::size_t grid = 300;
+    auto grid_x = sanisizer::create<std::vector<Float_> >(grid);
+    auto grid_y = sanisizer::create<std::vector<Float_> >(grid);
+    auto log_x = sanisizer::create<std::vector<Float_> >(grid);
 
     // Compute the x and y coordinates of the expected distance curve.
-    std::vector<Float_> grid_x(grid), grid_y(grid), log_x(grid);
     const Float_ delta = spread * 3 / grid;
-    for (int g = 0; g < grid; ++g) {
+    for (std::size_t g = 0; g < grid; ++g) {
         grid_x[g] = (g + 1) * delta; // +1 to avoid meaningless least squares result at x = 0, where both curves have y = 1 (and also the derivative w.r.t. b is not defined).
         log_x[g] = std::log(grid_x[g]);
         grid_y[g] = (grid_x[g] <= min_dist ? 1 : std::exp(- (grid_x[g] - min_dist) / spread));
@@ -59,25 +63,23 @@ std::pair<Float_, Float_> find_ab(Float_ spread, Float_ min_dist) {
     // We use 'limit = 0.5' because that's where most interesting stuff
     // happens, given that the curve is bounded between 0 and 1 on the y-axis.
     constexpr Float_ limit = 0.5;
-    Float_ x_half = std::log(limit) * -spread + min_dist; // guaranteed > 0, as log(limit) is negative.
-    Float_ d_half = limit / -spread; // first derivative at x_half.
+    const Float_ x_half = std::log(limit) * -spread + min_dist; // guaranteed > 0, as log(limit) is negative.
+    const Float_ d_half = limit / -spread; // first derivative at x_half.
     Float_ b = - d_half * x_half / (1 / limit - 1) / (2 * limit * limit);
     Float_ a = (1 / limit - 1) / std::pow(x_half, 2 * b);
 
-    std::vector<Float_> fit_y(grid), xpow(grid), grid_resid(grid);
-    auto compute_ss = [&](Float_ A, Float_ B) -> Float_ {
-        for (int g = 0; g < grid; ++g) {
+    auto fit_y = sanisizer::create<std::vector<Float_> >(grid);
+    auto xpow = sanisizer::create<std::vector<Float_> >(grid);
+    auto grid_resid = sanisizer::create<std::vector<Float_> >(grid);
+
+    auto compute_ss = [&](const Float_ A, const Float_ B) -> Float_ {
+        Float_ ss = 0;
+        for (std::size_t g = 0; g < grid; ++g) {
             xpow[g] = std::pow(grid_x[g], 2 * B);
             fit_y[g] = 1 / (1 + A * xpow[g]);
             grid_resid[g] = grid_y[g] - fit_y[g];
+            ss += grid_resid[g] * grid_resid[g];
         }
-
-        Float_ ss = 0;
-        for (int g = 0; g < grid; ++g) {
-            auto r = grid_resid[g];
-            ss += r * r;
-        }
-
         return ss;
     };
     Float_ ss = compute_ss(a, b);
@@ -98,7 +100,7 @@ std::pair<Float_, Float_> find_ab(Float_ spread, Float_ min_dist) {
          */
         Float_ da2 = 0, db2 = 0, dadb = 0, da_resid = 0, db_resid = 0;
 
-        for (int g = 0; g < grid; ++g) {
+        for (std::size_t g = 0; g < grid; ++g) {
             const Float_ x2b = xpow[g]; // set by the last compute_ss() call.
             const Float_ oy = fit_y[g]; // ditto
             const Float_ resid = grid_resid[g]; // ditto

diff --git a/include/umappp/initialize.hpp b/include/umappp/initialize.hpp
@@ -26,26 +26,25 @@ namespace umappp {
 namespace internal {
 
 template<typename Index_>
-int choose_num_epochs(int num_epochs, Index_ size) {
-    if (num_epochs < 0) {
-        // Choosing the number of epochs. We use a simple formula to decrease
-        // the number of epochs with increasing size, with the aim being that
-        // the 'extra work' beyond the minimal 200 epochs should be the same
-        // regardless of the numbe of observations. Given one calculation per
-        // observation per epoch, this amounts to 300 * 10000 calculations at
-        // the lower bound, so we simply choose a number of epochs that
-        // equalizes the number of calculations for any number of observations.
-        if (num_epochs < 0) {
-            constexpr Index_ limit = 10000;
-            const int minimal = 200, maximal = 300;
-            if (size <= limit) {
-                num_epochs = minimal + maximal;
-            } else {
-                num_epochs = minimal + static_cast<int>(std::ceil(maximal * static_cast<double>(limit) / static_cast<double>(size)));
-            }
-        }
+int choose_num_epochs(const int num_epochs, const Index_ size) {
+    if (num_epochs >= 0) {
+        return num_epochs;
+    }
+
+    // Choosing the number of epochs. We use a simple formula to decrease
+    // the number of epochs with increasing size, with the aim being that
+    // the 'extra work' beyond the minimal 200 epochs should be the same
+    // regardless of the number of observations. Given one calculation per
+    // observation per epoch, this amounts to 300 * 10000 calculations at
+    // the lower bound, so we simply choose a number of epochs that
+    // equalizes the number of calculations for any number of observations.
+    constexpr Index_ limit = 10000;
+    const int minimal = 200, maximal = 300;
+    if (size <= limit) {
+        return minimal + maximal;
+    } else {
+        return minimal + static_cast<int>(std::ceil(maximal * static_cast<double>(limit) / static_cast<double>(size)));
     }
-    return num_epochs;
 }
 
 }
@@ -72,7 +71,7 @@ int choose_num_epochs(int num_epochs, Index_ size) {
  * Further calls to `Status::run()` will update the embeddings in `embedding`.
  */
 template<typename Index_, typename Float_>
-Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, std::size_t num_dim, Float_* embedding, Options options) {
+Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, const std::size_t num_dim, Float_* const embedding, Options options) {
     internal::NeighborSimilaritiesOptions<Float_> nsopt;
     nsopt.local_connectivity = options.local_connectivity;
     nsopt.bandwidth = options.bandwidth;
@@ -83,7 +82,7 @@ Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, std::size_t nu
 
     // Choosing the manner of initialization.
     if (options.initialize == InitializeMethod::SPECTRAL || options.initialize == InitializeMethod::SPECTRAL_ONLY) {
-        bool attempt = internal::spectral_init(x, num_dim, embedding, options.num_threads);
+        const bool attempt = internal::spectral_init(x, num_dim, embedding, options.num_threads);
         if (!attempt && options.initialize == InitializeMethod::SPECTRAL) {
             internal::random_init<Index_>(x.size(), num_dim, embedding);
         }
@@ -93,7 +92,7 @@ Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, std::size_t nu
 
     // Finding a good a/b pair.
     if (options.a <= 0 || options.b <= 0) {
-        auto found = internal::find_ab(options.spread, options.min_dist);
+        const auto found = internal::find_ab(options.spread, options.min_dist);
         options.a = found.first;
         options.b = found.second;
     }
@@ -127,7 +126,7 @@ Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, std::size_t nu
  * Further calls to `Status::run()` will update the embeddings in `embedding`.
  */
 template<typename Index_, typename Input_, typename Float_>
-Status<Index_, Float_> initialize(const knncolle::Prebuilt<Index_, Input_, Float_>& prebuilt, std::size_t num_dim, Float_* embedding, Options options) { 
+Status<Index_, Float_> initialize(const knncolle::Prebuilt<Index_, Input_, Float_>& prebuilt, const std::size_t num_dim, Float_* const embedding, Options options) { 
     auto output = knncolle::find_nearest_neighbors(prebuilt, options.num_neighbors, options.num_threads);
     return initialize(std::move(output), num_dim, embedding, std::move(options));
 }
@@ -156,15 +155,15 @@ Status<Index_, Float_> initialize(const knncolle::Prebuilt<Index_, Input_, Float
  */
 template<typename Index_, typename Float_, class Matrix_ = knncolle::Matrix<Index_, Float_> >
 Status<Index_, Float_> initialize(
-    std::size_t data_dim,
-    std::size_t num_obs,
-    const Float_* data,
+    const std::size_t data_dim,
+    const Index_ num_obs,
+    const Float_* const data,
     const knncolle::Builder<Index_, Float_, Float_, Matrix_>& builder,
-    std::size_t num_dim,
-    Float_* embedding,
+    const std::size_t num_dim,
+    Float_* const embedding,
     Options options)
 { 
-    auto prebuilt = builder.build_unique(knncolle::SimpleMatrix<Index_, Float_>(data_dim, num_obs, data));
+    const auto prebuilt = builder.build_unique(knncolle::SimpleMatrix<Index_, Float_>(data_dim, num_obs, data));
     return initialize(*prebuilt, num_dim, embedding, std::move(options));
 }