Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ else()
find_package(ltla_aarand CONFIG REQUIRED)
find_package(ltla_irlba CONFIG REQUIRED)
find_package(ltla_subpar 0.4.0 CONFIG REQUIRED)
find_package(ltla_sanisizer 0.1.3 CONFIG REQUIRED)
find_package(knncolle_knncolle 3.0.0 CONFIG REQUIRED)
endif()

target_link_libraries(umappp INTERFACE ltla::aarand ltla::irlba ltla::subpar knncolle::knncolle)
target_link_libraries(umappp INTERFACE ltla::aarand ltla::irlba ltla::subpar ltla::sanisizer knncolle::knncolle)

# Tests
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
Expand Down
1 change: 1 addition & 0 deletions cmake/Config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

include(CMakeFindDependencyMacro)
find_dependency(ltla_aarand CONFIG REQUIRED)
find_dependency(ltla_sanisizer 0.1.3 CONFIG REQUIRED)
find_dependency(ltla_subpar 0.4.0 CONFIG REQUIRED)
find_dependency(ltla_irlba CONFIG REQUIRED)
find_dependency(knncolle_knncolle 3.0.0 CONFIG REQUIRED)
Expand Down
7 changes: 7 additions & 0 deletions extern/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ FetchContent_Declare(
GIT_TAG master # ^0.4.0
)

FetchContent_Declare(
sanisizer
GIT_REPOSITORY https://github.com/LTLA/sanisizer
GIT_TAG master # ^0.1.3
)

FetchContent_Declare(
knncolle
GIT_REPOSITORY https://github.com/knncolle/knncolle
Expand All @@ -26,5 +32,6 @@ FetchContent_Declare(

FetchContent_MakeAvailable(aarand)
FetchContent_MakeAvailable(subpar)
FetchContent_MakeAvailable(sanisizer)
FetchContent_MakeAvailable(knncolle)
FetchContent_MakeAvailable(irlba)
16 changes: 9 additions & 7 deletions include/umappp/Status.hpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#ifndef UMAPPP_STATUS_HPP
#define UMAPPP_STATUS_HPP

#include "Options.hpp"
#include "optimize_layout.hpp"

#include <random>
#include <cstddef>

#include "sanisizer/sanisizer.hpp"

#include "Options.hpp"
#include "optimize_layout.hpp"

/**
* @file Status.hpp
* @brief Status of the UMAP algorithm.
Expand All @@ -27,7 +29,7 @@ class Status {
/**
* @cond
*/
Status(internal::EpochData<Index_, Float_> epochs, Options options, std::size_t num_dim, Float_* embedding) :
Status(internal::EpochData<Index_, Float_> epochs, Options options, const std::size_t num_dim, Float_* const embedding) :
my_epochs(std::move(epochs)),
my_options(std::move(options)),
my_engine(my_options.seed),
Expand Down Expand Up @@ -84,9 +86,9 @@ class Status {
* The contents of the new array in `ptr` should be the same as the array that it replaces, as `run()` will continue the iteration from the coordinates inside the array.
* This is enforced by default when `copy = true`, but if the supplied `ptr` already contains a copy, the caller may set `copy = false` to avoid extra work
*/
void set_embedding(Float_* ptr, bool copy = true) {
void set_embedding(Float_* const ptr, const bool copy = true) {
if (copy) {
std::size_t n = num_dimensions() * static_cast<std::size_t>(num_observations()); // cast to avoid overflow.
const std::size_t n = sanisizer::product_unsafe<std::size_t>(num_dimensions(), num_observations());
std::copy_n(my_embedding, n, ptr);
}
my_embedding = ptr;
Expand All @@ -111,7 +113,7 @@ class Status {
* @return The number of observations in the dataset.
*/
Index_ num_observations() const {
return my_epochs.head.size();
return my_epochs.head.size() - 1;
}

public:
Expand Down
13 changes: 8 additions & 5 deletions include/umappp/combine_neighbor_sets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,27 @@
#include <algorithm>
#include <type_traits>

#include "sanisizer/sanisizer.hpp"

#include "NeighborList.hpp"

namespace umappp {

namespace internal {

template<typename Index_, typename Float_>
void combine_neighbor_sets(NeighborList<Index_, Float_>& x, Float_ mix_ratio) {
Index_ num_obs = x.size();
std::vector<decltype(x[0].size())> last(num_obs), original(num_obs);
void combine_neighbor_sets(NeighborList<Index_, Float_>& x, const Float_ mix_ratio) {
const Index_ num_obs = x.size(); // assume that Index_ is large enough to store the number of observations.
auto last = sanisizer::create<std::vector<Index_> >(num_obs);
auto original = sanisizer::create<std::vector<Index_> >(num_obs);

for (Index_ i = 0; i < num_obs; ++i) {
auto& current = x[i];
std::sort(current.begin(), current.end()); // sorting by ID, see below.
original[i] = x[i].size();
}

for (Index_ i = 0; i < num_obs; ++i ){
for (Index_ i = 0; i < num_obs; ++i) {
auto& current = x[i];

// Looping through the neighbors and searching for self in each
Expand All @@ -42,7 +45,7 @@ void combine_neighbor_sets(NeighborList<Index_, Float_>& x, Float_ mix_ratio) {
// previous iteration of the outermost loop where i and y.first
// swap values. So we skip this to avoid adding it twice.
if (i < y.first) {
Float_ product = y.second * target[curlast].second;
const Float_ product = y.second * target[curlast].second;
Float_ prob_final;

if (mix_ratio == 1) {
Expand Down
36 changes: 19 additions & 17 deletions include/umappp/find_ab.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <cmath>
#include <vector>

#include "sanisizer/sanisizer.hpp"

namespace umappp {

namespace internal {
Expand Down Expand Up @@ -42,13 +44,15 @@ namespace internal {
*/

template<typename Float_>
std::pair<Float_, Float_> find_ab(Float_ spread, Float_ min_dist) {
constexpr Float_ grid = 300;
std::pair<Float_, Float_> find_ab(const Float_ spread, const Float_ min_dist) {
constexpr std::size_t grid = 300;
auto grid_x = sanisizer::create<std::vector<Float_> >(grid);
auto grid_y = sanisizer::create<std::vector<Float_> >(grid);
auto log_x = sanisizer::create<std::vector<Float_> >(grid);

// Compute the x and y coordinates of the expected distance curve.
std::vector<Float_> grid_x(grid), grid_y(grid), log_x(grid);
const Float_ delta = spread * 3 / grid;
for (int g = 0; g < grid; ++g) {
for (std::size_t g = 0; g < grid; ++g) {
grid_x[g] = (g + 1) * delta; // +1 to avoid meaningless least squares result at x = 0, where both curves have y = 1 (and also the derivative w.r.t. b is not defined).
log_x[g] = std::log(grid_x[g]);
grid_y[g] = (grid_x[g] <= min_dist ? 1 : std::exp(- (grid_x[g] - min_dist) / spread));
Expand All @@ -59,25 +63,23 @@ std::pair<Float_, Float_> find_ab(Float_ spread, Float_ min_dist) {
// We use 'limit = 0.5' because that's where most interesting stuff
// happens, given that the curve is bounded between 0 and 1 on the y-axis.
constexpr Float_ limit = 0.5;
Float_ x_half = std::log(limit) * -spread + min_dist; // guaranteed > 0, as log(limit) is negative.
Float_ d_half = limit / -spread; // first derivative at x_half.
const Float_ x_half = std::log(limit) * -spread + min_dist; // guaranteed > 0, as log(limit) is negative.
const Float_ d_half = limit / -spread; // first derivative at x_half.
Float_ b = - d_half * x_half / (1 / limit - 1) / (2 * limit * limit);
Float_ a = (1 / limit - 1) / std::pow(x_half, 2 * b);

std::vector<Float_> fit_y(grid), xpow(grid), grid_resid(grid);
auto compute_ss = [&](Float_ A, Float_ B) -> Float_ {
for (int g = 0; g < grid; ++g) {
auto fit_y = sanisizer::create<std::vector<Float_> >(grid);
auto xpow = sanisizer::create<std::vector<Float_> >(grid);
auto grid_resid = sanisizer::create<std::vector<Float_> >(grid);

auto compute_ss = [&](const Float_ A, const Float_ B) -> Float_ {
Float_ ss = 0;
for (std::size_t g = 0; g < grid; ++g) {
xpow[g] = std::pow(grid_x[g], 2 * B);
fit_y[g] = 1 / (1 + A * xpow[g]);
grid_resid[g] = grid_y[g] - fit_y[g];
ss += grid_resid[g] * grid_resid[g];
}

Float_ ss = 0;
for (int g = 0; g < grid; ++g) {
auto r = grid_resid[g];
ss += r * r;
}

return ss;
};
Float_ ss = compute_ss(a, b);
Expand All @@ -98,7 +100,7 @@ std::pair<Float_, Float_> find_ab(Float_ spread, Float_ min_dist) {
*/
Float_ da2 = 0, db2 = 0, dadb = 0, da_resid = 0, db_resid = 0;

for (int g = 0; g < grid; ++g) {
for (std::size_t g = 0; g < grid; ++g) {
const Float_ x2b = xpow[g]; // set by the last compute_ss() call.
const Float_ oy = fit_y[g]; // ditto
const Float_ resid = grid_resid[g]; // ditto
Expand Down
57 changes: 28 additions & 29 deletions include/umappp/initialize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,25 @@ namespace umappp {
namespace internal {

template<typename Index_>
int choose_num_epochs(int num_epochs, Index_ size) {
if (num_epochs < 0) {
// Choosing the number of epochs. We use a simple formula to decrease
// the number of epochs with increasing size, with the aim being that
// the 'extra work' beyond the minimal 200 epochs should be the same
// regardless of the numbe of observations. Given one calculation per
// observation per epoch, this amounts to 300 * 10000 calculations at
// the lower bound, so we simply choose a number of epochs that
// equalizes the number of calculations for any number of observations.
if (num_epochs < 0) {
constexpr Index_ limit = 10000;
const int minimal = 200, maximal = 300;
if (size <= limit) {
num_epochs = minimal + maximal;
} else {
num_epochs = minimal + static_cast<int>(std::ceil(maximal * static_cast<double>(limit) / static_cast<double>(size)));
}
}
int choose_num_epochs(const int num_epochs, const Index_ size) {
if (num_epochs >= 0) {
return num_epochs;
}

// Choosing the number of epochs. We use a simple formula to decrease
// the number of epochs with increasing size, with the aim being that
// the 'extra work' beyond the minimal 200 epochs should be the same
// regardless of the number of observations. Given one calculation per
// observation per epoch, this amounts to 300 * 10000 calculations at
// the lower bound, so we simply choose a number of epochs that
// equalizes the number of calculations for any number of observations.
constexpr Index_ limit = 10000;
const int minimal = 200, maximal = 300;
if (size <= limit) {
return minimal + maximal;
} else {
return minimal + static_cast<int>(std::ceil(maximal * static_cast<double>(limit) / static_cast<double>(size)));
}
return num_epochs;
}

}
Expand All @@ -72,7 +71,7 @@ int choose_num_epochs(int num_epochs, Index_ size) {
* Further calls to `Status::run()` will update the embeddings in `embedding`.
*/
template<typename Index_, typename Float_>
Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, std::size_t num_dim, Float_* embedding, Options options) {
Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, const std::size_t num_dim, Float_* const embedding, Options options) {
internal::NeighborSimilaritiesOptions<Float_> nsopt;
nsopt.local_connectivity = options.local_connectivity;
nsopt.bandwidth = options.bandwidth;
Expand All @@ -83,7 +82,7 @@ Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, std::size_t nu

// Choosing the manner of initialization.
if (options.initialize == InitializeMethod::SPECTRAL || options.initialize == InitializeMethod::SPECTRAL_ONLY) {
bool attempt = internal::spectral_init(x, num_dim, embedding, options.num_threads);
const bool attempt = internal::spectral_init(x, num_dim, embedding, options.num_threads);
if (!attempt && options.initialize == InitializeMethod::SPECTRAL) {
internal::random_init<Index_>(x.size(), num_dim, embedding);
}
Expand All @@ -93,7 +92,7 @@ Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, std::size_t nu

// Finding a good a/b pair.
if (options.a <= 0 || options.b <= 0) {
auto found = internal::find_ab(options.spread, options.min_dist);
const auto found = internal::find_ab(options.spread, options.min_dist);
options.a = found.first;
options.b = found.second;
}
Expand Down Expand Up @@ -127,7 +126,7 @@ Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, std::size_t nu
* Further calls to `Status::run()` will update the embeddings in `embedding`.
*/
template<typename Index_, typename Input_, typename Float_>
Status<Index_, Float_> initialize(const knncolle::Prebuilt<Index_, Input_, Float_>& prebuilt, std::size_t num_dim, Float_* embedding, Options options) {
Status<Index_, Float_> initialize(const knncolle::Prebuilt<Index_, Input_, Float_>& prebuilt, const std::size_t num_dim, Float_* const embedding, Options options) {
auto output = knncolle::find_nearest_neighbors(prebuilt, options.num_neighbors, options.num_threads);
return initialize(std::move(output), num_dim, embedding, std::move(options));
}
Expand Down Expand Up @@ -156,15 +155,15 @@ Status<Index_, Float_> initialize(const knncolle::Prebuilt<Index_, Input_, Float
*/
template<typename Index_, typename Float_, class Matrix_ = knncolle::Matrix<Index_, Float_> >
Status<Index_, Float_> initialize(
std::size_t data_dim,
std::size_t num_obs,
const Float_* data,
const std::size_t data_dim,
const Index_ num_obs,
const Float_* const data,
const knncolle::Builder<Index_, Float_, Float_, Matrix_>& builder,
std::size_t num_dim,
Float_* embedding,
const std::size_t num_dim,
Float_* const embedding,
Options options)
{
auto prebuilt = builder.build_unique(knncolle::SimpleMatrix<Index_, Float_>(data_dim, num_obs, data));
const auto prebuilt = builder.build_unique(knncolle::SimpleMatrix<Index_, Float_>(data_dim, num_obs, data));
return initialize(*prebuilt, num_dim, embedding, std::move(options));
}

Expand Down
Loading