From 52441398f4feefabd6a1d19df72f6f0a43c58bb5 Mon Sep 17 00:00:00 2001 From: eleobert Date: Sat, 6 Feb 2021 16:35:53 +0300 Subject: [PATCH] Add dynamic matrix --- include/dkm.hpp | 97 ++++++++++++++++++++++-------------------- include/dkm_matrix.hpp | 63 +++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 47 deletions(-) create mode 100644 include/dkm_matrix.hpp diff --git a/include/dkm.hpp b/include/dkm.hpp index 38d1b68..5144706 100644 --- a/include/dkm.hpp +++ b/include/dkm.hpp @@ -4,6 +4,8 @@ #ifndef DKM_KMEANS_H #define DKM_KMEANS_H +#include "dkm_matrix.hpp" + #include #include #include @@ -28,33 +30,33 @@ namespace details { /* Calculate the square of the distance between two points. */ -template -T distance_squared(const std::array& point_a, const std::array& point_b) { +template +T distance_squared(const std::vector& point_a, const std::vector& point_b) { T d_squared = T(); - for (typename std::array::size_type i = 0; i < N; ++i) { + for (typename std::vector::size_type i = 0; i < point_a.size(); ++i) { auto delta = point_a[i] - point_b[i]; d_squared += delta * delta; } return d_squared; } -template -T distance(const std::array& point_a, const std::array& point_b) { +template +T distance(const std::vector& point_a, const std::vector& point_b) { return std::sqrt(distance_squared(point_a, point_b)); } /* Calculate the smallest distance between each of the data points and any of the input means. */ -template +template std::vector closest_distance( - const std::vector>& means, const std::vector>& data) { + const std::vector>& means, const dkm::as_matrix& data) { std::vector distances; - distances.reserve(data.size()); - for (auto& d : data) { - T closest = distance_squared(d, means[0]); + distances.reserve(data.n_rows); + for (size_t i = 0; i < data.n_rows; i++) { + T closest = distance_squared(data.row(i), means[0]); for (auto& m : means) { - T distance = distance_squared(d, m); + T distance = distance_squared(data.row(i), m); if (distance < closest) closest = distance; } @@ -67,20 +69,20 @@ std::vector closest_distance( This is an alternate initialization method based on the [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B) initialization algorithm. */ -template -std::vector> random_plusplus(const std::vector>& data, uint32_t k, uint64_t seed) { +template +std::vector> random_plusplus(const dkm::as_matrix& data, uint32_t k, uint64_t seed) { assert(k > 0); - assert(data.size() > 0); - using input_size_t = typename std::array::size_type; - std::vector> means; + assert(data.n_rows > 0); + using input_size_t = typename std::vector::size_type; + std::vector> means; // Using a very simple PRBS generator, parameters selected according to // https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use std::linear_congruential_engine rand_engine(seed); // Select first mean at random from the set { - std::uniform_int_distribution uniform_generator(0, data.size() - 1); - means.push_back(data[uniform_generator(rand_engine)]); + std::uniform_int_distribution uniform_generator(0, data.n_rows - 1); + means.push_back(data.row(uniform_generator(rand_engine))); } for (uint32_t count = 1; count < k; ++count) { @@ -94,7 +96,7 @@ std::vector> random_plusplus(const std::vector input_size_t i = 0; std::discrete_distribution generator(distances.size(), 0.0, 0.0, [&distances, &i](double) { return distances[i++]; }); #endif - means.push_back(data[generator(rand_engine)]); + means.push_back(data.row(generator(rand_engine))); } return means; } @@ -102,11 +104,11 @@ std::vector> random_plusplus(const std::vector /* Calculate the index of the mean a particular data point is closest to (euclidean distance) */ -template -uint32_t closest_mean(const std::array& point, const std::vector>& means) { +template +uint32_t closest_mean(const std::vector& point, const std::vector>& means) { assert(!means.empty()); T smallest_distance = distance_squared(point, means[0]); - typename std::array::size_type index = 0; + typename std::vector::size_type index = 0; T distance; for (size_t i = 1; i < means.size(); ++i) { distance = distance_squared(point, means[i]); @@ -121,12 +123,12 @@ uint32_t closest_mean(const std::array& point, const std::vector +template std::vector calculate_clusters( - const std::vector>& data, const std::vector>& means) { + const dkm::as_matrix& data, const std::vector>& means) { std::vector clusters; - for (auto& point : data) { - clusters.push_back(closest_mean(point, means)); + for (size_t i = 0; i < data.n_rows; i++) { + clusters.push_back(closest_mean(data.row(i), means)); } return clusters; } @@ -134,18 +136,19 @@ std::vector calculate_clusters( /* Calculate means based on data points and their cluster assignments. */ -template -std::vector> calculate_means(const std::vector>& data, +template +std::vector> calculate_means(const dkm::as_matrix& data, const std::vector& clusters, - const std::vector>& old_means, + const std::vector>& old_means, uint32_t k) { - std::vector> means(k); + std::vector> means(k); std::vector count(k, T()); - for (size_t i = 0; i < std::min(clusters.size(), data.size()); ++i) { + for (size_t i = 0; i < std::min(clusters.size(), data.n_rows); ++i) { auto& mean = means[clusters[i]]; + mean = (mean.empty())? std::vector(data.n_cols): mean; count[clusters[i]] += 1; - for (size_t j = 0; j < std::min(data[i].size(), mean.size()); ++j) { - mean[j] += data[i][j]; + for (size_t j = 0; j < std::min(data.n_cols, mean.size()); ++j) { + mean[j] += data(i, j); } } for (size_t i = 0; i < k; ++i) { @@ -160,9 +163,9 @@ std::vector> calculate_means(const std::vector return means; } -template +template std::vector deltas( - const std::vector>& old_means, const std::vector>& means) + const std::vector>& old_means, const std::vector>& means) { std::vector distances; distances.reserve(means.size()); @@ -267,19 +270,19 @@ with the [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B) used for initializing the means. */ -template -std::tuple>, std::vector> kmeans_lloyd( - const std::vector>& data, const clustering_parameters& parameters) { +template +std::tuple>, std::vector> kmeans_lloyd( + const dkm::as_matrix& data, const clustering_parameters& parameters) { static_assert(std::is_arithmetic::value && std::is_signed::value, "kmeans_lloyd requires the template parameter T to be a signed arithmetic type (e.g. float, double, int)"); assert(parameters.get_k() > 0); // k must be greater than zero - assert(data.size() >= parameters.get_k()); // there must be at least k data points + assert(data.n_rows >= parameters.get_k()); // there must be at least k data points std::random_device rand_device; uint64_t seed = parameters.has_random_seed() ? parameters.get_random_seed() : rand_device(); - std::vector> means = details::random_plusplus(data, parameters.get_k(), seed); + std::vector> means = details::random_plusplus(data, parameters.get_k(), seed); - std::vector> old_means; - std::vector> old_old_means; + std::vector> old_means; + std::vector> old_old_means; std::vector clusters; // Calculate new means until convergence is reached or we hit the maximum iteration count uint64_t count = 0; @@ -293,7 +296,7 @@ std::tuple>, std::vector> kmeans_lloyd( && !(parameters.has_max_iteration() && count == parameters.get_max_iteration()) && !(parameters.has_min_delta() && details::deltas_below_limit(details::deltas(old_means, means), parameters.get_min_delta()))); - return std::tuple>, std::vector>(means, clusters); + return std::tuple>, std::vector>(means, clusters); } /* @@ -301,10 +304,10 @@ This overload exists to support legacy code which uses this signature of the kme Any code still using this signature should move to the version of this function that uses a `clustering_parameters` struct for configuration. */ -template -std::tuple>, std::vector> kmeans_lloyd( - const std::vector>& data, uint32_t k, - uint64_t max_iter = 0, T min_delta = -1.0) { +template +std::tuple>, std::vector> kmeans_lloyd( + const dkm::as_matrix& data, uint32_t k, + uint64_t max_iter = 0, T min_delta = -1.0) { clustering_parameters parameters(k); if (max_iter != 0) { parameters.set_max_iteration(max_iter); diff --git a/include/dkm_matrix.hpp b/include/dkm_matrix.hpp new file mode 100644 index 0000000..a468610 --- /dev/null +++ b/include/dkm_matrix.hpp @@ -0,0 +1,63 @@ +#pragma once + +#ifndef DKM_MATRIX_HPP +#define DKM_MATRIX_HPP + +#include +#include +#include + +namespace dkm +{ +// This class is only an interface! Not designed to be used outside library internals. + +template +class as_matrix +{ + const T *data = nullptr; + + // column major indexer + auto cm_indexer(size_t i, size_t j) const -> const T& + { + assert(i >= 0 && j >= 0 && i < n_rows && j < n_cols); + return data[j * n_rows + i]; + } + + // row major indexer + auto rm_indexer(size_t i, size_t j) const -> const T& + { + assert(i >= 0 && j >= 0 && i < n_rows && j < n_cols); + return data[i * n_cols + j]; + } + + const T& (as_matrix::*indexer) (size_t, size_t) const = nullptr; + +public: + const size_t n_rows, n_cols; + + as_matrix(const T *data, size_t n_rows, size_t n_cols, bool col_major = true) + : data(data), n_rows(n_rows), n_cols(n_cols), + indexer((col_major)? &as_matrix::cm_indexer: &as_matrix::rm_indexer) + {} + + auto row(size_t i) const -> std::vector; + auto operator()(size_t i, size_t j) const -> const T& + { + return ((this)->*(this->indexer))(i, j); + } +}; + + +template +auto as_matrix::row(size_t i) const -> std::vector +{ + auto res = std::vector(n_cols); + for(size_t j = 0; j < n_cols; j++) + { + res[j] = (*this)(i, j); + } + return res; +} + +} +#endif