diff --git a/include/flucoma/algorithms/public/KMeans.hpp b/include/flucoma/algorithms/public/KMeans.hpp index fab4528f..3302e888 100644 --- a/include/flucoma/algorithms/public/KMeans.hpp +++ b/include/flucoma/algorithms/public/KMeans.hpp @@ -11,6 +11,7 @@ under the European Union’s Horizon 2020 research and innovation programme #pragma once #include "../util/DistanceFuncs.hpp" +#include "../util/EigenRandom.hpp" #include "../util/FluidEigenMappings.hpp" #include "../../data/FluidDataSet.hpp" #include "../../data/FluidIndex.hpp" @@ -32,20 +33,21 @@ namespace _impl::kmeans_init { /// @param input input data /// @param k number of clusters /// @return a 2D Eigen array of means -auto randomPartition(const Eigen::MatrixXd& input, index k) +auto randomPartition(const Eigen::MatrixXd& input, index k, index seed) { // Means come from randomly assigning points and taking average std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(seed < 0? rd() : seed); std::uniform_int_distribution distrib(0, k - 1); Eigen::ArrayXXd means = Eigen::ArrayXXd::Zero(k, input.cols()); - Eigen::ArrayXi weights(k); + Eigen::ArrayXi weights = Eigen::ArrayXi::Zero(k); std::for_each_n(input.rowwise().begin(), input.rows(), [&gen, &distrib, &means, &weights](auto row) { index label = distrib(gen); means(label, Eigen::all) += row.array(); weights(label)++; }); + weights = (weights != 0).select(weights,1); means /= weights.replicate(1, 2).cast(); return means; } @@ -54,11 +56,11 @@ auto randomPartition(const Eigen::MatrixXd& input, index k) /// @param input input data /// @param k number of clusters /// @return 2D Eigen expression of sampled input points -auto randomPoints(const Eigen::MatrixXd& input, index k) +auto randomPoints(const Eigen::MatrixXd& input, index k, index seed) { // Means come from k random points std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(seed < 0? rd() : seed); std::uniform_int_distribution distrib(0, input.rows() - 1); std::vector rows(asUnsigned(k)); @@ -68,31 +70,32 @@ auto randomPoints(const Eigen::MatrixXd& input, index k) } auto squareEuclidiean = [](Eigen::Ref const& a, - Eigen::Ref const& b, - bool squared = true) { - double a_sqnorm = a.squaredNorm(); - double b_sqnorm = b.squaredNorm(); - Eigen::ArrayXXd result = (a * b.transpose()).array(); - result *= -2; - result += (a_sqnorm + b_sqnorm); - return squared ? result: result.sqrt(); + Eigen::Ref const& b) { + double a_sqnorm = a.squaredNorm(); + double b_sqnorm = b.squaredNorm(); + Eigen::ArrayXXd result = (a * b.transpose()).array(); + result *= -2; + result += (a_sqnorm + b_sqnorm); + return result; }; -auto cosine = [](auto a, auto b){ - return 1.0 - (a * b.transpose()).array(); -}; +auto squareCosine = [](Eigen::Ref const& a, + Eigen::Ref const& b) { + return (1.0 - (a * b.transpose()).array()).pow(2); +}; /// @brief initilaize means using markov chain montecarlo approximation of Kmeans++ (kmc2) /// @tparam DistanceFn function object that performs distance calculation /// @param input /// @param k /// @param distance -/// @return -template -auto akmc2(Eigen::MatrixXd const& input, index k, DistanceFn distance) +/// @return +template +auto akmc2(Eigen::MatrixXd const& input, index k, DistanceFn&& distance, + index seed) { std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(seed < 0? rd() : seed); Eigen::MatrixXd centres(k, input.cols()); // First mean sampled at random from input @@ -100,7 +103,7 @@ auto akmc2(Eigen::MatrixXd const& input, index k, DistanceFn distance) std::uniform_int_distribution(0, input.rows() - 1)(gen); centres.row(0) = input.row(centre0); - Eigen::ArrayXd q = distance(input, centres.row(0)).pow(2); + Eigen::ArrayXd q = distance(input, centres.row(0)); q /= (2 * q.sum() + 2 * q.rows()); std::discrete_distribution proposalDistribution(q.begin(), q.end()); @@ -149,7 +152,7 @@ class KMeans bool initialized() const { return mTrained; } void train(const FluidDataSet& dataset, index k, - index maxIter, InitMethod init) + index maxIter, InitMethod init, index seed) { using namespace Eigen; using namespace _impl; @@ -166,15 +169,15 @@ class KMeans { case InitMethod::randomSampling: { - mMeans = akmc2(dataPoints, mK, squareEuclidiean); + mMeans = akmc2(dataPoints, mK, squareEuclidiean, seed); break; } case InitMethod::randomPoint: { - mMeans = randomPoints(dataPoints, mK); + mMeans = randomPoints(dataPoints, mK, seed); break; } - default: mMeans = randomPartition(dataPoints, mK); + default: mMeans = randomPartition(dataPoints, mK, seed); } mEmpty = std::vector(asUnsigned(mK), false); diff --git a/include/flucoma/algorithms/public/SKMeans.hpp b/include/flucoma/algorithms/public/SKMeans.hpp index 4d16a00b..552b1838 100644 --- a/include/flucoma/algorithms/public/SKMeans.hpp +++ b/include/flucoma/algorithms/public/SKMeans.hpp @@ -33,7 +33,7 @@ class SKMeans : public KMeans using KMeans::InitMethod; void train(const FluidDataSet& dataset, index k, - index maxIter, InitMethod initialize ) + index maxIter, InitMethod initialize, index seed) { using namespace Eigen; using namespace _impl; @@ -45,7 +45,7 @@ class SKMeans : public KMeans { mK = k; mDims = dataset.pointSize(); - initMeans(dataPoints, initialize); + initMeans(dataPoints, initialize, seed); } while (maxIter-- > 0) @@ -72,7 +72,7 @@ class SKMeans : public KMeans } private: - void initMeans(Eigen::MatrixXd& dataPoints, InitMethod init) + void initMeans(Eigen::MatrixXd& dataPoints, InitMethod init, index seed) { using namespace Eigen; mMeans = ArrayXXd::Zero(mK, mDims); @@ -81,17 +81,17 @@ class SKMeans : public KMeans switch(init) { case InitMethod::randomSampling: - { - mMeans = akmc2(dataPoints, mK,cosine); + { + mMeans = akmc2(dataPoints, mK, squareCosine, seed); break; } case InitMethod::randomPoint: { - mMeans = randomPoints(dataPoints, mK); + mMeans = randomPoints(dataPoints, mK, seed); break; } default: { - mMeans = randomPartition(dataPoints, mK); + mMeans = randomPartition(dataPoints, mK, seed); mMeans.matrix().rowwise().normalize(); } } diff --git a/include/flucoma/clients/nrt/KMeansClient.hpp b/include/flucoma/clients/nrt/KMeansClient.hpp index 92288f1a..45a2906a 100644 --- a/include/flucoma/clients/nrt/KMeansClient.hpp +++ b/include/flucoma/clients/nrt/KMeansClient.hpp @@ -26,7 +26,8 @@ constexpr auto KMeansParams = defineParameters( LongParam("numClusters", "Number of Clusters", 4, Min(1)), LongParam("maxIter", "Max number of Iterations", 100, Min(1)), EnumParam("initMethod", "Initialize method", 0, "Random Assignment", - "Random Points", "Sampling")); + "Random Points", "Sampling"), + LongParam("seed","Random Seed", -1)); class KMeansClient : public FluidBaseClient, OfflineIn, @@ -34,7 +35,7 @@ class KMeansClient : public FluidBaseClient, ModelObject, public DataClient { - enum {kName, kNumClusters, kMaxIter, kInit}; + enum {kName, kNumClusters, kMaxIter, kInit, kRandomSeed}; ParameterTrackChanges mTracker; public: using string = std::string; @@ -84,7 +85,8 @@ class KMeansClient : public FluidBaseClient, if (dataSet.size() == 0) return Error(EmptyDataSet); if (k <= 1) return Error(SmallK); if(mTracker.changed(k)) mAlgorithm.clear(); - mAlgorithm.train(dataSet, k, maxIter, static_cast(get())); + mAlgorithm.train(dataSet, k, maxIter, static_cast(get()), + get()); IndexVector assignments(dataSet.size()); mAlgorithm.getAssignments(assignments); return getCounts(assignments, k); @@ -104,7 +106,8 @@ class KMeansClient : public FluidBaseClient, if (k <= 1) return Error(SmallK); if (maxIter <= 0) maxIter = 100; if(mTracker.changed(k)) mAlgorithm.clear(); - mAlgorithm.train(dataSet, k, maxIter, static_cast(get())); + mAlgorithm.train(dataSet, k, maxIter, static_cast(get()), + get()); IndexVector assignments(dataSet.size()); mAlgorithm.getAssignments(assignments); StringVectorView ids = dataSet.getIds(); @@ -172,7 +175,8 @@ class KMeansClient : public FluidBaseClient, if (dataSet.size() == 0) return Error(EmptyDataSet); if (k <= 1) return Error(SmallK); if (maxIter <= 0) maxIter = 100; - mAlgorithm.train(dataSet, k, maxIter, static_cast(get())); + mAlgorithm.train(dataSet, k, maxIter, static_cast(get()), + get()); IndexVector assignments(dataSet.size()); mAlgorithm.getAssignments(assignments); transform(srcClient, dstClient); diff --git a/include/flucoma/clients/nrt/SKMeansClient.hpp b/include/flucoma/clients/nrt/SKMeansClient.hpp index df09e8c6..f6b2f41c 100644 --- a/include/flucoma/clients/nrt/SKMeansClient.hpp +++ b/include/flucoma/clients/nrt/SKMeansClient.hpp @@ -20,7 +20,7 @@ namespace fluid { namespace client { namespace skmeans { -enum { kName, kNumClusters, kThreshold, kMaxIter, kInit }; +enum { kName, kNumClusters, kThreshold, kMaxIter, kInit, kRandomSeed }; constexpr auto SKMeansParams = defineParameters( StringParam>("name", "Name"), @@ -28,7 +28,8 @@ constexpr auto SKMeansParams = defineParameters( FloatParam("encodingThreshold", "Encoding Threshold", 0.25, Min(0), Max(1)), LongParam("maxIter", "Max number of Iterations", 100, Min(1)), EnumParam("initMethod", "Initialize method", 0, "Random Assignment", - "Random Points", "Sampling")); + "Random Points", "Sampling"), + LongParam("seed","Random Seed", -1)); class SKMeansClient : public FluidBaseClient, OfflineIn, @@ -83,7 +84,8 @@ class SKMeansClient : public FluidBaseClient, if (dataSet.size() == 0) return Error(EmptyDataSet); if (k <= 1) return Error(SmallK); if(mTracker.changed(k)) mAlgorithm.clear(); - mAlgorithm.train(dataSet, k, maxIter, static_cast(get())); + mAlgorithm.train(dataSet, k, maxIter, static_cast(get()), + get()); IndexVector assignments(dataSet.size()); mAlgorithm.getAssignments(assignments); return getCounts(assignments, k); @@ -104,7 +106,8 @@ class SKMeansClient : public FluidBaseClient, if (k <= 1) return Error(SmallK); if (maxIter <= 0) maxIter = 100; if(mTracker.changed(k)) mAlgorithm.clear(); - mAlgorithm.train(dataSet, k, maxIter, static_cast(get())); + mAlgorithm.train(dataSet, k, maxIter, static_cast(get()), + get()); IndexVector assignments(dataSet.size()); mAlgorithm.getAssignments(assignments); StringVectorView ids = dataSet.getIds(); @@ -175,7 +178,8 @@ class SKMeansClient : public FluidBaseClient, if (k <= 1) return Error(SmallK); if (maxIter <= 0) maxIter = 100; if(mTracker.changed(k)) mAlgorithm.clear(); - mAlgorithm.train(dataSet, k, maxIter, static_cast(get())); + mAlgorithm.train(dataSet, k, maxIter, static_cast(get()), + get()); IndexVector assignments(dataSet.size()); mAlgorithm.getAssignments(assignments); encode(srcClient, dstClient); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 63af0ac4..6702da23 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -116,6 +116,7 @@ add_test_executable(TestTransientSlice algorithms/public/TestTransientSlice.cpp) add_test_executable(TestMLP algorithms/public/TestMLP.cpp) add_test_executable(TestKMeans algorithms/public/TestKMeans.cpp) +add_test_executable(TestSKMeans algorithms/public/TestSKMeans.cpp) add_test_executable(TestNMFCross algorithms/public/TestNMFCross.cpp) add_test_executable(TestGriffinLim algorithms/public/TestGriffinLim.cpp) add_test_executable(TestNMF algorithms/public/TestNMF.cpp) @@ -160,6 +161,7 @@ catch_discover_tests(TestBufferedProcess WORKING_DIRECTORY "${CMAKE_BINARY_DIR}" catch_discover_tests(TestMLP WORKING_DIRECTORY "${CMAKE_BINARY_DIR}") catch_discover_tests(TestKMeans WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +catch_discover_tests(TestSKMeans) catch_discover_tests(TestEigenRandom WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) catch_discover_tests(TestNMFCross WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) catch_discover_tests(TestGriffinLim WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) diff --git a/tests/algorithms/public/TestKMeans.cpp b/tests/algorithms/public/TestKMeans.cpp index 67d3b635..fbb079c6 100644 --- a/tests/algorithms/public/TestKMeans.cpp +++ b/tests/algorithms/public/TestKMeans.cpp @@ -14,7 +14,7 @@ TEST_CASE("KMeans against hand worked example") fluid::algorithm::KMeans algo; FluidTensor initialMeans{{0,0},{1,1}}; algo.setMeans(initialMeans); - algo.train(ds, 2, 2, KMeans::InitMethod::randomPartion); + algo.train(ds, 2, 2, KMeans::InitMethod::randomPartion, -1); FluidTensor means(2, 2); algo.getMeans(means); @@ -31,4 +31,45 @@ TEST_CASE("KMeans against hand worked example") REQUIRE_THAT(assignments, Catch::Matchers::RangeEquals({0, 0, 1, 1})); } +TEST_CASE("KMeans is reproducable with manual random seed") +{ + + using Tensor = FluidTensor; + + Tensor points{{0, 0}, {0.5, 0.}, {0.5, 1}, {1, 1}}; + FluidTensor ids{"0", "1", "2", "3"}; + FluidDataSet ds(ids, points); + + std::vector means(3,Tensor(2,2)); + + auto initmethod = GENERATE(algorithm::KMeans::InitMethod::randomPartion, + algorithm::KMeans::InitMethod::randomPoint, + algorithm::KMeans::InitMethod::randomSampling); + INFO("Init method " << static_cast(initmethod)); + algorithm::KMeans algo; + algo.train(ds, 2, 1, initmethod,42); + algo.getMeans(means[0]); + algo.clear(); + + algo.train(ds, 2, 1, initmethod,42); + algo.getMeans(means[1]); + algo.clear(); + + algo.train(ds, 2, 1, initmethod,4398); + algo.getMeans(means[2]); + algo.clear(); + + using Catch::Matchers::RangeEquals; + using Catch::Matchers::WithinRel; + auto comp = [](double x, double y) -> bool { + return Catch::Matchers::WithinRel(x).match(y); + }; + + REQUIRE_THAT(means[1], RangeEquals(means[0], comp)); +if(static_cast(initmethod) != 2) // can converge for randomsampling + { + REQUIRE_THAT(means[1], !RangeEquals(means[2], comp)); + } +} + } // namespace fluid::algorithm \ No newline at end of file diff --git a/tests/algorithms/public/TestSKMeans.cpp b/tests/algorithms/public/TestSKMeans.cpp new file mode 100644 index 00000000..9a7132ef --- /dev/null +++ b/tests/algorithms/public/TestSKMeans.cpp @@ -0,0 +1,48 @@ +#define CATCH_CONFIG_MAIN +#include +#include + +namespace fluid::algorithm { + +TEST_CASE("SKMeans is reproducable with manual random seed") +{ + + using Tensor = FluidTensor; + + Tensor points{{0.1, 0.1}, {0.5, 0.1}, {0.5, 1}, {1, 1}}; + FluidTensor ids{"0", "1", "2", "3"}; + FluidDataSet ds(ids, points); + + std::vector means(3,Tensor(2,2)); + + auto initmethod = GENERATE(algorithm::KMeans::InitMethod::randomPartion, + algorithm::KMeans::InitMethod::randomPoint, + algorithm::KMeans::InitMethod::randomSampling); + INFO("Init method " << static_cast(initmethod)); + algorithm::SKMeans algo; + algo.train(ds, 2, 1, initmethod,42); + algo.getMeans(means[0]); + algo.clear(); + + algo.train(ds, 2, 1, initmethod,42); + algo.getMeans(means[1]); + algo.clear(); + + algo.train(ds, 2, 1, initmethod,4398); + algo.getMeans(means[2]); + algo.clear(); + + using Catch::Matchers::RangeEquals; + + auto comp = [](double x, double y) -> bool { + return Catch::Matchers::WithinRel(x).match(y); + }; + + REQUIRE_THAT(means[1], RangeEquals(means[0], comp)); + if(static_cast(initmethod) != 2) // can converge for randomsampling + { + REQUIRE_THAT(means[1], !RangeEquals(means[2], comp)); + } +} + +} // namespace fluid::algorithm \ No newline at end of file