diff --git a/.gitignore b/.gitignore index 7dfea74f..90a24ee0 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ # build dir /build/ +/build_debug/ # editor configuration /.vscode/ diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp index bae58feb..4d8e80ff 100644 --- a/include/chopper/configuration.hpp +++ b/include/chopper/configuration.hpp @@ -20,8 +20,21 @@ namespace chopper { +enum class partitioning_scheme : uint8_t +{ + blocked, // 0 + sorted, // 1 + folded, // 2 + weighted_fold, // 3 + similarity, // 4 + lsh, // 5 + lsh_sim // 6 +}; + struct configuration { + partitioning_scheme partitioning_approach{partitioning_scheme::lsh_sim}; + /*!\name General Configuration * \{ */ @@ -77,6 +90,10 @@ struct configuration mutable seqan::hibf::concurrent_timer union_estimation_timer{}; mutable seqan::hibf::concurrent_timer rearrangement_timer{}; mutable seqan::hibf::concurrent_timer dp_algorithm_timer{}; + mutable seqan::hibf::concurrent_timer lsh_algorithm_timer{}; + mutable seqan::hibf::concurrent_timer search_partition_algorithm_timer{}; + mutable seqan::hibf::concurrent_timer intital_partition_timer{}; + mutable seqan::hibf::concurrent_timer small_layouts_timer{}; void read_from(std::istream & stream); diff --git a/include/chopper/layout/determine_split_bins.hpp b/include/chopper/layout/determine_split_bins.hpp new file mode 100644 index 00000000..91400e42 --- /dev/null +++ b/include/chopper/layout/determine_split_bins.hpp @@ -0,0 +1,29 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides chopper::determine_split_bins. + * \author Svenja Mehringer + */ + +#pragma once + +#include + +#include + +namespace chopper::layout +{ + +std::pair determine_split_bins(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + size_t const num_technical_bins, + size_t const num_user_bins, + std::vector> & partitions); + +} \ No newline at end of file diff --git a/include/chopper/layout/execute.hpp b/include/chopper/layout/execute.hpp index 52bfc3b7..00c4fa2e 100644 --- a/include/chopper/layout/execute.hpp +++ b/include/chopper/layout/execute.hpp @@ -13,12 +13,14 @@ #include #include +#include namespace chopper::layout { int execute(chopper::configuration & config, std::vector> const & filenames, - std::vector const & sketches); + std::vector const & sketches, + std::vector const & minHash_sketches); } // namespace chopper::layout diff --git a/include/chopper/lsh.hpp b/include/chopper/lsh.hpp new file mode 100644 index 00000000..caa83975 --- /dev/null +++ b/include/chopper/lsh.hpp @@ -0,0 +1,202 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides chopper::adjust_seed. + * \author Svenja Mehringer + */ + +#pragma once + +#include + +namespace chopper +{ + +/*\brief foo + */ +struct Cluster +{ +protected: + size_t representative_id{}; // representative id of the cluster; identifier; + + std::vector user_bins{}; // the user bins contained in thus cluster + + std::optional moved_id{std::nullopt}; // where this Clusters user bins where moved to + +public: + Cluster() = default; + Cluster(Cluster const &) = default; + Cluster(Cluster &&) = default; + Cluster & operator=(Cluster const &) = default; + Cluster & operator=(Cluster &&) = default; + ~Cluster() = default; + + Cluster(size_t const id, size_t const user_bins_id) : representative_id{id}, user_bins({user_bins_id}) + {} + + Cluster(size_t const id) : Cluster{id, id} + {} + + size_t id() const + { + return representative_id; + } + + std::vector const & contained_user_bins() const + { + return user_bins; + } + + bool has_been_moved() const + { + return moved_id.has_value(); + } + + bool empty() const + { + return user_bins.empty(); + } + + size_t size() const + { + return user_bins.size(); + } + + bool is_valid(size_t id) const + { + bool const ids_equal = representative_id == id; + bool const properly_moved = has_been_moved() && empty(); + bool const not_moved = !has_been_moved() && !empty(); + + return ids_equal && (properly_moved || not_moved); + } + + size_t moved_to_cluster_id() const + { + assert(moved_id.has_value()); + assert(is_valid(representative_id)); + return moved_id.value(); + } + + void move_to(Cluster & target_cluster) + { + target_cluster.user_bins.insert(target_cluster.user_bins.end(), this->user_bins.begin(), this->user_bins.end()); + this->user_bins.clear(); + moved_id = target_cluster.id(); + } + + void sort_by_cardinality(std::vector const & cardinalities) + { + std::ranges::sort(user_bins, + [&cardinalities](auto const & v1, auto const & v2) + { + return cardinalities[v1] > cardinalities[v2]; + }); + } +}; + +struct MultiCluster : Cluster +{ +protected: + std::vector> user_bins{}; // the user bins contained in this cluster + +public: + MultiCluster() = default; + MultiCluster(MultiCluster const &) = default; + MultiCluster(MultiCluster &&) = default; + MultiCluster & operator=(MultiCluster const &) = default; + MultiCluster & operator=(MultiCluster &&) = default; + ~MultiCluster() = default; + + MultiCluster(Cluster const & clust) + { + representative_id = clust.id(); + + if (clust.has_been_moved()) + moved_id = clust.moved_to_cluster_id(); + else + user_bins.push_back(clust.contained_user_bins()); + } + + std::vector> const & contained_user_bins() const + { + return user_bins; + } + + // needs to be defined again because of redefinition of `user_bins` + bool empty() const + { + return user_bins.empty(); + } + + // needs to be defined again because of redefinition of `user_bins` + size_t size() const + { + return user_bins.size(); + } + + bool is_valid(size_t id) const + { + bool const ids_equal = representative_id == id; + bool const properly_moved = has_been_moved() && empty(); + bool const not_moved = !has_been_moved() && !empty(); + + return ids_equal && (properly_moved || not_moved); + } + + void move_to(MultiCluster & target_cluster) + { + target_cluster.user_bins.insert(target_cluster.user_bins.end(), this->user_bins.begin(), this->user_bins.end()); + this->user_bins.clear(); + moved_id = target_cluster.id(); + } + + // sort user bins within a Cluster by cardinality and the clusters themselves by size + void sort_by_cardinality(std::vector const & cardinalities) + { + std::ranges::for_each(user_bins, + [&cardinalities](auto & user_bin_cluster) + { + std::ranges::sort(user_bin_cluster, + [&cardinalities](auto const & v1, auto const & v2) + { + return cardinalities[v1] > cardinalities[v2]; + }); + }); + + std::ranges::sort(user_bins, + [](auto const & c1, auto const & c2) + { + return c1.size() > c2.size(); + }); + } +}; + +// A valid cluster is one that hasn't been moved but actually contains user bins +// A valid cluster at position i is identified by the following equality: cluster[i].size() >= 1 && cluster[i][0] == i +// A moved cluster is one that has been joined and thereby moved to another cluster +// A moved cluster i is identified by the following: cluster[i].size() == 1 && cluster[i][0] != i +// returns position of the representative cluster +template +size_t LSH_find_representative_cluster(std::vector const & clusters, size_t current_id) +{ + std::reference_wrapper representative = clusters[current_id]; + + assert(representative.get().is_valid(current_id)); + + while (representative.get().has_been_moved()) + { + current_id = representative.get().moved_to_cluster_id(); + representative = clusters[current_id]; // replace by next cluster + assert(representative.get().is_valid(current_id)); + } + + return current_id; +} + +} // namespace chopper diff --git a/src/chopper_layout.cpp b/src/chopper_layout.cpp index 5fda4ddb..9df893a3 100644 --- a/src/chopper_layout.cpp +++ b/src/chopper_layout.cpp @@ -93,6 +93,7 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser) std::vector> filenames{}; std::vector sketches{}; + std::vector minHash_sketches{}; if (input_is_a_sketch_file) { @@ -106,6 +107,7 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser) filenames = std::move(sin.filenames); // No need to call check_filenames because the files are not read. sketches = std::move(sin.hll_sketches); + minHash_sketches = std::move(sin.minHash_sketches); validate_configuration(parser, config, sin.chopper_config); } else @@ -128,17 +130,18 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser) if (!input_is_a_sketch_file) { config.compute_sketches_timer.start(); - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); config.compute_sketches_timer.stop(); } - exit_code |= chopper::layout::execute(config, filenames, sketches); + exit_code |= chopper::layout::execute(config, filenames, sketches, minHash_sketches); if (!config.disable_sketch_output) { chopper::sketch::sketch_file sout{.chopper_config = config, .filenames = std::move(filenames), - .hll_sketches = std::move(sketches)}; + .hll_sketches = std::move(sketches), + .minHash_sketches = std::move(minHash_sketches)}; std::ofstream os{config.sketch_directory, std::ios::binary}; cereal::BinaryOutputArchive oarchive{os}; oarchive(sout); @@ -151,11 +154,19 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser) output_stream << "sketching_in_seconds\t" << "layouting_in_seconds\t" << "union_estimation_in_seconds\t" - << "rearrangement_in_seconds\n"; + << "rearrangement_in_seconds\t" + << "lsh_in_seconds\t" + << "intital_partition_timer_in_seconds\t" + << "small_layouts_timer_in_seconds\t" + << "search_best_p_in_seconds\n"; output_stream << config.compute_sketches_timer.in_seconds() << '\t'; output_stream << config.dp_algorithm_timer.in_seconds() << '\t'; output_stream << config.union_estimation_timer.in_seconds() << '\t'; output_stream << config.rearrangement_timer.in_seconds() << '\t'; + output_stream << config.lsh_algorithm_timer.in_seconds() << '\t'; + output_stream << config.intital_partition_timer.in_seconds() << '\t'; + output_stream << config.small_layouts_timer.in_seconds() << '\t'; + output_stream << config.search_partition_algorithm_timer.in_seconds() << '\n'; } return exit_code; diff --git a/src/layout/CMakeLists.txt b/src/layout/CMakeLists.txt index 915c83d7..f2b16735 100644 --- a/src/layout/CMakeLists.txt +++ b/src/layout/CMakeLists.txt @@ -4,8 +4,14 @@ if (TARGET chopper::layout) return () endif () -add_library (chopper_layout STATIC determine_best_number_of_technical_bins.cpp execute.cpp hibf_statistics.cpp - ibf_query_cost.cpp input.cpp output.cpp +add_library (chopper_layout STATIC + determine_best_number_of_technical_bins.cpp + determine_split_bins.cpp + execute.cpp + hibf_statistics.cpp + ibf_query_cost.cpp + input.cpp + output.cpp ) target_link_libraries (chopper_layout PUBLIC chopper::shared) add_library (chopper::layout ALIAS chopper_layout) diff --git a/src/layout/determine_split_bins.cpp b/src/layout/determine_split_bins.cpp new file mode 100644 index 00000000..fb4ba653 --- /dev/null +++ b/src/layout/determine_split_bins.cpp @@ -0,0 +1,153 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include // for allocator, string +#include // for vector + +#include +#include + +#include +#include // for data_store +#include + +namespace chopper::layout +{ + +std::pair determine_split_bins(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + size_t const num_technical_bins, + size_t const num_user_bins, + std::vector> & partitions) +{ + if (num_user_bins == 0) + return {0, 0}; + + assert(num_technical_bins > 0u); + assert(num_user_bins > 0u); + assert(num_user_bins <= num_technical_bins); + + auto const fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = num_technical_bins}); + + std::vector> matrix(num_technical_bins); // rows + for (auto & v : matrix) + v.resize(num_user_bins, std::numeric_limits::max()); // columns + + std::vector> trace(num_technical_bins); // rows + for (auto & v : trace) + v.resize(num_user_bins, std::numeric_limits::max()); // columns + + size_t const extra_bins = num_technical_bins - num_user_bins + 1; + + // initialize first column (first row is initialized with inf) + double const ub_cardinality = static_cast(cardinalities[positions[0]]); + for (size_t i = 0; i < extra_bins; ++i) + { + size_t const corrected_ub_cardinality = static_cast(ub_cardinality * fpr_correction[i + 1]); + matrix[i][0] = seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i + 1u); + } + + // we must iterate column wise + for (size_t j = 1; j < num_user_bins; ++j) + { + double const ub_cardinality = static_cast(cardinalities[positions[j]]); + + for (size_t i = j; i < j + extra_bins; ++i) + { + size_t minimum{std::numeric_limits::max()}; + + for (size_t i_prime = j - 1; i_prime < i; ++i_prime) + { + size_t const corrected_ub_cardinality = + static_cast(ub_cardinality * fpr_correction[(i - i_prime)]); + size_t score = std::max(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), + matrix[i_prime][j - 1]); + + // std::cout << "j:" << j << " i:" << i << " i':" << i_prime << " score:" << score << std::endl; + + minimum = (score < minimum) ? (trace[i][j] = i_prime, score) : minimum; + } + + matrix[i][j] = minimum; + } + } + + // seqan::hibf::layout::print_matrix(matrix, num_technical_bins, num_user_bins, std::numeric_limits::max()); + //seqan::hibf::layout::print_matrix(trace, num_technical_bins, num_user_bins, std::numeric_limits::max()); + + // backtracking + // first, in the last column, find the row with minimum score (it can happen that the last rows are equally good) + // the less rows, the better + size_t trace_i{num_technical_bins - 1}; + size_t best_score{std::numeric_limits::max()}; + for (size_t best_i{0}; best_i < num_technical_bins; ++best_i) + { + if (matrix[best_i][num_user_bins - 1] < best_score) + { + best_score = matrix[best_i][num_user_bins - 1]; + trace_i = best_i; + } + } + size_t const number_of_split_tbs{trace_i + 1}; // trace_i is the position. So +1 for the number + + // now that we found the best trace_i start usual backtracking + size_t trace_j = num_user_bins - 1; + + // size_t max_id{}; + size_t max_size{}; + + size_t bin_id{}; + + while (trace_j > 0) + { + size_t next_i = trace[trace_i][trace_j]; + size_t const number_of_bins = (trace_i - next_i); + size_t const cardinality = cardinalities[positions[trace_j]]; + size_t const corrected_cardinality = static_cast(cardinality * fpr_correction[number_of_bins]); + size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, number_of_bins); + + if (cardinality_per_bin > max_size) + { + // max_id = bin_id; + max_size = cardinality_per_bin; + } + + for (size_t splits{0}; splits < number_of_bins; ++splits) + { + partitions[partitions.size() - 1 - bin_id].push_back(positions[trace_j]); + ++bin_id; + } + + trace_i = trace[trace_i][trace_j]; + --trace_j; + } + ++trace_i; // because we want the length not the index. Now trace_i == number_of_bins + size_t const cardinality = cardinalities[positions[0]]; + size_t const corrected_cardinality = static_cast(cardinality * fpr_correction[trace_i]); + // NOLINTNEXTLINE(clang-analyzer-core.DivideZero) + size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, trace_i); + + if (cardinality_per_bin > max_size) + { + // max_id = bin_id; + max_size = cardinality_per_bin; + } + + for (size_t splits{0}; splits < trace_i; ++splits) + { + partitions[partitions.size() - 1 - bin_id].push_back(positions[0]); + ++bin_id; + } + + return {number_of_split_tbs, max_size}; +} + +} // namespace chopper::layout diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 6630447e..e4931196 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -5,6 +5,7 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include #include #include #include @@ -12,6 +13,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -19,60 +23,1200 @@ #include #include +#include #include #include #include +#include #include +#include +#include +#include #include +#include // for compute_relaxed_fpr_correction #include +#include #include #include // for estimate_kmer_counts #include +#include namespace chopper::layout { +uint64_t lsh_hash_the_sketch(std::vector const & sketch, size_t const number_of_hashes_to_consider) +{ + return std::reduce(sketch.begin(), sketch.begin() + number_of_hashes_to_consider); +} + +template +auto LSH_fill_hashtable(std::vector const & clusters, + std::vector const & minHash_sketches, + size_t const current_sketch_index, + size_t const current_number_of_sketch_hashes) +{ + static constexpr bool is_multi_cluster = std::same_as; + static_assert(std::same_as || std::same_as); + + robin_hood::unordered_flat_map> table; + + [[maybe_unused]] size_t processed_user_bins{0}; // only for sanity check + + auto get_user_bin_idx = [](cluster_type const & cluster) + { + if constexpr (is_multi_cluster) + return std::views::join(cluster.contained_user_bins()); + else + return cluster.contained_user_bins(); + }; + + for (size_t pos = 0; pos < clusters.size(); ++pos) + { + auto const & current = clusters[pos]; + assert(current.is_valid(pos)); + + if (current.has_been_moved()) // cluster has been moved somewhere else, don't process + continue; + + for (size_t const user_bin_idx : get_user_bin_idx(current)) + { + ++processed_user_bins; + uint64_t const key = lsh_hash_the_sketch(minHash_sketches[user_bin_idx].table[current_sketch_index], + current_number_of_sketch_hashes); + table[key].push_back(current.id()); // insert representative for all user bins + } + } + assert(processed_user_bins == clusters.size()); // all user bins should've been processed by one of the clusters + + // uniquify list. Since I am inserting representative_idx's into the table, the same number can + // be inserted into multiple splots, and multiple times in the same slot. + for (auto & [key, list] : table) + { + std::ranges::sort(list); + auto const ret = std::ranges::unique(list); + list.erase(ret.begin(), ret.end()); + } + + return table; +} + +// minHash_sketches data structure: +// Vector L1 : number of user bins +// Vector L2 : number_of_max_minHash_sketches (LSH ADD+OR parameter b) +// Vector L3 : minHash_sketch_size (LSH ADD+OR parameter r) + +std::vector very_similar_LSH_partitioning(std::vector const & minHash_sketches, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + size_t const average_technical_bin_size, + chopper::configuration const & config) +{ + assert(!minHash_sketches.empty()); + assert(!minHash_sketches[0].table.empty()); + assert(!minHash_sketches[0].table[0].empty()); + + size_t const number_of_user_bins{positions.size()}; + assert(number_of_user_bins <= minHash_sketches.size()); + size_t const number_of_max_minHash_sketches{3}; // LSH ADD+OR parameter b + // size_t const minHash_sketch_size{minHash_sketches[0].table[0].size()}; // LSH ADD+OR parameter r + size_t const minHash_sketch_size{5}; // LSH ADD+OR parameter r + seqan::hibf::sketch::hyperloglog const empty_sketch{config.hibf_config.sketch_bits}; + // std::cout << "sketch size available: " << minHash_sketches[0].table[0].size() << std::endl; + // std::cout << "sketch size used here: " << minHash_sketch_size << std::endl; + // initialise clusters with a signle user bin per cluster. + // clusters are either + // 1) of size 1; containing an id != position where the id points to the cluster it has been moved to + // e.g. cluster[Y] = {Z} (Y has been moved into Z, so Z could look likes this cluster[Z] = {Z, Y}) + // 2) of size >= 1; with the first entry beging id == position (a valid cluster) + // e.g. cluster[X] = {X} // valid singleton + // e.g. cluster[X] = {X, a, b, c, ...} // valid cluster with more joined entries + // The clusters could me moved recursively, s.t. + // cluster[A] = {B} + // cluster[B] = {C} + // cluster[C] = {C, A, B} // is valid cluster since cluster[C][0] == C; contains A and B + std::vector clusters; + clusters.reserve(number_of_user_bins); + + std::vector current_cluster_cardinality(number_of_user_bins); + std::vector current_cluster_sketches(number_of_user_bins, empty_sketch); + size_t current_max_cluster_size{0}; + size_t current_number_of_sketch_hashes{minHash_sketch_size}; // start with high r but decrease it iteratively + size_t current_sketch_index{0}; + + for (size_t pos = 0; pos < number_of_user_bins; ++pos) + { + clusters.emplace_back(pos, positions[pos]); + current_cluster_cardinality[pos] = cardinalities[positions[pos]]; + current_cluster_sketches[pos] = sketches[positions[pos]]; + current_max_cluster_size = std::max(current_max_cluster_size, cardinalities[positions[pos]]); + } + + auto get_cluster = [&clusters](size_t const current) -> Cluster & + { + size_t const cluster_id = LSH_find_representative_cluster(clusters, current); + return clusters[cluster_id]; + }; + + // refine clusters + //std::cout << "Start clustering with threshold average_technical_bin_size: " << average_technical_bin_size << std::endl; + while (current_max_cluster_size < average_technical_bin_size + && /*number_of_clusters / static_cast(number_of_user_bins) > 0.5 &&*/ + current_sketch_index < number_of_max_minHash_sketches) // I want to cluster 10%? + { + // fill LSH collision hashtable + robin_hood::unordered_flat_map> table = + LSH_fill_hashtable(clusters, minHash_sketches, current_sketch_index, current_number_of_sketch_hashes); + + // read out LSH collision hashtable + // for each present key, if the list contains more than one cluster, we merge everything contained in the list + // into the first cluster, since those clusters collide and should be joined in the same bucket + for (auto & [key, list] : table) + { + assert(!list.empty()); + + if (list.size() <= 1) // nothing to do here + continue; + + // Now combine all clusters into the first. + + // 1) find the representative cluster to merge everything else into + // It can happen, that the representative has already been joined with another cluster + // e.g. + // [key1] = {0,11} // then clusters[11] is merged into clusters[0] + // [key2] = {11,13} // now I want to merge clusters[13] into clusters[11] but the latter has been moved + auto & representative_cluster = get_cluster(list[0]); + assert(representative_cluster.id() == clusters[representative_cluster.id()].id()); + + auto & representative_cluster_sketch = current_cluster_sketches[representative_cluster.id()]; + + for (size_t const current : std::views::drop(list, 1)) + { + // For every other entry in the list, it can happen that I already joined that list with another + // e.g. + // [key1] = {0,11} // then clusters[11] is merged into clusters[0] + // [key2] = {0, 2, 11} // now I want to do it again + auto & next_cluster = get_cluster(current); + + if (next_cluster.id() == representative_cluster.id()) // already joined + continue; + + next_cluster.move_to(representative_cluster); // otherwise join next_cluster into representative_cluster + assert(next_cluster.empty()); + assert(next_cluster.has_been_moved()); + assert(representative_cluster.size() > 1); // there should be at least two user bins now + + representative_cluster_sketch.merge(current_cluster_sketches[next_cluster.id()]); + } + + current_cluster_cardinality[representative_cluster.id()] = representative_cluster_sketch.estimate(); + } + current_max_cluster_size = std::ranges::max(current_cluster_cardinality); + + ++current_sketch_index; + } + + return clusters; +} + +void post_process_clusters(std::vector & clusters, + std::vector const & cardinalities, + chopper::configuration const & config) +{ + // clusters are done. Start post processing + // since post processing involves re-ordering the clusters, the moved_to_cluster_id value of a cluster will not + // refer to the position of the cluster in the `clusters` vecto anymore but the cluster with the resprive id() + // would neet to be found + for (size_t pos = 0; pos < clusters.size(); ++pos) + { + assert(clusters[pos].is_valid(pos)); + clusters[pos].sort_by_cardinality(cardinalities); + } + + // push largest p clusters to the front + std::ranges::partial_sort(clusters, + std::ranges::next(clusters.begin(), config.hibf_config.tmax, clusters.end()), + [&cardinalities](auto const & v1, auto const & v2) + { + // Note: If v2 is empty, so is v1. + if (v1.size() == v2.size() && !v2.empty()) + return cardinalities[v1.contained_user_bins().front()] + > cardinalities[v2.contained_user_bins().front()]; + + return v1.size() > v2.size(); + }); + + // after filling up the partitions with the biggest clusters, sort the clusters by cardinality of the biggest ub + // s.t. that euqally sizes ub are assigned after each other and the small stuff is added at last. + // the largest ub is already at the start because of former sorting. + std::ranges::sort(std::ranges::next(clusters.begin(), config.hibf_config.tmax, clusters.end()), + clusters.end(), + [&cardinalities](auto const & v1, auto const & v2) + { + if (v1.empty()) + return false; // v1 can never be larger than v2 then + + if (v2.empty()) // and v1 is not, since the first if would catch + return true; + + return cardinalities[v1.contained_user_bins().front()] + > cardinalities[v2.contained_user_bins().front()]; + }); + + assert(clusters.size() < 2 || clusters[0].size() >= clusters[1].size()); // sanity check + // assert(cardinalities[clusters[std::min(clusters.size(), config.hibf_config.tmax)].contained_user_bins()[0]] >= cardinalities[clusters[std::min(clusters.size(), config.hibf_config.tmax) + 1].contained_user_bins()[0]]); // sanity check + + // debug + for (size_t cidx = 1; cidx < clusters.size(); ++cidx) + { + // once empty - always empty; all empty clusters should be at the end + if (clusters[cidx - 1].empty() && !clusters[cidx].empty()) + throw std::runtime_error{"sorting did not work"}; + } + // debug +} + +bool find_best_partition(chopper::configuration const & config, + size_t const number_of_partitions, + size_t & corrected_estimate_per_part, + std::vector const & cluster, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector> & positions, + std::vector & partition_sketches, + std::vector & max_partition_cardinality, + std::vector & min_partition_cardinality) +{ + seqan::hibf::sketch::hyperloglog const current_sketch = [&sketches, &cluster, &config]() + { + seqan::hibf::sketch::hyperloglog result{config.hibf_config.sketch_bits}; + + for (size_t const user_bin_idx : cluster) + result.merge(sketches[user_bin_idx]); + + return result; + }(); + + size_t const max_card = [&cardinalities, &cluster]() + { + size_t max{0}; + + for (size_t const user_bin_idx : cluster) + max = std::max(max, cardinalities[user_bin_idx]); + + return max; + }(); + + // TODO: afterwads check if I should again merge by how little the effective text ratio grows + + // search best partition fit by similarity + // similarity here is defined as: + // "whose (<-partition) effective text size is subsumed most by the current user bin" + // or in other words: + // "which partition has the largest intersection with user bin b compared to its own (partition) size." + // double best_subsume_ratio{0.0}; + size_t smallest_change{std::numeric_limits::max()}; + size_t best_p{0}; + bool best_p_found{false}; + + auto penalty_lower_level = [&](size_t const additional_number_of_user_bins, size_t const p) -> size_t + { + size_t const min = std::min(max_card, min_partition_cardinality[p]); + size_t const max = std::max(max_card, max_partition_cardinality[p]); + + if (positions[p].size() > config.hibf_config.tmax) // already a third level + { + // if there must already be another lower level because the current merged bin contains more than tmax + // user bins, then the current user bin is very likely stored multiple times. Therefore, the penalty is set + // to the cardinality of the current user bin times the number of levels, e.g. the number of times this user + // bin needs to be stored additionally + size_t const num_ubs_in_merged_bin{positions[p].size() + additional_number_of_user_bins}; + double const levels = std::log(num_ubs_in_merged_bin) / std::log(config.hibf_config.tmax); + return static_cast(max_card * levels); + } + else if (positions[p].size() + additional_number_of_user_bins > config.hibf_config.tmax) // now a third level + { + // if the current merged bin contains exactly tmax UBS, adding otherone must + // result in another lower level. Most likely, the smallest user bin will end up on the lower level + // therefore the penalty is set to 'min * tmax' + // of course, there could also be a third level with a lower number of user bins, but this is hard to + // estimate. + size_t const penalty = min * config.hibf_config.tmax; + return penalty; + } + else // positions[p].size() + additional_number_of_user_bins < tmax + { + // if the new user bin is smaller than all other already contained user bins + // the waste of space is high if stored in a single technical bin + if (max_card < min) + return (max - max_card); + // if the new user bin is bigger than all other already contained user bins, the IBF size increases + else if (max_card > max) + return (max_card - max) * config.hibf_config.tmax; + // else, end if-else-block and zero is returned + } + + return 0u; + }; + + for (size_t p = 0; p < number_of_partitions; ++p) + { + seqan::hibf::sketch::hyperloglog union_sketch = current_sketch; + union_sketch.merge(partition_sketches[p]); + size_t const union_estimate = union_sketch.estimate(); + size_t const current_partition_size = partition_sketches[p].estimate(); + + assert(union_estimate >= current_partition_size); + size_t const penalty_current_bin = union_estimate - current_partition_size; + size_t const penalty_current_ibf = + config.hibf_config.tmax + * ((union_estimate <= corrected_estimate_per_part) ? 0u : union_estimate - corrected_estimate_per_part); + size_t const change = penalty_current_bin + penalty_current_ibf + penalty_lower_level(cluster.size(), p); + + // size_t const intersection = current_sketch.estimate() - change; + // double const subsume_ratio = static_cast(intersection) / current_partition_size; + //std::cout << "p:" << p << " p-#UBs" << positions[p].size() << " penalty:" << penalty(cluster.size(), p) << " change:" << change << " union-current_p:" << (union_estimate - current_partition_size) << " union:" << union_estimate << " current_p:" << current_partition_size << " t:" << corrected_estimate_per_part << std::endl; + if (change == 0 || /* If there is no penalty at all, this is a best fit even if the partition is "full"*/ + (smallest_change > change /*&& subsume_ratio > best_subsume_ratio &&*/ + /*current_partition_size < corrected_estimate_per_part*/)) + { + //std::cout << "smaller!" << std::endl; + // best_subsume_ratio = subsume_ratio; + smallest_change = change; + best_p = p; + best_p_found = true; + } + } + + if (!best_p_found) + throw "currently there are no safety measures if a partition is not found because it is very unlikely"; + + //std::cout << "best_p:" << best_p << std::endl<< std::endl; + + // now that we know which partition fits best (`best_p`), add those indices to it + for (size_t const user_bin_idx : cluster) + { + positions[best_p].push_back(user_bin_idx); + max_partition_cardinality[best_p] = std::max(max_partition_cardinality[best_p], cardinalities[user_bin_idx]); + min_partition_cardinality[best_p] = std::min(min_partition_cardinality[best_p], cardinalities[user_bin_idx]); + } + partition_sketches[best_p].merge(current_sketch); + corrected_estimate_per_part = std::max(corrected_estimate_per_part, partition_sketches[best_p].estimate()); + + return true; +} + +size_t split_bins(chopper::configuration const & config, + std::vector const & sorted_positions, + std::vector const & cardinalities, + std::vector> & partitions, + size_t const sum_of_cardinalities, + size_t const end_idx) +{ + // assign split bins to the end. makes it easier to process merged bins afterwards + size_t pos{partitions.size() - 1}; + for (size_t idx = 0; idx < end_idx; ++idx) + { + size_t const ub_idx{sorted_positions[idx]}; + // Question: divide and ceil? + size_t const number_of_split_tbs = + std::max(1u, config.hibf_config.tmax * cardinalities[ub_idx] / sum_of_cardinalities); + std::cout << "number_of_split_tbs: " << number_of_split_tbs + << " cardinalities[ub_idx]:" << cardinalities[ub_idx] + << " sum_of_cardinalities:" << sum_of_cardinalities << std::endl; + // fill partitions from behind to ensure an easier layouting + for (size_t i = 0; i < number_of_split_tbs; ++i) + { + assert(pos > 0); + assert(partitions[pos].empty()); + partitions[pos].push_back(ub_idx); + --pos; + } + } + std::cout << "pos: " << pos << std::endl; + return pos + 1; +} + +std::pair find_bins_to_be_split(std::vector const & sorted_positions, + std::vector const & cardinalities, + size_t threshold, + size_t const max_bins) +{ + size_t idx{0}; + size_t sum{0}; + size_t number_of_split_bins = max_bins + 1; // initialise to something bigger to trigger while loop below + + auto find_idx_and_sum = [&]() + { + while (idx < sorted_positions.size() && cardinalities[sorted_positions[idx]] > threshold) + { + sum += cardinalities[sorted_positions[idx]]; + ++idx; + } + }; + + while (number_of_split_bins > max_bins) + { + idx = 0; + sum = 0; + find_idx_and_sum(); + number_of_split_bins = sum / threshold; + // max(1.01, ...) because a tie would end in an endless loop. + threshold = static_cast(threshold) * std::max(1.01, static_cast(sum) / (threshold * max_bins)); + } + + // Maybe there are more user bins (position of idx) than available split bins. + // Thus, clamp the number of user bins to split to maximally the number_of_split_bins + idx = std::min(idx, number_of_split_bins); + + return {idx, number_of_split_bins}; +} + +size_t lsh_sim_approach(chopper::configuration const & config, + std::vector const & sorted_positions2, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + std::vector> & partitions, + size_t const number_of_remaining_tbs, + size_t const technical_bin_size_threshold, + size_t const sum_of_cardinalities) +{ + uint8_t const sketch_bits{config.hibf_config.sketch_bits}; + //std::cout << "LSH partitioning into " << config.hibf_config.tmax << std::endl; + std::vector partition_sketches(number_of_remaining_tbs, + seqan::hibf::sketch::hyperloglog(sketch_bits)); + + std::vector max_partition_cardinality(number_of_remaining_tbs, 0u); + std::vector min_partition_cardinality(number_of_remaining_tbs, std::numeric_limits::max()); + + // initial partitioning using locality sensitive hashing (LSH) + config.lsh_algorithm_timer.start(); + std::vector clusters = very_similar_LSH_partitioning(minHash_sketches, + sorted_positions2, + cardinalities, + sketches, + technical_bin_size_threshold, + config); + post_process_clusters(clusters, cardinalities, config); + config.lsh_algorithm_timer.stop(); + + std::ofstream ofs{"/tmp/final.clusters"}; + for (size_t i = 0; i < clusters.size(); ++i) + { + seqan::hibf::sketch::hyperloglog sketch(config.hibf_config.sketch_bits); + for (size_t j = 0; j < clusters[i].size(); ++j) + { + sketch.merge(sketches[clusters[i].contained_user_bins()[j]]); + } + ofs << i << ":" << sketch.estimate() << ":" << clusters[i].size() << std::endl; + } + + std::vector> remaining_clusters{}; + + // initialise partitions with the first p largest clusters (post_processing sorts by size) + size_t cidx{0}; // current cluster index + for (size_t p = 0; p < number_of_remaining_tbs; ++p) + { + assert(!clusters[cidx].empty()); + auto const & cluster = clusters[cidx].contained_user_bins(); + bool split_cluster = false; + + if (cluster.size() > config.hibf_config.tmax) + { + size_t card{0}; + for (size_t uidx = 0; uidx < cluster.size(); ++uidx) + card += cardinalities[cluster[uidx]]; + + if (card > 0.05 * sum_of_cardinalities / config.hibf_config.tmax) + split_cluster = true; + } + + size_t end = (split_cluster) ? cluster.size() : std::min(cluster.size(), config.hibf_config.tmax); + for (size_t uidx = 0; uidx < end; ++uidx) + { + size_t const user_bin_idx = cluster[uidx]; + // if a single cluster already exceeds the cardinality_per_part, + // then the remaining user bins of the cluster must spill over into the next partition + if ((uidx != 0 && (uidx % config.hibf_config.tmax == 0)) + || partition_sketches[p].estimate() > technical_bin_size_threshold) + { + ++p; + + if (p >= number_of_remaining_tbs) + { + split_cluster = true; + end = uidx; + break; + } + } + + partition_sketches[p].merge(sketches[user_bin_idx]); + partitions[p].push_back(user_bin_idx); + max_partition_cardinality[p] = std::max(max_partition_cardinality[p], cardinalities[user_bin_idx]); + min_partition_cardinality[p] = std::min(min_partition_cardinality[p], cardinalities[user_bin_idx]); + } + + if (split_cluster) + { + std::vector remainder(cluster.begin() + end, cluster.end()); + remaining_clusters.insert(remaining_clusters.end(), remainder); + } + + ++cidx; + } + + for (size_t i = cidx; i < clusters.size(); ++i) + { + if (clusters[i].empty()) + break; + + remaining_clusters.insert(remaining_clusters.end(), clusters[i].contained_user_bins()); + } + + // assign the rest by similarity + size_t merged_threshold{technical_bin_size_threshold}; + for (size_t ridx = 0; ridx < remaining_clusters.size(); ++ridx) + { + auto const & cluster = remaining_clusters[ridx]; + + config.search_partition_algorithm_timer.start(); + find_best_partition(config, + number_of_remaining_tbs, + merged_threshold, + cluster, + cardinalities, + sketches, + partitions, + partition_sketches, + max_partition_cardinality, + min_partition_cardinality); + config.search_partition_algorithm_timer.start(); + } + + // compute actual max size + size_t max_size{0}; + for (auto const & sketch : partition_sketches) + max_size = std::max(max_size, (size_t)sketch.estimate()); + + return max_size; +} + +void partition_user_bins(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + std::vector> & partitions) +{ + // all approaches need sorted positions + std::vector const sorted_positions = [&positions, &cardinalities]() + { + std::vector ps(positions.begin(), positions.end()); + seqan::hibf::sketch::toolbox::sort_by_cardinalities(cardinalities, ps); + return ps; + }(); + + auto const [sum_of_cardinalities, max_cardinality, joint_estimate] = [&]() + { + size_t sum{0}; + size_t max{0}; + seqan::hibf::sketch::hyperloglog sketch{config.hibf_config.sketch_bits}; + + for (size_t const pos : positions) + { + sum += cardinalities[pos]; + max = std::max(max, cardinalities[pos]); + sketch.merge(sketches[pos]); + } + + return std::tuple{sum, max, sketch.estimate()}; + }(); + + // If the effective text size is very low, it can happen that the joint_estimate divided by the number of partitions + // is lower than the largest single user bin. But of course, we can never reach a smaller max technical bin size + // then that of the largest user user bin. Thus we can correct the estimate_per_part beforehand. + // This way we make sure there is at least 1 LSH clustering step. + [[maybe_unused]] size_t const estimate_per_part = + std::max(seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax), max_cardinality + 1); + + double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction( + {.fpr = config.hibf_config.maximum_fpr, // + .relaxed_fpr = config.hibf_config.relaxed_fpr, + .hash_count = config.hibf_config.number_of_hash_functions}); + //std::cout << "sum_of_cardinalities:" << sum_of_cardinalities << " joint_estimate:" << joint_estimate << std::endl; + + size_t split_threshold{}; + size_t idx{0}; // start in sorted positions + size_t number_of_split_tbs{0}; + size_t number_of_merged_tbs{config.hibf_config.tmax}; + size_t max_split_size{0}; + size_t max_merged_size{0}; + + // split_threshold = seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.hibf_config.tmax); + split_threshold = seqan::hibf::divide_and_ceil(joint_estimate, config.hibf_config.tmax); + + auto parition_split_bins = [&]() + { + size_t number_of_potential_split_bins{0}; + std::tie(idx, number_of_potential_split_bins) = + find_bins_to_be_split(sorted_positions, cardinalities, split_threshold, config.hibf_config.tmax - 1); + std::tie(number_of_split_tbs, max_split_size) = + chopper::layout::determine_split_bins(config, + sorted_positions, + cardinalities, + number_of_potential_split_bins, + idx, + partitions); + number_of_merged_tbs = config.hibf_config.tmax - number_of_split_tbs; + }; + + auto partitions_merged_bins = [&]() + { + // determine number of split bins + std::vector const sorted_positions2(sorted_positions.begin() + idx, sorted_positions.end()); + + // distribute the rest to merged bins + size_t const corrected_max_split_size = max_split_size / relaxed_fpr_correction; + size_t const merged_threshold = std::max(corrected_max_split_size, split_threshold); + + max_merged_size = lsh_sim_approach(config, + sorted_positions2, + cardinalities, + sketches, + minHash_sketches, + partitions, + number_of_merged_tbs, + merged_threshold, + sum_of_cardinalities); + }; + + parition_split_bins(); + partitions_merged_bins(); + + int64_t const difference = + static_cast(max_merged_size * relaxed_fpr_correction) - static_cast(max_split_size); + + std::cout << "number_of_split_tbs:" << number_of_split_tbs << " difference:" << difference << std::endl; + std::cout << "Reconfiguring threshold. from:" << split_threshold; + + if (number_of_split_tbs == 0) + split_threshold = (split_threshold + max_merged_size) / 2; // increase threshold + else if (difference > 0) // need more merged bins -> increase threshold + split_threshold = + static_cast(split_threshold) + * ((static_cast(max_merged_size) * relaxed_fpr_correction) / static_cast(max_split_size)); + else // need more split bins -> decrease threshold + split_threshold = + static_cast(split_threshold) + * ((static_cast(max_merged_size) * relaxed_fpr_correction) / static_cast(max_split_size)); + + std::cout << " to:" << split_threshold << std::endl; + + // reset result + partitions.clear(); + partitions.resize(config.hibf_config.tmax); + idx = 0; + number_of_split_tbs = 0; + number_of_merged_tbs = config.hibf_config.tmax; + max_split_size = 0; + max_merged_size = 0; + + parition_split_bins(); + partitions_merged_bins(); + + // sanity check: + size_t sum{0}; + size_t last_index{positions.size()}; // non existing user bin idx + for (auto const & p : partitions) + { + if (p.size() == 1) + { + // for split bins, avoid additional counts + if (p[0] != last_index) + ++sum; + + last_index = p[0]; + } + else + { + sum += p.size(); + } + } + + if (sum != positions.size()) + { + std::string str{"Not all user bins have been assigned to the "}; + str += std::to_string(partitions.size()); + str += " partitions! ("; + str += std::to_string(sum); + str += "/"; + str += std::to_string(positions.size()); + str += ")\n"; + for (auto const & p : partitions) + { + str += "["; + for (auto const h : p) + { + str += std::to_string(h); + str += ","; + } + str.back() = ']'; + str += '\n'; + } + + throw std::logic_error{str}; + } + + // assert([&](){ bool x{false}; for (auto const & p : partitions) { x &= !p.empty(); }; return x; }); +} + +seqan::hibf::layout::layout general_layout(chopper::configuration const & config, + std::vector positions, + std::vector const & cardinalities, + std::vector const & sketches) +{ + seqan::hibf::layout::layout hibf_layout; + + seqan::hibf::concurrent_timer union_estimation_timer{}; + seqan::hibf::concurrent_timer rearrangement_timer{}; + seqan::hibf::concurrent_timer dp_algorithm_timer{}; + + dp_algorithm_timer.start(); + hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, + cardinalities, + sketches, + std::move(positions), + union_estimation_timer, + rearrangement_timer); + dp_algorithm_timer.stop(); + + return hibf_layout; +} + +bool do_I_need_a_fast_layout(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities) +{ + // the fast layout heuristic would greedily merge even if merging only 2 bins at a time + // merging only little number of bins is highly disadvantegous for lower levels because few bins + // will be heavily split and this will raise the fpr correction for split bins + // Thus, if the average number of user bins per technical bin is less then 64, we should not fast layout + if (positions.size() < (64 * config.hibf_config.tmax)) + return false; + + if (positions.size() > 500'000) // layout takes more than half a day (should this be a user option?) + return true; + + size_t largest_size{0}; + size_t sum_of_cardinalities{0}; + + for (size_t const i : positions) + { + sum_of_cardinalities += cardinalities[i]; + largest_size = std::max(largest_size, cardinalities[i]); + } + + size_t const cardinality_per_tb = sum_of_cardinalities / config.hibf_config.tmax; + + bool const largest_user_bin_might_be_split = largest_size > cardinality_per_tb; + + // if no splitting is needed, its worth it to use a fast-merge-only algorithm + if (!largest_user_bin_might_be_split) + return true; + + return false; +} + +void add_level_to_layout(chopper::configuration const & config, + seqan::hibf::layout::layout & hibf_layout, + std::vector> const & partitions, + std::vector const & sketches, + std::vector const & previous) +{ + size_t max_bin_id{0}; + size_t max_size{0}; + + auto const split_fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = partitions.size()}); + + double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction( + {.fpr = config.hibf_config.maximum_fpr, // + .relaxed_fpr = config.hibf_config.relaxed_fpr, + .hash_count = config.hibf_config.number_of_hash_functions}); + + // we assume here that the user bins have been sorted by user bin id such that pos = idx + for (size_t partition_idx{0}; partition_idx < partitions.size(); ++partition_idx) + { + auto const & partition = partitions[partition_idx]; + + if (partition.size() > 1) // merged bin + { + seqan::hibf::sketch::hyperloglog current_sketch{sketches[0]}; // ensure same bit size + current_sketch.reset(); + + for (size_t const user_bin_id : partition) + { + assert(hibf_layout.user_bins[user_bin_id].idx == user_bin_id); + auto & current_user_bin = hibf_layout.user_bins[user_bin_id]; + + // update + assert(previous == current_user_bin.previous_TB_indices); + current_user_bin.previous_TB_indices.push_back(partition_idx); + current_sketch.merge(sketches[user_bin_id]); + } + + // update max_bin_id, max_size + size_t const current_size = current_sketch.estimate() * relaxed_fpr_correction; + if (current_size > max_size) + { + max_bin_id = partition_idx; + max_size = current_size; + } + } + else if (partition.size() == 0) // should not happen.. dge case? + { + continue; + } + else // single or split bin (partition.size() == 1) + { + auto & current_user_bin = hibf_layout.user_bins[partitions[partition_idx][0]]; + assert(current_user_bin.idx == partitions[partition_idx][0]); + current_user_bin.storage_TB_id = partition_idx; + current_user_bin.number_of_technical_bins = 1; // initialise to 1 + + while (partition_idx + 1 < partitions.size() && partitions[partition_idx].size() == 1 + && partitions[partition_idx + 1].size() == 1 + && partitions[partition_idx][0] == partitions[partition_idx + 1][0]) + { + ++current_user_bin.number_of_technical_bins; + ++partition_idx; + } + + // update max_bin_id, max_size + size_t const current_size = sketches[current_user_bin.idx].estimate() + * split_fpr_correction[current_user_bin.number_of_technical_bins]; + if (current_size > max_size) + { + max_bin_id = current_user_bin.storage_TB_id; + max_size = current_size; + } + } + } + + hibf_layout.max_bins.emplace_back(previous, max_bin_id); // add lower level meta information +} + +void update_layout_from_child_layout(seqan::hibf::layout::layout & child_layout, + seqan::hibf::layout::layout & hibf_layout, + std::vector const & new_previous) +{ + hibf_layout.max_bins.emplace_back(new_previous, child_layout.top_level_max_bin_id); + + for (auto & max_bin : child_layout.max_bins) + { + max_bin.previous_TB_indices.insert(max_bin.previous_TB_indices.begin(), + new_previous.begin(), + new_previous.end()); + hibf_layout.max_bins.push_back(max_bin); + } + + for (auto const & user_bin : child_layout.user_bins) + { + auto & actual_user_bin = hibf_layout.user_bins[user_bin.idx]; + + actual_user_bin.previous_TB_indices.insert(actual_user_bin.previous_TB_indices.end(), + user_bin.previous_TB_indices.begin(), + user_bin.previous_TB_indices.end()); + actual_user_bin.number_of_technical_bins = user_bin.number_of_technical_bins; + actual_user_bin.storage_TB_id = user_bin.storage_TB_id; + } +} + +void fast_layout_recursion(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + seqan::hibf::layout::layout & hibf_layout, + std::vector const & previous) +{ + std::vector> tmax_partitions(config.hibf_config.tmax); + + // here we assume that we want to start with a fast layout + partition_user_bins(config, positions, cardinalities, sketches, minHash_sketches, tmax_partitions); + +#pragma omp critical + { + add_level_to_layout(config, hibf_layout, tmax_partitions, sketches, previous); + } + + for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx) + { + auto const & partition = tmax_partitions[partition_idx]; + auto const new_previous = [&]() + { + auto cpy{previous}; + cpy.push_back(partition_idx); + return cpy; + }(); + + if (partition.empty() || partition.size() == 1) // nothing to merge + continue; + + if (do_I_need_a_fast_layout(config, partition, cardinalities)) + { + fast_layout_recursion(config, + partition, + cardinalities, + sketches, + minHash_sketches, + hibf_layout, + new_previous); // recurse fast_layout + } + else + { + auto child_layout = general_layout(config, partition, cardinalities, sketches); + +#pragma omp critical + { + update_layout_from_child_layout(child_layout, hibf_layout, new_previous); + } + } + } +} + +void fast_layout(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + seqan::hibf::layout::layout & hibf_layout) +{ + auto config_copy = config; + + std::vector> tmax_partitions(config.hibf_config.tmax); + + // here we assume that we want to start with a fast layout + config.intital_partition_timer.start(); + partition_user_bins(config_copy, positions, cardinalities, sketches, minHash_sketches, tmax_partitions); + config.intital_partition_timer.stop(); + + auto const split_fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = config.hibf_config.tmax}); + + double const relaxed_fpr_correction = seqan::hibf::layout::compute_relaxed_fpr_correction( + {.fpr = config.hibf_config.maximum_fpr, // + .relaxed_fpr = config.hibf_config.relaxed_fpr, + .hash_count = config.hibf_config.number_of_hash_functions}); + + size_t max_bin_id{0}; + size_t max_size{0}; + hibf_layout.user_bins.resize(config_copy.hibf_config.number_of_user_bins); + + // initialise user bins in layout + for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx) + { + if (tmax_partitions[partition_idx].size() > 1) // merged bin + { + seqan::hibf::sketch::hyperloglog current_sketch{sketches[0]}; // ensure same bit size + current_sketch.reset(); + + for (size_t const user_bin_id : tmax_partitions[partition_idx]) + { + hibf_layout.user_bins[user_bin_id] = {.previous_TB_indices = {partition_idx}, + .storage_TB_id = 0 /*not determiend yet*/, + .number_of_technical_bins = 1 /*not determiend yet*/, + .idx = user_bin_id}; + current_sketch.merge(sketches[user_bin_id]); + } + + // update max_bin_id, max_size + size_t const current_size = current_sketch.estimate() * relaxed_fpr_correction; + if (current_size > max_size) + { + max_bin_id = partition_idx; + max_size = current_size; + } + } + else if (tmax_partitions[partition_idx].size() == 0) // should not happen. Edge case? + { + continue; + } + else // single or split bin (tmax_partitions[partition_idx].size() == 1) + { + assert(tmax_partitions[partition_idx].size() == 1); + size_t const user_bin_id = tmax_partitions[partition_idx][0]; + hibf_layout.user_bins[user_bin_id] = {.previous_TB_indices = {}, + .storage_TB_id = partition_idx, + .number_of_technical_bins = 1 /*determiend below*/, + .idx = user_bin_id}; + + while (partition_idx + 1 < tmax_partitions.size() && tmax_partitions[partition_idx].size() == 1 + && tmax_partitions[partition_idx + 1].size() == 1 + && tmax_partitions[partition_idx][0] == tmax_partitions[partition_idx + 1][0]) + { + ++hibf_layout.user_bins[user_bin_id].number_of_technical_bins; + ++partition_idx; + } + + // update max_bin_id, max_size + size_t const current_size = + sketches[user_bin_id].estimate() + * split_fpr_correction[hibf_layout.user_bins[user_bin_id].number_of_technical_bins]; + if (current_size > max_size) + { + max_bin_id = hibf_layout.user_bins[user_bin_id].storage_TB_id; + max_size = current_size; + } + } + } + + hibf_layout.top_level_max_bin_id = max_bin_id; + + config.small_layouts_timer.start(); +#pragma omp parallel +#pragma omp single + { +#pragma omp taskloop + for (size_t partition_idx = 0; partition_idx < tmax_partitions.size(); ++partition_idx) + { + auto const & partition = tmax_partitions[partition_idx]; + + if (partition.empty() || partition.size() == 1) // nothing to merge + continue; + + if (do_I_need_a_fast_layout(config_copy, partition, cardinalities)) + { + fast_layout_recursion(config_copy, + partition, + cardinalities, + sketches, + minHash_sketches, + hibf_layout, + {partition_idx}); // recurse fast_layout + } + else + { + auto small_layout = general_layout(config, partition, cardinalities, sketches); + +#pragma omp critical + { + update_layout_from_child_layout(small_layout, hibf_layout, std::vector{partition_idx}); + } + } + } + } + config.small_layouts_timer.stop(); +} + int execute(chopper::configuration & config, std::vector> const & filenames, - std::vector const & sketches) + std::vector const & sketches, + std::vector const & minHash_sketches) { config.hibf_config.validate_and_set_defaults(); - seqan::hibf::layout::layout hibf_layout; - std::vector kmer_counts; - seqan::hibf::sketch::estimate_kmer_counts(sketches, kmer_counts); + std::vector cardinalities; + seqan::hibf::sketch::estimate_kmer_counts(sketches, cardinalities); - if (config.determine_best_tmax) + if (true) // 0 == unset == single HIBF, 1 == single HIBF { - hibf_layout = determine_best_number_of_technical_bins(config, kmer_counts, sketches); + seqan::hibf::layout::layout hibf_layout; + + if (config.determine_best_tmax) + { + hibf_layout = determine_best_number_of_technical_bins(config, cardinalities, sketches); + } + else + { + config.dp_algorithm_timer.start(); + fast_layout(config, + seqan::hibf::iota_vector(sketches.size()), + cardinalities, + sketches, + minHash_sketches, + hibf_layout); + config.dp_algorithm_timer.stop(); + + // hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, + // cardinalities, + // sketches, + // seqan::hibf::iota_vector(sketches.size()), + // config.union_estimation_timer, + // config.rearrangement_timer); + + // sort records ascending by the number of bin indices (corresponds to the IBF levels) + // GCOVR_EXCL_START + std::ranges::sort(hibf_layout.max_bins, + [](auto const & r, auto const & l) + { + if (r.previous_TB_indices.size() == l.previous_TB_indices.size()) + return std::ranges::lexicographical_compare(r.previous_TB_indices, + l.previous_TB_indices); + else + return r.previous_TB_indices.size() < l.previous_TB_indices.size(); + }); + // GCOVR_EXCL_STOP + + if (config.output_verbose_statistics) + { + size_t dummy{}; + chopper::layout::hibf_statistics global_stats{config, sketches, cardinalities}; + global_stats.hibf_layout = hibf_layout; + global_stats.print_header_to(std::cout); + global_stats.print_summary_to(dummy, std::cout); + } + } + + // brief Write the output to the layout file. + std::ofstream fout{config.output_filename}; + chopper::layout::write_user_bins_to(filenames, fout); + config.write_to(fout); + hibf_layout.write_to(fout); } else { - config.dp_algorithm_timer.start(); - hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, - kmer_counts, - sketches, - seqan::hibf::iota_vector(sketches.size()), - config.union_estimation_timer, - config.rearrangement_timer); - config.dp_algorithm_timer.stop(); + std::vector> positions(config.hibf_config.tmax); // asign positions for each partition + + std::vector ps; + ps.resize(cardinalities.size()); + std::iota(ps.begin(), ps.end(), 0); + partition_user_bins(config, ps, cardinalities, sketches, minHash_sketches, positions); + + std::vector hibf_layouts(config.hibf_config.tmax); // multiple layouts - if (config.output_verbose_statistics) +#pragma omp parallel for schedule(dynamic) num_threads(config.hibf_config.threads) + for (size_t i = 0; i < config.hibf_config.tmax; ++i) { - size_t dummy{}; - chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts}; - global_stats.hibf_layout = hibf_layout; - global_stats.print_header_to(std::cout); - global_stats.print_summary_to(dummy, std::cout); + // reset tmax to fit number of user bins in layout + auto local_hibf_config = config.hibf_config; // every thread needs to set individual tmax + local_hibf_config.tmax = + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(positions[i].size())))); + + config.dp_algorithm_timer.start(); + hibf_layouts[i] = seqan::hibf::layout::compute_layout(local_hibf_config, + cardinalities, + sketches, + std::move(positions[i]), + config.union_estimation_timer, + config.rearrangement_timer); + config.dp_algorithm_timer.stop(); } - } - // brief Write the output to the layout file. - std::ofstream fout{config.output_filename}; - chopper::layout::write_user_bins_to(filenames, fout); - config.write_to(fout); - hibf_layout.write_to(fout); + // brief Write the output to the layout file. + std::ofstream fout{config.output_filename}; + chopper::layout::write_user_bins_to(filenames, fout); + config.write_to(fout); + + for (size_t i = 0; i < config.hibf_config.tmax; ++i) + hibf_layouts[i].write_to(fout); + } return 0; } diff --git a/src/layout/hibf_statistics.cpp b/src/layout/hibf_statistics.cpp index df86775c..d764b394 100644 --- a/src/layout/hibf_statistics.cpp +++ b/src/layout/hibf_statistics.cpp @@ -398,7 +398,9 @@ void hibf_statistics::collect_bins() # pragma GCC diagnostic ignored "-Warray-bounds=" # pragma GCC diagnostic ignored "-Wstringop-overflow=" #endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV - ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector{user_bin_info.idx}); + + // ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector{user_bin_info.idx}); + #if CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV # pragma GCC diagnostic pop #endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMMOV diff --git a/test/api/CMakeLists.txt b/test/api/CMakeLists.txt index 050affcf..3c46f8a4 100644 --- a/test/api/CMakeLists.txt +++ b/test/api/CMakeLists.txt @@ -9,5 +9,6 @@ include (add_subdirectories) add_api_test (config_test.cpp) add_api_test (input_functor_test.cpp) +add_api_test (lsh_test.cpp) add_subdirectories () diff --git a/test/api/layout/CMakeLists.txt b/test/api/layout/CMakeLists.txt index 9cdb4076..45f5b1ce 100644 --- a/test/api/layout/CMakeLists.txt +++ b/test/api/layout/CMakeLists.txt @@ -9,6 +9,7 @@ cmake_minimum_required (VERSION 3.18) add_api_test (ibf_query_cost_test.cpp) add_api_test (execute_layout_test.cpp) +add_api_test (fast_layout_test.cpp) add_api_test (execute_with_estimation_test.cpp) target_use_datasources (execute_with_estimation_test FILES seq1.fa) @@ -26,3 +27,4 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" endif () add_api_test (user_bin_io_test.cpp) add_api_test (input_test.cpp) +add_api_test (determine_split_bins_test.cpp) diff --git a/test/api/layout/determine_split_bins_test.cpp b/test/api/layout/determine_split_bins_test.cpp new file mode 100644 index 00000000..4b0c49d2 --- /dev/null +++ b/test/api/layout/determine_split_bins_test.cpp @@ -0,0 +1,193 @@ +#include // for Test, TestInfo, EXPECT_EQ, Message, TEST, TestPartResult + +#include // for operator<<, char_traits, basic_ostream, basic_stringstream, strings... +#include // for allocator, string +#include // for vector + +#include + +#include +#include // for data_store +#include + +std::pair determine_split_bins(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + size_t const num_technical_bins, + size_t const num_user_bins, + std::vector> & partitions) +{ + if (num_user_bins == 0) + return {0, 0}; + + assert(num_technical_bins > 0u); + assert(num_user_bins > 0u); + assert(num_user_bins <= num_technical_bins); + + auto const fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = config.hibf_config.maximum_fpr, // + .hash_count = config.hibf_config.number_of_hash_functions, + .t_max = num_technical_bins}); + + std::vector> matrix(num_technical_bins); // rows + for (auto & v : matrix) + v.resize(num_user_bins, std::numeric_limits::max()); // columns + + std::vector> trace(num_technical_bins); // rows + for (auto & v : trace) + v.resize(num_user_bins, std::numeric_limits::max()); // columns + + size_t const extra_bins = num_technical_bins - num_user_bins + 1; + + // initialize first column (first row is initialized with inf) + double const ub_cardinality = static_cast(cardinalities[positions[0]]); + for (size_t i = 0; i < extra_bins; ++i) + { + size_t const corrected_ub_cardinality = static_cast(ub_cardinality * fpr_correction[i + 1]); + matrix[i][0] = seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i + 1u); + } + + // we must iterate column wise + for (size_t j = 1; j < num_user_bins; ++j) + { + double const ub_cardinality = static_cast(cardinalities[positions[j]]); + + for (size_t i = j; i < j + extra_bins; ++i) + { + size_t minimum{std::numeric_limits::max()}; + + for (size_t i_prime = j - 1; i_prime < i; ++i_prime) + { + size_t const corrected_ub_cardinality = + static_cast(ub_cardinality * fpr_correction[(i - i_prime)]); + size_t score = std::max(seqan::hibf::divide_and_ceil(corrected_ub_cardinality, i - i_prime), + matrix[i_prime][j - 1]); + + // std::cout << "j:" << j << " i:" << i << " i':" << i_prime << " score:" << score << std::endl; + + minimum = (score < minimum) ? (trace[i][j] = i_prime, score) : minimum; + } + + matrix[i][j] = minimum; + } + } + + // seqan::hibf::layout::print_matrix(matrix, num_technical_bins, num_user_bins, std::numeric_limits::max()); + //seqan::hibf::layout::print_matrix(trace, num_technical_bins, num_user_bins, std::numeric_limits::max()); + + // backtracking + // first, in the last column, find the row with minimum score (it can happen that the last rows are equally good) + // the less rows, the better + size_t trace_i{num_technical_bins - 1}; + size_t best_score{std::numeric_limits::max()}; + for (size_t best_i{0}; best_i < num_technical_bins; ++best_i) + { + if (matrix[best_i][num_user_bins - 1] < best_score) + { + best_score = matrix[best_i][num_user_bins - 1]; + trace_i = best_i; + } + } + size_t const number_of_split_tbs{trace_i + 1}; // trace_i is the position. So +1 for the number + + // now that we found the best trace_i start usual backtracking + size_t trace_j = num_user_bins - 1; + + // size_t max_id{}; + size_t max_size{}; + + size_t bin_id{}; + + while (trace_j > 0) + { + size_t next_i = trace[trace_i][trace_j]; + size_t const number_of_bins = (trace_i - next_i); + size_t const cardinality = cardinalities[positions[trace_j]]; + size_t const corrected_cardinality = static_cast(cardinality * fpr_correction[number_of_bins]); + size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, number_of_bins); + + if (cardinality_per_bin > max_size) + { + // max_id = bin_id; + max_size = cardinality_per_bin; + } + + for (size_t splits{0}; splits < number_of_bins; ++splits) + { + partitions[partitions.size() - 1 - bin_id].push_back(positions[trace_j]); + ++bin_id; + } + + trace_i = trace[trace_i][trace_j]; + --trace_j; + } + ++trace_i; // because we want the length not the index. Now trace_i == number_of_bins + size_t const cardinality = cardinalities[positions[0]]; + size_t const corrected_cardinality = static_cast(cardinality * fpr_correction[trace_i]); + // NOLINTNEXTLINE(clang-analyzer-core.DivideZero) + size_t const cardinality_per_bin = seqan::hibf::divide_and_ceil(corrected_cardinality, trace_i); + + if (cardinality_per_bin > max_size) + { + // max_id = bin_id; + max_size = cardinality_per_bin; + } + + for (size_t splits{0}; splits < trace_i; ++splits) + { + partitions[partitions.size() - 1 - bin_id].push_back(positions[0]); + ++bin_id; + } + + return {number_of_split_tbs, max_size}; +} + +TEST(simple_split, first) +{ + chopper::configuration config{}; + + std::vector cardinalities{}; + for (size_t i{0}; i < 20; ++i) + cardinalities.push_back(2400); + cardinalities.push_back(200); + for (size_t i{0}; i < 84; ++i) + cardinalities.push_back(50); + + std::vector positions(cardinalities.size()); + std::iota(positions.begin(), positions.end(), 0u); + + size_t num_technical_bins{63}; + size_t num_user_bins{21}; + + std::vector> partitions(64); + + auto const [num_splits, max_size] = + determine_split_bins(config, positions, cardinalities, num_technical_bins, num_user_bins, partitions); + + EXPECT_EQ(num_splits, 61); + EXPECT_EQ(max_size, 1452); + + size_t ub_idx{20}; + size_t tb_idx{partitions.size() - 1}; + + // TBs: 0, 1, 2, 3, ... + // UBs: 20,19,19,19,18,18,18,17,17,17,16,16,16,15,15,15,14,14,14,13,13,13,12,12,12,11,11,11,10,10,10,9,9,9,8,8,8,7,7,7,6,6,6,5,5,5,4,4,4,3,3,3,2,2,2,1,1,1,0,0,0, + + EXPECT_EQ(partitions[tb_idx][0], ub_idx); // single bin with card = 200 + --tb_idx; + --ub_idx; + + for (size_t i{partitions.size() - 1}; i > partitions.size() - num_splits - 1; --i) + std::cerr << partitions[i][0] << ","; + + for (; tb_idx > partitions.size() - num_splits - 1; --tb_idx) + { + for (size_t three = 0; three < 3; ++three) + { + EXPECT_EQ(partitions[tb_idx][0], ub_idx); + --tb_idx; + } + ++tb_idx; // one too much in loop before + --ub_idx; + } +} \ No newline at end of file diff --git a/test/api/layout/execute_layout_test.cpp b/test/api/layout/execute_layout_test.cpp index 5449ec12..d77a22f7 100644 --- a/test/api/layout/execute_layout_test.cpp +++ b/test/api/layout/execute_layout_test.cpp @@ -46,9 +46,10 @@ TEST(execute_test, few_ubs) filenames{{"seq0a", "seq0b"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, filenames, sketches); + chopper::layout::execute(config, filenames, sketches, minHash_sketches); std::string const expected_file{"@CHOPPER_USER_BINS\n" "@0 seq0a seq0b\n" @@ -142,9 +143,10 @@ TEST(execute_test, set_default_tmax) filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, filenames, sketches); + chopper::layout::execute(config, filenames, sketches, minHash_sketches); EXPECT_EQ(config.hibf_config.tmax, 64u); } @@ -178,9 +180,10 @@ TEST(execute_test, many_ubs) config.hibf_config.disable_estimate_union = true; // also disables rearrangement std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, many_filenames, sketches); + chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); std::string const expected_file{"@CHOPPER_USER_BINS\n" "@0 seq0\n" diff --git a/test/api/layout/execute_with_estimation_test.cpp b/test/api/layout/execute_with_estimation_test.cpp index c33d0243..9951befe 100644 --- a/test/api/layout/execute_with_estimation_test.cpp +++ b/test/api/layout/execute_with_estimation_test.cpp @@ -53,9 +53,10 @@ TEST(execute_estimation_test, few_ubs) filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}}; std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, filenames, sketches); + chopper::layout::execute(config, filenames, sketches, minHash_sketches); ASSERT_TRUE(std::filesystem::exists(stats_file)); @@ -114,9 +115,10 @@ TEST(execute_estimation_test, many_ubs) config.hibf_config.disable_estimate_union = true; // also disables rearrangement std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, many_filenames, sketches); + chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); ASSERT_TRUE(std::filesystem::exists(stats_file)); @@ -421,9 +423,10 @@ TEST(execute_estimation_test, many_ubs_force_all) config.hibf_config.disable_estimate_union = true; // also disables rearrangement std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, many_filenames, sketches); + chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); ASSERT_TRUE(std::filesystem::exists(stats_file)); @@ -523,9 +526,10 @@ TEST(execute_estimation_test, with_rearrangement) config.hibf_config.number_of_user_bins = filenames.size(); std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, filenames, sketches); + chopper::layout::execute(config, filenames, sketches, minHash_sketches); ASSERT_TRUE(std::filesystem::exists(stats_file)); diff --git a/test/api/layout/fast_layout_test.cpp b/test/api/layout/fast_layout_test.cpp new file mode 100644 index 00000000..9f175672 --- /dev/null +++ b/test/api/layout/fast_layout_test.cpp @@ -0,0 +1,422 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "../api_test.hpp" +#include "print_debug_file.hpp" + +TEST(execute_test, many_ubs) +{ + seqan3::test::tmp_directory tmp_dir{}; + std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"}; + + std::vector> many_filenames; + + for (size_t i{0}; i < 96u; ++i) + many_filenames.push_back({seqan3::detail::to_string("seq", i)}); + + // Creates sizes of the following series + // [100,101,...,120,222,223,...,241,343,344,...362,464,465,...,483,585,586,...,600] + // See also https://godbolt.org/z/9517eaaaG + auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it) + { + size_t const desired_kmer_count = 1001 * ((num + 20) / 20) + num; + for (auto hash : std::views::iota(num * 600, num * 600 + desired_kmer_count)) + it = hash; + }; + + chopper::configuration config{}; + config.output_filename = layout_file; + config.disable_sketch_output = true; + config.hibf_config.tmax = 64; + config.hibf_config.input_fn = simulated_input; + config.hibf_config.number_of_user_bins = many_filenames.size(); + config.hibf_config.disable_estimate_union = true; // also disables rearrangement + + std::vector sketches; + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); + + chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); + + std::string const expected_file{"@CHOPPER_USER_BINS\n" + "@0 seq0\n" + "@1 seq1\n" + "@2 seq2\n" + "@3 seq3\n" + "@4 seq4\n" + "@5 seq5\n" + "@6 seq6\n" + "@7 seq7\n" + "@8 seq8\n" + "@9 seq9\n" + "@10 seq10\n" + "@11 seq11\n" + "@12 seq12\n" + "@13 seq13\n" + "@14 seq14\n" + "@15 seq15\n" + "@16 seq16\n" + "@17 seq17\n" + "@18 seq18\n" + "@19 seq19\n" + "@20 seq20\n" + "@21 seq21\n" + "@22 seq22\n" + "@23 seq23\n" + "@24 seq24\n" + "@25 seq25\n" + "@26 seq26\n" + "@27 seq27\n" + "@28 seq28\n" + "@29 seq29\n" + "@30 seq30\n" + "@31 seq31\n" + "@32 seq32\n" + "@33 seq33\n" + "@34 seq34\n" + "@35 seq35\n" + "@36 seq36\n" + "@37 seq37\n" + "@38 seq38\n" + "@39 seq39\n" + "@40 seq40\n" + "@41 seq41\n" + "@42 seq42\n" + "@43 seq43\n" + "@44 seq44\n" + "@45 seq45\n" + "@46 seq46\n" + "@47 seq47\n" + "@48 seq48\n" + "@49 seq49\n" + "@50 seq50\n" + "@51 seq51\n" + "@52 seq52\n" + "@53 seq53\n" + "@54 seq54\n" + "@55 seq55\n" + "@56 seq56\n" + "@57 seq57\n" + "@58 seq58\n" + "@59 seq59\n" + "@60 seq60\n" + "@61 seq61\n" + "@62 seq62\n" + "@63 seq63\n" + "@64 seq64\n" + "@65 seq65\n" + "@66 seq66\n" + "@67 seq67\n" + "@68 seq68\n" + "@69 seq69\n" + "@70 seq70\n" + "@71 seq71\n" + "@72 seq72\n" + "@73 seq73\n" + "@74 seq74\n" + "@75 seq75\n" + "@76 seq76\n" + "@77 seq77\n" + "@78 seq78\n" + "@79 seq79\n" + "@80 seq80\n" + "@81 seq81\n" + "@82 seq82\n" + "@83 seq83\n" + "@84 seq84\n" + "@85 seq85\n" + "@86 seq86\n" + "@87 seq87\n" + "@88 seq88\n" + "@89 seq89\n" + "@90 seq90\n" + "@91 seq91\n" + "@92 seq92\n" + "@93 seq93\n" + "@94 seq94\n" + "@95 seq95\n" + "@CHOPPER_USER_BINS_END\n" + "@CHOPPER_CONFIG\n" + "@{\n" + "@ \"chopper_config\": {\n" + "@ \"version\": 2,\n" + "@ \"data_file\": {\n" + "@ \"value0\": \"\"\n" + "@ },\n" + "@ \"debug\": false,\n" + "@ \"sketch_directory\": {\n" + "@ \"value0\": \"\"\n" + "@ },\n" + "@ \"k\": 19,\n" + "@ \"window_size\": 19,\n" + "@ \"disable_sketch_output\": true,\n" + "@ \"precomputed_files\": false,\n" + "@ \"output_filename\": {\n" + "@ \"value0\": \"" + + layout_file.string() + + "\"\n" + "@ },\n" + "@ \"determine_best_tmax\": false,\n" + "@ \"force_all_binnings\": false\n" + "@ }\n" + "@}\n" + "@CHOPPER_CONFIG_END\n" + "@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 2,\n" + "@ \"number_of_user_bins\": 96,\n" + "@ \"number_of_hash_functions\": 2,\n" + "@ \"maximum_fpr\": 0.05,\n" + "@ \"relaxed_fpr\": 0.3,\n" + "@ \"threads\": 1,\n" + "@ \"sketch_bits\": 12,\n" + "@ \"tmax\": 64,\n" + "@ \"empty_bin_fraction\": 0.0,\n" + "@ \"alpha\": 1.2,\n" + "@ \"max_rearrangement_ratio\": 0.5,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": true\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n" +#ifdef _LIBCPP_VERSION + "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n" + "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n" + "#LOWER_LEVEL_IBF_3 fullest_technical_bin_idx:6\n" + "#LOWER_LEVEL_IBF_4 fullest_technical_bin_idx:15\n" + "#LOWER_LEVEL_IBF_5 fullest_technical_bin_idx:0\n" + "#LOWER_LEVEL_IBF_7 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_8 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_10 fullest_technical_bin_idx:35\n" + "#LOWER_LEVEL_IBF_13 fullest_technical_bin_idx:7\n" + "#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS\n" + "0 5;0 1;2\n" + "1 10;0 1;2\n" + "2 3;0 1;2\n" + "3 13;0 1;2\n" + "4 13;2 1;2\n" + "5 7;0 1;2\n" + "6 10;2 1;2\n" + "7 10;4 1;2\n" + "8 3;2 1;2\n" + "9 3;4 1;2\n" + "10 4;0 1;2\n" + "11 7;2 1;2\n" + "12 2;0 1;2\n" + "13 2;2 1;2\n" + "14 2;4 1;2\n" + "15 4;2 1;2\n" + "16 13;4 1;3\n" + "17 7;4 1;2\n" + "18 5;2 1;3\n" + "19 10;6 1;2\n" + "20 8;0 1;12\n" + "21 8;12 1;12\n" + "22 7;6 1;9\n" + "23 7;15 1;9\n" + "24 10;8 1;9\n" + "25 10;17 1;9\n" + "26 4;4 1;11\n" + "27 4;15 1;11\n" + "28 1;0 1;12\n" + "29 1;12 1;12\n" + "30 2;6 1;11\n" + "31 2;17 1;11\n" + "32 13;7 1;13\n" + "33 7;24 1;9\n" + "34 5;5 1;14\n" + "35 10;26 1;9\n" + "36 3;6 1;13\n" + "37 22 1\n" + "38 21 1\n" + "39 20 1\n" + "40 15 1\n" + "41 18 1\n" + "42 19 1\n" + "43 12 1\n" + "44 17 1\n" + "45 16 1\n" + "46 9 1\n" + "47 14 1\n" + "48 11 1\n" + "49 8;24 1;40\n" + "50 13;20 1;44\n" + "51 7;33 1;31\n" + "52 5;19 1;45\n" + "53 10;35 1;29\n" + "54 3;19 1;45\n" + "55 4;26 1;38\n" + "56 6 1\n" + "57 1;24 1;40\n" + "58 0 1\n" + "59 2;28 1;36\n" + "60 63 1\n" + "61 62 1\n" + "62 60 1\n" + "63 61 1\n" + "64 59 1\n" + "65 58 1\n" + "66 57 1\n" + "67 55 1\n" + "68 56 1\n" + "69 54 1\n" + "70 53 1\n" + "71 52 1\n" + "72 51 1\n" + "73 49 1\n" + "74 48 1\n" + "75 50 1\n" + "76 45 1\n" + "77 47 1\n" + "78 46 1\n" + "79 44 1\n" + "80 43 1\n" + "81 42 1\n" + "82 41 1\n" + "83 40 1\n" + "84 39 1\n" + "85 37 1\n" + "86 38 1\n" + "87 36 1\n" + "88 35 1\n" + "89 34 1\n" + "90 29 2\n" + "91 31 2\n" + "92 27 2\n" + "93 23 2\n" + "94 33 1\n" + "95 25 2\n" +#else + "#TOP_LEVEL_IBF fullest_technical_bin_idx:23\n" + "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:28\n" + "#LOWER_LEVEL_IBF_3 fullest_technical_bin_idx:6\n" + "#LOWER_LEVEL_IBF_4 fullest_technical_bin_idx:15\n" + "#LOWER_LEVEL_IBF_5 fullest_technical_bin_idx:0\n" + "#LOWER_LEVEL_IBF_6 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_8 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_9 fullest_technical_bin_idx:35\n" + "#LOWER_LEVEL_IBF_12 fullest_technical_bin_idx:7\n" + "#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS\n" + "0 5;0 1;2\n" + "1 9;0 1;2\n" + "2 3;0 1;2\n" + "3 12;0 1;2\n" + "4 12;2 1;2\n" + "5 6;0 1;2\n" + "6 9;2 1;2\n" + "7 9;4 1;2\n" + "8 3;2 1;2\n" + "9 3;4 1;2\n" + "10 4;0 1;2\n" + "11 6;2 1;2\n" + "12 2;0 1;2\n" + "13 2;2 1;2\n" + "14 2;4 1;2\n" + "15 4;2 1;2\n" + "16 12;4 1;3\n" + "17 6;4 1;2\n" + "18 5;2 1;3\n" + "19 9;6 1;2\n" + "20 8;0 1;12\n" + "21 8;12 1;12\n" + "22 6;6 1;9\n" + "23 6;15 1;9\n" + "24 9;8 1;9\n" + "25 9;17 1;9\n" + "26 4;4 1;11\n" + "27 4;15 1;11\n" + "28 1;0 1;12\n" + "29 1;12 1;12\n" + "30 2;6 1;11\n" + "31 2;17 1;11\n" + "32 12;7 1;13\n" + "33 6;24 1;9\n" + "34 5;5 1;14\n" + "35 9;26 1;9\n" + "36 3;6 1;13\n" + "37 22 1\n" + "38 21 1\n" + "39 20 1\n" + "40 15 1\n" + "41 18 1\n" + "42 19 1\n" + "43 13 1\n" + "44 17 1\n" + "45 14 1\n" + "46 10 1\n" + "47 16 1\n" + "48 11 1\n" + "49 8;24 1;40\n" + "50 12;20 1;44\n" + "51 6;33 1;31\n" + "52 5;19 1;45\n" + "53 9;35 1;29\n" + "54 3;19 1;45\n" + "55 4;26 1;38\n" + "56 7 1\n" + "57 1;24 1;40\n" + "58 0 1\n" + "59 2;28 1;36\n" + "60 63 1\n" + "61 62 1\n" + "62 60 1\n" + "63 61 1\n" + "64 59 1\n" + "65 58 1\n" + "66 57 1\n" + "67 55 1\n" + "68 56 1\n" + "69 54 1\n" + "70 52 1\n" + "71 53 1\n" + "72 51 1\n" + "73 49 1\n" + "74 48 1\n" + "75 50 1\n" + "76 45 1\n" + "77 47 1\n" + "78 46 1\n" + "79 44 1\n" + "80 39 1\n" + "81 40 1\n" + "82 41 1\n" + "83 42 1\n" + "84 38 1\n" + "85 37 1\n" + "86 43 1\n" + "87 36 1\n" + "88 35 1\n" + "89 34 1\n" + "90 29 2\n" + "91 31 2\n" + "92 27 2\n" + "93 23 2\n" + "94 33 1\n" + "95 25 2\n" +#endif + }; + std::string const actual_file{string_from_file(layout_file)}; + + EXPECT_EQ(actual_file, expected_file) << actual_file << std::endl; +} diff --git a/test/api/layout/hibf_statistics_test.cpp b/test/api/layout/hibf_statistics_test.cpp index 718b43dc..054fbcb2 100644 --- a/test/api/layout/hibf_statistics_test.cpp +++ b/test/api/layout/hibf_statistics_test.cpp @@ -133,11 +133,12 @@ TEST(execute_test, chopper_layout_statistics) .disable_estimate_union = true /* also disable rearrangement */}}; std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); testing::internal::CaptureStdout(); testing::internal::CaptureStderr(); - chopper::layout::execute(config, many_filenames, sketches); + chopper::layout::execute(config, many_filenames, sketches, minHash_sketches); std::string layout_result_stdout = testing::internal::GetCapturedStdout(); std::string layout_result_stderr = testing::internal::GetCapturedStderr(); @@ -190,9 +191,10 @@ TEST(execute_test, chopper_layout_statistics_determine_best_bins) .disable_estimate_union = true /* also disable rearrangement */}}; std::vector sketches; - seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches); + std::vector minHash_sketches{}; + seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches); - chopper::layout::execute(config, filenames, sketches); + chopper::layout::execute(config, filenames, sketches, minHash_sketches); std::string expected_cout = R"expected_cout(## ### Parameters ### diff --git a/test/api/lsh_test.cpp b/test/api/lsh_test.cpp new file mode 100644 index 00000000..1ebfd552 --- /dev/null +++ b/test/api/lsh_test.cpp @@ -0,0 +1,159 @@ +#include // for Test, TestInfo, EXPECT_EQ, Message, TEST, TestPartResult + +#include // for size_t +#include // for operator<<, char_traits, basic_ostream, basic_stringstream, strings... +#include // for allocator, string +#include // for operator<< +#include // for vector + +#include + +TEST(Cluster_test, ctor_from_id) +{ + size_t user_bin_idx{5}; + chopper::Cluster const cluster{user_bin_idx}; + + EXPECT_EQ(cluster.id(), user_bin_idx); + EXPECT_FALSE(cluster.empty()); + EXPECT_EQ(cluster.size(), 1u); + ASSERT_EQ(cluster.contained_user_bins().size(), 1u); + EXPECT_EQ(cluster.contained_user_bins()[0], user_bin_idx); + EXPECT_TRUE(cluster.is_valid(user_bin_idx)); +} + +TEST(Cluster_test, move_to) +{ + size_t user_bin_idx1{5}; + size_t user_bin_idx2{7}; + chopper::Cluster cluster1{user_bin_idx1}; + chopper::Cluster cluster2{user_bin_idx2}; + + EXPECT_TRUE(cluster1.is_valid(user_bin_idx1)); + EXPECT_TRUE(cluster2.is_valid(user_bin_idx2)); + + cluster2.move_to(cluster1); + + // cluster1 now contains user bins 5 and 7 + EXPECT_EQ(cluster1.size(), 2u); + ASSERT_EQ(cluster1.contained_user_bins().size(), 2u); + EXPECT_EQ(cluster1.contained_user_bins()[0], user_bin_idx1); + EXPECT_EQ(cluster1.contained_user_bins()[1], user_bin_idx2); + + // cluster 2 is empty + EXPECT_TRUE(cluster2.has_been_moved()); + EXPECT_TRUE(cluster2.empty()); + EXPECT_EQ(cluster2.size(), 0u); + EXPECT_EQ(cluster2.contained_user_bins().size(), 0u); + EXPECT_EQ(cluster2.moved_to_cluster_id(), cluster1.id()); + + // both should still be valid + EXPECT_TRUE(cluster1.is_valid(user_bin_idx1)); + EXPECT_TRUE(cluster2.is_valid(user_bin_idx2)); +} + +TEST(Multicluster_test, ctor_from_cluster) +{ + chopper::Cluster const cluster1{5}; + chopper::MultiCluster const multi_cluster1{cluster1}; + + EXPECT_EQ(multi_cluster1.id(), cluster1.id()); + EXPECT_FALSE(multi_cluster1.empty()); + EXPECT_EQ(multi_cluster1.size(), 1u); + ASSERT_EQ(multi_cluster1.contained_user_bins().size(), 1u); + ASSERT_EQ(multi_cluster1.contained_user_bins()[0].size(), 1u); + EXPECT_EQ(multi_cluster1.contained_user_bins()[0][0], 5u); + EXPECT_TRUE(multi_cluster1.is_valid(cluster1.id())); +} + +TEST(Multicluster_test, ctor_from_moved_cluster) +{ + chopper::Cluster cluster1{5}; + chopper::Cluster cluster2{7}; + cluster2.move_to(cluster1); + + chopper::MultiCluster const multi_cluster2{cluster2}; + + EXPECT_EQ(multi_cluster2.id(), cluster2.id()); + EXPECT_TRUE(multi_cluster2.empty()); + EXPECT_TRUE(multi_cluster2.has_been_moved()); + EXPECT_EQ(multi_cluster2.moved_to_cluster_id(), cluster2.moved_to_cluster_id()); + EXPECT_EQ(multi_cluster2.size(), 0u); + ASSERT_EQ(multi_cluster2.contained_user_bins().size(), 0u); + EXPECT_TRUE(multi_cluster2.is_valid(cluster2.id())); +} + +TEST(Multicluster_test, move_to) +{ + chopper::Cluster cluster1{5}; + chopper::Cluster cluster2{7}; + cluster2.move_to(cluster1); + ASSERT_EQ(cluster1.size(), 2u); + chopper::Cluster const cluster3{13}; + + chopper::MultiCluster multi_cluster1{cluster1}; + EXPECT_TRUE(multi_cluster1.is_valid(cluster1.id())); + EXPECT_EQ(multi_cluster1.size(), 1u); + EXPECT_EQ(multi_cluster1.contained_user_bins().size(), 1u); + EXPECT_EQ(multi_cluster1.contained_user_bins()[0].size(), 2u); + + chopper::MultiCluster multi_cluster3{cluster3}; + EXPECT_TRUE(multi_cluster3.is_valid(cluster3.id())); + EXPECT_EQ(multi_cluster3.size(), 1u); + + multi_cluster1.move_to(multi_cluster3); + + EXPECT_TRUE(multi_cluster1.is_valid(cluster1.id())); + EXPECT_TRUE(multi_cluster3.is_valid(cluster3.id())); + + // multi_cluster1 has been moved and is empty now + EXPECT_TRUE(multi_cluster1.empty()); + EXPECT_EQ(multi_cluster1.size(), 0u); + EXPECT_TRUE(multi_cluster1.has_been_moved()); + EXPECT_EQ(multi_cluster1.moved_to_cluster_id(), multi_cluster3.id()); + + // multi_cluster3 contains 2 clusters now, {13} and {5, 7} + EXPECT_FALSE(multi_cluster3.empty()); + EXPECT_EQ(multi_cluster3.size(), 2u); // two clusters + ASSERT_EQ(multi_cluster3.contained_user_bins().size(), 2u); + ASSERT_EQ(multi_cluster3.contained_user_bins()[0].size(), 1u); + ASSERT_EQ(multi_cluster3.contained_user_bins()[1].size(), 2u); + ASSERT_EQ(multi_cluster3.contained_user_bins()[0][0], cluster3.id()); + ASSERT_EQ(multi_cluster3.contained_user_bins()[1][0], cluster1.id()); + ASSERT_EQ(multi_cluster3.contained_user_bins()[1][1], cluster2.id()); +} + +TEST(LSH_find_representative_cluster_test, cluster_one_move) +{ + std::vector clusters{chopper::Cluster{0}, chopper::Cluster{1}}; + clusters[1].move_to(clusters[0]); + + EXPECT_EQ(chopper::LSH_find_representative_cluster(clusters, clusters[1].id()), clusters[0].id()); +} + +TEST(LSH_find_representative_cluster_test, multi_cluster_one_move) +{ + std::vector mclusters{{chopper::Cluster{0}}, {chopper::Cluster{1}}}; + mclusters[1].move_to(mclusters[0]); + + EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[1].id()), mclusters[0].id()); +} + +TEST(LSH_find_representative_cluster_test, cluster_two_moves) +{ + std::vector clusters{chopper::Cluster{0}, chopper::Cluster{1}, chopper::Cluster{2}}; + clusters[2].move_to(clusters[1]); + clusters[1].move_to(clusters[0]); + + EXPECT_EQ(chopper::LSH_find_representative_cluster(clusters, clusters[1].id()), clusters[0].id()); + EXPECT_EQ(chopper::LSH_find_representative_cluster(clusters, clusters[2].id()), clusters[0].id()); +} + +TEST(LSH_find_representative_cluster_test, multi_cluster_two_moves) +{ + std::vector mclusters{{chopper::Cluster{0}}, {chopper::Cluster{1}}, {chopper::Cluster{2}}}; + mclusters[2].move_to(mclusters[1]); + mclusters[1].move_to(mclusters[0]); + + EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[1].id()), mclusters[0].id()); + EXPECT_EQ(chopper::LSH_find_representative_cluster(mclusters, mclusters[2].id()), mclusters[0].id()); +}