From 7ea2e5d7f94dfc2f74c776c67ad25c52b4e0e271 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Thu, 9 Nov 2023 09:34:53 +0100 Subject: [PATCH 01/30] [FEATURE] Compute a partitioned HIBF layout. --- include/chopper/configuration.hpp | 20 +++ include/chopper/layout/input.hpp | 5 +- src/layout/execute.cpp | 239 ++++++++++++++++++++++------ src/layout/input.cpp | 15 +- src/measure_HIBF.cpp | 87 ++++++++++ src/set_up_parser.cpp | 27 ++++ src/util/display_layout/general.cpp | 14 +- src/util/display_layout/sizes.cpp | 44 ++--- test/api/layout/input_test.cpp | 50 +++++- 9 files changed, 415 insertions(+), 86 deletions(-) create mode 100644 src/measure_HIBF.cpp diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp index d39b2a5d..a21ed4e0 100644 --- a/include/chopper/configuration.hpp +++ b/include/chopper/configuration.hpp @@ -19,8 +19,15 @@ namespace chopper { +enum partitioning_scheme +{ + blocked, + sorted +}; + struct configuration { + int partitioning_approach; /*!\name General Configuration * \{ */ @@ -46,6 +53,16 @@ struct configuration bool precomputed_files{false}; //!\} + /*!\name Partitioned HIBF configuration + * \{ + */ + //!\brief The maximum index size that the HIBF should not exceed. number_of_paritions will be set accordingly. + size_t maximum_index_size{0}; + + //!\brief The number of partitions for the HIBF index. + size_t number_of_partitions{0}; + //!\} + /*!\name Configuration of size estimates * \{ */ @@ -93,6 +110,9 @@ struct configuration archive(CEREAL_NVP(disable_sketch_output)); archive(CEREAL_NVP(precomputed_files)); + archive(CEREAL_NVP(maximum_index_size)); + archive(CEREAL_NVP(number_of_partitions)); + archive(CEREAL_NVP(output_filename)); archive(CEREAL_NVP(determine_best_tmax)); archive(CEREAL_NVP(force_all_binnings)); diff --git a/include/chopper/layout/input.hpp b/include/chopper/layout/input.hpp index 855856c1..037d7503 100644 --- a/include/chopper/layout/input.hpp +++ b/include/chopper/layout/input.hpp @@ -20,7 +20,8 @@ namespace chopper::layout { std::vector> read_filenames_from(std::istream & stream); -std::tuple>, configuration, seqan::hibf::layout::layout> -read_layout_file(std::istream & stream); + +std::tuple>, configuration, std::vector> +read_layouts_file(std::istream & stream); } // namespace chopper::layout diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index f3e6ef67..470f08be 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -27,12 +28,74 @@ #include #include +#include #include #include +#include namespace chopper::layout { +void partition_user_bins(chopper::configuration const & config, + std::vector const & cardinalities, + std::vector> & positions) +{ + // all approaches need sorted positions + std::vector const sorted_positions = [&cardinalities]() + { + std::vector ps; + ps.resize(cardinalities.size()); + std::iota(ps.begin(), ps.end(), 0); + seqan::hibf::sketch::toolbox::sort_by_cardinalities(cardinalities, ps); + return ps; + }(); + + if (config.partitioning_approach == partitioning_scheme::blocked) + { + size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); + size_t const block_size = std::min(u_bins_per_part, + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); + + size_t current_part{0u}; + size_t current_block_count{0}; + + for (size_t const current_user_bin_id : sorted_positions) + { + positions[current_part].push_back(current_user_bin_id); + ++current_block_count; + + if (current_block_count >= block_size) + { + current_block_count = 0; + ++current_part; + if (current_part == config.number_of_partitions) // we need to circle back to the first partition + current_part = 0; + } + } + } + else if (config.partitioning_approach == partitioning_scheme::sorted) + { + size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); + size_t const cardinality_per_part = + seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); + + size_t current_cardinality{0u}; + size_t current_part{0}; + + for (size_t const current_user_bin_id : sorted_positions) + { + positions[current_part].push_back(current_user_bin_id); + current_cardinality += cardinalities[current_user_bin_id]; + + if (current_cardinality >= cardinality_per_part) + { + current_cardinality = 0; + ++current_part; + } + } + } +} + int execute(chopper::configuration & config, std::vector> const & filenames) { assert(config.hibf_config.number_of_user_bins > 0); @@ -58,71 +121,143 @@ int execute(chopper::configuration & config, std::vector sketches; + if (config.number_of_partitions < 2) // 0 == unset == single HIBF, 1 == single HIBF + { + seqan::hibf::layout::layout hibf_layout; + std::vector sketches; - seqan::hibf::concurrent_timer compute_sketches_timer{}; - seqan::hibf::concurrent_timer union_estimation_timer{}; - seqan::hibf::concurrent_timer rearrangement_timer{}; - seqan::hibf::concurrent_timer dp_algorithm_timer{}; + seqan::hibf::concurrent_timer compute_sketches_timer{}; + seqan::hibf::concurrent_timer union_estimation_timer{}; + seqan::hibf::concurrent_timer rearrangement_timer{}; + seqan::hibf::concurrent_timer dp_algorithm_timer{}; - if (config.determine_best_tmax) - { - std::tie(hibf_layout, sketches) = determine_best_number_of_technical_bins(config); + if (config.determine_best_tmax) + { + std::tie(hibf_layout, sketches) = determine_best_number_of_technical_bins(config); + } + else + { + std::vector kmer_counts; + + compute_sketches_timer.start(); + seqan::hibf::sketch::compute_sketches(config.hibf_config, kmer_counts, sketches); + compute_sketches_timer.stop(); + + std::vector positions = [&kmer_counts]() + { + std::vector ps; + ps.resize(kmer_counts.size()); + std::iota(ps.begin(), ps.end(), 0); + return ps; + }(); // GCOVR_EXCL_LINE + + dp_algorithm_timer.start(); + hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, + kmer_counts, + sketches, + std::move(positions), + union_estimation_timer, + rearrangement_timer); + dp_algorithm_timer.stop(); + + if (config.output_verbose_statistics) + { + size_t dummy{}; + chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts}; + global_stats.hibf_layout = hibf_layout; + global_stats.print_header_to(std::cout); + global_stats.print_summary_to(dummy, std::cout); + } + } + + if (!config.disable_sketch_output) + { + if (!std::filesystem::exists(config.sketch_directory)) + std::filesystem::create_directory(config.sketch_directory); + + assert(filenames.size() == sketches.size()); + for (size_t i = 0; i < filenames.size(); ++i) + sketch::write_sketch_file(filenames[i][0], sketches[i], config); + } + + // brief Write the output to the layout file. + std::ofstream fout{config.output_filename}; + chopper::layout::write_user_bins_to(filenames, fout); + config.write_to(fout); + hibf_layout.write_to(fout); + + if (!config.output_timings.empty()) + { + std::ofstream output_stream{config.output_timings}; + output_stream << std::fixed << std::setprecision(2); + output_stream << "sketching_in_seconds\t" + << "layouting_in_seconds\t" + << "union_estimation_in_seconds\t" + << "rearrangement_in_seconds\n"; + output_stream << compute_sketches_timer.in_seconds() << '\t'; + output_stream << dp_algorithm_timer.in_seconds() << '\t'; + output_stream << union_estimation_timer.in_seconds() << '\t'; + output_stream << rearrangement_timer.in_seconds() << '\t'; + } } else { - std::vector kmer_counts; + std::vector cardinalities; + std::vector sketches; + std::vector> positions(config.number_of_partitions); // asign positions for each partition + // compute sketches of all user bins + seqan::hibf::concurrent_timer compute_sketches_timer{}; compute_sketches_timer.start(); - seqan::hibf::sketch::compute_sketches(config.hibf_config, kmer_counts, sketches); + seqan::hibf::sketch::compute_sketches(config.hibf_config, cardinalities, sketches); compute_sketches_timer.stop(); - dp_algorithm_timer.start(); - hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, - kmer_counts, - sketches, - union_estimation_timer, - rearrangement_timer); - dp_algorithm_timer.stop(); - - if (config.output_verbose_statistics) + + partition_user_bins(config, cardinalities, positions); + + std::vector hibf_layouts(config.number_of_partitions); // multiple layouts + +#pragma omp parallel for schedule(dynamic) num_threads(config.hibf_config.threads) + for (size_t i = 0; i < config.number_of_partitions; ++i) { - size_t dummy{}; - chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts}; - global_stats.hibf_layout = hibf_layout; - global_stats.print_header_to(std::cout); - global_stats.print_summary_to(dummy, std::cout); - } - } + seqan::hibf::concurrent_timer union_estimation_timer{}; + seqan::hibf::concurrent_timer rearrangement_timer{}; + seqan::hibf::concurrent_timer dp_algorithm_timer{}; - if (!config.disable_sketch_output) - { - if (!std::filesystem::exists(config.sketch_directory)) - std::filesystem::create_directory(config.sketch_directory); + // reset tmax to fit number of user bins in layout + config.hibf_config.tmax = + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(positions[i].size())))); - assert(filenames.size() == sketches.size()); - for (size_t i = 0; i < filenames.size(); ++i) - sketch::write_sketch_file(filenames[i][0], sketches[i], config); - } + dp_algorithm_timer.start(); + hibf_layouts[i] = seqan::hibf::layout::compute_layout(config.hibf_config, + cardinalities, + sketches, + std::move(positions[i]), + union_estimation_timer, + rearrangement_timer); + dp_algorithm_timer.stop(); - // brief Write the output to the layout file. - std::ofstream fout{config.output_filename}; - chopper::layout::write_user_bins_to(filenames, fout); - config.write_to(fout); - hibf_layout.write_to(fout); + if (!config.output_timings.empty()) + { + std::ofstream output_stream{config.output_timings, std::ios_base::app}; + output_stream << std::fixed << std::setprecision(2); + output_stream << "sketching_in_seconds\t" + << "layouting_in_seconds\t" + << "union_estimation_in_seconds\t" + << "rearrangement_in_seconds\n"; + output_stream << compute_sketches_timer.in_seconds() << '\t'; + output_stream << dp_algorithm_timer.in_seconds() << '\t'; + output_stream << union_estimation_timer.in_seconds() << '\t'; + output_stream << rearrangement_timer.in_seconds() << '\t'; + } + } - if (!config.output_timings.empty()) - { - std::ofstream output_stream{config.output_timings}; - output_stream << std::fixed << std::setprecision(2); - output_stream << "sketching_in_seconds\t" - << "layouting_in_seconds\t" - << "union_estimation_in_seconds\t" - << "rearrangement_in_seconds\n"; - output_stream << compute_sketches_timer.in_seconds() << '\t'; - output_stream << dp_algorithm_timer.in_seconds() << '\t'; - output_stream << union_estimation_timer.in_seconds() << '\t'; - output_stream << rearrangement_timer.in_seconds() << '\t'; + // brief Write the output to the layout file. + std::ofstream fout{config.output_filename}; + chopper::layout::write_user_bins_to(filenames, fout); + config.write_to(fout); + + for (size_t i = 0; i < config.number_of_partitions; ++i) + hibf_layouts[i].write_to(fout); } return 0; diff --git a/src/layout/input.cpp b/src/layout/input.cpp index 2b1a7459..76e8efab 100644 --- a/src/layout/input.cpp +++ b/src/layout/input.cpp @@ -66,15 +66,20 @@ std::vector> read_filenames_from(std::istream & stream) return filenames; } -std::tuple>, configuration, seqan::hibf::layout::layout> -read_layout_file(std::istream & stream) +std::tuple>, configuration, std::vector> +read_layouts_file(std::istream & stream) { std::vector> filenames = chopper::layout::read_filenames_from(stream); chopper::configuration chopper_config; chopper_config.read_from(stream); - seqan::hibf::layout::layout hibf_layout{}; - hibf_layout.read_from(stream); - return std::make_tuple(std::move(filenames), std::move(chopper_config), std::move(hibf_layout)); + std::vector layouts; + while (stream.good()) + { + seqan::hibf::layout::layout hibf_layout{}; + hibf_layout.read_from(stream); + layouts.push_back(std::move(hibf_layout)); + } + return std::make_tuple(std::move(filenames), std::move(chopper_config), std::move(layouts)); } } // namespace chopper::layout diff --git a/src/measure_HIBF.cpp b/src/measure_HIBF.cpp new file mode 100644 index 00000000..a972ba91 --- /dev/null +++ b/src/measure_HIBF.cpp @@ -0,0 +1,87 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +struct config +{ + std::filesystem::path input_index{}; + std::filesystem::path general_stats_output{"general.stats"}; +}; + +struct stats +{ + std::vector ibf_sizes; + std::vector ibf_levels; + std::vector ibf_load_factor; +}; + +inline void set_up_parser(sharg::parser & parser, config & cfg) +{ + parser.info.version = "1.0.0"; + parser.info.author = "Svenja Mehringer"; + parser.info.email = "svenja.mehringer@fu-berlin.de"; + parser.info.short_description = "Compute a top-level HIBF layout figure file"; + + parser.info.description.emplace_back("Computes an table to display the top-level layout."); + + parser.add_subsection("Main options:"); + parser.add_option(cfg.input_index, + sharg::config{.short_id = '\0', + .long_id = "index", + .description = "The input must be an index computed via raptor layout/build. ", + .required = true, + .validator = sharg::input_file_validator{}}); + parser.add_option(cfg.general_stats_output, + sharg::config{.short_id = '\0', + .long_id = "output", + .description = "The output. ", + .required = true, + .validator = sharg::output_file_validator{}}); +} + +int main(int argc, char const * argv[]) +{ + sharg::parser parser{"layout_stats", argc, argv, sharg::update_notifications::off}; + parser.info.version = "1.0.0"; + + config cfg{}; + set_up_parser(parser, cfg); + + try + { + parser.parse(); + } + catch (sharg::parser_error const & ext) + { + std::cerr << "[ERROR] " << ext.what() << '\n'; + return -1; + } + + auto index = raptor::raptor_index{}; + raptor::load_index(index, arguments); +} diff --git a/src/set_up_parser.cpp b/src/set_up_parser.cpp index 5f18a3a9..4be7096f 100644 --- a/src/set_up_parser.cpp +++ b/src/set_up_parser.cpp @@ -78,6 +78,7 @@ void set_up_parser(sharg::parser & parser, configuration & config) "accuracy.", .default_message = "k-mer size", }); + parser.add_option(config.output_timings, sharg::config{.short_id = '\0', .long_id = "timing-output", @@ -85,6 +86,32 @@ void set_up_parser(sharg::parser & parser, configuration & config) .default_message = "", .validator = sharg::output_file_validator{}}); + parser.add_option( + config.maximum_index_size, + sharg::config{ + .short_id = '\0', + .long_id = "maximum-index-size", + .description = + "You can restrict the hibf index to have a maximum index size which will partition the index into " + "several partitions if needed. The number of partitions is computed based on your input data. " + "you can manually set the number of partitions with --number-of-partitions"}); + + parser.add_option( + config.number_of_partitions, + sharg::config{ + .short_id = '\0', + .long_id = "number-of-partitions", + .description = + "The number of partitions of the HIBF. We recommend to instead use the option maximum-index-size if " + "your goal is to reduce the index size and thereby peak mempry usage of searching with the HIBF.", + .advanced = true}); + + parser.add_option(config.partitioning_approach, + sharg::config{.short_id = '\0', + .long_id = "partitioning-approach", + .description = "this is only configurable for debugging.", + .advanced = true}); + parser.add_option( config.hibf_config.tmax, sharg::config{ diff --git a/src/util/display_layout/general.cpp b/src/util/display_layout/general.cpp index 4ed3f76f..29c1f813 100644 --- a/src/util/display_layout/general.cpp +++ b/src/util/display_layout/general.cpp @@ -192,7 +192,7 @@ int execute(config const & cfg) auto chopper_config = std::move(std::get<1>(tuple)); auto hibf_layout = std::move(std::get<2>(tuple)); #else - auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); + auto [filenames, chopper_config, hibf_layouts] = chopper::layout::read_layouts_file(layout_file); #endif auto const & hibf_config = chopper_config.hibf_config; @@ -223,7 +223,7 @@ int execute(config const & cfg) // If the index is the same, sort by file sizes (happens for merged bins). // Using the smallest file to initialise the shared k-mers later will be less work. std::ranges::sort( - hibf_layout.user_bins, + hibf_layouts[0].user_bins, [&filesizes](seqan::hibf::layout::layout::user_bin const & lhs, seqan::hibf::layout::layout::user_bin const & rhs) { @@ -232,7 +232,7 @@ int execute(config const & cfg) return first_idx < second_idx || (first_idx == second_idx && filesizes[lhs.idx] < filesizes[rhs.idx]); }); - size_t const total_ub_count = hibf_layout.user_bins.size(); + size_t const total_ub_count = hibf_layouts[0].user_bins.size(); progress_bar progress{total_ub_count}; // Create chunks containing user bin indices for one technical bin. @@ -249,8 +249,8 @@ int execute(config const & cfg) // Two user bins belong to the same chunk if they are in the same technical bin. auto predicate = [&](size_t const lhs, size_t const rhs) { - auto const & lhs_ub = hibf_layout.user_bins[lhs]; - auto const & rhs_ub = hibf_layout.user_bins[rhs]; + auto const & lhs_ub = hibf_layouts[0].user_bins[lhs]; + auto const & rhs_ub = hibf_layouts[0].user_bins[rhs]; // The top-level technical bin index for the current user bin. // user_bin.previous_TB_indices.size() == 0: true for split bins, false for merged bins // user_bin.storage_TB_id: technical bin index on the lowest level @@ -289,7 +289,7 @@ int execute(config const & cfg) for (size_t const ub_index : chunk) { - auto const & user_bin = hibf_layout.user_bins[ub_index]; + auto const & user_bin = hibf_layouts[0].user_bins[ub_index]; current_kmers.clear(); // We don't need to keep the current_kmers if there are no shared k-mers to merge them with. @@ -325,7 +325,7 @@ int execute(config const & cfg) } // Into how many techincal bins is the user bin split? Always 1 for merged bins. - size_t const split_count{is_merged ? 1u : hibf_layout.user_bins[chunk[0]].number_of_technical_bins}; + size_t const split_count{is_merged ? 1u : hibf_layouts[0].user_bins[chunk[0]].number_of_technical_bins}; size_t const avg_kmer_count = (current_kmer_set.size() + split_count - 1u) / split_count; size_t const sketch_estimate = (sketch.estimate() + split_count - 1u) / split_count; diff --git a/src/util/display_layout/sizes.cpp b/src/util/display_layout/sizes.cpp index 9218b596..5ec9b69d 100644 --- a/src/util/display_layout/sizes.cpp +++ b/src/util/display_layout/sizes.cpp @@ -288,9 +288,9 @@ void execute_general_stats(config const & cfg) // https://godbolt.org/z/WoWf55KPb auto filenames = std::move(std::get<0>(tuple)); auto chopper_config = std::move(std::get<1>(tuple)); - auto hibf_layout = std::move(std::get<2>(tuple)); + auto hibf_layouts = std::move(std::get<2>(tuple)); #else - auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); + auto [filenames, chopper_config, hibf_layouts] = chopper::layout::read_layouts_file(layout_file); #endif // Prepare configs @@ -311,28 +311,34 @@ void execute_general_stats(config const & cfg) auto const & hibf_config = chopper_config.hibf_config; // Prepare stats - size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u; - std::vector stats(number_of_ibfs); + assert(hibf_layouts.size() > 0); + size_t part = (hibf_layouts.size() == 1) ? 0 : 1; + for (auto const & hibf_layout : hibf_layouts) + { + size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u; + std::vector stats(number_of_ibfs); - // Prepare data - seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}}; - seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root; - size_t const t_max{root_node.number_of_technical_bins}; - data.fpr_correction = seqan::hibf::layout::compute_fpr_correction( - {.fpr = hibf_config.maximum_fpr, .hash_count = hibf_config.number_of_hash_functions, .t_max = t_max}); + // Prepare data + seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}}; + seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root; + size_t const t_max{root_node.number_of_technical_bins}; + data.fpr_correction = seqan::hibf::layout::compute_fpr_correction( + {.fpr = hibf_config.maximum_fpr, .hash_count = hibf_config.number_of_hash_functions, .t_max = t_max}); - // Get stats - hierarchical_stats(stats, root_node, data); + // Get stats + hierarchical_stats(stats, root_node, data); - // Get stats per level - per_level_stats const level_stats{stats}; + // Get stats per level + per_level_stats const level_stats{stats}; - // Output - std::ofstream output_stream{cfg.output}; - if (!output_stream.good() || !output_stream.is_open()) - throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; + // Output + std::ofstream output_stream(cfg.output.string(), std::ios::app); + if (!output_stream.good() || !output_stream.is_open()) + throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading (appending)."}; - level_stats.print(output_stream, hibf_layout.user_bins.size()); + level_stats.print(output_stream, hibf_layout.user_bins.size()); + ++part; + } } void execute_sizes(config const & cfg) diff --git a/test/api/layout/input_test.cpp b/test/api/layout/input_test.cpp index a1b240e2..74b692c8 100644 --- a/test/api/layout/input_test.cpp +++ b/test/api/layout/input_test.cpp @@ -71,7 +71,9 @@ TEST(layout_test, read_single_layout) 5 1;2;3;4;22 1;1;1;1;21 )layout_file"}; - auto [filenames, chopper_config, layout] = chopper::layout::read_layout_file(ss); + auto [filenames, chopper_config, layouts] = chopper::layout::read_layout_file(ss); + + auto const & layout = layouts[0]; EXPECT_EQ(layout.top_level_max_bin_id, 111); EXPECT_EQ(layout.max_bins[0], (seqan::hibf::layout::layout::max_bin{{0}, 0})); @@ -81,3 +83,49 @@ TEST(layout_test, read_single_layout) EXPECT_EQ(layout.user_bins[1], (seqan::hibf::layout::layout::user_bin{std::vector{1}, 0, 22, 4})); EXPECT_EQ(layout.user_bins[2], (seqan::hibf::layout::layout::user_bin{std::vector{1, 2, 3, 4}, 22, 21, 5})); } + +TEST(layout_test, read_from_partitioned_layout) +{ + // layout consists of three partitions, written one after the other + std::stringstream ss{layout_header + R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +)layout_file"}; + + auto [filenames, chopper_config, hibf_layouts] = chopper::layout::read_layouts_file(ss); + + for (size_t i = 0; i < 3; ++i) + { + auto layout = hibf_layouts[i]; + + EXPECT_EQ(layout.top_level_max_bin_id, 111); + EXPECT_EQ(layout.max_bins[0], (seqan::hibf::layout::layout::max_bin{{0}, 0})); + EXPECT_EQ(layout.max_bins[1], (seqan::hibf::layout::layout::max_bin{{2}, 2})); + EXPECT_EQ(layout.max_bins[2], (seqan::hibf::layout::layout::max_bin{{1, 2, 3, 4}, 22})); + EXPECT_EQ(layout.user_bins[0], (seqan::hibf::layout::layout::user_bin{std::vector{}, 0, 1, 7})); + EXPECT_EQ(layout.user_bins[1], (seqan::hibf::layout::layout::user_bin{std::vector{1}, 0, 22, 4})); + EXPECT_EQ(layout.user_bins[2], + (seqan::hibf::layout::layout::user_bin{std::vector{1, 2, 3, 4}, 22, 21, 5})); + } +} From 0590816813a8fd0619a224d60169f47cd8032bf7 Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Tue, 28 Nov 2023 13:33:17 +0100 Subject: [PATCH 02/30] [MISC] automatic linting --- src/layout/execute.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 470f08be..04660c9d 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -53,8 +53,9 @@ void partition_user_bins(chopper::configuration const & config, if (config.partitioning_approach == partitioning_scheme::blocked) { size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); - size_t const block_size = std::min(u_bins_per_part, - chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); + size_t const block_size = + std::min(u_bins_per_part, + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); size_t current_part{0u}; size_t current_block_count{0}; From d75d814d6476174dd8c7c78dce167049082b3ffd Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Mon, 4 Dec 2023 08:26:16 +0100 Subject: [PATCH 03/30] fix configuration variable initialisation --- include/chopper/configuration.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp index a21ed4e0..886da51d 100644 --- a/include/chopper/configuration.hpp +++ b/include/chopper/configuration.hpp @@ -27,7 +27,7 @@ enum partitioning_scheme struct configuration { - int partitioning_approach; + int partitioning_approach{}; /*!\name General Configuration * \{ */ From 46f461994607261bef7067f1c1f8d0617aa04d38 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Mon, 4 Dec 2023 08:27:00 +0100 Subject: [PATCH 04/30] fix thread local variable hibf_config. --- src/layout/execute.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 04660c9d..d5f816d4 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -225,11 +225,12 @@ int execute(chopper::configuration & config, std::vector(std::ceil(std::sqrt(positions[i].size())))); dp_algorithm_timer.start(); - hibf_layouts[i] = seqan::hibf::layout::compute_layout(config.hibf_config, + hibf_layouts[i] = seqan::hibf::layout::compute_layout(local_hibf_config, cardinalities, sketches, std::move(positions[i]), From e29b5a5d3164cc972897abf14ce767cf03eef674 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Tue, 5 Dec 2023 11:00:13 +0100 Subject: [PATCH 05/30] NEW paritioning approach: folded. --- include/chopper/configuration.hpp | 3 ++- src/layout/execute.cpp | 35 +++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp index 886da51d..50cc5c29 100644 --- a/include/chopper/configuration.hpp +++ b/include/chopper/configuration.hpp @@ -22,7 +22,8 @@ namespace chopper enum partitioning_scheme { blocked, - sorted + sorted, + folded }; struct configuration diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index d5f816d4..98130924 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -95,6 +95,41 @@ void partition_user_bins(chopper::configuration const & config, } } } + else if (config.partitioning_approach == partitioning_scheme::folded) + { + size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); + size_t const cardinality_per_part = + seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); + + size_t current_cardinality{0u}; + size_t current_part{0}; + size_t current_pos = 0; + size_t end_pos = sorted_positions.size(); + + while (current_pos < end_pos) + { + size_t const current_user_bin_id = sorted_positions[current_pos]; + + // check if adding next UB would surpass the `cardinality_per_part` + if (current_cardinality + cardinalities[current_user_bin_id] > cardinality_per_part) + { + // if so, fill the remaining space with small UBs from the right hand side. + while (end_pos > (current_pos + 1) && current_cardinality < cardinality_per_part) + { + size_t const smallest_user_bin_id = sorted_positions[end_pos - 1]; + current_cardinality += cardinalities[smallest_user_bin_id]; + positions[current_part].push_back(smallest_user_bin_id); + --end_pos; + } + + current_cardinality = 0; + ++current_part; + } + + current_cardinality += cardinalities[current_user_bin_id]; + positions[current_part].push_back(current_user_bin_id); + } + } } int execute(chopper::configuration & config, std::vector> const & filenames) From cab641f4d22ca29f50685da1907bd643cc4aed73 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 6 Dec 2023 08:45:47 +0100 Subject: [PATCH 06/30] fix alternative --- src/layout/execute.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 98130924..374add14 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -124,10 +124,15 @@ void partition_user_bins(chopper::configuration const & config, current_cardinality = 0; ++current_part; + + // In the while loop above, we always slightly overfill the current partition. + // That way, it shouldn't happen that the last user bin cannot fit into the last partition. + assert(current_part < config.number_of_partitions); } current_cardinality += cardinalities[current_user_bin_id]; positions[current_part].push_back(current_user_bin_id); + ++current_pos; } } } From 968684a73f6983725e2bc95b30e52dd2aa7db510 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 6 Dec 2023 09:48:50 +0100 Subject: [PATCH 07/30] adapt approach --- src/layout/execute.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 374add14..1fc213ee 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -100,6 +100,11 @@ void partition_user_bins(chopper::configuration const & config, size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); size_t const cardinality_per_part = seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); + size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); + size_t const block_size = + std::min(u_bins_per_part, + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); + size_t const cardinality_per_block = cardinality_per_part / block_size; size_t current_cardinality{0u}; size_t current_part{0}; @@ -111,7 +116,7 @@ void partition_user_bins(chopper::configuration const & config, size_t const current_user_bin_id = sorted_positions[current_pos]; // check if adding next UB would surpass the `cardinality_per_part` - if (current_cardinality + cardinalities[current_user_bin_id] > cardinality_per_part) + if (current_cardinality + cardinality_per_block > cardinality_per_part) { // if so, fill the remaining space with small UBs from the right hand side. while (end_pos > (current_pos + 1) && current_cardinality < cardinality_per_part) From 168d12bc4f0ed40b3dab7f663772fb26b710f2d1 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Thu, 7 Dec 2023 09:33:19 +0100 Subject: [PATCH 08/30] improve layout. --- src/layout/execute.cpp | 44 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 1fc213ee..300b5bde 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -98,46 +98,30 @@ void partition_user_bins(chopper::configuration const & config, else if (config.partitioning_approach == partitioning_scheme::folded) { size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); - size_t const cardinality_per_part = - seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); - size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); - size_t const block_size = - std::min(u_bins_per_part, - chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); - size_t const cardinality_per_block = cardinality_per_part / block_size; + size_t const cardinality_per_part_halved = + seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions * 2); size_t current_cardinality{0u}; + std::vector const parts = [&config]() + { + size_t const len{config.number_of_partitions}; + std::vector result(len * 2); + std::iota(result.begin(), result.begin() + len, 0); + std::copy(result.rbegin() + len, result.rend(), result.begin() + len); + return result; + }(); size_t current_part{0}; - size_t current_pos = 0; - size_t end_pos = sorted_positions.size(); - while (current_pos < end_pos) + for (size_t const current_user_bin_id : sorted_positions) { - size_t const current_user_bin_id = sorted_positions[current_pos]; + positions[parts[current_part]].push_back(current_user_bin_id); + current_cardinality += cardinalities[current_user_bin_id]; - // check if adding next UB would surpass the `cardinality_per_part` - if (current_cardinality + cardinality_per_block > cardinality_per_part) + if (current_cardinality >= cardinality_per_part_halved) { - // if so, fill the remaining space with small UBs from the right hand side. - while (end_pos > (current_pos + 1) && current_cardinality < cardinality_per_part) - { - size_t const smallest_user_bin_id = sorted_positions[end_pos - 1]; - current_cardinality += cardinalities[smallest_user_bin_id]; - positions[current_part].push_back(smallest_user_bin_id); - --end_pos; - } - current_cardinality = 0; ++current_part; - - // In the while loop above, we always slightly overfill the current partition. - // That way, it shouldn't happen that the last user bin cannot fit into the last partition. - assert(current_part < config.number_of_partitions); } - - current_cardinality += cardinalities[current_user_bin_id]; - positions[current_part].push_back(current_user_bin_id); - ++current_pos; } } } From 9eeb4bffe86053993c7c05441d81192b2c7524c6 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Fri, 8 Dec 2023 08:48:37 +0100 Subject: [PATCH 09/30] new approach --- include/chopper/configuration.hpp | 3 +- src/layout/execute.cpp | 81 +++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp index 50cc5c29..03e32de0 100644 --- a/include/chopper/configuration.hpp +++ b/include/chopper/configuration.hpp @@ -23,7 +23,8 @@ enum partitioning_scheme { blocked, sorted, - folded + folded, + weighted_fold }; struct configuration diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 300b5bde..ca9d3275 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -124,6 +124,87 @@ void partition_user_bins(chopper::configuration const & config, } } } + else if (config.partitioning_approach == partitioning_scheme::weighted_fold) + { + size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); + size_t const cardinality_per_part = + seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); + size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); + + size_t current_big_pos{0}; // the next largest user bin to assign to a partition + size_t current_small_pos{cardinalities.size() - 1}; // the next small user bin + + for (size_t current_part = 0; current_part < config.number_of_partitions - 1; ++current_part) + { + size_t current_cardinality{0}; + std::vector small_bins; + size_t new_small_bin_addition{0}; + + auto compute_score = [&]() + { + double const correct_weight = static_cast(current_cardinality) / cardinality_per_part; + double const correct_amount = static_cast(positions[current_part].size() + small_bins.size() + new_small_bin_addition) / u_bins_per_part; + return (correct_amount + correct_weight) / 2; + }; + + while (current_cardinality < cardinality_per_part) + { + positions[current_part].push_back(sorted_positions[current_big_pos]); + current_cardinality += cardinalities[sorted_positions[current_big_pos]]; + ++current_big_pos; + } + + double local_optimum = compute_score(); + + while(true) + { + size_t const cache_last_small_pos{current_small_pos}; + current_cardinality -= cardinalities[sorted_positions[current_big_pos]]; + while (current_cardinality < cardinality_per_part && + (positions[current_part].size() + small_bins.size() + new_small_bin_addition) < u_bins_per_part) + { + current_cardinality += cardinalities[sorted_positions[current_small_pos]]; + --current_small_pos; + ++new_small_bin_addition; + } + + // can we further improve the ratio by adding more small bins? + double improved_score{}; + do + { + improved_score = compute_score(); + current_cardinality += cardinalities[sorted_positions[current_small_pos]]; + --current_small_pos; + ++new_small_bin_addition; + } while (compute_score() > improved_score); + // remove overstep + ++current_small_pos; + current_cardinality -= cardinalities[sorted_positions[current_small_pos]]; + --new_small_bin_addition; + + if (local_optimum > compute_score()) // score would decrease. Stop + { + current_small_pos = cache_last_small_pos; + break; + } + else // update + { + positions[current_part].pop_back(); + --current_big_pos; + for (size_t pos = cache_last_small_pos; pos > current_small_pos; --pos) + small_bins.push_back(sorted_positions[pos]); + } + } + positions[current_part].insert(positions[current_part].end(), small_bins.begin(), small_bins.end()); + } + + // remaining user bins go to last partition + while (current_big_pos <= current_small_pos) + { + positions[config.number_of_partitions - 1].push_back(sorted_positions[current_big_pos]); + ++current_big_pos; + } + } } int execute(chopper::configuration & config, std::vector> const & filenames) From 49ffd1692815ec4487bc04ffd83f2ad835264b08 Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Fri, 8 Dec 2023 08:49:30 +0100 Subject: [PATCH 10/30] [MISC] automatic linting --- src/layout/execute.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index ca9d3275..ce4d4fdf 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -131,7 +131,7 @@ void partition_user_bins(chopper::configuration const & config, seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); - size_t current_big_pos{0}; // the next largest user bin to assign to a partition + size_t current_big_pos{0}; // the next largest user bin to assign to a partition size_t current_small_pos{cardinalities.size() - 1}; // the next small user bin for (size_t current_part = 0; current_part < config.number_of_partitions - 1; ++current_part) @@ -143,7 +143,9 @@ void partition_user_bins(chopper::configuration const & config, auto compute_score = [&]() { double const correct_weight = static_cast(current_cardinality) / cardinality_per_part; - double const correct_amount = static_cast(positions[current_part].size() + small_bins.size() + new_small_bin_addition) / u_bins_per_part; + double const correct_amount = + static_cast(positions[current_part].size() + small_bins.size() + new_small_bin_addition) + / u_bins_per_part; return (correct_amount + correct_weight) / 2; }; @@ -156,12 +158,13 @@ void partition_user_bins(chopper::configuration const & config, double local_optimum = compute_score(); - while(true) + while (true) { size_t const cache_last_small_pos{current_small_pos}; current_cardinality -= cardinalities[sorted_positions[current_big_pos]]; - while (current_cardinality < cardinality_per_part && - (positions[current_part].size() + small_bins.size() + new_small_bin_addition) < u_bins_per_part) + while (current_cardinality < cardinality_per_part + && (positions[current_part].size() + small_bins.size() + new_small_bin_addition) + < u_bins_per_part) { current_cardinality += cardinalities[sorted_positions[current_small_pos]]; --current_small_pos; @@ -176,7 +179,8 @@ void partition_user_bins(chopper::configuration const & config, current_cardinality += cardinalities[sorted_positions[current_small_pos]]; --current_small_pos; ++new_small_bin_addition; - } while (compute_score() > improved_score); + } + while (compute_score() > improved_score); // remove overstep ++current_small_pos; current_cardinality -= cardinalities[sorted_positions[current_small_pos]]; From a8c8ce91b549f25c692f878af8d7c4f7117bd4f7 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Fri, 8 Dec 2023 10:33:06 +0100 Subject: [PATCH 11/30] FIX --- src/layout/execute.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index ce4d4fdf..1fe553ec 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -134,7 +134,7 @@ void partition_user_bins(chopper::configuration const & config, size_t current_big_pos{0}; // the next largest user bin to assign to a partition size_t current_small_pos{cardinalities.size() - 1}; // the next small user bin - for (size_t current_part = 0; current_part < config.number_of_partitions - 1; ++current_part) + for (size_t current_part = 0; current_part + 1 < config.number_of_partitions; ++current_part) { size_t current_cardinality{0}; std::vector small_bins; @@ -142,11 +142,9 @@ void partition_user_bins(chopper::configuration const & config, auto compute_score = [&]() { - double const correct_weight = static_cast(current_cardinality) / cardinality_per_part; - double const correct_amount = - static_cast(positions[current_part].size() + small_bins.size() + new_small_bin_addition) - / u_bins_per_part; - return (correct_amount + correct_weight) / 2; + double const weight = static_cast(current_cardinality) / cardinality_per_part; + double const amount = static_cast(positions[current_part].size() + small_bins.size() + new_small_bin_addition) / u_bins_per_part; + return (std::abs(1 - weight) + std::abs(1 - amount)) / 2; }; while (current_cardinality < cardinality_per_part) @@ -161,6 +159,8 @@ void partition_user_bins(chopper::configuration const & config, while (true) { size_t const cache_last_small_pos{current_small_pos}; + // remove a big user bin and fill the partition with small user bins + // fill the partition until either the maximum cardinality or the maximum amount of bins is reached current_cardinality -= cardinalities[sorted_positions[current_big_pos]]; while (current_cardinality < cardinality_per_part && (positions[current_part].size() + small_bins.size() + new_small_bin_addition) @@ -180,13 +180,13 @@ void partition_user_bins(chopper::configuration const & config, --current_small_pos; ++new_small_bin_addition; } - while (compute_score() > improved_score); + while (compute_score() < improved_score); // smaller is better // remove overstep ++current_small_pos; current_cardinality -= cardinalities[sorted_positions[current_small_pos]]; --new_small_bin_addition; - if (local_optimum > compute_score()) // score would decrease. Stop + if (local_optimum < compute_score()) // score would increase. Stop { current_small_pos = cache_last_small_pos; break; From 36c354136520c3ff84f6b4c31c409c632b21ab3f Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Fri, 8 Dec 2023 10:35:31 +0100 Subject: [PATCH 12/30] [MISC] automatic linting --- src/layout/execute.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 1fe553ec..623d3fd8 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -143,7 +143,9 @@ void partition_user_bins(chopper::configuration const & config, auto compute_score = [&]() { double const weight = static_cast(current_cardinality) / cardinality_per_part; - double const amount = static_cast(positions[current_part].size() + small_bins.size() + new_small_bin_addition) / u_bins_per_part; + double const amount = + static_cast(positions[current_part].size() + small_bins.size() + new_small_bin_addition) + / u_bins_per_part; return (std::abs(1 - weight) + std::abs(1 - amount)) / 2; }; From 9a1f52948969db52ab98bf9ffd28858882016aa0 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Fri, 8 Dec 2023 11:15:05 +0100 Subject: [PATCH 13/30] fix --- src/layout/execute.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 623d3fd8..61f316cb 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -146,9 +146,10 @@ void partition_user_bins(chopper::configuration const & config, double const amount = static_cast(positions[current_part].size() + small_bins.size() + new_small_bin_addition) / u_bins_per_part; - return (std::abs(1 - weight) + std::abs(1 - amount)) / 2; + return std::abs(1.0 - weight) + std::abs(1.0 - amount); }; + // first add all large bins that fit while (current_cardinality < cardinality_per_part) { positions[current_part].push_back(sorted_positions[current_big_pos]); @@ -158,20 +159,12 @@ void partition_user_bins(chopper::configuration const & config, double local_optimum = compute_score(); + // then remove big bins and add small bins until a local optima is reached while (true) { size_t const cache_last_small_pos{current_small_pos}; // remove a big user bin and fill the partition with small user bins - // fill the partition until either the maximum cardinality or the maximum amount of bins is reached current_cardinality -= cardinalities[sorted_positions[current_big_pos]]; - while (current_cardinality < cardinality_per_part - && (positions[current_part].size() + small_bins.size() + new_small_bin_addition) - < u_bins_per_part) - { - current_cardinality += cardinalities[sorted_positions[current_small_pos]]; - --current_small_pos; - ++new_small_bin_addition; - } // can we further improve the ratio by adding more small bins? double improved_score{}; From f8540407e300fa5c93ce3169fb041f88c7e72aa5 Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Fri, 8 Dec 2023 12:56:03 +0100 Subject: [PATCH 14/30] [MISC] automatic linting --- src/layout/execute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 61f316cb..850fcb65 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -287,7 +287,7 @@ int execute(chopper::configuration & config, std::vector Date: Sat, 9 Dec 2023 13:50:03 +0100 Subject: [PATCH 15/30] [INFRA] Use hibf branch --- lib/hibf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/hibf b/lib/hibf index b068f7ed..46e5e8ce 160000 --- a/lib/hibf +++ b/lib/hibf @@ -1 +1 @@ -Subproject commit b068f7ed2fe6716453ced9059321a4e1fa2c5a35 +Subproject commit 46e5e8ce624c791115ba2ec140cc7a9cfcfeff08 From 11e940d8795d45b6b399957a08b37224faa869aa Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Sat, 9 Dec 2023 14:29:18 +0100 Subject: [PATCH 16/30] [MISC] read_layouts_file --- src/util/display_layout/general.cpp | 2 +- src/util/display_layout/sizes.cpp | 2 +- test/api/config_test.cpp | 6 ++++-- test/api/layout/input_test.cpp | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/util/display_layout/general.cpp b/src/util/display_layout/general.cpp index 29c1f813..404960d7 100644 --- a/src/util/display_layout/general.cpp +++ b/src/util/display_layout/general.cpp @@ -186,7 +186,7 @@ int execute(config const & cfg) // https://godbolt.org/z/PeKnxzjn1 #if defined(__clang__) - auto tuple = chopper::layout::read_layout_file(layout_file); + auto tuple = chopper::layout::read_layouts_file(layout_file); // https://godbolt.org/z/WoWf55KPb auto filenames = std::move(std::get<0>(tuple)); auto chopper_config = std::move(std::get<1>(tuple)); diff --git a/src/util/display_layout/sizes.cpp b/src/util/display_layout/sizes.cpp index 5ec9b69d..ef363272 100644 --- a/src/util/display_layout/sizes.cpp +++ b/src/util/display_layout/sizes.cpp @@ -284,7 +284,7 @@ void execute_general_stats(config const & cfg) // https://godbolt.org/z/PeKnxzjn1 #if defined(__clang__) - auto tuple = chopper::layout::read_layout_file(layout_file); + auto tuple = chopper::layout::read_layouts_file(layout_file); // https://godbolt.org/z/WoWf55KPb auto filenames = std::move(std::get<0>(tuple)); auto chopper_config = std::move(std::get<1>(tuple)); diff --git a/test/api/config_test.cpp b/test/api/config_test.cpp index be717ddc..2cd5c652 100644 --- a/test/api/config_test.cpp +++ b/test/api/config_test.cpp @@ -149,7 +149,7 @@ TEST(config_test, read_from_with_more_meta) } // Easier to do in the config_test because of existing helper functions -TEST(input, read_layout_file) +TEST(input, read_layouts_file) { std::string config_string{"@CHOPPER_USER_BINS\n" "@0 file1.fa\n" @@ -169,7 +169,9 @@ TEST(input, read_layout_file) std::stringstream ss{config_string}; - auto [filenames, config, layout] = chopper::layout::read_layout_file(ss); + auto [filenames, config, layouts] = chopper::layout::read_layouts_file(ss); + + auto const & layout = layouts[0]; std::vector> const expected_filenames{{"file1.fa"}, {"file2.fa"}, diff --git a/test/api/layout/input_test.cpp b/test/api/layout/input_test.cpp index 74b692c8..4dc485cc 100644 --- a/test/api/layout/input_test.cpp +++ b/test/api/layout/input_test.cpp @@ -71,7 +71,7 @@ TEST(layout_test, read_single_layout) 5 1;2;3;4;22 1;1;1;1;21 )layout_file"}; - auto [filenames, chopper_config, layouts] = chopper::layout::read_layout_file(ss); + auto [filenames, chopper_config, layouts] = chopper::layout::read_layouts_file(ss); auto const & layout = layouts[0]; From f9f8f5b8a88d8eaa4c2a6607a33a81afd0406b4b Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Sat, 9 Dec 2023 15:35:21 +0100 Subject: [PATCH 17/30] [FIX] tests --- test/api/config_test.cpp | 2 ++ test/api/layout/execute_layout_test.cpp | 4 ++++ test/api/layout/execute_with_estimation_test.cpp | 2 ++ test/api/layout/input_test.cpp | 2 ++ test/cli/cli_chopper_pipeline_test.cpp | 4 ++++ test/cli/util_display_layout_test.cpp | 2 ++ 6 files changed, 16 insertions(+) diff --git a/test/api/config_test.cpp b/test/api/config_test.cpp index 2cd5c652..1eee2aec 100644 --- a/test/api/config_test.cpp +++ b/test/api/config_test.cpp @@ -84,6 +84,8 @@ static constexpr std::string_view config_string_view{"@CHOPPER_CONFIG\n" "@ \"window_size\": 24,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": true,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"file.layout\"\n" "@ },\n" diff --git a/test/api/layout/execute_layout_test.cpp b/test/api/layout/execute_layout_test.cpp index 696a976e..3e4fb923 100644 --- a/test/api/layout/execute_layout_test.cpp +++ b/test/api/layout/execute_layout_test.cpp @@ -70,6 +70,8 @@ TEST(execute_test, few_ubs) "@ \"window_size\": 19,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + layout_file.string() @@ -281,6 +283,8 @@ TEST(execute_test, many_ubs) "@ \"window_size\": 19,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + layout_file.string() diff --git a/test/api/layout/execute_with_estimation_test.cpp b/test/api/layout/execute_with_estimation_test.cpp index 50658941..79166653 100644 --- a/test/api/layout/execute_with_estimation_test.cpp +++ b/test/api/layout/execute_with_estimation_test.cpp @@ -250,6 +250,8 @@ TEST(execute_estimation_test, many_ubs) "@ \"window_size\": 19,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + layout_file.string() diff --git a/test/api/layout/input_test.cpp b/test/api/layout/input_test.cpp index 4dc485cc..fadcc80c 100644 --- a/test/api/layout/input_test.cpp +++ b/test/api/layout/input_test.cpp @@ -31,6 +31,8 @@ std::string const layout_header{"@CHOPPER_USER_BINS\n" "@ \"window_size\": 15,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"foo.layout\"\n" "@ },\n" diff --git a/test/cli/cli_chopper_pipeline_test.cpp b/test/cli/cli_chopper_pipeline_test.cpp index 25eaa6f2..184d6c3f 100644 --- a/test/cli/cli_chopper_pipeline_test.cpp +++ b/test/cli/cli_chopper_pipeline_test.cpp @@ -78,6 +78,8 @@ TEST_F(cli_test, chopper_layout) "@ \"window_size\": 15,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + binning_filename.string() @@ -180,6 +182,8 @@ TEST_F(cli_test, chopper_layout2) "@ \"window_size\": 19,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + binning_filename.string() diff --git a/test/cli/util_display_layout_test.cpp b/test/cli/util_display_layout_test.cpp index 1ba5a25e..f81c8f73 100644 --- a/test/cli/util_display_layout_test.cpp +++ b/test/cli/util_display_layout_test.cpp @@ -49,6 +49,8 @@ std::string get_layout_with_correct_filenames(std::string_view const seq1_filena "@ \"window_size\": 15,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + output_filename.data() From 27f250616e9e5557353df67470f9c2ffccf84d27 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Sat, 9 Dec 2023 15:54:51 +0100 Subject: [PATCH 18/30] [FIX] clang --- src/util/display_layout/general.cpp | 2 +- src/util/display_layout/sizes.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util/display_layout/general.cpp b/src/util/display_layout/general.cpp index 404960d7..e92c5fdc 100644 --- a/src/util/display_layout/general.cpp +++ b/src/util/display_layout/general.cpp @@ -190,7 +190,7 @@ int execute(config const & cfg) // https://godbolt.org/z/WoWf55KPb auto filenames = std::move(std::get<0>(tuple)); auto chopper_config = std::move(std::get<1>(tuple)); - auto hibf_layout = std::move(std::get<2>(tuple)); + auto hibf_layouts = std::move(std::get<2>(tuple)); #else auto [filenames, chopper_config, hibf_layouts] = chopper::layout::read_layouts_file(layout_file); #endif diff --git a/src/util/display_layout/sizes.cpp b/src/util/display_layout/sizes.cpp index ef363272..ff234a3c 100644 --- a/src/util/display_layout/sizes.cpp +++ b/src/util/display_layout/sizes.cpp @@ -312,7 +312,7 @@ void execute_general_stats(config const & cfg) // Prepare stats assert(hibf_layouts.size() > 0); - size_t part = (hibf_layouts.size() == 1) ? 0 : 1; + // size_t part = (hibf_layouts.size() == 1) ? 0 : 1; for (auto const & hibf_layout : hibf_layouts) { size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u; @@ -337,7 +337,7 @@ void execute_general_stats(config const & cfg) throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading (appending)."}; level_stats.print(output_stream, hibf_layout.user_bins.size()); - ++part; + // ++part; } } From c4e6087448aff8e07bb0239645a0be07d5d21497 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Mon, 11 Dec 2023 08:42:45 +0100 Subject: [PATCH 19/30] [FEATURE] general.cpp can process multiple layouts. --- src/util/display_layout/general.cpp | 47 ++++++++++++++--------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/src/util/display_layout/general.cpp b/src/util/display_layout/general.cpp index e92c5fdc..414fd82e 100644 --- a/src/util/display_layout/general.cpp +++ b/src/util/display_layout/general.cpp @@ -177,27 +177,14 @@ void process_and_write_records_to(std::vector & records, std::ostream & stream << std::flush; } -int execute(config const & cfg) +int execute(config const & cfg, + std::vector> const & filenames, + chopper::configuration const & chopper_config, + seqan::hibf::layout::layout & hibf_layout) { - std::ifstream layout_file{cfg.input}; - - if (!layout_file.good() || !layout_file.is_open()) - throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; - -// https://godbolt.org/z/PeKnxzjn1 -#if defined(__clang__) - auto tuple = chopper::layout::read_layouts_file(layout_file); - // https://godbolt.org/z/WoWf55KPb - auto filenames = std::move(std::get<0>(tuple)); - auto chopper_config = std::move(std::get<1>(tuple)); - auto hibf_layouts = std::move(std::get<2>(tuple)); -#else - auto [filenames, chopper_config, hibf_layouts] = chopper::layout::read_layouts_file(layout_file); -#endif auto const & hibf_config = chopper_config.hibf_config; - layout_file.close(); - std::ofstream output_stream{cfg.output}; + std::ofstream output_stream{cfg.output, std::ios_base::app}; if (!output_stream.good() || !output_stream.is_open()) throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; @@ -223,7 +210,7 @@ int execute(config const & cfg) // If the index is the same, sort by file sizes (happens for merged bins). // Using the smallest file to initialise the shared k-mers later will be less work. std::ranges::sort( - hibf_layouts[0].user_bins, + hibf_layout.user_bins, [&filesizes](seqan::hibf::layout::layout::user_bin const & lhs, seqan::hibf::layout::layout::user_bin const & rhs) { @@ -232,7 +219,7 @@ int execute(config const & cfg) return first_idx < second_idx || (first_idx == second_idx && filesizes[lhs.idx] < filesizes[rhs.idx]); }); - size_t const total_ub_count = hibf_layouts[0].user_bins.size(); + size_t const total_ub_count = hibf_layout.user_bins.size(); progress_bar progress{total_ub_count}; // Create chunks containing user bin indices for one technical bin. @@ -249,8 +236,8 @@ int execute(config const & cfg) // Two user bins belong to the same chunk if they are in the same technical bin. auto predicate = [&](size_t const lhs, size_t const rhs) { - auto const & lhs_ub = hibf_layouts[0].user_bins[lhs]; - auto const & rhs_ub = hibf_layouts[0].user_bins[rhs]; + auto const & lhs_ub = hibf_layout.user_bins[lhs]; + auto const & rhs_ub = hibf_layout.user_bins[rhs]; // The top-level technical bin index for the current user bin. // user_bin.previous_TB_indices.size() == 0: true for split bins, false for merged bins // user_bin.storage_TB_id: technical bin index on the lowest level @@ -289,7 +276,7 @@ int execute(config const & cfg) for (size_t const ub_index : chunk) { - auto const & user_bin = hibf_layouts[0].user_bins[ub_index]; + auto const & user_bin = hibf_layout.user_bins[ub_index]; current_kmers.clear(); // We don't need to keep the current_kmers if there are no shared k-mers to merge them with. @@ -325,7 +312,7 @@ int execute(config const & cfg) } // Into how many techincal bins is the user bin split? Always 1 for merged bins. - size_t const split_count{is_merged ? 1u : hibf_layouts[0].user_bins[chunk[0]].number_of_technical_bins}; + size_t const split_count{is_merged ? 1u : hibf_layout.user_bins[chunk[0]].number_of_technical_bins}; size_t const avg_kmer_count = (current_kmer_set.size() + split_count - 1u) / split_count; size_t const sketch_estimate = (sketch.estimate() + split_count - 1u) / split_count; @@ -348,5 +335,15 @@ int execute(config const & cfg) void execute_general(config const & cfg) { - execute(cfg); + std::ifstream layout_file{cfg.input}; + + if (!layout_file.good() || !layout_file.is_open()) + throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; + + auto [filenames, chopper_config, hibf_layouts] = chopper::layout::read_layouts_file(layout_file); + + layout_file.close(); + + for (auto & hibf_layout : hibf_layouts) + execute(cfg, filenames, chopper_config, hibf_layout); } From 75c0e031ab43117d0c894d8b00473a0ff3b84546 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 13 Dec 2023 14:20:01 +0100 Subject: [PATCH 20/30] knuts similarity approach --- include/chopper/configuration.hpp | 3 +- src/layout/execute.cpp | 61 +++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp index 03e32de0..b11b297e 100644 --- a/include/chopper/configuration.hpp +++ b/include/chopper/configuration.hpp @@ -24,7 +24,8 @@ enum partitioning_scheme blocked, sorted, folded, - weighted_fold + weighted_fold, + similarity }; struct configuration diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 850fcb65..744f210e 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -38,6 +38,7 @@ namespace chopper::layout void partition_user_bins(chopper::configuration const & config, std::vector const & cardinalities, + std::vector const & sketches, std::vector> & positions) { // all approaches need sorted positions @@ -204,6 +205,66 @@ void partition_user_bins(chopper::configuration const & config, ++current_big_pos; } } + else if (config.partitioning_approach == partitioning_scheme::similarity) + { + uint8_t sketch_bits{10}; + std::vector partition_sketches(config.number_of_partitions, seqan::hibf::sketch::hyperloglog(sketch_bits)); + size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); + size_t const block_size = + std::min(u_bins_per_part, + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); + size_t const number_of_blocks = seqan::hibf::divide_and_ceil(cardinalities.size(), block_size); + + size_t current_part{0u}; + size_t current_block_count{0}; + seqan::hibf::sketch::hyperloglog current_sketch(sketch_bits); + + size_t current_pos{0}; + + // initialise partitions with the first config.number_of_partitions blocks + assert(number_of_blocks >= config.number_of_partitions); + for (size_t i = 0; i < config.number_of_partitions; ++i) + { + do + { + partition_sketches[i].merge(sketches[sorted_positions[current_pos]]); + positions[i].push_back(sorted_positions[current_pos]); + ++current_pos; + } + while (current_pos % block_size != 0); + } + + // assign the rest by similarity + for (size_t i = config.number_of_partitions; i < number_of_blocks; ++i) + { + size_t count{}; // we need to track count for the last partition that is not block_size long + do // init sketch + { + current_sketch.merge(sketches[sorted_positions[current_pos]]); + ++current_pos; + ++count; + } + while (current_pos % block_size != 0 && current_pos < cardinalities.size()); + + // search best parition fit + size_t smallest_change{std::numeric_limits::max()}; + size_t best_p{0}; + for (size_t p = 0; p < config.number_of_partitions; ++p) + { + seqan::hibf::sketch::hyperloglog tmp = current_sketch; + tmp.merge(partition_sketches[p]); + + if (tmp.estimate() < smallest_change) + { + smallest_change = tmp.estimate(); + best_p = p; + } + } + + for (size_t add = current_pos - count; add < count; ++add) + positions[best_p].push_back(add); + } + } } int execute(chopper::configuration & config, std::vector> const & filenames) From df418f14d5ad7289bfb7b3c7d3655fba52298b95 Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Wed, 13 Dec 2023 14:20:50 +0100 Subject: [PATCH 21/30] [MISC] automatic linting --- src/layout/execute.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 744f210e..6495456d 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -208,7 +208,8 @@ void partition_user_bins(chopper::configuration const & config, else if (config.partitioning_approach == partitioning_scheme::similarity) { uint8_t sketch_bits{10}; - std::vector partition_sketches(config.number_of_partitions, seqan::hibf::sketch::hyperloglog(sketch_bits)); + std::vector partition_sketches(config.number_of_partitions, + seqan::hibf::sketch::hyperloglog(sketch_bits)); size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); size_t const block_size = std::min(u_bins_per_part, @@ -238,7 +239,7 @@ void partition_user_bins(chopper::configuration const & config, for (size_t i = config.number_of_partitions; i < number_of_blocks; ++i) { size_t count{}; // we need to track count for the last partition that is not block_size long - do // init sketch + do // init sketch { current_sketch.merge(sketches[sorted_positions[current_pos]]); ++current_pos; From fec8cc200bcde96afdab4d714f1653c717d29e8f Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 13 Dec 2023 14:24:56 +0100 Subject: [PATCH 22/30] fix stuff --- src/layout/execute.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 6495456d..7fa8bc0b 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -216,8 +216,6 @@ void partition_user_bins(chopper::configuration const & config, chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); size_t const number_of_blocks = seqan::hibf::divide_and_ceil(cardinalities.size(), block_size); - size_t current_part{0u}; - size_t current_block_count{0}; seqan::hibf::sketch::hyperloglog current_sketch(sketch_bits); size_t current_pos{0}; @@ -384,7 +382,7 @@ int execute(chopper::configuration & config, std::vector hibf_layouts(config.number_of_partitions); // multiple layouts From b48519b47fd1b1ce001b8f857de1e5dd708e0822 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 13 Dec 2023 15:20:16 +0100 Subject: [PATCH 23/30] fic --- src/layout/execute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 7fa8bc0b..b4bb9cbf 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -260,7 +260,7 @@ void partition_user_bins(chopper::configuration const & config, } } - for (size_t add = current_pos - count; add < count; ++add) + for (size_t add = current_pos - count; add < current_pos; ++add) positions[best_p].push_back(add); } } From eeee4be2f5522d3ece244a8661cc1d5ca0af416b Mon Sep 17 00:00:00 2001 From: smehringer Date: Tue, 9 Jan 2024 12:03:12 +0100 Subject: [PATCH 24/30] [FIX] Similarity approach: Prohibit assigning to a partition that is full. --- src/layout/execute.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index b4bb9cbf..4a37ca45 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -210,6 +210,8 @@ void partition_user_bins(chopper::configuration const & config, uint8_t sketch_bits{10}; std::vector partition_sketches(config.number_of_partitions, seqan::hibf::sketch::hyperloglog(sketch_bits)); + size_t const cardinality_per_part = + seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); size_t const block_size = std::min(u_bins_per_part, @@ -253,7 +255,7 @@ void partition_user_bins(chopper::configuration const & config, seqan::hibf::sketch::hyperloglog tmp = current_sketch; tmp.merge(partition_sketches[p]); - if (tmp.estimate() < smallest_change) + if (tmp.estimate() < smallest_change && partition_sketches[p].estimate() < cardinality_per_part) { smallest_change = tmp.estimate(); best_p = p; From 256596c9abb3c87aaf0148b4e4e8312c8fbc836d Mon Sep 17 00:00:00 2001 From: smehringer Date: Tue, 9 Jan 2024 12:33:37 +0100 Subject: [PATCH 25/30] [FEATURE] Similarity Approach: process blocks in random order. Might improve user bin distribution across partitions. --- src/layout/execute.cpp | 44 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 4a37ca45..441486e7 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -218,36 +218,37 @@ void partition_user_bins(chopper::configuration const & config, chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); size_t const number_of_blocks = seqan::hibf::divide_and_ceil(cardinalities.size(), block_size); - seqan::hibf::sketch::hyperloglog current_sketch(sketch_bits); - - size_t current_pos{0}; - // initialise partitions with the first config.number_of_partitions blocks assert(number_of_blocks >= config.number_of_partitions); for (size_t i = 0; i < config.number_of_partitions; ++i) { - do + for (size_t x = 0; x < block_size; ++x) { - partition_sketches[i].merge(sketches[sorted_positions[current_pos]]); - positions[i].push_back(sorted_positions[current_pos]); - ++current_pos; + partition_sketches[i].merge(sketches[sorted_positions[i + x]]); + positions[i].push_back(sorted_positions[i + x]); } - while (current_pos % block_size != 0); } // assign the rest by similarity - for (size_t i = config.number_of_partitions; i < number_of_blocks; ++i) + // but don't move from largest to smallest but pick the next block to process randomly. + // this probably leads to more evenly distributed partitions (evenly in terms of number of user bins) + std::vector indices(number_of_blocks - config.number_of_partitions); + std::iota(indices.begin(), indices.end(), config.number_of_partitions); + std::random_device shuffle_random_device; + std::mt19937 shuffle_engine(shuffle_random_device()); + std::shuffle(indices.begin(), indices.end(), shuffle_engine); + + for (size_t const i : indices) { - size_t count{}; // we need to track count for the last partition that is not block_size long - do // init sketch - { - current_sketch.merge(sketches[sorted_positions[current_pos]]); - ++current_pos; - ++count; - } - while (current_pos % block_size != 0 && current_pos < cardinalities.size()); + seqan::hibf::sketch::hyperloglog current_sketch(sketch_bits); + + // initialise sketch of the current block of indices + for (size_t x = 0; x < block_size && (i + x) < sorted_positions.size(); ++x) + current_sketch.merge(sketches[sorted_positions[i + x]]); - // search best parition fit + // search best partition fit by similarity + // similarity here is defined as: + // "whose (<-partition) effective text size will increase the least when current_block is added to it" size_t smallest_change{std::numeric_limits::max()}; size_t best_p{0}; for (size_t p = 0; p < config.number_of_partitions; ++p) @@ -262,8 +263,9 @@ void partition_user_bins(chopper::configuration const & config, } } - for (size_t add = current_pos - count; add < current_pos; ++add) - positions[best_p].push_back(add); + // now that we know which partition fits best (`best_p`), add those indices to it + for (size_t x = 0; x < block_size && (i + x) < sorted_positions.size(); ++x) + positions[best_p].push_back(i + x); } } } From 3bcce0233bcf91c7837ac6142cb37b3c79db3b5a Mon Sep 17 00:00:00 2001 From: smehringer Date: Wed, 10 Jan 2024 11:07:17 +0100 Subject: [PATCH 26/30] fix new similarity approach. --- src/layout/execute.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 441486e7..9e547e79 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -210,6 +211,7 @@ void partition_user_bins(chopper::configuration const & config, uint8_t sketch_bits{10}; std::vector partition_sketches(config.number_of_partitions, seqan::hibf::sketch::hyperloglog(sketch_bits)); + size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); size_t const cardinality_per_part = seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); @@ -266,6 +268,7 @@ void partition_user_bins(chopper::configuration const & config, // now that we know which partition fits best (`best_p`), add those indices to it for (size_t x = 0; x < block_size && (i + x) < sorted_positions.size(); ++x) positions[best_p].push_back(i + x); + partition_sketches[best_p].merge(current_sketch); } } } From d21f2b88c0bace018dad4d764444b4a262a9481e Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Wed, 10 Jan 2024 11:08:17 +0100 Subject: [PATCH 27/30] [MISC] automatic linting --- src/layout/execute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 9e547e79..f18947ff 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -13,11 +13,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include From 9b241bce2d52f9b3343d9c63741228515eb3e0b4 Mon Sep 17 00:00:00 2001 From: smehringer Date: Thu, 18 Jan 2024 16:57:43 +0100 Subject: [PATCH 28/30] fix similarity approach --- src/layout/execute.cpp | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index f18947ff..ebc09d9f 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -5,6 +5,7 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include @@ -208,9 +210,11 @@ void partition_user_bins(chopper::configuration const & config, } else if (config.partitioning_approach == partitioning_scheme::similarity) { - uint8_t sketch_bits{10}; + uint8_t const sketch_bits{config.hibf_config.sketch_bits}; std::vector partition_sketches(config.number_of_partitions, seqan::hibf::sketch::hyperloglog(sketch_bits)); + std::vector partition_cardinality(config.number_of_partitions, 0u); + size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); size_t const cardinality_per_part = seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); @@ -222,12 +226,13 @@ void partition_user_bins(chopper::configuration const & config, // initialise partitions with the first config.number_of_partitions blocks assert(number_of_blocks >= config.number_of_partitions); - for (size_t i = 0; i < config.number_of_partitions; ++i) + for (size_t p = 0; p < config.number_of_partitions; ++p) { for (size_t x = 0; x < block_size; ++x) { - partition_sketches[i].merge(sketches[sorted_positions[i + x]]); - positions[i].push_back(sorted_positions[i + x]); + partition_sketches[p].merge(sketches[sorted_positions[block_size * p + x]]); + partition_cardinality[p] += cardinalities[sorted_positions[block_size * p + x]]; + positions[p].push_back(sorted_positions[block_size * p + x]); } } @@ -235,6 +240,7 @@ void partition_user_bins(chopper::configuration const & config, // but don't move from largest to smallest but pick the next block to process randomly. // this probably leads to more evenly distributed partitions (evenly in terms of number of user bins) std::vector indices(number_of_blocks - config.number_of_partitions); + std::iota(indices.begin(), indices.end(), config.number_of_partitions); std::random_device shuffle_random_device; std::mt19937 shuffle_engine(shuffle_random_device()); @@ -245,8 +251,8 @@ void partition_user_bins(chopper::configuration const & config, seqan::hibf::sketch::hyperloglog current_sketch(sketch_bits); // initialise sketch of the current block of indices - for (size_t x = 0; x < block_size && (i + x) < sorted_positions.size(); ++x) - current_sketch.merge(sketches[sorted_positions[i + x]]); + for (size_t x = 0; x < block_size && ((block_size * i + x) < sorted_positions.size()); ++x) + current_sketch.merge(sketches[sorted_positions[block_size * i + x]]); // search best partition fit by similarity // similarity here is defined as: @@ -258,7 +264,7 @@ void partition_user_bins(chopper::configuration const & config, seqan::hibf::sketch::hyperloglog tmp = current_sketch; tmp.merge(partition_sketches[p]); - if (tmp.estimate() < smallest_change && partition_sketches[p].estimate() < cardinality_per_part) + if (tmp.estimate() < smallest_change && partition_cardinality[p] < cardinality_per_part) { smallest_change = tmp.estimate(); best_p = p; @@ -266,13 +272,17 @@ void partition_user_bins(chopper::configuration const & config, } // now that we know which partition fits best (`best_p`), add those indices to it - for (size_t x = 0; x < block_size && (i + x) < sorted_positions.size(); ++x) - positions[best_p].push_back(i + x); + for (size_t x = 0; x < block_size && ((block_size * i + x) < sorted_positions.size()); ++x) + { + positions[best_p].push_back(sorted_positions[block_size * i + x]); + partition_cardinality[best_p] += cardinalities[sorted_positions[block_size * i + x]]; + } partition_sketches[best_p].merge(current_sketch); } } } + int execute(chopper::configuration & config, std::vector> const & filenames) { assert(config.hibf_config.number_of_user_bins > 0); From d19c2fb80d7c774230fd9792cbb32a2fffc1784f Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Thu, 18 Jan 2024 16:58:33 +0100 Subject: [PATCH 29/30] [MISC] automatic linting --- src/layout/execute.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index ebc09d9f..22022973 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -282,7 +281,6 @@ void partition_user_bins(chopper::configuration const & config, } } - int execute(chopper::configuration & config, std::vector> const & filenames) { assert(config.hibf_config.number_of_user_bins > 0); From 457b61439a4758a07e6edc3f1356b896657581fb Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Thu, 15 Feb 2024 12:51:45 +0100 Subject: [PATCH 30/30] [FEATURE] Adapt similarity approach by Knuts suggestions: random seed, choosing best-p by subsumption ratio. --- src/layout/execute.cpp | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index 22022973..157824d8 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -223,28 +223,29 @@ void partition_user_bins(chopper::configuration const & config, chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); size_t const number_of_blocks = seqan::hibf::divide_and_ceil(cardinalities.size(), block_size); - // initialise partitions with the first config.number_of_partitions blocks + // don't move from largest to smallest but pick the next block to process randomly. + // this probably leads to more evenly distributed partitions (evenly in terms of number of user bins) + std::vector indices(number_of_blocks); + std::iota(indices.begin(), indices.end(), 0); + std::random_device shuffle_random_device; + std::mt19937 shuffle_engine(shuffle_random_device()); + std::shuffle(indices.begin(), indices.end(), shuffle_engine); + + // initialise partitions with the first random config.number_of_partitions blocks assert(number_of_blocks >= config.number_of_partitions); for (size_t p = 0; p < config.number_of_partitions; ++p) { + size_t const i = indices[p]; for (size_t x = 0; x < block_size; ++x) { - partition_sketches[p].merge(sketches[sorted_positions[block_size * p + x]]); - partition_cardinality[p] += cardinalities[sorted_positions[block_size * p + x]]; - positions[p].push_back(sorted_positions[block_size * p + x]); + partition_sketches[p].merge(sketches[sorted_positions[block_size * i + x]]); + partition_cardinality[p] += cardinalities[sorted_positions[block_size * i + x]]; + positions[p].push_back(sorted_positions[block_size * i + x]); } } + indices.erase(indices.begin(), indices.begin() + config.number_of_partitions); // assign the rest by similarity - // but don't move from largest to smallest but pick the next block to process randomly. - // this probably leads to more evenly distributed partitions (evenly in terms of number of user bins) - std::vector indices(number_of_blocks - config.number_of_partitions); - - std::iota(indices.begin(), indices.end(), config.number_of_partitions); - std::random_device shuffle_random_device; - std::mt19937 shuffle_engine(shuffle_random_device()); - std::shuffle(indices.begin(), indices.end(), shuffle_engine); - for (size_t const i : indices) { seqan::hibf::sketch::hyperloglog current_sketch(sketch_bits); @@ -255,17 +256,24 @@ void partition_user_bins(chopper::configuration const & config, // search best partition fit by similarity // similarity here is defined as: - // "whose (<-partition) effective text size will increase the least when current_block is added to it" - size_t smallest_change{std::numeric_limits::max()}; + // "whose (<-partition) effective text size is subsumed most by the current user bin" + // or in other words: + // "which partition has the largest intersection with user bin b compared to its own (partition) size." + double best_subsume_ratio{0.0}; size_t best_p{0}; for (size_t p = 0; p < config.number_of_partitions; ++p) { seqan::hibf::sketch::hyperloglog tmp = current_sketch; tmp.merge(partition_sketches[p]); + size_t const tmp_estimate = tmp.estimate(); + assert(tmp_estimate >= partition_cardinality[p]); + size_t const change = tmp_estimate - partition_cardinality[p]; + size_t const intersection = current_sketch.estimate() - change; + double const subsume_ratio = static_cast(intersection) / partition_cardinality[p]; - if (tmp.estimate() < smallest_change && partition_cardinality[p] < cardinality_per_part) + if (subsume_ratio > best_subsume_ratio && partition_cardinality[p] < cardinality_per_part) { - smallest_change = tmp.estimate(); + best_subsume_ratio = subsume_ratio; best_p = p; } }