diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp index d39b2a5d..b11b297e 100644 --- a/include/chopper/configuration.hpp +++ b/include/chopper/configuration.hpp @@ -19,8 +19,18 @@ namespace chopper { +enum partitioning_scheme +{ + blocked, + sorted, + folded, + weighted_fold, + similarity +}; + struct configuration { + int partitioning_approach{}; /*!\name General Configuration * \{ */ @@ -46,6 +56,16 @@ struct configuration bool precomputed_files{false}; //!\} + /*!\name Partitioned HIBF configuration + * \{ + */ + //!\brief The maximum index size that the HIBF should not exceed. number_of_paritions will be set accordingly. + size_t maximum_index_size{0}; + + //!\brief The number of partitions for the HIBF index. + size_t number_of_partitions{0}; + //!\} + /*!\name Configuration of size estimates * \{ */ @@ -93,6 +113,9 @@ struct configuration archive(CEREAL_NVP(disable_sketch_output)); archive(CEREAL_NVP(precomputed_files)); + archive(CEREAL_NVP(maximum_index_size)); + archive(CEREAL_NVP(number_of_partitions)); + archive(CEREAL_NVP(output_filename)); archive(CEREAL_NVP(determine_best_tmax)); archive(CEREAL_NVP(force_all_binnings)); diff --git a/include/chopper/layout/input.hpp b/include/chopper/layout/input.hpp index 855856c1..037d7503 100644 --- a/include/chopper/layout/input.hpp +++ b/include/chopper/layout/input.hpp @@ -20,7 +20,8 @@ namespace chopper::layout { std::vector> read_filenames_from(std::istream & stream); -std::tuple>, configuration, seqan::hibf::layout::layout> -read_layout_file(std::istream & stream); + +std::tuple>, configuration, std::vector> +read_layouts_file(std::istream & stream); } // namespace chopper::layout diff --git a/lib/hibf b/lib/hibf index b068f7ed..46e5e8ce 160000 --- a/lib/hibf +++ b/lib/hibf @@ -1 +1 @@ -Subproject commit b068f7ed2fe6716453ced9059321a4e1fa2c5a35 +Subproject commit 46e5e8ce624c791115ba2ec140cc7a9cfcfeff08 diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp index f3e6ef67..157824d8 100644 --- a/src/layout/execute.cpp +++ b/src/layout/execute.cpp @@ -5,6 +5,7 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include #include #include #include @@ -12,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -27,12 +30,265 @@ #include #include +#include #include #include +#include namespace chopper::layout { +void partition_user_bins(chopper::configuration const & config, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector> & positions) +{ + // all approaches need sorted positions + std::vector const sorted_positions = [&cardinalities]() + { + std::vector ps; + ps.resize(cardinalities.size()); + std::iota(ps.begin(), ps.end(), 0); + seqan::hibf::sketch::toolbox::sort_by_cardinalities(cardinalities, ps); + return ps; + }(); + + if (config.partitioning_approach == partitioning_scheme::blocked) + { + size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); + size_t const block_size = + std::min(u_bins_per_part, + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); + + size_t current_part{0u}; + size_t current_block_count{0}; + + for (size_t const current_user_bin_id : sorted_positions) + { + positions[current_part].push_back(current_user_bin_id); + ++current_block_count; + + if (current_block_count >= block_size) + { + current_block_count = 0; + ++current_part; + if (current_part == config.number_of_partitions) // we need to circle back to the first partition + current_part = 0; + } + } + } + else if (config.partitioning_approach == partitioning_scheme::sorted) + { + size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); + size_t const cardinality_per_part = + seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); + + size_t current_cardinality{0u}; + size_t current_part{0}; + + for (size_t const current_user_bin_id : sorted_positions) + { + positions[current_part].push_back(current_user_bin_id); + current_cardinality += cardinalities[current_user_bin_id]; + + if (current_cardinality >= cardinality_per_part) + { + current_cardinality = 0; + ++current_part; + } + } + } + else if (config.partitioning_approach == partitioning_scheme::folded) + { + size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); + size_t const cardinality_per_part_halved = + seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions * 2); + + size_t current_cardinality{0u}; + std::vector const parts = [&config]() + { + size_t const len{config.number_of_partitions}; + std::vector result(len * 2); + std::iota(result.begin(), result.begin() + len, 0); + std::copy(result.rbegin() + len, result.rend(), result.begin() + len); + return result; + }(); + size_t current_part{0}; + + for (size_t const current_user_bin_id : sorted_positions) + { + positions[parts[current_part]].push_back(current_user_bin_id); + current_cardinality += cardinalities[current_user_bin_id]; + + if (current_cardinality >= cardinality_per_part_halved) + { + current_cardinality = 0; + ++current_part; + } + } + } + else if (config.partitioning_approach == partitioning_scheme::weighted_fold) + { + size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); + size_t const cardinality_per_part = + seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); + size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); + + size_t current_big_pos{0}; // the next largest user bin to assign to a partition + size_t current_small_pos{cardinalities.size() - 1}; // the next small user bin + + for (size_t current_part = 0; current_part + 1 < config.number_of_partitions; ++current_part) + { + size_t current_cardinality{0}; + std::vector small_bins; + size_t new_small_bin_addition{0}; + + auto compute_score = [&]() + { + double const weight = static_cast(current_cardinality) / cardinality_per_part; + double const amount = + static_cast(positions[current_part].size() + small_bins.size() + new_small_bin_addition) + / u_bins_per_part; + return std::abs(1.0 - weight) + std::abs(1.0 - amount); + }; + + // first add all large bins that fit + while (current_cardinality < cardinality_per_part) + { + positions[current_part].push_back(sorted_positions[current_big_pos]); + current_cardinality += cardinalities[sorted_positions[current_big_pos]]; + ++current_big_pos; + } + + double local_optimum = compute_score(); + + // then remove big bins and add small bins until a local optima is reached + while (true) + { + size_t const cache_last_small_pos{current_small_pos}; + // remove a big user bin and fill the partition with small user bins + current_cardinality -= cardinalities[sorted_positions[current_big_pos]]; + + // can we further improve the ratio by adding more small bins? + double improved_score{}; + do + { + improved_score = compute_score(); + current_cardinality += cardinalities[sorted_positions[current_small_pos]]; + --current_small_pos; + ++new_small_bin_addition; + } + while (compute_score() < improved_score); // smaller is better + // remove overstep + ++current_small_pos; + current_cardinality -= cardinalities[sorted_positions[current_small_pos]]; + --new_small_bin_addition; + + if (local_optimum < compute_score()) // score would increase. Stop + { + current_small_pos = cache_last_small_pos; + break; + } + else // update + { + positions[current_part].pop_back(); + --current_big_pos; + for (size_t pos = cache_last_small_pos; pos > current_small_pos; --pos) + small_bins.push_back(sorted_positions[pos]); + } + } + positions[current_part].insert(positions[current_part].end(), small_bins.begin(), small_bins.end()); + } + + // remaining user bins go to last partition + while (current_big_pos <= current_small_pos) + { + positions[config.number_of_partitions - 1].push_back(sorted_positions[current_big_pos]); + ++current_big_pos; + } + } + else if (config.partitioning_approach == partitioning_scheme::similarity) + { + uint8_t const sketch_bits{config.hibf_config.sketch_bits}; + std::vector partition_sketches(config.number_of_partitions, + seqan::hibf::sketch::hyperloglog(sketch_bits)); + std::vector partition_cardinality(config.number_of_partitions, 0u); + + size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), size_t{}); + size_t const cardinality_per_part = + seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions); + size_t const u_bins_per_part = seqan::hibf::divide_and_ceil(cardinalities.size(), config.number_of_partitions); + size_t const block_size = + std::min(u_bins_per_part, + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(u_bins_per_part))))); + size_t const number_of_blocks = seqan::hibf::divide_and_ceil(cardinalities.size(), block_size); + + // don't move from largest to smallest but pick the next block to process randomly. + // this probably leads to more evenly distributed partitions (evenly in terms of number of user bins) + std::vector indices(number_of_blocks); + std::iota(indices.begin(), indices.end(), 0); + std::random_device shuffle_random_device; + std::mt19937 shuffle_engine(shuffle_random_device()); + std::shuffle(indices.begin(), indices.end(), shuffle_engine); + + // initialise partitions with the first random config.number_of_partitions blocks + assert(number_of_blocks >= config.number_of_partitions); + for (size_t p = 0; p < config.number_of_partitions; ++p) + { + size_t const i = indices[p]; + for (size_t x = 0; x < block_size; ++x) + { + partition_sketches[p].merge(sketches[sorted_positions[block_size * i + x]]); + partition_cardinality[p] += cardinalities[sorted_positions[block_size * i + x]]; + positions[p].push_back(sorted_positions[block_size * i + x]); + } + } + indices.erase(indices.begin(), indices.begin() + config.number_of_partitions); + + // assign the rest by similarity + for (size_t const i : indices) + { + seqan::hibf::sketch::hyperloglog current_sketch(sketch_bits); + + // initialise sketch of the current block of indices + for (size_t x = 0; x < block_size && ((block_size * i + x) < sorted_positions.size()); ++x) + current_sketch.merge(sketches[sorted_positions[block_size * i + x]]); + + // search best partition fit by similarity + // similarity here is defined as: + // "whose (<-partition) effective text size is subsumed most by the current user bin" + // or in other words: + // "which partition has the largest intersection with user bin b compared to its own (partition) size." + double best_subsume_ratio{0.0}; + size_t best_p{0}; + for (size_t p = 0; p < config.number_of_partitions; ++p) + { + seqan::hibf::sketch::hyperloglog tmp = current_sketch; + tmp.merge(partition_sketches[p]); + size_t const tmp_estimate = tmp.estimate(); + assert(tmp_estimate >= partition_cardinality[p]); + size_t const change = tmp_estimate - partition_cardinality[p]; + size_t const intersection = current_sketch.estimate() - change; + double const subsume_ratio = static_cast(intersection) / partition_cardinality[p]; + + if (subsume_ratio > best_subsume_ratio && partition_cardinality[p] < cardinality_per_part) + { + best_subsume_ratio = subsume_ratio; + best_p = p; + } + } + + // now that we know which partition fits best (`best_p`), add those indices to it + for (size_t x = 0; x < block_size && ((block_size * i + x) < sorted_positions.size()); ++x) + { + positions[best_p].push_back(sorted_positions[block_size * i + x]); + partition_cardinality[best_p] += cardinalities[sorted_positions[block_size * i + x]]; + } + partition_sketches[best_p].merge(current_sketch); + } + } +} + int execute(chopper::configuration & config, std::vector> const & filenames) { assert(config.hibf_config.number_of_user_bins > 0); @@ -58,71 +314,144 @@ int execute(chopper::configuration & config, std::vector sketches; + if (config.number_of_partitions < 2) // 0 == unset == single HIBF, 1 == single HIBF + { + seqan::hibf::layout::layout hibf_layout; + std::vector sketches; - seqan::hibf::concurrent_timer compute_sketches_timer{}; - seqan::hibf::concurrent_timer union_estimation_timer{}; - seqan::hibf::concurrent_timer rearrangement_timer{}; - seqan::hibf::concurrent_timer dp_algorithm_timer{}; + seqan::hibf::concurrent_timer compute_sketches_timer{}; + seqan::hibf::concurrent_timer union_estimation_timer{}; + seqan::hibf::concurrent_timer rearrangement_timer{}; + seqan::hibf::concurrent_timer dp_algorithm_timer{}; - if (config.determine_best_tmax) - { - std::tie(hibf_layout, sketches) = determine_best_number_of_technical_bins(config); + if (config.determine_best_tmax) + { + std::tie(hibf_layout, sketches) = determine_best_number_of_technical_bins(config); + } + else + { + std::vector kmer_counts; + + compute_sketches_timer.start(); + seqan::hibf::sketch::compute_sketches(config.hibf_config, kmer_counts, sketches); + compute_sketches_timer.stop(); + + std::vector positions = [&kmer_counts]() + { + std::vector ps; + ps.resize(kmer_counts.size()); + std::iota(ps.begin(), ps.end(), 0); + return ps; + }(); // GCOVR_EXCL_LINE + + dp_algorithm_timer.start(); + hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, + kmer_counts, + sketches, + std::move(positions), + union_estimation_timer, + rearrangement_timer); + dp_algorithm_timer.stop(); + + if (config.output_verbose_statistics) + { + size_t dummy{}; + chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts}; + global_stats.hibf_layout = hibf_layout; + global_stats.print_header_to(std::cout); + global_stats.print_summary_to(dummy, std::cout); + } + } + + if (!config.disable_sketch_output) + { + if (!std::filesystem::exists(config.sketch_directory)) + std::filesystem::create_directory(config.sketch_directory); + + assert(filenames.size() == sketches.size()); + for (size_t i = 0; i < filenames.size(); ++i) + sketch::write_sketch_file(filenames[i][0], sketches[i], config); + } + + // brief Write the output to the layout file. + std::ofstream fout{config.output_filename}; + chopper::layout::write_user_bins_to(filenames, fout); + config.write_to(fout); + hibf_layout.write_to(fout); + + if (!config.output_timings.empty()) + { + std::ofstream output_stream{config.output_timings}; + output_stream << std::fixed << std::setprecision(2); + output_stream << "sketching_in_seconds\t" + << "layouting_in_seconds\t" + << "union_estimation_in_seconds\t" + << "rearrangement_in_seconds\n"; + output_stream << compute_sketches_timer.in_seconds() << '\t'; + output_stream << dp_algorithm_timer.in_seconds() << '\t'; + output_stream << union_estimation_timer.in_seconds() << '\t'; + output_stream << rearrangement_timer.in_seconds() << '\t'; + } } else { - std::vector kmer_counts; + std::vector cardinalities; + std::vector sketches; + std::vector> positions(config.number_of_partitions); // asign positions for each partition + // compute sketches of all user bins + seqan::hibf::concurrent_timer compute_sketches_timer{}; compute_sketches_timer.start(); - seqan::hibf::sketch::compute_sketches(config.hibf_config, kmer_counts, sketches); + seqan::hibf::sketch::compute_sketches(config.hibf_config, cardinalities, sketches); compute_sketches_timer.stop(); - dp_algorithm_timer.start(); - hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, - kmer_counts, - sketches, - union_estimation_timer, - rearrangement_timer); - dp_algorithm_timer.stop(); - - if (config.output_verbose_statistics) + + partition_user_bins(config, cardinalities, sketches, positions); + + std::vector hibf_layouts(config.number_of_partitions); // multiple layouts + +#pragma omp parallel for schedule(dynamic) num_threads(config.hibf_config.threads) + for (size_t i = 0; i < config.number_of_partitions; ++i) { - size_t dummy{}; - chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts}; - global_stats.hibf_layout = hibf_layout; - global_stats.print_header_to(std::cout); - global_stats.print_summary_to(dummy, std::cout); - } - } + seqan::hibf::concurrent_timer union_estimation_timer{}; + seqan::hibf::concurrent_timer rearrangement_timer{}; + seqan::hibf::concurrent_timer dp_algorithm_timer{}; - if (!config.disable_sketch_output) - { - if (!std::filesystem::exists(config.sketch_directory)) - std::filesystem::create_directory(config.sketch_directory); + // reset tmax to fit number of user bins in layout + auto local_hibf_config = config.hibf_config; // every thread needs to set individual tmax + local_hibf_config.tmax = + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(positions[i].size())))); - assert(filenames.size() == sketches.size()); - for (size_t i = 0; i < filenames.size(); ++i) - sketch::write_sketch_file(filenames[i][0], sketches[i], config); - } + dp_algorithm_timer.start(); + hibf_layouts[i] = seqan::hibf::layout::compute_layout(local_hibf_config, + cardinalities, + sketches, + std::move(positions[i]), + union_estimation_timer, + rearrangement_timer); + dp_algorithm_timer.stop(); - // brief Write the output to the layout file. - std::ofstream fout{config.output_filename}; - chopper::layout::write_user_bins_to(filenames, fout); - config.write_to(fout); - hibf_layout.write_to(fout); + if (!config.output_timings.empty()) + { + std::ofstream output_stream{config.output_timings, std::ios_base::app}; + output_stream << std::fixed << std::setprecision(2); + output_stream << "sketching_in_seconds\t" + << "layouting_in_seconds\t" + << "union_estimation_in_seconds\t" + << "rearrangement_in_seconds\n"; + output_stream << compute_sketches_timer.in_seconds() << '\t'; + output_stream << dp_algorithm_timer.in_seconds() << '\t'; + output_stream << union_estimation_timer.in_seconds() << '\t'; + output_stream << rearrangement_timer.in_seconds() << '\t'; + } + } - if (!config.output_timings.empty()) - { - std::ofstream output_stream{config.output_timings}; - output_stream << std::fixed << std::setprecision(2); - output_stream << "sketching_in_seconds\t" - << "layouting_in_seconds\t" - << "union_estimation_in_seconds\t" - << "rearrangement_in_seconds\n"; - output_stream << compute_sketches_timer.in_seconds() << '\t'; - output_stream << dp_algorithm_timer.in_seconds() << '\t'; - output_stream << union_estimation_timer.in_seconds() << '\t'; - output_stream << rearrangement_timer.in_seconds() << '\t'; + // brief Write the output to the layout file. + std::ofstream fout{config.output_filename}; + chopper::layout::write_user_bins_to(filenames, fout); + config.write_to(fout); + + for (size_t i = 0; i < config.number_of_partitions; ++i) + hibf_layouts[i].write_to(fout); } return 0; diff --git a/src/layout/input.cpp b/src/layout/input.cpp index 2b1a7459..76e8efab 100644 --- a/src/layout/input.cpp +++ b/src/layout/input.cpp @@ -66,15 +66,20 @@ std::vector> read_filenames_from(std::istream & stream) return filenames; } -std::tuple>, configuration, seqan::hibf::layout::layout> -read_layout_file(std::istream & stream) +std::tuple>, configuration, std::vector> +read_layouts_file(std::istream & stream) { std::vector> filenames = chopper::layout::read_filenames_from(stream); chopper::configuration chopper_config; chopper_config.read_from(stream); - seqan::hibf::layout::layout hibf_layout{}; - hibf_layout.read_from(stream); - return std::make_tuple(std::move(filenames), std::move(chopper_config), std::move(hibf_layout)); + std::vector layouts; + while (stream.good()) + { + seqan::hibf::layout::layout hibf_layout{}; + hibf_layout.read_from(stream); + layouts.push_back(std::move(hibf_layout)); + } + return std::make_tuple(std::move(filenames), std::move(chopper_config), std::move(layouts)); } } // namespace chopper::layout diff --git a/src/measure_HIBF.cpp b/src/measure_HIBF.cpp new file mode 100644 index 00000000..a972ba91 --- /dev/null +++ b/src/measure_HIBF.cpp @@ -0,0 +1,87 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +struct config +{ + std::filesystem::path input_index{}; + std::filesystem::path general_stats_output{"general.stats"}; +}; + +struct stats +{ + std::vector ibf_sizes; + std::vector ibf_levels; + std::vector ibf_load_factor; +}; + +inline void set_up_parser(sharg::parser & parser, config & cfg) +{ + parser.info.version = "1.0.0"; + parser.info.author = "Svenja Mehringer"; + parser.info.email = "svenja.mehringer@fu-berlin.de"; + parser.info.short_description = "Compute a top-level HIBF layout figure file"; + + parser.info.description.emplace_back("Computes an table to display the top-level layout."); + + parser.add_subsection("Main options:"); + parser.add_option(cfg.input_index, + sharg::config{.short_id = '\0', + .long_id = "index", + .description = "The input must be an index computed via raptor layout/build. ", + .required = true, + .validator = sharg::input_file_validator{}}); + parser.add_option(cfg.general_stats_output, + sharg::config{.short_id = '\0', + .long_id = "output", + .description = "The output. ", + .required = true, + .validator = sharg::output_file_validator{}}); +} + +int main(int argc, char const * argv[]) +{ + sharg::parser parser{"layout_stats", argc, argv, sharg::update_notifications::off}; + parser.info.version = "1.0.0"; + + config cfg{}; + set_up_parser(parser, cfg); + + try + { + parser.parse(); + } + catch (sharg::parser_error const & ext) + { + std::cerr << "[ERROR] " << ext.what() << '\n'; + return -1; + } + + auto index = raptor::raptor_index{}; + raptor::load_index(index, arguments); +} diff --git a/src/set_up_parser.cpp b/src/set_up_parser.cpp index 5f18a3a9..4be7096f 100644 --- a/src/set_up_parser.cpp +++ b/src/set_up_parser.cpp @@ -78,6 +78,7 @@ void set_up_parser(sharg::parser & parser, configuration & config) "accuracy.", .default_message = "k-mer size", }); + parser.add_option(config.output_timings, sharg::config{.short_id = '\0', .long_id = "timing-output", @@ -85,6 +86,32 @@ void set_up_parser(sharg::parser & parser, configuration & config) .default_message = "", .validator = sharg::output_file_validator{}}); + parser.add_option( + config.maximum_index_size, + sharg::config{ + .short_id = '\0', + .long_id = "maximum-index-size", + .description = + "You can restrict the hibf index to have a maximum index size which will partition the index into " + "several partitions if needed. The number of partitions is computed based on your input data. " + "you can manually set the number of partitions with --number-of-partitions"}); + + parser.add_option( + config.number_of_partitions, + sharg::config{ + .short_id = '\0', + .long_id = "number-of-partitions", + .description = + "The number of partitions of the HIBF. We recommend to instead use the option maximum-index-size if " + "your goal is to reduce the index size and thereby peak mempry usage of searching with the HIBF.", + .advanced = true}); + + parser.add_option(config.partitioning_approach, + sharg::config{.short_id = '\0', + .long_id = "partitioning-approach", + .description = "this is only configurable for debugging.", + .advanced = true}); + parser.add_option( config.hibf_config.tmax, sharg::config{ diff --git a/src/util/display_layout/general.cpp b/src/util/display_layout/general.cpp index 4ed3f76f..414fd82e 100644 --- a/src/util/display_layout/general.cpp +++ b/src/util/display_layout/general.cpp @@ -177,27 +177,14 @@ void process_and_write_records_to(std::vector & records, std::ostream & stream << std::flush; } -int execute(config const & cfg) +int execute(config const & cfg, + std::vector> const & filenames, + chopper::configuration const & chopper_config, + seqan::hibf::layout::layout & hibf_layout) { - std::ifstream layout_file{cfg.input}; - - if (!layout_file.good() || !layout_file.is_open()) - throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; - -// https://godbolt.org/z/PeKnxzjn1 -#if defined(__clang__) - auto tuple = chopper::layout::read_layout_file(layout_file); - // https://godbolt.org/z/WoWf55KPb - auto filenames = std::move(std::get<0>(tuple)); - auto chopper_config = std::move(std::get<1>(tuple)); - auto hibf_layout = std::move(std::get<2>(tuple)); -#else - auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); -#endif auto const & hibf_config = chopper_config.hibf_config; - layout_file.close(); - std::ofstream output_stream{cfg.output}; + std::ofstream output_stream{cfg.output, std::ios_base::app}; if (!output_stream.good() || !output_stream.is_open()) throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; @@ -348,5 +335,15 @@ int execute(config const & cfg) void execute_general(config const & cfg) { - execute(cfg); + std::ifstream layout_file{cfg.input}; + + if (!layout_file.good() || !layout_file.is_open()) + throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; + + auto [filenames, chopper_config, hibf_layouts] = chopper::layout::read_layouts_file(layout_file); + + layout_file.close(); + + for (auto & hibf_layout : hibf_layouts) + execute(cfg, filenames, chopper_config, hibf_layout); } diff --git a/src/util/display_layout/sizes.cpp b/src/util/display_layout/sizes.cpp index 9218b596..ff234a3c 100644 --- a/src/util/display_layout/sizes.cpp +++ b/src/util/display_layout/sizes.cpp @@ -284,13 +284,13 @@ void execute_general_stats(config const & cfg) // https://godbolt.org/z/PeKnxzjn1 #if defined(__clang__) - auto tuple = chopper::layout::read_layout_file(layout_file); + auto tuple = chopper::layout::read_layouts_file(layout_file); // https://godbolt.org/z/WoWf55KPb auto filenames = std::move(std::get<0>(tuple)); auto chopper_config = std::move(std::get<1>(tuple)); - auto hibf_layout = std::move(std::get<2>(tuple)); + auto hibf_layouts = std::move(std::get<2>(tuple)); #else - auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); + auto [filenames, chopper_config, hibf_layouts] = chopper::layout::read_layouts_file(layout_file); #endif // Prepare configs @@ -311,28 +311,34 @@ void execute_general_stats(config const & cfg) auto const & hibf_config = chopper_config.hibf_config; // Prepare stats - size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u; - std::vector stats(number_of_ibfs); + assert(hibf_layouts.size() > 0); + // size_t part = (hibf_layouts.size() == 1) ? 0 : 1; + for (auto const & hibf_layout : hibf_layouts) + { + size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u; + std::vector stats(number_of_ibfs); - // Prepare data - seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}}; - seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root; - size_t const t_max{root_node.number_of_technical_bins}; - data.fpr_correction = seqan::hibf::layout::compute_fpr_correction( - {.fpr = hibf_config.maximum_fpr, .hash_count = hibf_config.number_of_hash_functions, .t_max = t_max}); + // Prepare data + seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}}; + seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root; + size_t const t_max{root_node.number_of_technical_bins}; + data.fpr_correction = seqan::hibf::layout::compute_fpr_correction( + {.fpr = hibf_config.maximum_fpr, .hash_count = hibf_config.number_of_hash_functions, .t_max = t_max}); - // Get stats - hierarchical_stats(stats, root_node, data); + // Get stats + hierarchical_stats(stats, root_node, data); - // Get stats per level - per_level_stats const level_stats{stats}; + // Get stats per level + per_level_stats const level_stats{stats}; - // Output - std::ofstream output_stream{cfg.output}; - if (!output_stream.good() || !output_stream.is_open()) - throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; + // Output + std::ofstream output_stream(cfg.output.string(), std::ios::app); + if (!output_stream.good() || !output_stream.is_open()) + throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading (appending)."}; - level_stats.print(output_stream, hibf_layout.user_bins.size()); + level_stats.print(output_stream, hibf_layout.user_bins.size()); + // ++part; + } } void execute_sizes(config const & cfg) diff --git a/test/api/config_test.cpp b/test/api/config_test.cpp index be717ddc..1eee2aec 100644 --- a/test/api/config_test.cpp +++ b/test/api/config_test.cpp @@ -84,6 +84,8 @@ static constexpr std::string_view config_string_view{"@CHOPPER_CONFIG\n" "@ \"window_size\": 24,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": true,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"file.layout\"\n" "@ },\n" @@ -149,7 +151,7 @@ TEST(config_test, read_from_with_more_meta) } // Easier to do in the config_test because of existing helper functions -TEST(input, read_layout_file) +TEST(input, read_layouts_file) { std::string config_string{"@CHOPPER_USER_BINS\n" "@0 file1.fa\n" @@ -169,7 +171,9 @@ TEST(input, read_layout_file) std::stringstream ss{config_string}; - auto [filenames, config, layout] = chopper::layout::read_layout_file(ss); + auto [filenames, config, layouts] = chopper::layout::read_layouts_file(ss); + + auto const & layout = layouts[0]; std::vector> const expected_filenames{{"file1.fa"}, {"file2.fa"}, diff --git a/test/api/layout/execute_layout_test.cpp b/test/api/layout/execute_layout_test.cpp index 696a976e..3e4fb923 100644 --- a/test/api/layout/execute_layout_test.cpp +++ b/test/api/layout/execute_layout_test.cpp @@ -70,6 +70,8 @@ TEST(execute_test, few_ubs) "@ \"window_size\": 19,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + layout_file.string() @@ -281,6 +283,8 @@ TEST(execute_test, many_ubs) "@ \"window_size\": 19,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + layout_file.string() diff --git a/test/api/layout/execute_with_estimation_test.cpp b/test/api/layout/execute_with_estimation_test.cpp index 50658941..79166653 100644 --- a/test/api/layout/execute_with_estimation_test.cpp +++ b/test/api/layout/execute_with_estimation_test.cpp @@ -250,6 +250,8 @@ TEST(execute_estimation_test, many_ubs) "@ \"window_size\": 19,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + layout_file.string() diff --git a/test/api/layout/input_test.cpp b/test/api/layout/input_test.cpp index a1b240e2..fadcc80c 100644 --- a/test/api/layout/input_test.cpp +++ b/test/api/layout/input_test.cpp @@ -31,6 +31,8 @@ std::string const layout_header{"@CHOPPER_USER_BINS\n" "@ \"window_size\": 15,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"foo.layout\"\n" "@ },\n" @@ -71,7 +73,9 @@ TEST(layout_test, read_single_layout) 5 1;2;3;4;22 1;1;1;1;21 )layout_file"}; - auto [filenames, chopper_config, layout] = chopper::layout::read_layout_file(ss); + auto [filenames, chopper_config, layouts] = chopper::layout::read_layouts_file(ss); + + auto const & layout = layouts[0]; EXPECT_EQ(layout.top_level_max_bin_id, 111); EXPECT_EQ(layout.max_bins[0], (seqan::hibf::layout::layout::max_bin{{0}, 0})); @@ -81,3 +85,49 @@ TEST(layout_test, read_single_layout) EXPECT_EQ(layout.user_bins[1], (seqan::hibf::layout::layout::user_bin{std::vector{1}, 0, 22, 4})); EXPECT_EQ(layout.user_bins[2], (seqan::hibf::layout::layout::user_bin{std::vector{1, 2, 3, 4}, 22, 21, 5})); } + +TEST(layout_test, read_from_partitioned_layout) +{ + // layout consists of three partitions, written one after the other + std::stringstream ss{layout_header + R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +)layout_file"}; + + auto [filenames, chopper_config, hibf_layouts] = chopper::layout::read_layouts_file(ss); + + for (size_t i = 0; i < 3; ++i) + { + auto layout = hibf_layouts[i]; + + EXPECT_EQ(layout.top_level_max_bin_id, 111); + EXPECT_EQ(layout.max_bins[0], (seqan::hibf::layout::layout::max_bin{{0}, 0})); + EXPECT_EQ(layout.max_bins[1], (seqan::hibf::layout::layout::max_bin{{2}, 2})); + EXPECT_EQ(layout.max_bins[2], (seqan::hibf::layout::layout::max_bin{{1, 2, 3, 4}, 22})); + EXPECT_EQ(layout.user_bins[0], (seqan::hibf::layout::layout::user_bin{std::vector{}, 0, 1, 7})); + EXPECT_EQ(layout.user_bins[1], (seqan::hibf::layout::layout::user_bin{std::vector{1}, 0, 22, 4})); + EXPECT_EQ(layout.user_bins[2], + (seqan::hibf::layout::layout::user_bin{std::vector{1, 2, 3, 4}, 22, 21, 5})); + } +} diff --git a/test/cli/cli_chopper_pipeline_test.cpp b/test/cli/cli_chopper_pipeline_test.cpp index 25eaa6f2..184d6c3f 100644 --- a/test/cli/cli_chopper_pipeline_test.cpp +++ b/test/cli/cli_chopper_pipeline_test.cpp @@ -78,6 +78,8 @@ TEST_F(cli_test, chopper_layout) "@ \"window_size\": 15,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + binning_filename.string() @@ -180,6 +182,8 @@ TEST_F(cli_test, chopper_layout2) "@ \"window_size\": 19,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + binning_filename.string() diff --git a/test/cli/util_display_layout_test.cpp b/test/cli/util_display_layout_test.cpp index 1ba5a25e..f81c8f73 100644 --- a/test/cli/util_display_layout_test.cpp +++ b/test/cli/util_display_layout_test.cpp @@ -49,6 +49,8 @@ std::string get_layout_with_correct_filenames(std::string_view const seq1_filena "@ \"window_size\": 15,\n" "@ \"disable_sketch_output\": true,\n" "@ \"precomputed_files\": false,\n" + "@ \"maximum_index_size\": 0,\n" + "@ \"number_of_partitions\": 0,\n" "@ \"output_filename\": {\n" "@ \"value0\": \"" + output_filename.data()