Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

# build dir
/build/
/build_debug/

# editor configuration
/.vscode/
17 changes: 17 additions & 0 deletions include/chopper/configuration.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,21 @@
namespace chopper
{

enum class partitioning_scheme : uint8_t
{
blocked, // 0
sorted, // 1
folded, // 2
weighted_fold, // 3
similarity, // 4
lsh, // 5
lsh_sim // 6
};

struct configuration
{
partitioning_scheme partitioning_approach{partitioning_scheme::lsh_sim};

/*!\name General Configuration
* \{
*/
Expand Down Expand Up @@ -77,6 +90,10 @@ struct configuration
mutable seqan::hibf::concurrent_timer union_estimation_timer{};
mutable seqan::hibf::concurrent_timer rearrangement_timer{};
mutable seqan::hibf::concurrent_timer dp_algorithm_timer{};
mutable seqan::hibf::concurrent_timer lsh_algorithm_timer{};
mutable seqan::hibf::concurrent_timer search_partition_algorithm_timer{};
mutable seqan::hibf::concurrent_timer intital_partition_timer{};
mutable seqan::hibf::concurrent_timer small_layouts_timer{};

void read_from(std::istream & stream);

Expand Down
29 changes: 29 additions & 0 deletions include/chopper/layout/determine_split_bins.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// --------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
// --------------------------------------------------------------------------------------------------

/*!\file
* \brief Provides chopper::determine_split_bins.
* \author Svenja Mehringer <svenja.mehringer AT fu-berlin.de>
*/

#pragma once

#include <vector>

#include <chopper/configuration.hpp>

namespace chopper::layout
{

std::pair<size_t, size_t> determine_split_bins(chopper::configuration const & config,
std::vector<size_t> const & positions,
std::vector<size_t> const & cardinalities,
size_t const num_technical_bins,
size_t const num_user_bins,
std::vector<std::vector<size_t>> & partitions);

}
4 changes: 3 additions & 1 deletion include/chopper/layout/execute.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@
#include <chopper/configuration.hpp>

#include <hibf/sketch/hyperloglog.hpp>
#include <hibf/sketch/minhashes.hpp>

namespace chopper::layout
{

int execute(chopper::configuration & config,
std::vector<std::vector<std::string>> const & filenames,
std::vector<seqan::hibf::sketch::hyperloglog> const & sketches);
std::vector<seqan::hibf::sketch::hyperloglog> const & sketches,
std::vector<seqan::hibf::sketch::minhashes> const & minHash_sketches);

} // namespace chopper::layout
202 changes: 202 additions & 0 deletions include/chopper/lsh.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
// --------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
// --------------------------------------------------------------------------------------------------

/*!\file
* \brief Provides chopper::adjust_seed.
* \author Svenja Mehringer <svenja.mehringer AT fu-berlin.de>
*/

#pragma once

#include <algorithm>

namespace chopper
{

/*\brief foo
*/
struct Cluster
{
protected:
size_t representative_id{}; // representative id of the cluster; identifier;

std::vector<size_t> user_bins{}; // the user bins contained in thus cluster

std::optional<size_t> moved_id{std::nullopt}; // where this Clusters user bins where moved to

public:
Cluster() = default;
Cluster(Cluster const &) = default;
Cluster(Cluster &&) = default;
Cluster & operator=(Cluster const &) = default;
Cluster & operator=(Cluster &&) = default;
~Cluster() = default;

Cluster(size_t const id, size_t const user_bins_id) : representative_id{id}, user_bins({user_bins_id})
{}

Cluster(size_t const id) : Cluster{id, id}
{}

size_t id() const
{
return representative_id;
}

std::vector<size_t> const & contained_user_bins() const
{
return user_bins;
}

bool has_been_moved() const
{
return moved_id.has_value();
}

bool empty() const
{
return user_bins.empty();
}

size_t size() const
{
return user_bins.size();
}

bool is_valid(size_t id) const
{
bool const ids_equal = representative_id == id;
bool const properly_moved = has_been_moved() && empty();
bool const not_moved = !has_been_moved() && !empty();

return ids_equal && (properly_moved || not_moved);
}

size_t moved_to_cluster_id() const
{
assert(moved_id.has_value());
assert(is_valid(representative_id));
return moved_id.value();
}

void move_to(Cluster & target_cluster)
{
target_cluster.user_bins.insert(target_cluster.user_bins.end(), this->user_bins.begin(), this->user_bins.end());
this->user_bins.clear();
moved_id = target_cluster.id();
}

void sort_by_cardinality(std::vector<size_t> const & cardinalities)
{
std::ranges::sort(user_bins,
[&cardinalities](auto const & v1, auto const & v2)
{
return cardinalities[v1] > cardinalities[v2];
});
}
};

struct MultiCluster : Cluster
{
protected:
std::vector<std::vector<size_t>> user_bins{}; // the user bins contained in this cluster

public:
MultiCluster() = default;
MultiCluster(MultiCluster const &) = default;
MultiCluster(MultiCluster &&) = default;
MultiCluster & operator=(MultiCluster const &) = default;
MultiCluster & operator=(MultiCluster &&) = default;
~MultiCluster() = default;

MultiCluster(Cluster const & clust)
{
representative_id = clust.id();

if (clust.has_been_moved())
moved_id = clust.moved_to_cluster_id();
else
user_bins.push_back(clust.contained_user_bins());
}

std::vector<std::vector<size_t>> const & contained_user_bins() const
{
return user_bins;
}

// needs to be defined again because of redefinition of `user_bins`
bool empty() const
{
return user_bins.empty();
}

// needs to be defined again because of redefinition of `user_bins`
size_t size() const
{
return user_bins.size();
}

bool is_valid(size_t id) const
{
bool const ids_equal = representative_id == id;
bool const properly_moved = has_been_moved() && empty();
bool const not_moved = !has_been_moved() && !empty();

return ids_equal && (properly_moved || not_moved);
}

void move_to(MultiCluster & target_cluster)
{
target_cluster.user_bins.insert(target_cluster.user_bins.end(), this->user_bins.begin(), this->user_bins.end());
this->user_bins.clear();
moved_id = target_cluster.id();
}

// sort user bins within a Cluster by cardinality and the clusters themselves by size
void sort_by_cardinality(std::vector<size_t> const & cardinalities)
{
std::ranges::for_each(user_bins,
[&cardinalities](auto & user_bin_cluster)
{
std::ranges::sort(user_bin_cluster,
[&cardinalities](auto const & v1, auto const & v2)
{
return cardinalities[v1] > cardinalities[v2];
});
});

std::ranges::sort(user_bins,
[](auto const & c1, auto const & c2)
{
return c1.size() > c2.size();
});
}
};

// A valid cluster is one that hasn't been moved but actually contains user bins
// A valid cluster at position i is identified by the following equality: cluster[i].size() >= 1 && cluster[i][0] == i
// A moved cluster is one that has been joined and thereby moved to another cluster
// A moved cluster i is identified by the following: cluster[i].size() == 1 && cluster[i][0] != i
// returns position of the representative cluster
template <typename cluster_type>
size_t LSH_find_representative_cluster(std::vector<cluster_type> const & clusters, size_t current_id)
{
std::reference_wrapper<cluster_type const> representative = clusters[current_id];

assert(representative.get().is_valid(current_id));

while (representative.get().has_been_moved())
{
current_id = representative.get().moved_to_cluster_id();
representative = clusters[current_id]; // replace by next cluster
assert(representative.get().is_valid(current_id));
}

return current_id;
}

} // namespace chopper
19 changes: 15 additions & 4 deletions src/chopper_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser)

std::vector<std::vector<std::string>> filenames{};
std::vector<seqan::hibf::sketch::hyperloglog> sketches{};
std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};

if (input_is_a_sketch_file)
{
Expand All @@ -106,6 +107,7 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser)

filenames = std::move(sin.filenames); // No need to call check_filenames because the files are not read.
sketches = std::move(sin.hll_sketches);
minHash_sketches = std::move(sin.minHash_sketches);
validate_configuration(parser, config, sin.chopper_config);
}
else
Expand All @@ -128,17 +130,18 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser)
if (!input_is_a_sketch_file)
{
config.compute_sketches_timer.start();
seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches, minHash_sketches);
config.compute_sketches_timer.stop();
}

exit_code |= chopper::layout::execute(config, filenames, sketches);
exit_code |= chopper::layout::execute(config, filenames, sketches, minHash_sketches);

if (!config.disable_sketch_output)
{
chopper::sketch::sketch_file sout{.chopper_config = config,
.filenames = std::move(filenames),
.hll_sketches = std::move(sketches)};
.hll_sketches = std::move(sketches),
.minHash_sketches = std::move(minHash_sketches)};
std::ofstream os{config.sketch_directory, std::ios::binary};
cereal::BinaryOutputArchive oarchive{os};
oarchive(sout);
Expand All @@ -151,11 +154,19 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser)
output_stream << "sketching_in_seconds\t"
<< "layouting_in_seconds\t"
<< "union_estimation_in_seconds\t"
<< "rearrangement_in_seconds\n";
<< "rearrangement_in_seconds\t"
<< "lsh_in_seconds\t"
<< "intital_partition_timer_in_seconds\t"
<< "small_layouts_timer_in_seconds\t"
<< "search_best_p_in_seconds\n";
output_stream << config.compute_sketches_timer.in_seconds() << '\t';
output_stream << config.dp_algorithm_timer.in_seconds() << '\t';
output_stream << config.union_estimation_timer.in_seconds() << '\t';
output_stream << config.rearrangement_timer.in_seconds() << '\t';
output_stream << config.lsh_algorithm_timer.in_seconds() << '\t';
output_stream << config.intital_partition_timer.in_seconds() << '\t';
output_stream << config.small_layouts_timer.in_seconds() << '\t';
output_stream << config.search_partition_algorithm_timer.in_seconds() << '\n';
}

return exit_code;
Expand Down
10 changes: 8 additions & 2 deletions src/layout/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@ if (TARGET chopper::layout)
return ()
endif ()

add_library (chopper_layout STATIC determine_best_number_of_technical_bins.cpp execute.cpp hibf_statistics.cpp
ibf_query_cost.cpp input.cpp output.cpp
add_library (chopper_layout STATIC
determine_best_number_of_technical_bins.cpp
determine_split_bins.cpp
execute.cpp
hibf_statistics.cpp
ibf_query_cost.cpp
input.cpp
output.cpp
)
target_link_libraries (chopper_layout PUBLIC chopper::shared)
add_library (chopper::layout ALIAS chopper_layout)
Loading
Loading