From 9acc43d682c76f89ffc1273517f1048adda18fdb Mon Sep 17 00:00:00 2001 From: tonibohnlein Date: Wed, 19 Nov 2025 11:54:37 +0100 Subject: [PATCH 1/3] compute_comm_affinity added test kl_bsp_cost more tests for max_comm_datastructure update update node affinity added test for affinity update test update affinity tests affinity tests update update small performance optimization more test, pre move data update pre_move_comm_data new unit test update test update tests update update update lambda container, bsp_cost function cost function correction max_comm_datastructures fix fix kl_bsp_cost test update update update update more tests update update comm update update update update update update update added more tests, cmpute working update comm affinity implemenation comm_affinity optimizations enable tests update test debug output update update update update unit tests passing cleaning update datastructure update update numa added mt test --- .../auxiliary/io/hdag_graph_file_reader.hpp | 1 + .../comm_cost_modules/kl_bsp_comm_cost.hpp | 675 +++++++-- .../kl_hyper_total_comm_cost.hpp | 11 +- .../comm_cost_modules/kl_total_comm_cost.hpp | 6 + .../comm_cost_modules/lambda_container.hpp | 330 +++- .../max_comm_datastructure.hpp | 396 +++-- .../KernighanLin_v2/kl_improver.hpp | 1340 +++++++++++------ .../KernighanLin_v2/kl_improver_test.hpp | 96 +- .../KernighanLin_v2/kl_include.hpp | 18 +- .../KernighanLin_v2/kl_include_mt.hpp | 5 + .../LocalSearch/KernighanLin_v2/kl_util.hpp | 222 ++- tests/CMakeLists.txt | 6 + tests/kl_bsp_affinity_test.cpp | 967 ++++++++++++ tests/kl_bsp_cost.cpp | 1086 +++++++++++++ tests/kl_bsp_improver_test.cpp | 250 +++ 15 files changed, 4447 insertions(+), 962 deletions(-) create mode 100644 tests/kl_bsp_affinity_test.cpp create mode 100644 tests/kl_bsp_cost.cpp create mode 100644 tests/kl_bsp_improver_test.cpp diff --git a/include/osp/auxiliary/io/hdag_graph_file_reader.hpp b/include/osp/auxiliary/io/hdag_graph_file_reader.hpp index 63d04909..a91481a7 100644 --- a/include/osp/auxiliary/io/hdag_graph_file_reader.hpp +++ b/include/osp/auxiliary/io/hdag_graph_file_reader.hpp @@ -29,6 +29,7 @@ limitations under the License. #include "osp/concepts/computational_dag_concept.hpp" #include "osp/graph_algorithms/directed_graph_util.hpp" #include "osp/auxiliary/io/filepath_checker.hpp" +#include "osp/concepts/constructable_computational_dag_concept.hpp" namespace osp { namespace file_reader { diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp index 679db815..f6c425bd 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp @@ -19,218 +19,649 @@ limitations under the License. #pragma once #include "../kl_active_schedule.hpp" -#include "lambda_container.hpp" +#include "../kl_improver.hpp" #include "max_comm_datastructure.hpp" +#include namespace osp { -template +// A lightweight helper to track deltas without hash maps or repeated allocations. +// Uses a dense vector for O(1) lookups and a sparse list for fast iteration/clearing. +template +struct FastDeltaTracker { + std::vector dense_vals; // Size: num_procs + std::vector dirty_procs; // List of modified indices + std::vector proc_dirty_index; // Map proc -> index in dirty_procs (num_procs if not dirty) + unsigned num_procs = 0; + + void initialize(unsigned n_procs) { + if (n_procs > num_procs) { + num_procs = n_procs; + dense_vals.resize(num_procs, 0); + dirty_procs.reserve(num_procs); + proc_dirty_index.resize(num_procs, num_procs); + } + } + + inline void add(unsigned proc, comm_weight_t val) { + if (val == 0) + return; + + // If currently 0, it is becoming dirty + if (dense_vals[proc] == 0) { + proc_dirty_index[proc] = static_cast(dirty_procs.size()); + dirty_procs.push_back(proc); + } + + dense_vals[proc] += val; + + // If it returns to 0, remove it from dirty list (Swap and Pop for O(1)) + if (dense_vals[proc] == 0) { + unsigned idx = proc_dirty_index[proc]; + unsigned last_proc = dirty_procs.back(); + + // Move last element to the hole + dirty_procs[idx] = last_proc; + proc_dirty_index[last_proc] = idx; + + // Remove last + dirty_procs.pop_back(); + proc_dirty_index[proc] = num_procs; + } + } + + inline comm_weight_t get(unsigned proc) const { + if (proc < dense_vals.size()) + return dense_vals[proc]; + return 0; + } + + inline void clear() { + for (unsigned p : dirty_procs) { + dense_vals[p] = 0; + proc_dirty_index[p] = num_procs; + } + dirty_procs.clear(); + } +}; + +template struct kl_bsp_comm_cost_function { - + using VertexType = vertex_idx_t; using kl_move = kl_move_struct; using kl_gain_update_info = kl_update_info; + using comm_weight_t = v_commw_t; constexpr static unsigned window_range = 2 * window_size + 1; + constexpr static bool is_max_comm_cost_function = true; kl_active_schedule *active_schedule; compatible_processor_range *proc_range; const Graph_t *graph; const BspInstance *instance; - max_comm_datastructure comm_ds; + max_comm_datastructure> comm_ds; inline cost_t get_comm_multiplier() { return 1; } inline cost_t get_max_comm_weight() { return comm_ds.max_comm_weight; } inline cost_t get_max_comm_weight_multiplied() { return comm_ds.max_comm_weight; } inline const std::string name() const { return "bsp_comm"; } - inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); } - inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return (node_step < window_size + start_step) ? window_size - (node_step - start_step) : 0; } - inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return (node_step + window_size <= end_step) ? window_range : window_range - (node_step + window_size - end_step); } + inline bool is_compatible(VertexType node, unsigned proc) { + return active_schedule->getInstance().isCompatible(node, proc); + } + inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { + return (node_step < window_size + start_step) ? window_size - (node_step - start_step) : 0; + } + inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { + return (node_step + window_size <= end_step) ? window_range + : window_range - (node_step + window_size - end_step); + } - void initialize(kl_active_schedule &sched, compatible_processor_range &p_range) { + void initialize(kl_active_schedule &sched, + compatible_processor_range &p_range) { active_schedule = &sched; proc_range = &p_range; instance = &sched.getInstance(); graph = &instance->getComputationalDag(); const unsigned num_steps = active_schedule->num_steps(); - comm_ds.initialize(active_schedule->getSetSchedule(), *instance, num_steps); - comm_ds.set_active_schedule(*active_schedule); + comm_ds.initialize(*active_schedule); + } + + using pre_move_comm_data_t = pre_move_comm_data; + + inline pre_move_comm_data get_pre_move_comm_data(const kl_move &move) { + return comm_ds.get_pre_move_comm_data(move); } - void compute_send_receive_datastructures() { + void compute_send_receive_datastructures() { comm_ds.compute_comm_datastructures(0, active_schedule->num_steps() - 1); } template cost_t compute_schedule_cost() { - if constexpr (compute_datastructures) compute_send_receive_datastructures(); + if constexpr (compute_datastructures) + compute_send_receive_datastructures(); cost_t total_cost = 0; for (unsigned step = 0; step < active_schedule->num_steps(); step++) { total_cost += active_schedule->get_step_max_work(step); total_cost += comm_ds.step_max_comm(step) * instance->communicationCosts(); } - total_cost += static_cast(active_schedule->num_steps() - 1) * instance->synchronisationCosts(); + + if (active_schedule->num_steps() > 1) { + total_cost += static_cast(active_schedule->num_steps() - 1) * instance->synchronisationCosts(); + } + return total_cost; } cost_t compute_schedule_cost_test() { return compute_schedule_cost(); } - void update_datastructure_after_move(const kl_move &move, const unsigned start_step, const unsigned end_step) { + void update_datastructure_after_move(const kl_move &move, const unsigned start_step, const unsigned end_step) { comm_ds.update_datastructure_after_move(move, start_step, end_step); } - template + // Structure to hold thread-local scratchpads to avoid re-allocation. + struct ScratchData { + std::vector> send_deltas; // Size: num_steps + std::vector> recv_deltas; // Size: num_steps + + std::vector active_steps; // List of steps touched in current operation + std::vector step_is_active; // Fast lookup for active steps + + std::vector> child_cost_buffer; + + void init(unsigned n_steps, unsigned n_procs) { + if (send_deltas.size() < n_steps) { + send_deltas.resize(n_steps); + recv_deltas.resize(n_steps); + step_is_active.resize(n_steps, false); + active_steps.reserve(n_steps); + } + + for (auto &tracker : send_deltas) + tracker.initialize(n_procs); + for (auto &tracker : recv_deltas) + tracker.initialize(n_procs); + + child_cost_buffer.reserve(n_procs); + } + + void clear_all() { + for (unsigned step : active_steps) { + send_deltas[step].clear(); + recv_deltas[step].clear(); + step_is_active[step] = false; + } + active_steps.clear(); + child_cost_buffer.clear(); + } + + void mark_active(unsigned step) { + if (!step_is_active[step]) { + step_is_active[step] = true; + active_steps.push_back(step); + } + } + }; + + template void compute_comm_affinity(VertexType node, affinity_table_t &affinity_table_node, const cost_t &penalty, const cost_t &reward, const unsigned start_step, const unsigned end_step) { + // Use static thread_local scratchpad to avoid allocation in hot loop + static thread_local ScratchData scratch; + scratch.init(active_schedule->num_steps(), instance->numberOfProcessors()); + scratch.clear_all(); + const unsigned node_step = active_schedule->assigned_superstep(node); const unsigned node_proc = active_schedule->assigned_processor(node); const unsigned window_bound = end_idx(node_step, end_step); const unsigned node_start_idx = start_idx(node_step, start_step); - const cost_t comm_w_node = graph->vertex_comm_weight(node); + for (const auto &target : instance->getComputationalDag().children(node)) { + const unsigned target_step = active_schedule->assigned_superstep(target); + const unsigned target_proc = active_schedule->assigned_processor(target); + + if (target_step < node_step + (target_proc != node_proc)) { + const unsigned diff = node_step - target_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; + unsigned idx = node_start_idx; + for (; idx < bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + affinity_table_node[p][idx] -= reward; + } + } + if (window_size >= diff && is_compatible(node, target_proc)) { + affinity_table_node[target_proc][idx] -= reward; + } + } else { + const unsigned diff = target_step - node_step; + unsigned idx = window_size + diff; + if (idx < window_bound && is_compatible(node, target_proc)) { + affinity_table_node[target_proc][idx] -= penalty; + } + for (; idx < window_bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + affinity_table_node[p][idx] += penalty; + } + } + } + } + + for (const auto &source : instance->getComputationalDag().parents(node)) { + const unsigned source_step = active_schedule->assigned_superstep(source); + const unsigned source_proc = active_schedule->assigned_processor(source); + + if (source_step < node_step + (source_proc == node_proc)) { + const unsigned diff = node_step - source_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; + unsigned idx = node_start_idx; + for (; idx < bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + affinity_table_node[p][idx] += penalty; + } + } + if (idx - 1 < bound && is_compatible(node, source_proc)) { + affinity_table_node[source_proc][idx - 1] -= penalty; + } + } else { + const unsigned diff = source_step - node_step; + unsigned idx = std::min(window_size + diff, window_bound); + if (idx < window_bound && is_compatible(node, source_proc)) { + affinity_table_node[source_proc][idx] -= reward; + } + idx++; + for (; idx < window_bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + affinity_table_node[p][idx] -= reward; + } + } + } + } + + const comm_weight_t comm_w_node = graph->vertex_comm_weight(node); + const auto ¤t_vec_schedule = active_schedule->getVectorSchedule(); + + auto add_delta = [&](bool is_recv, unsigned step, unsigned proc, comm_weight_t val) { + if (val == 0) + return; + if (step < active_schedule->num_steps()) { + scratch.mark_active(step); + if (is_recv) + scratch.recv_deltas[step].add(proc, val); + else + scratch.send_deltas[step].add(proc, val); + } + }; + + // 1. Remove Node from Current State (Phase 1 - Invariant for all candidates) + + // Outgoing (Children) + // Child stops receiving from node_proc at node_step + auto node_lambda_entries = comm_ds.node_lambda_map.iterate_proc_entries(node); + comm_weight_t total_send_cost_removed = 0; - const auto ¤t_set_schedule = active_schedule->getSetSchedule(); + for (const auto [proc, count] : node_lambda_entries) { + if (proc != node_proc) { + const comm_weight_t cost = comm_w_node * instance->sendCosts(node_proc, proc); + if (cost > 0) { + add_delta(true, node_step, proc, -cost); + total_send_cost_removed += cost; + } + } + } + if (total_send_cost_removed > 0) { + add_delta(false, node_step, node_proc, -total_send_cost_removed); + } + + // Incoming (Parents) + for (const auto &u : graph->parents(node)) { + const unsigned u_proc = active_schedule->assigned_processor(u); + const unsigned u_step = current_vec_schedule.assignedSuperstep(u); + const comm_weight_t comm_w_u = graph->vertex_comm_weight(u); + + if (u_proc != node_proc) { + if (comm_ds.node_lambda_map.get_proc_entry(u, node_proc) == 1) { + const comm_weight_t cost = comm_w_u * instance->sendCosts(u_proc, node_proc); + if (cost > 0) { + add_delta(true, u_step, node_proc, -cost); + add_delta(false, u_step, u_proc, -cost); + } + } + } + } + + // 2. Add Node to Target (Iterate candidates) + + for (const unsigned p_to : proc_range->compatible_processors_vertex(node)) { + + // --- Part A: Incoming Edges (Parents -> p_to) --- + // These updates are specific to p_to but independent of s_to. + // We apply them, run the s_to loop, then revert them. + + for (const auto &u : graph->parents(node)) { + const unsigned u_proc = active_schedule->assigned_processor(u); + const unsigned u_step = current_vec_schedule.assignedSuperstep(u); + const comm_weight_t comm_w_u = graph->vertex_comm_weight(u); + + if (u_proc != p_to) { + bool already_sending_to_p_to = false; + unsigned count_on_p_to = comm_ds.node_lambda_map.get_proc_entry(u, p_to); + + if (p_to == node_proc) { + if (count_on_p_to > 0) + count_on_p_to--; + } + + if (count_on_p_to > 0) { + already_sending_to_p_to = true; + } + + if (!already_sending_to_p_to) { + const comm_weight_t cost = comm_w_u * instance->sendCosts(u_proc, p_to); + if (cost > 0) { + add_delta(true, u_step, p_to, cost); + add_delta(false, u_step, u_proc, cost); + } + } + } + } - for (unsigned p_to = 0; p_to < instance->numberOfProcessors(); ++p_to) { - if (!is_compatible(node, p_to)) continue; + // --- Part B: Outgoing Edges (Node -> Children) --- + // These depend on which processors children are on. + scratch.child_cost_buffer.clear(); + comm_weight_t total_send_cost_added = 0; + + for (const auto [v_proc, count] : comm_ds.node_lambda_map.iterate_proc_entries(node)) { + if (v_proc != p_to) { + const comm_weight_t cost = comm_w_node * instance->sendCosts(p_to, v_proc); + if (cost > 0) { + scratch.child_cost_buffer.push_back({v_proc, cost}); + total_send_cost_added += cost; + } + } + } + // Iterate Window (s_to) for (unsigned s_to_idx = node_start_idx; s_to_idx < window_bound; ++s_to_idx) { unsigned s_to = node_step + s_to_idx - window_size; - cost_t comm_cost_change = 0; - - const auto pre_move_data_from = comm_ds.get_pre_move_comm_data_step(node_step); - const auto pre_move_data_to = comm_ds.get_pre_move_comm_data_step(s_to); - - // --- Outgoing communication from `node` --- - // From - for (const auto [proc, count] : comm_ds.node_lambda_map.iterate_proc_entries(node)) { - comm_cost_change += calculate_comm_cost_change_send(node_step, node_proc, comm_w_node, -1, pre_move_data_from); - } - // To - lambda_vector_container temp_lambda_map; // Use a temporary map for 'to' state - temp_lambda_map.initialize(1, instance->numberOfProcessors()); - for (const auto &v : graph->children(node)) { - const unsigned v_proc = current_set_schedule.assignedProcessor(v); - - if (p_to != v_proc) { - if (temp_lambda_map.increase_proc_count(0, v_proc)) { - comm_cost_change -= calculate_comm_cost_change_send(s_to, p_to, comm_w_node, 1, pre_move_data_to); - comm_cost_change -= calculate_comm_cost_change_receive(s_to, v_proc, comm_w_node, 1, pre_move_data_to); - } + + // Apply Outgoing Deltas for this specific step s_to + for (const auto &[v_proc, cost] : scratch.child_cost_buffer) { + add_delta(true, s_to, v_proc, cost); + } + + if (total_send_cost_added > 0) { + add_delta(false, s_to, p_to, total_send_cost_added); + } + + cost_t total_change = 0; + + // Only check steps that are active (modified in Phase 1, Part A, or Part B) + for (unsigned step : scratch.active_steps) { + // Check if dirty_procs is empty implies no change for this step + // FastDeltaTracker ensures dirty_procs is empty if all deltas summed to 0 + if (!scratch.send_deltas[step].dirty_procs.empty() || + !scratch.recv_deltas[step].dirty_procs.empty()) { + + total_change += + calculate_step_cost_change(step, scratch.send_deltas[step], scratch.recv_deltas[step]); } } - // --- Incoming communication to `node` --- - for (const auto &u : graph->parents(node)) { - const unsigned u_proc = active_schedule->assigned_processor(u); - const unsigned u_step = current_set_schedule.assignedSuperstep(u); - const cost_t comm_w_u = graph->vertex_comm_weight(u); - const auto pre_move_data_u = comm_ds.get_pre_move_comm_data_step(u_step); - - // From - if (u_proc != node_proc) { - // Send part (from parent u) & Receive part (at node_proc) // TODO: this is not correct, the lambda map is not updated - if (comm_ds.node_lambda_map.get_proc_entry(u, node_proc) == 1) { // if node is the only child on this proc - comm_cost_change += calculate_comm_cost_change_send(u_step, u_proc, comm_w_u, -1, pre_move_data_u); - comm_cost_change += calculate_comm_cost_change_receive(u_step, node_proc, comm_w_u, -1, pre_move_data_u); - } + affinity_table_node[p_to][s_to_idx] += total_change * instance->communicationCosts(); + + // Revert Outgoing Deltas for s_to (Inverse of Apply) + for (const auto &[v_proc, cost] : scratch.child_cost_buffer) { + add_delta(true, s_to, v_proc, -cost); + } + if (total_send_cost_added > 0) { + add_delta(false, s_to, p_to, -total_send_cost_added); + } + } + + // Revert Incoming Deltas (Inverse of Part A) + for (const auto &u : graph->parents(node)) { + const unsigned u_proc = active_schedule->assigned_processor(u); + const unsigned u_step = current_vec_schedule.assignedSuperstep(u); + const comm_weight_t comm_w_u = graph->vertex_comm_weight(u); + + if (u_proc != p_to) { + bool already_sending_to_p_to = false; + unsigned count_on_p_to = comm_ds.node_lambda_map.get_proc_entry(u, p_to); + if (p_to == node_proc) { + if (count_on_p_to > 0) + count_on_p_to--; } - // To - if (u_proc != p_to) { - // Send part (from parent u) & Receive part (at p_to) - // This logic is complex for an affinity calculation. - // A full recompute for neighbors is a safer bet, which is what update_node_comm_affinity does. // TODO: this is not true anymore - // The following is an approximation. - - // if moving node to p_to creates a new communication link for parent u - bool has_other_on_p_to = false; - for(const auto& sibling : graph->children(u)) { - if (sibling != node && active_schedule->assigned_processor(sibling) == p_to) { has_other_on_p_to = true; break; } - } - if (!has_other_on_p_to) { - comm_cost_change -= calculate_comm_cost_change_send(u_step, u_proc, comm_w_u, 1, pre_move_data_u); - comm_cost_change -= calculate_comm_cost_change_receive(u_step, p_to, comm_w_u, 1, pre_move_data_u); + if (count_on_p_to > 0) + already_sending_to_p_to = true; + + if (!already_sending_to_p_to) { + const comm_weight_t cost = comm_w_u * instance->sendCosts(u_proc, p_to); + if (cost > 0) { + add_delta(true, u_step, p_to, -cost); + add_delta(false, u_step, u_proc, -cost); } } } - affinity_table_node[p_to][s_to_idx] += comm_cost_change * instance->communicationCosts(); } } } - cost_t calculate_comm_cost_change_send(unsigned step, unsigned p_send, cost_t comm_w, int sign, const pre_move_comm_data& pre_move_data) { - cost_t old_max = pre_move_data.from_step_max_comm; + comm_weight_t calculate_step_cost_change(unsigned step, const FastDeltaTracker &delta_send, + const FastDeltaTracker &delta_recv) { - cost_t new_send = comm_ds.step_proc_send(step, p_send) + sign * comm_w; - cost_t new_max_send = comm_ds.step_max_send(step); - if (new_send > new_max_send) new_max_send = new_send; - else if (comm_ds.step_proc_send(step, p_send) == new_max_send) { - if (sign < 0 && comm_ds.step_max_send_processor_count[step] == 1) { - new_max_send = comm_ds.step_second_max_send(step); - } else { - new_max_send = new_send; - } - } + comm_weight_t old_max = comm_ds.step_max_comm(step); + comm_weight_t second_max = comm_ds.step_second_max_comm(step); + unsigned old_max_count = comm_ds.step_max_comm_count(step); - return std::max(new_max_send, comm_ds.step_max_receive(step)) - old_max; - } + comm_weight_t new_global_max = 0; + unsigned reduced_max_instances = 0; - cost_t calculate_comm_cost_change_receive(unsigned step, unsigned p_receive, cost_t comm_w, int sign, const pre_move_comm_data& pre_move_data) { - cost_t old_max = pre_move_data.from_step_max_comm; + // 1. Check modified sends (Iterate sparse dirty list) + for (unsigned proc : delta_send.dirty_procs) { + comm_weight_t delta = delta_send.get(proc); + // delta cannot be 0 here due to FastDeltaTracker invariant - cost_t new_receive = comm_ds.step_proc_receive(step, p_receive) + sign * comm_w; + comm_weight_t current_val = comm_ds.step_proc_send(step, proc); + comm_weight_t new_val = current_val + delta; - cost_t new_max_receive = comm_ds.step_max_receive(step); - if (new_receive > new_max_receive) new_max_receive = new_receive; - else if (comm_ds.step_proc_receive(step, p_receive) == new_max_receive) { - if (sign < 0 && comm_ds.step_max_receive_processor_count[step] == 1) { - new_max_receive = comm_ds.step_second_max_receive(step); - } else { - new_max_receive = new_receive; - } + if (new_val > new_global_max) + new_global_max = new_val; + if (delta < 0 && current_val == old_max) + reduced_max_instances++; } - return std::max(comm_ds.step_max_send(step), new_max_receive) - old_max; - } + // 2. Check modified receives (Iterate sparse dirty list) + for (unsigned proc : delta_recv.dirty_procs) { + comm_weight_t delta = delta_recv.get(proc); + + comm_weight_t current_val = comm_ds.step_proc_receive(step, proc); + comm_weight_t new_val = current_val + delta; + + if (new_val > new_global_max) + new_global_max = new_val; + if (delta < 0 && current_val == old_max) + reduced_max_instances++; + } - cost_t calculate_comm_cost_change(unsigned step, unsigned p_send, unsigned p_receive, cost_t comm_w, int sign) { - const auto pre_move_data = comm_ds.get_pre_move_comm_data_step(step); - cost_t change = 0; - change += calculate_comm_cost_change_send(step, p_send, comm_w, sign, pre_move_data); - comm_ds.step_proc_send(step, p_send) += sign * comm_w; - change += calculate_comm_cost_change_receive(step, p_receive, comm_w, sign, pre_move_data); - comm_ds.step_proc_send(step, p_send) -= sign * comm_w; // revert for next calculation - return change; + // 3. Determine result + if (new_global_max > old_max) { + return new_global_max - old_max; + } + if (reduced_max_instances < old_max_count) { + return 0; + } + return std::max(new_global_max, second_max) - old_max; } - template + template void update_node_comm_affinity(const kl_move &move, thread_data_t &thread_data, const cost_t &penalty, - const cost_t &reward, std::map &max_gain_recompute, + const cost_t &reward, std::map &, std::vector &new_nodes) { - // For simplicity and correctness, we will do a full recompute for neighbors. - // A fully incremental update is very complex for this cost function. - auto process_neighbor = [&](VertexType neighbor) { - if (thread_data.lock_manager.is_locked(neighbor)) return; - if (not thread_data.affinity_table.is_selected(neighbor)) { - new_nodes.push_back(neighbor); - return; + + const unsigned start_step = thread_data.start_step; + const unsigned end_step = thread_data.end_step; + + for (const auto &target : instance->getComputationalDag().children(move.node)) { + const unsigned target_step = active_schedule->assigned_superstep(target); + if (target_step < start_step || target_step > end_step) + continue; + + if (thread_data.lock_manager.is_locked(target)) + continue; + + if (not thread_data.affinity_table.is_selected(target)) { + new_nodes.push_back(target); + continue; } - if (max_gain_recompute.find(neighbor) == max_gain_recompute.end()) { - max_gain_recompute[neighbor] = kl_gain_update_info(neighbor, true); + + const unsigned target_proc = active_schedule->assigned_processor(target); + const unsigned target_start_idx = start_idx(target_step, start_step); + auto &affinity_table = thread_data.affinity_table.at(target); + + if (move.from_step < target_step + (move.from_proc == target_proc)) { + const unsigned diff = target_step - move.from_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; + unsigned idx = target_start_idx; + for (; idx < bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + affinity_table[p][idx] -= penalty; + } + } + + if (idx - 1 < bound && is_compatible(target, move.from_proc)) { + affinity_table[move.from_proc][idx - 1] += penalty; + } + } else { - max_gain_recompute[neighbor].full_update = true; + const unsigned diff = move.from_step - target_step; + const unsigned window_bound = end_idx(target_step, end_step); + unsigned idx = std::min(window_size + diff, window_bound); + + if (idx < window_bound && is_compatible(target, move.from_proc)) { + affinity_table[move.from_proc][idx] += reward; + } + + idx++; + + for (; idx < window_bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + affinity_table[p][idx] += reward; + } + } } - }; - for (const auto &target : graph->children(move.node)) { - process_neighbor(target); + if (move.to_step < target_step + (move.to_proc == target_proc)) { + unsigned idx = target_start_idx; + const unsigned diff = target_step - move.to_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; + for (; idx < bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + affinity_table[p][idx] += penalty; + } + } + + if (idx - 1 < bound && is_compatible(target, move.to_proc)) { + affinity_table[move.to_proc][idx - 1] -= penalty; + } + + } else { + const unsigned diff = move.to_step - target_step; + const unsigned window_bound = end_idx(target_step, end_step); + unsigned idx = std::min(window_size + diff, window_bound); + + if (idx < window_bound && is_compatible(target, move.to_proc)) { + affinity_table[move.to_proc][idx] -= reward; + } + + idx++; + + for (; idx < window_bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + affinity_table[p][idx] -= reward; + } + } + } } - for (const auto &source : graph->parents(move.node)) { - process_neighbor(source); + + for (const auto &source : instance->getComputationalDag().parents(move.node)) { + const unsigned source_step = active_schedule->assigned_superstep(source); + if (source_step < start_step || source_step > end_step) + continue; + + if (thread_data.lock_manager.is_locked(source)) + continue; + + if (not thread_data.affinity_table.is_selected(source)) { + new_nodes.push_back(source); + continue; + } + + const unsigned source_proc = active_schedule->assigned_processor(source); + const unsigned source_start_idx = start_idx(source_step, start_step); + const unsigned window_bound = end_idx(source_step, end_step); + auto &affinity_table_source = thread_data.affinity_table.at(source); + + if (move.from_step < source_step + (move.from_proc != source_proc)) { + const unsigned diff = source_step - move.from_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; + unsigned idx = source_start_idx; + for (; idx < bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + affinity_table_source[p][idx] += reward; + } + } + + if (window_size >= diff && is_compatible(source, move.from_proc)) { + affinity_table_source[move.from_proc][idx] += reward; + } + + } else { + const unsigned diff = move.from_step - source_step; + unsigned idx = window_size + diff; + + if (idx < window_bound && is_compatible(source, move.from_proc)) { + affinity_table_source[move.from_proc][idx] += penalty; + } + + for (; idx < window_bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + affinity_table_source[p][idx] -= penalty; + } + } + } + + if (move.to_step < source_step + (move.to_proc != source_proc)) { + const unsigned diff = source_step - move.to_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; + unsigned idx = source_start_idx; + for (; idx < bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + affinity_table_source[p][idx] -= reward; + } + } + + if (window_size >= diff && is_compatible(source, move.to_proc)) { + affinity_table_source[move.to_proc][idx] -= reward; + } + + } else { + const unsigned diff = move.to_step - source_step; + unsigned idx = window_size + diff; + + if (idx < window_bound && is_compatible(source, move.to_proc)) { + affinity_table_source[move.to_proc][idx] -= penalty; + } + for (; idx < window_bound; idx++) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + affinity_table_source[p][idx] += penalty; + } + } + } } } }; diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp index 6b6f25b5..50384c72 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp @@ -32,7 +32,8 @@ struct kl_hyper_total_comm_cost_function { using kl_gain_update_info = kl_update_info; constexpr static unsigned window_range = 2 * window_size + 1; - + constexpr static bool is_max_comm_cost_function = false; + kl_active_schedule *active_schedule; compatible_processor_range *proc_range; @@ -43,7 +44,7 @@ struct kl_hyper_total_comm_cost_function { cost_t comm_multiplier = 1; cost_t max_comm_weight = 0; - lambda_vector_container node_lambda_map; + lambda_vector_container node_lambda_map; inline cost_t get_comm_multiplier() { return comm_multiplier; } inline cost_t get_max_comm_weight() { return max_comm_weight; } @@ -60,6 +61,12 @@ struct kl_hyper_total_comm_cost_function { node_lambda_map.initialize(graph->num_vertices(), instance->numberOfProcessors()); } + struct empty_struct {}; + + using pre_move_comm_data_t = empty_struct; + + inline empty_struct get_pre_move_comm_data(const kl_move& ) { return empty_struct(); } + cost_t compute_schedule_cost() { cost_t work_costs = 0; for (unsigned step = 0; step < active_schedule->num_steps(); step++) { diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp index 7d0d61ea..be7c627c 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp @@ -29,6 +29,8 @@ struct kl_total_comm_cost_function { using kl_move = kl_move_struct; using kl_gain_update_info = kl_update_info; + constexpr static bool is_max_comm_cost_function = false; + constexpr static unsigned window_range = 2 * window_size + 1; constexpr static bool use_node_communication_costs = use_node_communication_costs_arg || not has_edge_weights_v; @@ -58,6 +60,10 @@ struct kl_total_comm_cost_function { comm_multiplier = 1.0 / instance->numberOfProcessors(); } + struct empty_struct {}; + using pre_move_comm_data_t = empty_struct; + inline empty_struct get_pre_move_comm_data(const kl_move& ) { return empty_struct(); } + cost_t compute_schedule_cost_test() { return compute_schedule_cost(); } diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/lambda_container.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/lambda_container.hpp index fd126699..0eccc815 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/lambda_container.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/lambda_container.hpp @@ -18,24 +18,101 @@ limitations under the License. #pragma once -#include -#include #include +#include +#include namespace osp { +/** + * @brief Container for tracking child processor assignments in a BSP schedule using hash maps. + * + * This structure tracks how many children a node has that are assigned to each processor. + * It uses unordered_map for sparse data representation. + * + * For each node, the map stores the count of children assigned to each processor, which is + * important for computing communication costs in BSP scheduling. + */ +template struct lambda_map_container { - std::vector> node_lambda_map; + /// Vector of maps: for each node, maps processor ID to assignment count + std::vector> node_lambda_map; + + /** + * @brief Initialize the container for a given number of vertices. + * @param num_vertices Number of nodes in the schedule + * @param (unused) Number of processors (not needed for map-based implementation) + */ + inline void initialize(const vertex_idx_t num_vertices, const unsigned) { node_lambda_map.resize(num_vertices); } + + /** + * @brief Reset all processor assignments for a specific node. + * @param node Node index to reset + */ + inline void reset_node(const vertex_idx_t node) { node_lambda_map[node].clear(); } - inline void initialize(const size_t num_vertices, const unsigned) { node_lambda_map.resize(num_vertices); } - inline void reset_node(const size_t node) { node_lambda_map[node].clear(); } + /** + * @brief Clear all data from the container. + */ inline void clear() { node_lambda_map.clear(); } - inline bool has_proc_entry(const size_t node, const unsigned proc) const { return (node_lambda_map[node].find(proc) != node_lambda_map[node].end()); } - inline bool has_no_proc_entry(const size_t node, const unsigned proc) const { return (node_lambda_map[node].find(proc) == node_lambda_map[node].end()); } - inline unsigned & get_proc_entry(const size_t node, const unsigned proc) { return node_lambda_map[node][proc]; } - inline bool increase_proc_count(const size_t node, const unsigned proc) { + /** + * @brief Check if a processor has an entry for a given node. + * @param node Node index + * @param proc Processor ID + * @return true if the processor has at least one assignment to the node + */ + inline bool has_proc_entry(const vertex_idx_t node, const unsigned proc) const { + return (node_lambda_map[node].find(proc) != node_lambda_map[node].end()); + } + + /** + * @brief Check if a processor has no entry for a given node. + * @param node Node index + * @param proc Processor ID + * @return true if the processor has no assignments to the node + */ + inline bool has_no_proc_entry(const vertex_idx_t node, const unsigned proc) const { + return (node_lambda_map[node].find(proc) == node_lambda_map[node].end()); + } + + /** + * @brief Get a reference to the processor count for a given node. + * @param node Node index + * @param proc Processor ID + * @return Reference to the count (creates entry if it doesn't exist) + */ + inline unsigned &get_proc_entry(const vertex_idx_t node, const unsigned proc) { return node_lambda_map[node][proc]; } + + /** + * @brief Get the processor count for a given node (const version). + * @param node Node index + * @param proc Processor ID + * @return The count value for the processor at the node + * @pre has_proc_entry(node, proc) must be true + */ + inline unsigned get_proc_entry(const vertex_idx_t node, const unsigned proc) const { + assert(has_proc_entry(node, proc)); + return node_lambda_map[node].at(proc); + } + + /** + * @brief Get the number of different processors to which a node has children assigned. + * @param node Node index + * @return The count of different processors the node is sending to + */ + inline unsigned get_proc_count(const vertex_idx_t node) const { + return static_cast(node_lambda_map[node].size()); + } + + /** + * @brief Increase the processor count for a given node. + * @param node Node index + * @param proc Processor ID + * @return true if this is the first assignment of this processor to the node + */ + inline bool increase_proc_count(const vertex_idx_t node, const unsigned proc) { if (has_proc_entry(node, proc)) { node_lambda_map[node][proc]++; return false; @@ -45,7 +122,14 @@ struct lambda_map_container { } } - inline bool decrease_proc_count(const size_t node, const unsigned proc) { + /** + * @brief Decrease the processor count for a given node. + * @param node Node index + * @param proc Processor ID + * @return true if this was the last assignment of this processor to the node + * @pre has_proc_entry(node, proc) must be true + */ + inline bool decrease_proc_count(const vertex_idx_t node, const unsigned proc) { assert(has_proc_entry(node, proc)); if (node_lambda_map[node][proc] == 1) { node_lambda_map[node].erase(proc); @@ -56,40 +140,80 @@ struct lambda_map_container { } } - inline const auto & iterate_proc_entries(const size_t node) { - return node_lambda_map[node]; - } + /** + * @brief Get an iterable view of all processor entries for a node. + * @param node Node index + * @return Reference to the unordered_map of processor assignments for the node + */ + inline const auto &iterate_proc_entries(const vertex_idx_t node) { return node_lambda_map[node]; } }; +/** + * @brief Container for tracking child processor assignments in a BSP schedule using vectors. + * + * This structure tracks how many children a node has that are assigned to each processor. + * It uses a 2D vector for dense data, making it efficient when most processors may have + * children of nodes assigned to them, or when the processor count is relatively small. + * + * For each node, the vector stores the count of children assigned to each processor, which is + * important for computing communication costs in BSP scheduling. + */ +template struct lambda_vector_container { - + + /** + * @brief Range adapter for iterating over non-zero processor entries. + * + * Provides a range-based for loop interface that automatically skips processors + * with zero assignments. + */ class lambda_vector_range { - private: - const std::vector & vec_; + private: + const std::vector &vec_; - public: + public: + /** + * @brief Iterator that skips zero entries in the lambda vector. + * + * Implements an input iterator that yields pairs of (processor_id, count) + * for all processors with non-zero assignment counts. + */ class lambda_vector_iterator { - + using iterator_category = std::input_iterator_tag; using value_type = std::pair; using difference_type = std::ptrdiff_t; - using pointer = value_type*; - using reference = value_type&; - private: - const std::vector& vec_; - size_t index_; - public: - - lambda_vector_iterator(const std::vector& vec) : vec_(vec), index_(0) { - // Advance to the first valid entry - while (index_ < vec_.size() && vec_[index_] == 0) { - ++index_; + using pointer = value_type *; + using reference = value_type &; + + private: + const std::vector &vec_; + unsigned index_; + + public: + /** + * @brief Construct iterator at the beginning, skipping initial zeros. + * @param vec Reference to the vector to iterate over + */ + lambda_vector_iterator(const std::vector &vec) : vec_(vec), index_(0) { + // Advance to the first valid entry + while (index_ < vec_.size() && vec_[index_] == 0) { + ++index_; + } } - } - lambda_vector_iterator(const std::vector& vec, size_t index) : vec_(vec), index_(index) {} + /** + * @brief Construct iterator at a specific position. + * @param vec Reference to the vector to iterate over + * @param index Starting index + */ + lambda_vector_iterator(const std::vector &vec, unsigned index) : vec_(vec), index_(index) {} - lambda_vector_iterator& operator++() { + /** + * @brief Advance to the next non-zero entry. + * @return Reference to this iterator + */ + lambda_vector_iterator &operator++() { ++index_; while (index_ < vec_.size() && vec_[index_] == 0) { ++index_; @@ -97,58 +221,152 @@ struct lambda_vector_container { return *this; } - value_type operator*() const { - return std::make_pair(static_cast(index_), vec_[index_]); - } + /** + * @brief Dereference to get (processor_id, count) pair. + * @return Pair of processor ID and its count + */ + value_type operator*() const { return std::make_pair(index_, vec_[index_]); } - bool operator==(const lambda_vector_iterator& other) const { - return index_ == other.index_; - } + /** + * @brief Check equality with another iterator. + * @param other Iterator to compare with + * @return true if both iterators point to the same position + */ + bool operator==(const lambda_vector_iterator &other) const { return index_ == other.index_; } - bool operator!=(const lambda_vector_iterator& other) const { - return !(*this == other); - } + /** + * @brief Check inequality with another iterator. + * @param other Iterator to compare with + * @return true if iterators point to different positions + */ + bool operator!=(const lambda_vector_iterator &other) const { return !(*this == other); } }; - lambda_vector_range(const std::vector& vec) : vec_(vec) {} + /** + * @brief Construct a range from a vector. + * @param vec Reference to the vector to create range over + */ + lambda_vector_range(const std::vector &vec) : vec_(vec) {} + /// Get iterator to the first non-zero entry lambda_vector_iterator begin() { return lambda_vector_iterator(vec_); } - lambda_vector_iterator end() { return lambda_vector_iterator(vec_, vec_.size()); } + + /// Get iterator to the end + lambda_vector_iterator end() { return lambda_vector_iterator(vec_, static_cast(vec_.size())); } }; + /// 2D vector: for each node, stores processor assignment counts std::vector> node_lambda_vec; + + /// Number of processors in the system unsigned num_procs_ = 0; - inline void initialize(const size_t num_vertices, const unsigned num_procs) { - node_lambda_vec.assign(num_vertices, {num_procs}); - num_procs_ = num_procs; + /** + * @brief Initialize the container for a given number of vertices and processors. + * @param num_vertices Number of nodes in the schedule + * @param num_procs Number of processors in the system + */ + inline void initialize(const vertex_idx_t num_vertices, const unsigned num_procs) { + node_lambda_vec.assign(num_vertices, std::vector(num_procs, 0)); + num_procs_ = num_procs; } - inline void reset_node(const size_t node) { node_lambda_vec[node].assign(num_procs_, 0); } + /** + * @brief Reset all processor assignments for a specific node. + * @param node Node index to reset + */ + inline void reset_node(const vertex_idx_t node) { node_lambda_vec[node].assign(num_procs_, 0); } + + /** + * @brief Clear all data from the container. + */ inline void clear() { node_lambda_vec.clear(); } - inline bool has_proc_entry(const size_t node, const unsigned proc) const { return node_lambda_vec[node][proc] > 0; } - inline bool has_no_proc_entry(const size_t node, const unsigned proc) const { return node_lambda_vec[node][proc] == 0; } - inline unsigned & get_proc_entry(const size_t node, const unsigned proc) { return node_lambda_vec[node][proc]; } - inline unsigned get_proc_entry(const size_t node, const unsigned proc) const { + /** + * @brief Check if a processor has an entry for a given node. + * @param node Node index + * @param proc Processor ID + * @return true if the processor has at least one assignment to the node + */ + inline bool has_proc_entry(const vertex_idx_t node, const unsigned proc) const { return node_lambda_vec[node][proc] > 0; } + + /** + * @brief Check if a processor has no entry for a given node. + * @param node Node index + * @param proc Processor ID + * @return true if the processor has no assignments to the node + */ + inline bool has_no_proc_entry(const vertex_idx_t node, const unsigned proc) const { + return node_lambda_vec[node][proc] == 0; + } + + /** + * @brief Get a reference to the processor count for a given node. + * @param node Node index + * @param proc Processor ID + * @return Reference to the count (allows modification) + */ + inline unsigned &get_proc_entry(const vertex_idx_t node, const unsigned proc) { return node_lambda_vec[node][proc]; } + + /** + * @brief Get the processor count for a given node (const version). + * @param node Node index + * @param proc Processor ID + * @return The count value for the processor at the node + * @pre has_proc_entry(node, proc) must be true + */ + inline unsigned get_proc_entry(const vertex_idx_t node, const unsigned proc) const { assert(has_proc_entry(node, proc)); return node_lambda_vec[node][proc]; } - inline bool increase_proc_count(const size_t node, const unsigned proc) { + /** + * @brief Get the processor count for a given node (alias for compatibility). + * @param node Node index + * @param proc Processor ID + * @return The count value for the processor at the node + * @pre has_proc_entry(node, proc) must be true + */ + inline unsigned get_proc_count(const vertex_idx_t node) const { + unsigned count = 0; + for (unsigned proc = 0; proc < num_procs_; ++proc) { + if (node_lambda_vec[node][proc] > 0) { + ++count; + } + } + return count; + } + + /** + * @brief Increase the processor count for a given node. + * @param node Node index + * @param proc Processor ID + * @return true if this is the first assignment of this processor to the node + */ + inline bool increase_proc_count(const vertex_idx_t node, const unsigned proc) { node_lambda_vec[node][proc]++; return node_lambda_vec[node][proc] == 1; } - inline bool decrease_proc_count(const size_t node, const unsigned proc) { + /** + * @brief Decrease the processor count for a given node. + * @param node Node index + * @param proc Processor ID + * @return true if this was the last assignment of this processor to the node + * @pre has_proc_entry(node, proc) must be true + */ + inline bool decrease_proc_count(const vertex_idx_t node, const unsigned proc) { assert(has_proc_entry(node, proc)); node_lambda_vec[node][proc]--; return node_lambda_vec[node][proc] == 0; } - inline auto iterate_proc_entries(const size_t node) { - return lambda_vector_range(node_lambda_vec[node]); - } + /** + * @brief Get an iterable range over all non-zero processor entries for a node. + * @param node Node index + * @return Range object that can be used in range-based for loops + */ + inline auto iterate_proc_entries(const vertex_idx_t node) { return lambda_vector_range(node_lambda_vec[node]); } }; } // namespace osp \ No newline at end of file diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp index 82ade586..cc8d8a5a 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp @@ -18,28 +18,39 @@ limitations under the License. #pragma once -#include -#include -#include -#include "osp/bsp/model/BspInstance.hpp" #include "lambda_container.hpp" -#include "../kl_active_schedule.hpp" +#include "osp/bsp/model/BspInstance.hpp" +#include +#include +#include + +namespace osp { template struct pre_move_comm_data { - comm_weight_t from_step_max_comm; - comm_weight_t from_step_second_max_comm; + struct step_info { + comm_weight_t max_comm; + comm_weight_t second_max_comm; + unsigned max_comm_count; + }; - comm_weight_t to_step_max_comm; - comm_weight_t to_step_second_max_comm; + std::unordered_map step_data; pre_move_comm_data() = default; - pre_move_comm_data(comm_weight_t from_max, comm_weight_t from_second_max, - comm_weight_t to_max, comm_weight_t to_second_max) - : from_step_max_comm(from_max), from_step_second_max_comm(from_second_max), - to_step_max_comm(to_max), to_step_second_max_comm(to_second_max) {} + void add_step(unsigned step, comm_weight_t max, comm_weight_t second, unsigned count) { + step_data[step] = {max, second, count}; + } + + bool get_step(unsigned step, step_info &info) const { + auto it = step_data.find(step); + if (it != step_data.end()) { + info = it->second; + return true; + } + return false; + } }; template @@ -51,167 +62,292 @@ struct max_comm_datastructure { const BspInstance *instance; const kl_active_schedule_t *active_schedule; - - struct comm_proc { - comm_weight_t comm; - unsigned proc; - - comm_proc() : comm(0), proc(0) {} - comm_proc(comm_weight_t c, unsigned p) : comm(c), proc(p) {} - - bool operator<(comm_proc const &rhs) const { - return (comm > rhs.comm) or (comm == rhs.comm and proc < rhs.proc); - } - }; - - std::vector> step_proc_send_sorted; - std::vector> step_proc_receive_sorted; - std::vector> step_proc_send; - std::vector> step_proc_receive; - - std::vector step_max_send_processor_count; - std::vector step_max_receive_processor_count; + std::vector> step_proc_send_; + std::vector> step_proc_receive_; + // Caches for fast cost calculation (Global Max/Second Max per step) std::vector step_max_comm_cache; std::vector step_second_max_comm_cache; + std::vector step_max_comm_count_cache; comm_weight_t max_comm_weight = 0; - lambda_vector_container node_lambda_map; - - inline comm_weight_t step_proc_send(unsigned step, unsigned proc) const { return step_proc_send[step][proc]; } - inline comm_weight_t& step_proc_send(unsigned step, unsigned proc) { return step_proc_send[step][proc]; } - inline comm_weight_t step_proc_receive(unsigned step, unsigned proc) const { return step_proc_receive[step][proc]; } - inline comm_weight_t& step_proc_receive(unsigned step, unsigned proc) { return step_proc_receive[step][proc]; } + lambda_vector_container node_lambda_map; - inline comm_weight_t step_max_send(unsigned step) const { return step_proc_send_sorted[step][0].comm; } - inline comm_weight_t step_second_max_send(unsigned step) const { - return step_proc_send_sorted[step][step_max_send_processor_count[step]].comm; - } + // Optimization: Scratchpad for update_datastructure_after_move to avoid allocations + std::vector affected_steps_list; + std::vector step_is_affected; - inline comm_weight_t step_max_receive(unsigned step) const { return step_proc_receive_sorted[step][0].comm; } - inline comm_weight_t step_second_max_receive(unsigned step) const { - return step_proc_receive_sorted[step][step_max_receive_processor_count[step]].comm; + inline comm_weight_t step_proc_send(unsigned step, unsigned proc) const { return step_proc_send_[step][proc]; } + inline comm_weight_t &step_proc_send(unsigned step, unsigned proc) { return step_proc_send_[step][proc]; } + inline comm_weight_t step_proc_receive(unsigned step, unsigned proc) const { + return step_proc_receive_[step][proc]; } + inline comm_weight_t &step_proc_receive(unsigned step, unsigned proc) { return step_proc_receive_[step][proc]; } inline comm_weight_t step_max_comm(unsigned step) const { return step_max_comm_cache[step]; } - inline comm_weight_t step_second_max_comm(unsigned step) const { - return step_second_max_comm_cache[step]; - } + inline comm_weight_t step_second_max_comm(unsigned step) const { return step_second_max_comm_cache[step]; } + inline unsigned step_max_comm_count(unsigned step) const { return step_max_comm_count_cache[step]; } - template - inline pre_move_comm_data get_pre_move_comm_data(const kl_move_struct& move) { - return pre_move_comm_data( - step_max_comm(move.from_step), step_second_max_comm(move.from_step), - step_max_comm(move.to_step), step_second_max_comm(move.to_step) - ); - } - - template - inline pre_move_comm_data_step get_pre_move_comm_data_step(unsigned step) const { - return pre_move_comm_data_step( - step_max_comm(step), step_second_max_comm(step), 0, 0 - ); - } - - inline void initialize( kl_active_schedule_t &kl_sched) { + inline void initialize(kl_active_schedule_t &kl_sched) { active_schedule = &kl_sched; - instance = & active_schedule->getInstance(); + instance = &active_schedule->getInstance(); const unsigned num_steps = active_schedule->num_steps(); const unsigned num_procs = instance->numberOfProcessors(); max_comm_weight = 0; - step_proc_send.assign(num_steps, std::vector(num_procs, 0)); - step_proc_receive.assign(num_steps, std::vector(num_procs, 0)); - - step_proc_send_sorted.assign(num_steps, std::vector(num_procs)); - step_proc_receive_sorted.assign(num_steps, std::vector(num_procs)); + step_proc_send_.assign(num_steps, std::vector(num_procs, 0)); + step_proc_receive_.assign(num_steps, std::vector(num_procs, 0)); - step_max_send_processor_count.assign(num_steps, 0); - step_max_receive_processor_count.assign(num_steps, 0); step_max_comm_cache.assign(num_steps, 0); step_second_max_comm_cache.assign(num_steps, 0); + step_max_comm_count_cache.assign(num_steps, 0); node_lambda_map.initialize(instance->getComputationalDag().num_vertices(), num_procs); + + // Initialize scratchpad + step_is_affected.assign(num_steps, false); + affected_steps_list.reserve(num_steps); } inline void clear() { - step_proc_send.clear(); - step_proc_receive.clear(); - step_proc_send_sorted.clear(); - step_proc_receive_sorted.clear(); - step_max_send_processor_count.clear(); - step_max_receive_processor_count.clear(); + step_proc_send_.clear(); + step_proc_receive_.clear(); step_max_comm_cache.clear(); step_second_max_comm_cache.clear(); + step_max_comm_count_cache.clear(); node_lambda_map.clear(); + affected_steps_list.clear(); + step_is_affected.clear(); } inline void arrange_superstep_comm_data(const unsigned step) { - for (unsigned p = 0; p < instance->numberOfProcessors(); ++p) { - step_proc_send_sorted[step][p] = {step_proc_send[step][p], p}; - step_proc_receive_sorted[step][p] = {step_proc_receive[step][p], p}; - } - std::sort(step_proc_send_sorted[step].begin(), step_proc_send_sorted[step].end()); - std::sort(step_proc_receive_sorted[step].begin(), step_proc_receive_sorted[step].end()); - - const comm_weight_t max_send = step_proc_send_sorted[step][0].comm; - unsigned send_count = 1; - while (send_count < instance->numberOfProcessors() && step_proc_send_sorted[step][send_count].comm == max_send) { - send_count++; + // Linear scan O(P) to find max, second_max and count + + // 1. Analyze Sends + comm_weight_t max_send = 0; + comm_weight_t second_max_send = 0; + unsigned max_send_count = 0; + + const auto &sends = step_proc_send_[step]; + for (const auto val : sends) { + if (val > max_send) { + second_max_send = max_send; + max_send = val; + max_send_count = 1; + } else if (val == max_send) { + max_send_count++; + } else if (val > second_max_send) { + second_max_send = val; + } } - step_max_send_processor_count[step] = send_count; - const comm_weight_t max_receive = step_proc_receive_sorted[step][0].comm; - unsigned receive_count = 1; - while (receive_count < instance->numberOfProcessors() && step_proc_receive_sorted[step][receive_count].comm == max_receive) { - receive_count++; + // 2. Analyze Receives + comm_weight_t max_receive = 0; + comm_weight_t second_max_receive = 0; + unsigned max_receive_count = 0; + + const auto &receives = step_proc_receive_[step]; + for (const auto val : receives) { + if (val > max_receive) { + second_max_receive = max_receive; + max_receive = val; + max_receive_count = 1; + } else if (val == max_receive) { + max_receive_count++; + } else if (val > second_max_receive) { + second_max_receive = val; + } } - step_max_receive_processor_count[step] = receive_count; - step_max_comm_cache[step] = std::max(max_send, max_receive); + // 3. Aggregate Global Stats + const comm_weight_t global_max = std::max(max_send, max_receive); + step_max_comm_cache[step] = global_max; - const comm_weight_t second_max_send = step_proc_send_sorted[step][send_count].comm; - const comm_weight_t second_max_receive = step_proc_receive_sorted[step][receive_count].comm; + unsigned global_count = 0; + if (max_send == global_max) + global_count += max_send_count; + if (max_receive == global_max) + global_count += max_receive_count; + step_max_comm_count_cache[step] = global_count; - step_second_max_comm_cache[step] = std::max(std::max(second_max_send, max_receive), std::max(max_send, second_max_receive)); + // Determine second max + comm_weight_t cand_send = (max_send == global_max) ? second_max_send : max_send; + comm_weight_t cand_recv = (max_receive == global_max) ? second_max_receive : max_receive; + step_second_max_comm_cache[step] = std::max(cand_send, cand_recv); } - void recompute_max_send_receive(unsigned step) { - arrange_superstep_comm_data(step); + void recompute_max_send_receive(unsigned step) { arrange_superstep_comm_data(step); } + + inline pre_move_comm_data get_pre_move_comm_data(const kl_move &move) { + pre_move_comm_data data; + std::unordered_set affected_steps; + + affected_steps.insert(move.from_step); + affected_steps.insert(move.to_step); + + const auto &graph = instance->getComputationalDag(); + + for (const auto &parent : graph.parents(move.node)) { + affected_steps.insert(active_schedule->assigned_superstep(parent)); + } + + for (unsigned step : affected_steps) { + data.add_step(step, step_max_comm(step), step_second_max_comm(step), step_max_comm_count(step)); + } + + return data; } - - void update_datastructure_after_move(const kl_move& move, unsigned start_step, unsigned end_step) { - + + void update_datastructure_after_move(const kl_move &move, unsigned, unsigned) { + const auto &graph = instance->getComputationalDag(); + + // --- 0. Prepare Scratchpad (Avoids Allocations) --- + for (unsigned step : affected_steps_list) { + if (step < step_is_affected.size()) + step_is_affected[step] = false; + } + affected_steps_list.clear(); + + auto mark_step = [&](unsigned step) { + if (step < step_is_affected.size() && !step_is_affected[step]) { + step_is_affected[step] = true; + affected_steps_list.push_back(step); + } + }; + + const VertexType node = move.node; + const unsigned from_step = move.from_step; + const unsigned to_step = move.to_step; + const unsigned from_proc = move.from_proc; + const unsigned to_proc = move.to_proc; + const comm_weight_t comm_w_node = graph.vertex_comm_weight(node); + + // --- 1. Handle Node Movement (Outgoing Edges: Node -> Children) --- + + if (from_step != to_step) { + // Case 1: Node changes Step + // Optimization: Fuse the loop to iterate lambda map only once. + + for (const auto [proc, count] : node_lambda_map.iterate_proc_entries(node)) { + // A. Remove Old (Sender: from_proc, Receiver: proc) + if (proc != from_proc) { + const comm_weight_t cost = comm_w_node * instance->sendCosts(from_proc, proc); + // Optimization: check cost > 0 to avoid dirtying cache lines with +0 ops + if (cost > 0) { + step_proc_receive_[from_step][proc] -= cost; + step_proc_send_[from_step][from_proc] -= cost; + } + } + + // B. Add New (Sender: to_proc, Receiver: proc) + if (proc != to_proc) { + const comm_weight_t cost = comm_w_node * instance->sendCosts(to_proc, proc); + if (cost > 0) { + step_proc_receive_[to_step][proc] += cost; + step_proc_send_[to_step][to_proc] += cost; + } + } + } + mark_step(from_step); + mark_step(to_step); + + } else if (from_proc != to_proc) { + // Case 2: Node stays in same Step, but changes Processor + + for (const auto [proc, count] : node_lambda_map.iterate_proc_entries(node)) { + // Remove Old (Sender: from_proc, Receiver: proc) + if (proc != from_proc) { + const comm_weight_t cost = comm_w_node * instance->sendCosts(from_proc, proc); + if (cost > 0) { + step_proc_receive_[from_step][proc] -= cost; + step_proc_send_[from_step][from_proc] -= cost; + } + } + + // Add New (Sender: to_proc, Receiver: proc) + if (proc != to_proc) { + const comm_weight_t cost = comm_w_node * instance->sendCosts(to_proc, proc); + if (cost > 0) { + step_proc_receive_[from_step][proc] += cost; + step_proc_send_[from_step][to_proc] += cost; + } + } + } + mark_step(from_step); + } + + // --- 2. Update Parents' Outgoing Communication (Parents → Node) --- + + if (from_proc != to_proc) { + for (const auto &parent : graph.parents(node)) { + const unsigned parent_step = active_schedule->assigned_superstep(parent); + // Fast boundary check + if (parent_step >= step_proc_send_.size()) + continue; + + const unsigned parent_proc = active_schedule->assigned_processor(parent); + const comm_weight_t comm_w_parent = graph.vertex_comm_weight(parent); + + const bool removed_from_proc = node_lambda_map.decrease_proc_count(parent, from_proc); + const bool added_to_proc = node_lambda_map.increase_proc_count(parent, to_proc); + + // 1. Handle Removal from from_proc + if (removed_from_proc) { + if (from_proc != parent_proc) { + const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, from_proc); + if (cost > 0) { + step_proc_send_[parent_step][parent_proc] -= cost; + step_proc_receive_[parent_step][from_proc] -= cost; + } + } + } + + // 2. Handle Addition to to_proc + if (added_to_proc) { + if (to_proc != parent_proc) { + const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, to_proc); + if (cost > 0) { + step_proc_send_[parent_step][parent_proc] += cost; + step_proc_receive_[parent_step][to_proc] += cost; + } + } + } + + mark_step(parent_step); + } + } + + // --- 3. Re-arrange Affected Steps --- + for (unsigned step : affected_steps_list) { + arrange_superstep_comm_data(step); + } } void swap_steps(const unsigned step1, const unsigned step2) { - std::swap(step_proc_send[step1], step_proc_send[step2]); - std::swap(step_proc_receive[step1], step_proc_receive[step2]); - std::swap(step_proc_send_sorted[step1], step_proc_send_sorted[step2]); - std::swap(step_proc_receive_sorted[step1], step_proc_receive_sorted[step2]); - std::swap(step_max_send_processor_count[step1], step_max_send_processor_count[step2]); - std::swap(step_max_receive_processor_count[step1], step_max_receive_processor_count[step2]); + std::swap(step_proc_send_[step1], step_proc_send_[step2]); + std::swap(step_proc_receive_[step1], step_proc_receive_[step2]); std::swap(step_max_comm_cache[step1], step_max_comm_cache[step2]); std::swap(step_second_max_comm_cache[step1], step_second_max_comm_cache[step2]); + std::swap(step_max_comm_count_cache[step1], step_max_comm_count_cache[step2]); } void reset_superstep(unsigned step) { - std::fill(step_proc_send[step].begin(), step_proc_send[step].end(), 0); - std::fill(step_proc_receive[step].begin(), step_proc_receive[step].end(), 0); + std::fill(step_proc_send_[step].begin(), step_proc_send_[step].end(), 0); + std::fill(step_proc_receive_[step].begin(), step_proc_receive_[step].end(), 0); arrange_superstep_comm_data(step); } void compute_comm_datastructures(unsigned start_step, unsigned end_step) { for (unsigned step = start_step; step <= end_step; step++) { - std::fill(step_proc_send[step].begin(), step_proc_send[step].end(), 0); - std::fill(step_proc_receive[step].begin(), step_proc_receive[step].end(), 0); + std::fill(step_proc_send_[step].begin(), step_proc_send_[step].end(), 0); + std::fill(step_proc_receive_[step].begin(), step_proc_receive_[step].end(), 0); } - const auto & vec_sched = active_schedule->getVectorSchedule(); - const auto & graph = instance->getComputationalDag(); + const auto &vec_sched = active_schedule->getVectorSchedule(); + const auto &graph = instance->getComputationalDag(); for (const auto &u : graph.vertices()) { node_lambda_map.reset_node(u); @@ -220,23 +356,29 @@ struct max_comm_datastructure { const comm_weight_t comm_w = graph.vertex_comm_weight(u); max_comm_weight = std::max(max_comm_weight, comm_w); - bool has_child_on_other_proc = false; for (const auto &v : graph.children(u)) { const unsigned v_proc = vec_sched.assignedProcessor(v); - if (u_proc != v_proc) { - if (node_lambda_map.increase_proc_count(u, v_proc)) { - has_child_on_other_proc = true; - step_proc_receive[u_step][v_proc] += comm_w; + const unsigned v_step = vec_sched.assignedSuperstep(v); + const comm_weight_t comm_w_send_cost = (u_proc != v_proc) ? comm_w * instance->sendCosts(u_proc, v_proc) : 0; + + if (node_lambda_map.increase_proc_count(u, v_proc)) { + if (u_proc != v_proc && comm_w_send_cost > 0) { + attribute_communication(comm_w_send_cost, u_step, u_proc, v_proc, v_step); } } } - - if(has_child_on_other_proc) - step_proc_send[u_step][u_proc] += comm_w; } - + for (unsigned step = start_step; step <= end_step; step++) { arrange_superstep_comm_data(step); } } -}; \ No newline at end of file + + inline void attribute_communication(const comm_weight_t &comm_w_send_cost, const unsigned u_step, const unsigned u_proc, const unsigned v_proc, + const unsigned) { + step_proc_receive_[u_step][v_proc] += comm_w_send_cost; + step_proc_send_[u_step][u_proc] += comm_w_send_cost; + } +}; + +} // namespace osp \ No newline at end of file diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp index 922e049b..a27ebe9b 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp @@ -31,9 +31,9 @@ limitations under the License. #include "osp/auxiliary/datastructures/heaps/PairingHeap.hpp" #include "osp/auxiliary/misc.hpp" #include "osp/bsp/scheduler/ImprovementScheduler.hpp" +#include "osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp" #include "osp/graph_algorithms/directed_graph_edge_desc_util.hpp" #include "osp/graph_algorithms/directed_graph_util.hpp" -#include "osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp" #include "kl_active_schedule.hpp" #include "kl_util.hpp" @@ -46,12 +46,12 @@ struct kl_parameter { unsigned num_parallel_loops = 4; unsigned max_inner_iterations_reset = 500; - unsigned max_no_improvement_iterations = 50; + unsigned max_no_improvement_iterations = 50; constexpr static unsigned abort_scatter_nodes_violation_threshold = 500; constexpr static unsigned initial_violation_threshold = 250; - unsigned max_no_vioaltions_removed_backtrack_reset; + unsigned max_no_vioaltions_removed_backtrack_reset; unsigned remove_step_epocs; unsigned node_max_step_selection_epochs; unsigned max_no_vioaltions_removed_backtrack_for_remove_step_reset; @@ -61,7 +61,6 @@ struct kl_parameter { unsigned thread_min_range = 8; unsigned thread_range_gap = 0; - }; template @@ -76,12 +75,15 @@ struct kl_update_info { bool update_entire_from_step = false; kl_update_info() = default; - kl_update_info(VertexType n) : node(n), full_update(false), update_entire_to_step(false), update_entire_from_step(false) {} - kl_update_info(VertexType n, bool full) : node(n), full_update(full), update_entire_to_step(false), update_entire_from_step(false) {} + kl_update_info(VertexType n) + : node(n), full_update(false), update_entire_to_step(false), update_entire_from_step(false) {} + kl_update_info(VertexType n, bool full) + : node(n), full_update(full), update_entire_to_step(false), update_entire_from_step(false) {} }; -template +template class kl_improver : public ImprovementScheduler { static_assert(is_directed_graph_edge_desc_v, "Graph_t must satisfy the directed_graph concept"); @@ -89,7 +91,6 @@ class kl_improver : public ImprovementScheduler { static_assert(is_computational_dag_v, "Graph_t must satisfy the computational_dag concept"); protected: - constexpr static unsigned window_range = 2 * window_size + 1; constexpr static bool enable_quick_moves = true; constexpr static bool enable_preresolving_violations = true; @@ -124,7 +125,7 @@ class kl_improver : public ImprovementScheduler { double average_gain = 0.0; unsigned max_inner_iterations = 0; - unsigned no_improvement_iterations_reduce_penalty = 0; + unsigned no_improvement_iterations_reduce_penalty = 0; unsigned min_inner_iter = 0; unsigned no_improvement_iterations_increase_inner_iter = 0; unsigned step_selection_epoch_counter = 0; @@ -136,9 +137,13 @@ class kl_improver : public ImprovementScheduler { unsigned max_no_vioaltions_removed_backtrack = 0; inline unsigned num_steps() const { return end_step - start_step + 1; } - inline unsigned start_idx(const unsigned node_step) const { return node_step < start_step + window_size ? window_size - (node_step - start_step) : 0; } - inline unsigned end_idx(unsigned node_step) const { return node_step + window_size <= end_step ? window_range : window_range - (node_step + window_size - end_step); } - + inline unsigned start_idx(const unsigned node_step) const { + return node_step < start_step + window_size ? window_size - (node_step - start_step) : 0; + } + inline unsigned end_idx(unsigned node_step) const { + return node_step + window_size <= end_step ? window_range + : window_range - (node_step + window_size - end_step); + } }; bool compute_with_time_limit = false; @@ -151,33 +156,38 @@ class kl_improver : public ImprovementScheduler { kl_parameter parameters; std::mt19937 gen; - + active_schedule_t active_schedule; comm_cost_function_t comm_cost_f; std::vector thread_data_vec; std::vector thread_finished_vec; - - inline unsigned rel_step_idx(const unsigned node_step, const unsigned move_step) const { return (move_step >= node_step) ? ((move_step - node_step) + window_size) : (window_size - (node_step - move_step)); } - inline bool is_compatible(VertexType node, unsigned proc) const { return active_schedule.getInstance().isCompatible(node, proc); } - void set_start_step(const unsigned step, ThreadSearchContext& thread_data) { + inline unsigned rel_step_idx(const unsigned node_step, const unsigned move_step) const { + return (move_step >= node_step) ? ((move_step - node_step) + window_size) + : (window_size - (node_step - move_step)); + } + inline bool is_compatible(VertexType node, unsigned proc) const { + return active_schedule.getInstance().isCompatible(node, proc); + } + + void set_start_step(const unsigned step, ThreadSearchContext &thread_data) { thread_data.start_step = step; thread_data.step_to_remove = step; thread_data.step_selection_counter = step; - + thread_data.average_gain = 0.0; thread_data.max_inner_iterations = parameters.max_inner_iterations_reset; thread_data.no_improvement_iterations_reduce_penalty = parameters.max_no_improvement_iterations / 5; thread_data.min_inner_iter = parameters.min_inner_iter_reset; thread_data.step_selection_epoch_counter = 0; - thread_data.no_improvement_iterations_increase_inner_iter = 10; + thread_data.no_improvement_iterations_increase_inner_iter = 10; thread_data.unlock_edge_backtrack_counter_reset = 0; - thread_data.unlock_edge_backtrack_counter = thread_data.unlock_edge_backtrack_counter_reset; + thread_data.unlock_edge_backtrack_counter = thread_data.unlock_edge_backtrack_counter_reset; thread_data.max_no_vioaltions_removed_backtrack = parameters.max_no_vioaltions_removed_backtrack_reset; } - - kl_move get_best_move(node_selection_container_t & affinity_table, vector_vertex_lock_manger & lock_manager, heap_datastructure & max_gain_heap) { + kl_move get_best_move(node_selection_container_t &affinity_table, + vector_vertex_lock_manger &lock_manager, heap_datastructure &max_gain_heap) { // To introduce non-determinism and help escape local optima, if there are multiple moves with the same // top gain, we randomly select one. We check up to `local_max` ties. const unsigned local_max = 50; @@ -198,24 +208,29 @@ class kl_improver : public ImprovementScheduler { return best_move; } - - inline void process_other_steps_best_move(const unsigned idx, const unsigned node_step, const VertexType& node, const cost_t affinity_current_proc_step, cost_t& max_gain, unsigned& max_proc, unsigned& max_step, const std::vector> &affinity_table_node) const { + + inline void process_other_steps_best_move(const unsigned idx, const unsigned node_step, const VertexType &node, + const cost_t affinity_current_proc_step, cost_t &max_gain, + unsigned &max_proc, unsigned &max_step, + const std::vector> &affinity_table_node) const { for (const unsigned p : proc_range.compatible_processors_vertex(node)) { if constexpr (active_schedule_t::use_memory_constraint) { - if( not active_schedule.memory_constraint.can_move(node, p, node_step + idx - window_size)) continue; + if (not active_schedule.memory_constraint.can_move(node, p, node_step + idx - window_size)) + continue; } const cost_t gain = affinity_current_proc_step - affinity_table_node[p][idx]; if (gain > max_gain) { max_gain = gain; max_proc = p; - max_step = idx; + max_step = idx; } } } template - kl_move compute_best_move(VertexType node, const std::vector> &affinity_table_node, ThreadSearchContext & thread_data) { + kl_move compute_best_move(VertexType node, const std::vector> &affinity_table_node, + ThreadSearchContext &thread_data) { const unsigned node_step = active_schedule.assigned_superstep(node); const unsigned node_proc = active_schedule.assigned_processor(node); @@ -228,7 +243,8 @@ class kl_improver : public ImprovementScheduler { unsigned idx = thread_data.start_idx(node_step); for (; idx < window_size; idx++) { - process_other_steps_best_move(idx, node_step, node, affinity_current_proc_step, max_gain, max_proc, max_step, affinity_table_node); + process_other_steps_best_move(idx, node_step, node, affinity_current_proc_step, max_gain, max_proc, + max_step, affinity_table_node); } if constexpr (move_to_same_super_step) { @@ -237,14 +253,15 @@ class kl_improver : public ImprovementScheduler { continue; if constexpr (active_schedule_t::use_memory_constraint) { - if( not active_schedule.memory_constraint.can_move(node, proc, node_step + idx - window_size)) continue; + if (not active_schedule.memory_constraint.can_move(node, proc, node_step + idx - window_size)) + continue; } const cost_t gain = affinity_current_proc_step - affinity_table_node[proc][window_size]; if (gain > max_gain) { max_gain = gain; max_proc = proc; - max_step = idx; + max_step = idx; } } } @@ -253,20 +270,23 @@ class kl_improver : public ImprovementScheduler { const unsigned bound = thread_data.end_idx(node_step); for (; idx < bound; idx++) { - process_other_steps_best_move(idx, node_step, node, affinity_current_proc_step, max_gain, max_proc, max_step, affinity_table_node); + process_other_steps_best_move(idx, node_step, node, affinity_current_proc_step, max_gain, max_proc, + max_step, affinity_table_node); } return kl_move(node, max_gain, node_proc, node_step, max_proc, node_step + max_step - window_size); } - - kl_gain_update_info update_node_work_affinity_after_move(VertexType node, kl_move move, const pre_move_work_data & prev_work_data, std::vector> &affinity_table_node) { + + kl_gain_update_info update_node_work_affinity_after_move(VertexType node, kl_move move, + const pre_move_work_data &prev_work_data, + std::vector> &affinity_table_node) { const unsigned node_step = active_schedule.assigned_superstep(node); const work_weight_t vertex_weight = graph->vertex_work_weight(node); kl_gain_update_info update_info(node); - if (move.from_step == move.to_step) { - const unsigned lower_bound = move.from_step > window_size ? move.from_step - window_size : 0; + if (move.from_step == move.to_step) { + const unsigned lower_bound = move.from_step > window_size ? move.from_step - window_size : 0; if (lower_bound <= node_step && node_step <= move.from_step + window_size) { update_info.update_from_step = true; update_info.update_to_step = true; @@ -276,147 +296,242 @@ class kl_improver : public ImprovementScheduler { if (node_step == move.from_step) { const unsigned node_proc = active_schedule.assigned_processor(node); - const work_weight_t new_max_weight = active_schedule.get_step_max_work(move.from_step); - const work_weight_t new_second_max_weight = active_schedule.get_step_second_max_work(move.from_step); - const work_weight_t new_step_proc_work = active_schedule.get_step_processor_work(node_step, node_proc); - const work_weight_t prev_step_proc_work = (node_proc == move.from_proc) ? new_step_proc_work + graph->vertex_work_weight(move.node) : (node_proc == move.to_proc) ? new_step_proc_work - graph->vertex_work_weight(move.node) : new_step_proc_work; - const bool prev_is_sole_max_processor = (prev_work_data.from_step_max_work_processor_count == 1) && (prev_max_work == prev_step_proc_work); - const cost_t prev_node_proc_affinity = prev_is_sole_max_processor ? std::min(vertex_weight, prev_max_work - prev_second_max_work) : 0.0; - const bool new_is_sole_max_processor = (active_schedule.get_step_max_work_processor_count()[node_step] == 1) && (new_max_weight == new_step_proc_work); - const cost_t new_node_proc_affinity = new_is_sole_max_processor ? std::min(vertex_weight, new_max_weight - new_second_max_weight) : 0.0; - + const work_weight_t new_max_weight = active_schedule.get_step_max_work(move.from_step); + const work_weight_t new_second_max_weight = + active_schedule.get_step_second_max_work(move.from_step); + const work_weight_t new_step_proc_work = + active_schedule.get_step_processor_work(node_step, node_proc); + const work_weight_t prev_step_proc_work = + (node_proc == move.from_proc) ? new_step_proc_work + graph->vertex_work_weight(move.node) + : (node_proc == move.to_proc) ? new_step_proc_work - graph->vertex_work_weight(move.node) + : new_step_proc_work; + const bool prev_is_sole_max_processor = (prev_work_data.from_step_max_work_processor_count == 1) && + (prev_max_work == prev_step_proc_work); + const cost_t prev_node_proc_affinity = + prev_is_sole_max_processor ? std::min(vertex_weight, prev_max_work - prev_second_max_work) + : 0.0; + const bool new_is_sole_max_processor = + (active_schedule.get_step_max_work_processor_count()[node_step] == 1) && + (new_max_weight == new_step_proc_work); + const cost_t new_node_proc_affinity = + new_is_sole_max_processor ? std::min(vertex_weight, new_max_weight - new_second_max_weight) + : 0.0; + const cost_t diff = new_node_proc_affinity - prev_node_proc_affinity; if (std::abs(diff) > EPSILON) { update_info.full_update = true; affinity_table_node[node_proc][window_size] += diff; // Use the pre-calculated diff - } - + } + if ((prev_max_work != new_max_weight) || update_info.full_update) { update_info.update_entire_from_step = true; - for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { - if((proc == node_proc) || (proc == move.from_proc) || (proc == move.to_proc)) { + for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { + if ((proc == node_proc) || (proc == move.from_proc) || (proc == move.to_proc)) { continue; } - const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, proc); - const cost_t prev_other_affinity = compute_same_step_affinity(prev_max_work, new_weight, prev_node_proc_affinity); - const cost_t other_affinity = compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity); - - affinity_table_node[proc][window_size] += (other_affinity - prev_other_affinity); + const work_weight_t new_weight = + vertex_weight + active_schedule.get_step_processor_work(node_step, proc); + const cost_t prev_other_affinity = + compute_same_step_affinity(prev_max_work, new_weight, prev_node_proc_affinity); + const cost_t other_affinity = + compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity); + + affinity_table_node[proc][window_size] += (other_affinity - prev_other_affinity); } - } - + } + if (node_proc != move.from_proc && is_compatible(node, move.from_proc)) { - const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move.from_proc) + graph->vertex_work_weight(move.node); - const cost_t prev_other_affinity = compute_same_step_affinity(prev_max_work, prev_new_weight, prev_node_proc_affinity); - const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move.from_proc); - const cost_t other_affinity = compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity); - affinity_table_node[move.from_proc][window_size] += (other_affinity - prev_other_affinity); - } - + const work_weight_t prev_new_weight = + vertex_weight + active_schedule.get_step_processor_work(node_step, move.from_proc) + + graph->vertex_work_weight(move.node); + const cost_t prev_other_affinity = + compute_same_step_affinity(prev_max_work, prev_new_weight, prev_node_proc_affinity); + const work_weight_t new_weight = + vertex_weight + active_schedule.get_step_processor_work(node_step, move.from_proc); + const cost_t other_affinity = + compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity); + affinity_table_node[move.from_proc][window_size] += (other_affinity - prev_other_affinity); + } + if (node_proc != move.to_proc && is_compatible(node, move.to_proc)) { - const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move.to_proc) - graph->vertex_work_weight(move.node); - const cost_t prev_other_affinity = compute_same_step_affinity(prev_max_work, prev_new_weight, prev_node_proc_affinity); - const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move.to_proc); - const cost_t other_affinity = compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity); - affinity_table_node[move.to_proc][window_size] += (other_affinity - prev_other_affinity); + const work_weight_t prev_new_weight = + vertex_weight + active_schedule.get_step_processor_work(node_step, move.to_proc) - + graph->vertex_work_weight(move.node); + const cost_t prev_other_affinity = + compute_same_step_affinity(prev_max_work, prev_new_weight, prev_node_proc_affinity); + const work_weight_t new_weight = + vertex_weight + active_schedule.get_step_processor_work(node_step, move.to_proc); + const cost_t other_affinity = + compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity); + affinity_table_node[move.to_proc][window_size] += (other_affinity - prev_other_affinity); } - } else { + } else { const work_weight_t new_max_weight = active_schedule.get_step_max_work(move.from_step); - const unsigned idx = rel_step_idx(node_step, move.from_step); + const unsigned idx = rel_step_idx(node_step, move.from_step); if (prev_max_work != new_max_weight) { - update_info.update_entire_from_step = true; + update_info.update_entire_from_step = true; // update moving to all procs with special for move.from_proc - for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { - const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(move.from_step, proc); + for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { + const work_weight_t new_weight = + vertex_weight + active_schedule.get_step_processor_work(move.from_step, proc); if (proc == move.from_proc) { - const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(move.from_step, proc) + graph->vertex_work_weight(move.node); - const cost_t prev_affinity = prev_max_work < prev_new_weight ? static_cast(prev_new_weight) - static_cast(prev_max_work) : 0.0; - const cost_t new_affinity = new_max_weight < new_weight ? static_cast(new_weight) - static_cast(new_max_weight) : 0.0; - affinity_table_node[proc][idx] += new_affinity - prev_affinity; + const work_weight_t prev_new_weight = + vertex_weight + active_schedule.get_step_processor_work(move.from_step, proc) + + graph->vertex_work_weight(move.node); + const cost_t prev_affinity = + prev_max_work < prev_new_weight + ? static_cast(prev_new_weight) - static_cast(prev_max_work) + : 0.0; + const cost_t new_affinity = + new_max_weight < new_weight + ? static_cast(new_weight) - static_cast(new_max_weight) + : 0.0; + affinity_table_node[proc][idx] += new_affinity - prev_affinity; } else if (proc == move.to_proc) { - const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(move.to_step, proc) - graph->vertex_work_weight(move.node); - const cost_t prev_affinity = prev_max_work < prev_new_weight ? static_cast(prev_new_weight) - static_cast(prev_max_work) : 0.0; - const cost_t new_affinity = new_max_weight < new_weight ? static_cast(new_weight) - static_cast(new_max_weight) : 0.0; + const work_weight_t prev_new_weight = + vertex_weight + active_schedule.get_step_processor_work(move.to_step, proc) - + graph->vertex_work_weight(move.node); + const cost_t prev_affinity = + prev_max_work < prev_new_weight + ? static_cast(prev_new_weight) - static_cast(prev_max_work) + : 0.0; + const cost_t new_affinity = + new_max_weight < new_weight + ? static_cast(new_weight) - static_cast(new_max_weight) + : 0.0; affinity_table_node[proc][idx] += new_affinity - prev_affinity; } else { - const cost_t prev_affinity = prev_max_work < new_weight ? static_cast(new_weight) - static_cast(prev_max_work) : 0.0; - const cost_t new_affinity = new_max_weight < new_weight ? static_cast(new_weight) - static_cast(new_max_weight) : 0.0; - affinity_table_node[proc][idx] += new_affinity - prev_affinity; + const cost_t prev_affinity = + prev_max_work < new_weight + ? static_cast(new_weight) - static_cast(prev_max_work) + : 0.0; + const cost_t new_affinity = + new_max_weight < new_weight + ? static_cast(new_weight) - static_cast(new_max_weight) + : 0.0; + affinity_table_node[proc][idx] += new_affinity - prev_affinity; } - } + } } else { // update only move.from_proc and move.to_proc if (is_compatible(node, move.from_proc)) { - const work_weight_t from_new_weight = vertex_weight + active_schedule.get_step_processor_work(move.from_step, move.from_proc); - const work_weight_t from_prev_new_weight = from_new_weight + graph->vertex_work_weight(move.node); - const cost_t from_prev_affinity = prev_max_work < from_prev_new_weight ? static_cast(from_prev_new_weight) - static_cast(prev_max_work) : 0.0; - - const cost_t from_new_affinity = new_max_weight < from_new_weight ? static_cast(from_new_weight) - static_cast(new_max_weight) : 0.0; + const work_weight_t from_new_weight = + vertex_weight + active_schedule.get_step_processor_work(move.from_step, move.from_proc); + const work_weight_t from_prev_new_weight = + from_new_weight + graph->vertex_work_weight(move.node); + const cost_t from_prev_affinity = + prev_max_work < from_prev_new_weight + ? static_cast(from_prev_new_weight) - static_cast(prev_max_work) + : 0.0; + + const cost_t from_new_affinity = + new_max_weight < from_new_weight + ? static_cast(from_new_weight) - static_cast(new_max_weight) + : 0.0; affinity_table_node[move.from_proc][idx] += from_new_affinity - from_prev_affinity; } if (is_compatible(node, move.to_proc)) { - const work_weight_t to_new_weight = vertex_weight + active_schedule.get_step_processor_work(move.to_step, move.to_proc); - const work_weight_t to_prev_new_weight = to_new_weight - graph->vertex_work_weight(move.node); - const cost_t to_prev_affinity = prev_max_work < to_prev_new_weight ? static_cast(to_prev_new_weight) - static_cast(prev_max_work) : 0.0; - - const cost_t to_new_affinity = new_max_weight < to_new_weight ? static_cast(to_new_weight) - static_cast(new_max_weight) : 0.0; + const work_weight_t to_new_weight = + vertex_weight + active_schedule.get_step_processor_work(move.to_step, move.to_proc); + const work_weight_t to_prev_new_weight = + to_new_weight - graph->vertex_work_weight(move.node); + const cost_t to_prev_affinity = + prev_max_work < to_prev_new_weight + ? static_cast(to_prev_new_weight) - static_cast(prev_max_work) + : 0.0; + + const cost_t to_new_affinity = + new_max_weight < to_new_weight + ? static_cast(to_new_weight) - static_cast(new_max_weight) + : 0.0; affinity_table_node[move.to_proc][idx] += to_new_affinity - to_prev_affinity; } } } } - - } else { + + } else { const unsigned node_proc = active_schedule.assigned_processor(node); - process_work_update_step(node, node_step, node_proc, vertex_weight, move.from_step, move.from_proc, graph->vertex_work_weight(move.node), prev_work_data.from_step_max_work, prev_work_data.from_step_second_max_work, prev_work_data.from_step_max_work_processor_count, update_info.update_from_step, update_info.update_entire_from_step, update_info.full_update, affinity_table_node); - process_work_update_step(node, node_step, node_proc, vertex_weight, move.to_step, move.to_proc, -graph->vertex_work_weight(move.node), prev_work_data.to_step_max_work, prev_work_data.to_step_second_max_work, prev_work_data.to_step_max_work_processor_count, update_info.update_to_step, update_info.update_entire_to_step, update_info.full_update, affinity_table_node); + process_work_update_step(node, node_step, node_proc, vertex_weight, move.from_step, move.from_proc, + graph->vertex_work_weight(move.node), prev_work_data.from_step_max_work, + prev_work_data.from_step_second_max_work, + prev_work_data.from_step_max_work_processor_count, update_info.update_from_step, + update_info.update_entire_from_step, update_info.full_update, affinity_table_node); + process_work_update_step(node, node_step, node_proc, vertex_weight, move.to_step, move.to_proc, + -graph->vertex_work_weight(move.node), prev_work_data.to_step_max_work, + prev_work_data.to_step_second_max_work, + prev_work_data.to_step_max_work_processor_count, update_info.update_to_step, + update_info.update_entire_to_step, update_info.full_update, affinity_table_node); } return update_info; } - void process_work_update_step(VertexType node, unsigned node_step, unsigned node_proc, work_weight_t vertex_weight, unsigned move_step, unsigned move_proc, work_weight_t move_correction_node_weight, const work_weight_t prev_move_step_max_work, const work_weight_t prev_move_step_second_max_work, unsigned prev_move_step_max_work_processor_count, bool & update_step, bool & update_entire_step, bool & full_update, std::vector> &affinity_table_node); - void update_node_work_affinity(node_selection_container_t &nodes, kl_move move, const pre_move_work_data & prev_work_data, std::map &recompute_max_gain); - void update_best_move(VertexType node, unsigned step, unsigned proc, node_selection_container_t &affinity_table, ThreadSearchContext & thread_data); - void update_best_move(VertexType node, unsigned step, node_selection_container_t &affinity_table, ThreadSearchContext & thread_data); - void update_max_gain(kl_move move, std::map &recompute_max_gain, ThreadSearchContext & thread_data); - void compute_work_affinity(VertexType node, std::vector> & affinity_table_node, ThreadSearchContext & thread_data); - - inline void recompute_node_max_gain(VertexType node, node_selection_container_t &affinity_table, ThreadSearchContext & thread_data) { + void process_work_update_step(VertexType node, unsigned node_step, unsigned node_proc, work_weight_t vertex_weight, + unsigned move_step, unsigned move_proc, work_weight_t move_correction_node_weight, + const work_weight_t prev_move_step_max_work, + const work_weight_t prev_move_step_second_max_work, + unsigned prev_move_step_max_work_processor_count, bool &update_step, + bool &update_entire_step, bool &full_update, + std::vector> &affinity_table_node); + void update_node_work_affinity(node_selection_container_t &nodes, kl_move move, + const pre_move_work_data &prev_work_data, + std::map &recompute_max_gain); + void update_best_move(VertexType node, unsigned step, unsigned proc, node_selection_container_t &affinity_table, + ThreadSearchContext &thread_data); + void update_best_move(VertexType node, unsigned step, node_selection_container_t &affinity_table, + ThreadSearchContext &thread_data); + void update_max_gain(kl_move move, std::map &recompute_max_gain, + ThreadSearchContext &thread_data); + void compute_work_affinity(VertexType node, std::vector> &affinity_table_node, + ThreadSearchContext &thread_data); + + inline void recompute_node_max_gain(VertexType node, node_selection_container_t &affinity_table, + ThreadSearchContext &thread_data) { const auto best_move = compute_best_move(node, affinity_table[node], thread_data); - thread_data.max_gain_heap.update(node, best_move); + thread_data.max_gain_heap.update(node, best_move); } - inline cost_t compute_same_step_affinity(const work_weight_t &max_work_for_step, const work_weight_t &new_weight, const cost_t &node_proc_affinity) { + inline cost_t compute_same_step_affinity(const work_weight_t &max_work_for_step, const work_weight_t &new_weight, + const cost_t &node_proc_affinity) { const cost_t max_work_after_removal = static_cast(max_work_for_step) - node_proc_affinity; if (new_weight > max_work_after_removal) { return new_weight - max_work_after_removal; } return 0.0; } - - inline cost_t apply_move(kl_move move, ThreadSearchContext & thread_data) { + + inline cost_t apply_move(kl_move move, ThreadSearchContext &thread_data) { active_schedule.apply_move(move, thread_data.active_schedule_data); - comm_cost_f.update_datastructure_after_move(move, thread_data.start_step, thread_data.end_step); + comm_cost_f.update_datastructure_after_move(move, thread_data.start_step, thread_data.end_step); cost_t change_in_cost = -move.gain; - change_in_cost += static_cast(thread_data.active_schedule_data.resolved_violations.size()) * thread_data.reward_penalty_strat.reward; - change_in_cost -= static_cast(thread_data.active_schedule_data.new_violations.size()) * thread_data.reward_penalty_strat.penalty; - + change_in_cost += static_cast(thread_data.active_schedule_data.resolved_violations.size()) * + thread_data.reward_penalty_strat.reward; + change_in_cost -= static_cast(thread_data.active_schedule_data.new_violations.size()) * + thread_data.reward_penalty_strat.penalty; + #ifdef KL_DEBUG - std::cout << "penalty: " << thread_data.reward_penalty_strat.penalty << " num violations: " << thread_data.active_schedule_data.current_violations.size() << " num new violations: " << thread_data.active_schedule_data.new_violations.size() << ", num resolved violations: " << thread_data.active_schedule_data.resolved_violations.size() << ", reward: " << thread_data.reward_penalty_strat.reward << std::endl; - std::cout << "apply move, previous cost: " << thread_data.active_schedule_data.cost << ", new cost: " << thread_data.active_schedule_data.cost + change_in_cost << ", " << (thread_data.active_schedule_data.feasible ? "feasible," : "infeasible,") << std::endl; + std::cout << "penalty: " << thread_data.reward_penalty_strat.penalty + << " num violations: " << thread_data.active_schedule_data.current_violations.size() + << " num new violations: " << thread_data.active_schedule_data.new_violations.size() + << ", num resolved violations: " << thread_data.active_schedule_data.resolved_violations.size() + << ", reward: " << thread_data.reward_penalty_strat.reward << std::endl; + std::cout << "apply move, previous cost: " << thread_data.active_schedule_data.cost + << ", new cost: " << thread_data.active_schedule_data.cost + change_in_cost << ", " + << (thread_data.active_schedule_data.feasible ? "feasible," : "infeasible,") << std::endl; #endif - + thread_data.active_schedule_data.update_cost(change_in_cost); - + return change_in_cost; - } + } - void run_quick_moves(unsigned & inner_iter, ThreadSearchContext & thread_data, const cost_t change_in_cost, const VertexType best_move_node) { + void run_quick_moves(unsigned &inner_iter, ThreadSearchContext &thread_data, const cost_t change_in_cost, + const VertexType best_move_node) { #ifdef KL_DEBUG std::cout << "Starting quick moves sequence." << std::endl; #endif @@ -430,7 +545,7 @@ class kl_improver : public ImprovementScheduler { std::vector quick_moves_stack; quick_moves_stack.reserve(10 + thread_data.active_schedule_data.new_violations.size() * 2); - for (const auto& key_value_pair : thread_data.active_schedule_data.new_violations) { + for (const auto &key_value_pair : thread_data.active_schedule_data.new_violations) { const auto &key = key_value_pair.first; quick_moves_stack.push_back(key); } @@ -439,10 +554,12 @@ class kl_improver : public ImprovementScheduler { auto next_node_to_move = quick_moves_stack.back(); quick_moves_stack.pop_back(); - - thread_data.reward_penalty_strat.init_reward_penalty(static_cast(thread_data.active_schedule_data.current_violations.size()) + 1.0); + + thread_data.reward_penalty_strat.init_reward_penalty( + static_cast(thread_data.active_schedule_data.current_violations.size()) + 1.0); compute_node_affinities(next_node_to_move, thread_data.local_affinity_table, thread_data); - kl_move best_quick_move = compute_best_move(next_node_to_move, thread_data.local_affinity_table, thread_data); + kl_move best_quick_move = + compute_best_move(next_node_to_move, thread_data.local_affinity_table, thread_data); local_lock.insert(next_node_to_move); if (best_quick_move.gain <= std::numeric_limits::lowest()) { @@ -450,25 +567,28 @@ class kl_improver : public ImprovementScheduler { } #ifdef KL_DEBUG - std::cout << " >>> move node " << best_quick_move.node << " with gain " << best_quick_move.gain << ", from proc|step: " << best_quick_move.from_proc << "|" << best_quick_move.from_step << " to: " << best_quick_move.to_proc << "|" << best_quick_move.to_step << std::endl; + std::cout << " >>> move node " << best_quick_move.node << " with gain " << best_quick_move.gain + << ", from proc|step: " << best_quick_move.from_proc << "|" << best_quick_move.from_step + << " to: " << best_quick_move.to_proc << "|" << best_quick_move.to_step << std::endl; #endif - apply_move(best_quick_move, thread_data); + apply_move(best_quick_move, thread_data); inner_iter++; if (thread_data.active_schedule_data.new_violations.size() > 0) { bool abort = false; - for (const auto& key_value_pair : thread_data.active_schedule_data.new_violations) { + for (const auto &key_value_pair : thread_data.active_schedule_data.new_violations) { const auto &key = key_value_pair.first; - if(local_lock.find(key) != local_lock.end()) { + if (local_lock.find(key) != local_lock.end()) { abort = true; break; - } + } quick_moves_stack.push_back(key); } - if (abort) break; + if (abort) + break; } else if (thread_data.active_schedule_data.feasible) { break; @@ -476,11 +596,13 @@ class kl_improver : public ImprovementScheduler { } if (!thread_data.active_schedule_data.feasible) { - active_schedule.revert_schedule_to_bound(num_applied_moves, saved_cost ,true, comm_cost_f, thread_data.active_schedule_data, thread_data.start_step, thread_data.end_step); + active_schedule.revert_schedule_to_bound(num_applied_moves, saved_cost, true, comm_cost_f, + thread_data.active_schedule_data, thread_data.start_step, + thread_data.end_step); #ifdef KL_DEBUG std::cout << "Ending quick moves sequence with infeasible solution." << std::endl; #endif - } + } #ifdef KL_DEBUG else { std::cout << "Ending quick moves sequence with feasible solution." << std::endl; @@ -493,18 +615,19 @@ class kl_improver : public ImprovementScheduler { insert_gain_heap(thread_data); // Re-initialize the heap with the current state } - void resolve_violations(ThreadSearchContext & thread_data) { - auto & current_violations = thread_data.active_schedule_data.current_violations; + void resolve_violations(ThreadSearchContext &thread_data) { + auto ¤t_violations = thread_data.active_schedule_data.current_violations; unsigned num_violations = static_cast(current_violations.size()); if (num_violations > 0) { #ifdef KL_DEBUG_1 - std::cout << "thread " << thread_data.thread_id << ", Starting preresolving violations with " << num_violations << " initial violations" << std::endl; + std::cout << "thread " << thread_data.thread_id << ", Starting preresolving violations with " + << num_violations << " initial violations" << std::endl; #endif thread_data.reward_penalty_strat.init_reward_penalty(static_cast(num_violations) + 1.0); - std::unordered_set local_lock; + std::unordered_set local_lock; unsigned num_iter = 0; - const unsigned min_iter = num_violations / 4; + const unsigned min_iter = num_violations / 4; while (not current_violations.empty()) { std::uniform_int_distribution dis(0, current_violations.size() - 1); auto it = current_violations.begin(); @@ -514,14 +637,14 @@ class kl_improver : public ImprovementScheduler { const VertexType target_v = target(next_edge, *graph); const bool source_locked = local_lock.find(source_v) != local_lock.end(); const bool target_locked = local_lock.find(target_v) != local_lock.end(); - + if (source_locked && target_locked) { #ifdef KL_DEBUG_1 std::cout << "source, target locked" << std::endl; #endif break; } - + kl_move best_move; if (source_locked || target_locked) { const VertexType node = source_locked ? target_v : source_v; @@ -529,25 +652,32 @@ class kl_improver : public ImprovementScheduler { best_move = compute_best_move(node, thread_data.local_affinity_table, thread_data); } else { compute_node_affinities(source_v, thread_data.local_affinity_table, thread_data); - kl_move best_source_v_move = compute_best_move(source_v, thread_data.local_affinity_table, thread_data); + kl_move best_source_v_move = + compute_best_move(source_v, thread_data.local_affinity_table, thread_data); compute_node_affinities(target_v, thread_data.local_affinity_table, thread_data); - kl_move best_target_v_move = compute_best_move(target_v, thread_data.local_affinity_table, thread_data); - best_move = best_target_v_move.gain > best_source_v_move.gain ? std::move(best_target_v_move) : std::move(best_source_v_move); + kl_move best_target_v_move = + compute_best_move(target_v, thread_data.local_affinity_table, thread_data); + best_move = best_target_v_move.gain > best_source_v_move.gain ? std::move(best_target_v_move) + : std::move(best_source_v_move); } local_lock.insert(best_move.node); - if (best_move.gain <= std::numeric_limits::lowest()) continue; + if (best_move.gain <= std::numeric_limits::lowest()) + continue; apply_move(best_move, thread_data); thread_data.affinity_table.insert(best_move.node); #ifdef KL_DEBUG_1 - std::cout << "move node " << best_move.node << " with gain " << best_move.gain << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step << " to: " << best_move.to_proc << "|" << best_move.to_step << std::endl; + std::cout << "move node " << best_move.node << " with gain " << best_move.gain + << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step + << " to: " << best_move.to_proc << "|" << best_move.to_step << std::endl; #endif const unsigned new_num_violations = static_cast(current_violations.size()); - if (new_num_violations == 0) break; + if (new_num_violations == 0) + break; - if (thread_data.active_schedule_data.new_violations.size() > 0) { - for (const auto & vertex_edge_pair : thread_data.active_schedule_data.new_violations) { + if (thread_data.active_schedule_data.new_violations.size() > 0) { + for (const auto &vertex_edge_pair : thread_data.active_schedule_data.new_violations) { const auto &vertex = vertex_edge_pair.first; thread_data.affinity_table.insert(vertex); } @@ -557,20 +687,24 @@ class kl_improver : public ImprovementScheduler { num_violations = new_num_violations; update_avg_gain(gain, num_iter++, thread_data.average_gain); #ifdef KL_DEBUG_1 - std::cout << "thread " << thread_data.thread_id << ", preresolving violations with " << num_violations << " violations, " << num_iter << " #iterations, " << thread_data.average_gain << " average gain" << std::endl; + std::cout << "thread " << thread_data.thread_id << ", preresolving violations with " << num_violations + << " violations, " << num_iter << " #iterations, " << thread_data.average_gain + << " average gain" << std::endl; #endif if (num_iter > min_iter && thread_data.average_gain < 0.0) { break; } } thread_data.average_gain = 0.0; - } + } } - void run_local_search(ThreadSearchContext & thread_data) { + void run_local_search(ThreadSearchContext &thread_data) { #ifdef KL_DEBUG_1 - std::cout << "thread " << thread_data.thread_id << ", start local search, initial schedule cost: " << thread_data.active_schedule_data.cost << " with " << thread_data.num_steps() << " supersteps." << std::endl; + std::cout << "thread " << thread_data.thread_id + << ", start local search, initial schedule cost: " << thread_data.active_schedule_data.cost + << " with " << thread_data.num_steps() << " supersteps." << std::endl; #endif std::vector new_nodes; std::vector unlock_nodes; @@ -584,74 +718,90 @@ class kl_improver : public ImprovementScheduler { for (; outer_iter < parameters.max_outer_iterations; outer_iter++) { cost_t initial_inner_iter_cost = thread_data.active_schedule_data.cost; - reset_inner_search_structures(thread_data); - select_active_nodes(thread_data); - thread_data.reward_penalty_strat.init_reward_penalty(static_cast(thread_data.active_schedule_data.current_violations.size()) + 1.0); + reset_inner_search_structures(thread_data); + select_active_nodes(thread_data); + thread_data.reward_penalty_strat.init_reward_penalty( + static_cast(thread_data.active_schedule_data.current_violations.size()) + 1.0); insert_gain_heap(thread_data); - + unsigned inner_iter = 0; unsigned violation_removed_count = 0; unsigned reset_counter = 0; bool iter_inital_feasible = thread_data.active_schedule_data.feasible; - + #ifdef KL_DEBUG std::cout << "------ start inner loop ------" << std::endl; std::cout << "initial node selection: {"; - for (size_t i = 0; i < thread_data.affinity_table.size() ; ++i) { + for (size_t i = 0; i < thread_data.affinity_table.size(); ++i) { std::cout << thread_data.affinity_table.get_selected_nodes()[i] << ", "; } std::cout << "}" << std::endl; #endif #ifdef KL_DEBUG_1 if (not iter_inital_feasible) { - std::cout << "initial solution not feasible, num violations: " << thread_data.active_schedule_data.current_violations.size() << ". Penalty: " << thread_data.reward_penalty_strat.penalty << ", reward: " << thread_data.reward_penalty_strat.reward << std::endl; + std::cout << "initial solution not feasible, num violations: " + << thread_data.active_schedule_data.current_violations.size() + << ". Penalty: " << thread_data.reward_penalty_strat.penalty + << ", reward: " << thread_data.reward_penalty_strat.reward << std::endl; } #endif #ifdef KL_DEBUG_COST_CHECK - active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps(); - if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) { - std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; - std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl; - } - if constexpr (active_schedule_t::use_memory_constraint) { - if ( not active_schedule.memory_constraint.satisfied_memory_constraint()) - std::cout << "memory constraint not satisfied" << std::endl; - } + active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps(); + if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001) { + std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() + << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; + std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" + << std::endl; + } + if constexpr (active_schedule_t::use_memory_constraint) { + if (not active_schedule.memory_constraint.satisfied_memory_constraint()) + std::cout << "memory constraint not satisfied" << std::endl; + } #endif - while (inner_iter < thread_data.max_inner_iterations && thread_data.max_gain_heap.size() > 0) { - kl_move best_move = get_best_move(thread_data.affinity_table, thread_data.lock_manager, thread_data.max_gain_heap); // locks best_move.node and removes it from node_selection + kl_move best_move = + get_best_move(thread_data.affinity_table, thread_data.lock_manager, + thread_data.max_gain_heap); // locks best_move.node and removes it from node_selection if (best_move.gain <= std::numeric_limits::lowest()) { break; - } + } update_avg_gain(best_move.gain, inner_iter, thread_data.average_gain); #ifdef KL_DEBUG - std::cout << " >>> move node " << best_move.node << " with gain " << best_move.gain << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step << " to: " << best_move.to_proc << "|" << best_move.to_step << ",avg gain: " << thread_data.average_gain << std::endl; + std::cout << " >>> move node " << best_move.node << " with gain " << best_move.gain + << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step + << " to: " << best_move.to_proc << "|" << best_move.to_step + << ",avg gain: " << thread_data.average_gain << std::endl; #endif if (inner_iter > thread_data.min_inner_iter && thread_data.average_gain < 0.0) { #ifdef KL_DEBUG - std::cout << "Negative average gain: " << thread_data.average_gain << ", end local search" << std::endl; + std::cout << "Negative average gain: " << thread_data.average_gain << ", end local search" + << std::endl; #endif - break; + break; } #ifdef KL_DEBUG - if (not active_schedule.getInstance().isCompatible(best_move.node, best_move.to_proc)) { - std::cout << "move to incompatibe node" << std::endl; - } + if (not active_schedule.getInstance().isCompatible(best_move.node, best_move.to_proc)) { + std::cout << "move to incompatibe node" << std::endl; + } #endif const auto prev_work_data = active_schedule.get_pre_move_work_data(best_move); + const typename comm_cost_function_t::pre_move_comm_data_t prev_comm_data = + comm_cost_f.get_pre_move_comm_data(best_move); const cost_t change_in_cost = apply_move(best_move, thread_data); #ifdef KL_DEBUG_COST_CHECK active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps(); - if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) { - std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; - std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl; + if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > + 0.00001) { + std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() + << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; + std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" + << std::endl; } if constexpr (active_schedule_t::use_memory_constraint) { - if ( not active_schedule.memory_constraint.satisfied_memory_constraint()) + if (not active_schedule.memory_constraint.satisfied_memory_constraint()) std::cout << "memory constraint not satisfied" << std::endl; } #endif @@ -659,17 +809,21 @@ class kl_improver : public ImprovementScheduler { if (iter_inital_feasible && thread_data.active_schedule_data.new_violations.size() > 0) { run_quick_moves(inner_iter, thread_data, change_in_cost, best_move.node); #ifdef KL_DEBUG_COST_CHECK - active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps(); - if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) { - std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; - std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl; - } - if constexpr (active_schedule_t::use_memory_constraint) { - if ( not active_schedule.memory_constraint.satisfied_memory_constraint()) - std::cout << "memory constraint not satisfied" << std::endl; - } + active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps(); + if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > + 0.00001) { + std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() + << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; + std::cout + << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" + << std::endl; + } + if constexpr (active_schedule_t::use_memory_constraint) { + if (not active_schedule.memory_constraint.satisfied_memory_constraint()) + std::cout << "memory constraint not satisfied" << std::endl; + } #endif - continue; + continue; } } @@ -680,40 +834,47 @@ class kl_improver : public ImprovementScheduler { violation_removed_count++; if (violation_removed_count > 3) { - if (reset_counter < thread_data.max_no_vioaltions_removed_backtrack && ((not iter_inital_feasible) || (thread_data.active_schedule_data.cost < thread_data.active_schedule_data.best_cost))) { + if (reset_counter < thread_data.max_no_vioaltions_removed_backtrack && + ((not iter_inital_feasible) || (thread_data.active_schedule_data.cost < + thread_data.active_schedule_data.best_cost))) { thread_data.affinity_table.reset_node_selection(); thread_data.max_gain_heap.clear(); thread_data.lock_manager.clear(); - thread_data.selection_strategy.select_nodes_violations(thread_data.affinity_table, thread_data.active_schedule_data.current_violations, thread_data.start_step, thread_data.end_step); + thread_data.selection_strategy.select_nodes_violations( + thread_data.affinity_table, thread_data.active_schedule_data.current_violations, + thread_data.start_step, thread_data.end_step); #ifdef KL_DEBUG - std::cout << "Infeasible, and no violations resolved for 5 iterations, reset node selection" << std::endl; + std::cout + << "Infeasible, and no violations resolved for 5 iterations, reset node selection" + << std::endl; #endif - thread_data.reward_penalty_strat.init_reward_penalty(static_cast(thread_data.active_schedule_data.current_violations.size())); + thread_data.reward_penalty_strat.init_reward_penalty( + static_cast(thread_data.active_schedule_data.current_violations.size())); insert_gain_heap(thread_data); reset_counter++; inner_iter++; - continue; + continue; } else { #ifdef KL_DEBUG - std::cout << "Infeasible, and no violations resolved for 5 iterations, end local search" << std::endl; + std::cout << "Infeasible, and no violations resolved for 5 iterations, end local search" + << std::endl; #endif - break; + break; } } } } - - if(is_local_search_blocked(thread_data)) { + + if (is_local_search_blocked(thread_data)) { if (not blocked_edge_strategy(best_move.node, unlock_nodes, thread_data)) { - break; + break; } } thread_data.affinity_table.trim(); - - update_node_work_affinity(thread_data.affinity_table, best_move, prev_work_data, recompute_max_gain); - comm_cost_f.update_node_comm_affinity(best_move, thread_data, thread_data.reward_penalty_strat.penalty, thread_data.reward_penalty_strat.reward, recompute_max_gain, new_nodes); + update_affinities(best_move, thread_data, recompute_max_gain, new_nodes, prev_work_data, + prev_comm_data); for (const auto v : unlock_nodes) { thread_data.lock_manager.unlock(v); @@ -736,12 +897,15 @@ class kl_improver : public ImprovementScheduler { #endif #ifdef KL_DEBUG_COST_CHECK active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps(); - if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) { - std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; - std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl; + if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > + 0.00001) { + std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() + << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; + std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" + << std::endl; } if constexpr (active_schedule_t::use_memory_constraint) { - if ( not active_schedule.memory_constraint.satisfied_memory_constraint()) + if (not active_schedule.memory_constraint.satisfied_memory_constraint()) std::cout << "memory constraint not satisfied" << std::endl; } #endif @@ -755,31 +919,40 @@ class kl_improver : public ImprovementScheduler { } #ifdef KL_DEBUG - std::cout << "--- end inner loop after " << inner_iter << " inner iterations, gain heap size: " << thread_data.max_gain_heap.size() << ", outer iteraion " << outer_iter << "/" << parameters.max_outer_iterations << ", current cost: " << thread_data.active_schedule_data.cost << ", " << (thread_data.active_schedule_data.feasible ? "feasible" : "infeasible") << std::endl; + std::cout << "--- end inner loop after " << inner_iter + << " inner iterations, gain heap size: " << thread_data.max_gain_heap.size() + << ", outer iteraion " << outer_iter << "/" << parameters.max_outer_iterations + << ", current cost: " << thread_data.active_schedule_data.cost << ", " + << (thread_data.active_schedule_data.feasible ? "feasible" : "infeasible") << std::endl; #endif #ifdef KL_DEBUG_1 - const unsigned num_steps_tmp = thread_data.end_step; + const unsigned num_steps_tmp = thread_data.end_step; #endif - active_schedule.revert_to_best_schedule(thread_data.local_search_start_step, thread_data.step_to_remove, comm_cost_f, thread_data.active_schedule_data, thread_data.start_step, thread_data.end_step); + active_schedule.revert_to_best_schedule(thread_data.local_search_start_step, thread_data.step_to_remove, + comm_cost_f, thread_data.active_schedule_data, + thread_data.start_step, thread_data.end_step); #ifdef KL_DEBUG_1 if (thread_data.local_search_start_step > 0) { - if(num_steps_tmp == thread_data.end_step) { - std::cout << "thread " << thread_data.thread_id << ", removing step " << thread_data.step_to_remove << " succeded " << std::endl; + if (num_steps_tmp == thread_data.end_step) { + std::cout << "thread " << thread_data.thread_id << ", removing step " << thread_data.step_to_remove + << " succeded " << std::endl; } else { - std::cout << "thread " << thread_data.thread_id << ", removing step " << thread_data.step_to_remove << " failed " << std::endl; + std::cout << "thread " << thread_data.thread_id << ", removing step " << thread_data.step_to_remove + << " failed " << std::endl; } - } + } #endif - #ifdef KL_DEBUG_COST_CHECK active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps(); - if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) { - std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; - std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl; + if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001) { + std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() + << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; + std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" + << std::endl; } if constexpr (active_schedule_t::use_memory_constraint) { - if ( not active_schedule.memory_constraint.satisfied_memory_constraint()) + if (not active_schedule.memory_constraint.satisfied_memory_constraint()) std::cout << "memory constraint not satisfied" << std::endl; } #endif @@ -791,10 +964,11 @@ class kl_improver : public ImprovementScheduler { break; } } - + if (other_threads_finished(thread_data.thread_id)) { #ifdef KL_DEBUG_1 - std::cout << "thread " << thread_data.thread_id << ", other threads finished, end local search" << std::endl; + std::cout << "thread " << thread_data.thread_id << ", other threads finished, end local search" + << std::endl; #endif break; } @@ -804,38 +978,148 @@ class kl_improver : public ImprovementScheduler { if (no_improvement_iter_counter >= parameters.max_no_improvement_iterations) { #ifdef KL_DEBUG_1 - std::cout << "thread " << thread_data.thread_id << ", no improvement for " << parameters.max_no_improvement_iterations - << " iterations, end local search" << std::endl; + std::cout << "thread " << thread_data.thread_id << ", no improvement for " + << parameters.max_no_improvement_iterations << " iterations, end local search" + << std::endl; #endif break; - } + } } else { no_improvement_iter_counter = 0; - } - + } + adjust_local_search_parameters(outer_iter, no_improvement_iter_counter, thread_data); } #ifdef KL_DEBUG_1 - std::cout << "thread " << thread_data.thread_id << ", local search end after " << outer_iter << " outer iterations, current cost: " << thread_data.active_schedule_data.cost << " with " << thread_data.num_steps() << " supersteps, vs serial cost " << active_schedule.get_total_work_weight() << "." << std::endl; + std::cout << "thread " << thread_data.thread_id << ", local search end after " << outer_iter + << " outer iterations, current cost: " << thread_data.active_schedule_data.cost << " with " + << thread_data.num_steps() << " supersteps, vs serial cost " + << active_schedule.get_total_work_weight() << "." << std::endl; #endif thread_finished_vec[thread_data.thread_id] = true; - } bool other_threads_finished(const unsigned thread_id) { const size_t num_threads = thread_finished_vec.size(); - if(num_threads == 1) + if (num_threads == 1) return false; for (size_t i = 0; i < num_threads; i++) { - if (i != thread_id && !thread_finished_vec[i]) + if (i != thread_id && !thread_finished_vec[i]) return false; } return true; } - inline bool blocked_edge_strategy(VertexType node, std::vector & unlock_nodes, ThreadSearchContext & thread_data) { + inline void update_affinities(const kl_move &best_move, ThreadSearchContext &thread_data, + std::map &recompute_max_gain, + std::vector &new_nodes, + const pre_move_work_data> &prev_work_data, + const typename comm_cost_function_t::pre_move_comm_data_t &prev_comm_data) { + + if constexpr (comm_cost_function_t::is_max_comm_cost_function) { + comm_cost_f.update_node_comm_affinity( + best_move, thread_data, thread_data.reward_penalty_strat.penalty, + thread_data.reward_penalty_strat.reward, recompute_max_gain, + new_nodes); // this only updated reward/penalty, collects new_nodes, and fills recompute_max_gain + + // Determine the steps where max/second_max/max_count for work/comm changed + std::unordered_set changed_steps; + + // Check work changes for from_step + if (best_move.from_step == best_move.to_step) { + // Same step - check if max/second_max changed + const auto current_max = active_schedule.get_step_max_work(best_move.from_step); + const auto current_second_max = active_schedule.get_step_second_max_work(best_move.from_step); + const auto current_count = active_schedule.get_step_max_work_processor_count()[best_move.from_step]; + if (current_max != prev_work_data.from_step_max_work || + current_second_max != prev_work_data.from_step_second_max_work || + current_count != prev_work_data.from_step_max_work_processor_count) { + changed_steps.insert(best_move.from_step); + } + } else { + // Different steps - check both + const auto current_from_max = active_schedule.get_step_max_work(best_move.from_step); + const auto current_from_second_max = active_schedule.get_step_second_max_work(best_move.from_step); + const auto current_from_count = + active_schedule.get_step_max_work_processor_count()[best_move.from_step]; + if (current_from_max != prev_work_data.from_step_max_work || + current_from_second_max != prev_work_data.from_step_second_max_work || + current_from_count != prev_work_data.from_step_max_work_processor_count) { + changed_steps.insert(best_move.from_step); + } + + const auto current_to_max = active_schedule.get_step_max_work(best_move.to_step); + const auto current_to_second_max = active_schedule.get_step_second_max_work(best_move.to_step); + const auto current_to_count = active_schedule.get_step_max_work_processor_count()[best_move.to_step]; + if (current_to_max != prev_work_data.to_step_max_work || + current_to_second_max != prev_work_data.to_step_second_max_work || + current_to_count != prev_work_data.to_step_max_work_processor_count) { + changed_steps.insert(best_move.to_step); + } + } + + for (const auto &[step, step_info] : prev_comm_data.step_data) { + typename comm_cost_function_t::pre_move_comm_data_t::step_info current_info; + // Query current values + const auto current_max = comm_cost_f.comm_ds.step_max_comm(step); + const auto current_second_max = comm_cost_f.comm_ds.step_second_max_comm(step); + const auto current_count = comm_cost_f.comm_ds.step_max_comm_count(step); + + if (current_max != step_info.max_comm || current_second_max != step_info.second_max_comm || + current_count != step_info.max_comm_count) { + changed_steps.insert(step); + } + } + + // Recompute affinities for all active nodes + const size_t active_count = thread_data.affinity_table.size(); + for (size_t i = 0; i < active_count; ++i) { + const VertexType node = thread_data.affinity_table.get_selected_nodes()[i]; + + // Determine if this node needs affinity recomputation + // A node needs recomputation if it's in or adjacent to changed steps + const unsigned node_step = active_schedule.assigned_superstep(node); + + // Calculate window bounds for this node once + const int node_lower_bound = static_cast(node_step) - static_cast(window_size); + const unsigned node_upper_bound = node_step + window_size; + + bool needs_update = false; + // Check if any changed step falls within the node's window + for (unsigned step : changed_steps) { + if (static_cast(step) >= node_lower_bound && step <= node_upper_bound) { + needs_update = true; + break; + } + } + + if (needs_update) { + auto &affinity_table_node = thread_data.affinity_table.get_affinity_table(node); + + // Reset affinity table entries to zero + const unsigned num_procs = active_schedule.getInstance().numberOfProcessors(); + for (unsigned p = 0; p < num_procs; ++p) { + for (unsigned idx = 0; idx < affinity_table_node[p].size(); ++idx) { + affinity_table_node[p][idx] = 0; + } + } + + compute_node_affinities(node, affinity_table_node, thread_data); + recompute_max_gain[node] = kl_gain_update_info(node, true); + } + } + } else { + update_node_work_affinity(thread_data.affinity_table, best_move, prev_work_data, recompute_max_gain); + comm_cost_f.update_node_comm_affinity(best_move, thread_data, thread_data.reward_penalty_strat.penalty, + thread_data.reward_penalty_strat.reward, recompute_max_gain, + new_nodes); + } + } + + inline bool blocked_edge_strategy(VertexType node, std::vector &unlock_nodes, + ThreadSearchContext &thread_data) { if (thread_data.unlock_edge_backtrack_counter > 1) { for (const auto vertex_edge_pair : thread_data.active_schedule_data.new_violations) { const auto &e = vertex_edge_pair.second; @@ -849,7 +1133,8 @@ class kl_improver : public ImprovementScheduler { } } #ifdef KL_DEBUG - std::cout << "Nodes of violated edge locked, backtrack counter: " << thread_data.unlock_edge_backtrack_counter << std::endl; + std::cout << "Nodes of violated edge locked, backtrack counter: " + << thread_data.unlock_edge_backtrack_counter << std::endl; #endif thread_data.unlock_edge_backtrack_counter--; return true; @@ -857,94 +1142,114 @@ class kl_improver : public ImprovementScheduler { #ifdef KL_DEBUG std::cout << "Nodes of violated edge locked, end local search" << std::endl; #endif - return false; //or reset local search and initalize with violating nodes + return false; // or reset local search and initalize with violating nodes } } - inline void adjust_local_search_parameters(unsigned outer_iter, unsigned no_imp_counter, ThreadSearchContext & thread_data) { - if (no_imp_counter >= thread_data.no_improvement_iterations_reduce_penalty && thread_data.reward_penalty_strat.initial_penalty > 1.0) { - thread_data.reward_penalty_strat.initial_penalty = std::floor(std::sqrt(thread_data.reward_penalty_strat.initial_penalty)); + inline void adjust_local_search_parameters(unsigned outer_iter, unsigned no_imp_counter, + ThreadSearchContext &thread_data) { + if (no_imp_counter >= thread_data.no_improvement_iterations_reduce_penalty && + thread_data.reward_penalty_strat.initial_penalty > 1.0) { + thread_data.reward_penalty_strat.initial_penalty = static_cast(std::floor(std::sqrt(thread_data.reward_penalty_strat.initial_penalty))); thread_data.unlock_edge_backtrack_counter_reset += 1; thread_data.no_improvement_iterations_reduce_penalty += 15; #ifdef KL_DEBUG_1 - std::cout << "thread " << thread_data.thread_id << ", no improvement for " << thread_data.no_improvement_iterations_reduce_penalty - << " iterations, reducing initial penalty to " << thread_data.reward_penalty_strat.initial_penalty << std::endl; -#endif - } + std::cout << "thread " << thread_data.thread_id << ", no improvement for " + << thread_data.no_improvement_iterations_reduce_penalty + << " iterations, reducing initial penalty to " << thread_data.reward_penalty_strat.initial_penalty + << std::endl; +#endif + } - if (parameters.try_remove_step_after_num_outer_iterations > 0 && ((outer_iter + 1) % parameters.try_remove_step_after_num_outer_iterations) == 0) { - thread_data.step_selection_epoch_counter = 0;; + if (parameters.try_remove_step_after_num_outer_iterations > 0 && + ((outer_iter + 1) % parameters.try_remove_step_after_num_outer_iterations) == 0) { + thread_data.step_selection_epoch_counter = 0; + ; #ifdef KL_DEBUG std::cout << "reset remove epoc counter after " << outer_iter << " iterations." << std::endl; #endif } - if (no_imp_counter >= thread_data.no_improvement_iterations_increase_inner_iter ) { + if (no_imp_counter >= thread_data.no_improvement_iterations_increase_inner_iter) { thread_data.min_inner_iter = static_cast(std::ceil(thread_data.min_inner_iter * 2.2)); thread_data.no_improvement_iterations_increase_inner_iter += 20; #ifdef KL_DEBUG_1 - std::cout << "thread " << thread_data.thread_id << ", no improvement for " << thread_data.no_improvement_iterations_increase_inner_iter - << " iterations, increasing min inner iter to " << thread_data.min_inner_iter << std::endl; + std::cout << "thread " << thread_data.thread_id << ", no improvement for " + << thread_data.no_improvement_iterations_increase_inner_iter + << " iterations, increasing min inner iter to " << thread_data.min_inner_iter << std::endl; #endif } - } - - bool is_local_search_blocked(ThreadSearchContext & thread_data); + + bool is_local_search_blocked(ThreadSearchContext &thread_data); void set_parameters(vertex_idx_t num_nodes); - void reset_inner_search_structures(ThreadSearchContext & thread_data) const; + void reset_inner_search_structures(ThreadSearchContext &thread_data) const; void initialize_datastructures(BspSchedule &schedule); - void print_heap(heap_datastructure & max_gain_heap) const; + void print_heap(heap_datastructure &max_gain_heap) const; void cleanup_datastructures(); - void update_avg_gain(const cost_t gain, const unsigned num_iter, cost_t & average_gain); - void insert_gain_heap(ThreadSearchContext & thread_data); - void insert_new_nodes_gain_heap(std::vector& new_nodes, node_selection_container_t &nodes, ThreadSearchContext & thread_data); + void update_avg_gain(const cost_t gain, const unsigned num_iter, double &average_gain); + void insert_gain_heap(ThreadSearchContext &thread_data); + void insert_new_nodes_gain_heap(std::vector &new_nodes, node_selection_container_t &nodes, + ThreadSearchContext &thread_data); - inline void compute_node_affinities(VertexType node, std::vector> & affinity_table_node, ThreadSearchContext & thread_data) { + inline void compute_node_affinities(VertexType node, std::vector> &affinity_table_node, + ThreadSearchContext &thread_data) { compute_work_affinity(node, affinity_table_node, thread_data); - comm_cost_f.compute_comm_affinity(node, affinity_table_node, thread_data.reward_penalty_strat.penalty, thread_data.reward_penalty_strat.reward, thread_data.start_step, thread_data.end_step); + comm_cost_f.compute_comm_affinity(node, affinity_table_node, thread_data.reward_penalty_strat.penalty, + thread_data.reward_penalty_strat.reward, thread_data.start_step, + thread_data.end_step); } - void select_active_nodes(ThreadSearchContext & thread_data) { + void select_active_nodes(ThreadSearchContext &thread_data) { if (select_nodes_check_remove_superstep(thread_data.step_to_remove, thread_data)) { active_schedule.swap_empty_step_fwd(thread_data.step_to_remove, thread_data.end_step); thread_data.end_step--; thread_data.local_search_start_step = static_cast(thread_data.active_schedule_data.applied_moves.size()); - thread_data.active_schedule_data.update_cost(-1.0 * static_cast(instance->synchronisationCosts())); + thread_data.active_schedule_data.update_cost(static_cast(-1.0 * instance->synchronisationCosts())); if constexpr (enable_preresolving_violations) { resolve_violations(thread_data); } if (thread_data.active_schedule_data.current_violations.size() > parameters.initial_violation_threshold) { - active_schedule.revert_to_best_schedule(thread_data.local_search_start_step, thread_data.step_to_remove, comm_cost_f, thread_data.active_schedule_data, thread_data.start_step, thread_data.end_step); + active_schedule.revert_to_best_schedule(thread_data.local_search_start_step, thread_data.step_to_remove, + comm_cost_f, thread_data.active_schedule_data, + thread_data.start_step, thread_data.end_step); } else { - thread_data.unlock_edge_backtrack_counter = static_cast(thread_data.active_schedule_data.current_violations.size()); - thread_data.max_inner_iterations = std::max(thread_data.unlock_edge_backtrack_counter * 5u, parameters.max_inner_iterations_reset); - thread_data.max_no_vioaltions_removed_backtrack = parameters.max_no_vioaltions_removed_backtrack_for_remove_step_reset; - #ifdef KL_DEBUG_1 - std::cout << "thread " << thread_data.thread_id << ", Trying to remove step " << thread_data.step_to_remove << std::endl; - #endif - return; - } + thread_data.unlock_edge_backtrack_counter = + static_cast(thread_data.active_schedule_data.current_violations.size()); + thread_data.max_inner_iterations = + std::max(thread_data.unlock_edge_backtrack_counter * 5u, parameters.max_inner_iterations_reset); + thread_data.max_no_vioaltions_removed_backtrack = + parameters.max_no_vioaltions_removed_backtrack_for_remove_step_reset; +#ifdef KL_DEBUG_1 + std::cout << "thread " << thread_data.thread_id << ", Trying to remove step " + << thread_data.step_to_remove << std::endl; +#endif + return; + } } - //thread_data.step_to_remove = thread_data.start_step; + // thread_data.step_to_remove = thread_data.start_step; thread_data.local_search_start_step = 0; - thread_data.selection_strategy.select_active_nodes(thread_data.affinity_table, thread_data.start_step, thread_data.end_step); + thread_data.selection_strategy.select_active_nodes(thread_data.affinity_table, thread_data.start_step, + thread_data.end_step); } bool check_remove_superstep(unsigned step); - bool select_nodes_check_remove_superstep(unsigned & step, ThreadSearchContext & thread_data); + bool select_nodes_check_remove_superstep(unsigned &step, ThreadSearchContext &thread_data); - bool scatter_nodes_superstep(unsigned step, ThreadSearchContext & thread_data) { + bool scatter_nodes_superstep(unsigned step, ThreadSearchContext &thread_data) { assert(step <= thread_data.end_step && thread_data.start_step <= step); bool abort = false; - for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - const std::vector step_proc_node_vec(active_schedule.getSetSchedule().step_processor_vertices[step][proc].begin(),active_schedule.getSetSchedule().step_processor_vertices[step][proc].end()); - for (const auto &node : step_proc_node_vec) { - - thread_data.reward_penalty_strat.init_reward_penalty(static_cast(thread_data.active_schedule_data.current_violations.size()) + 1.0); + for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { + const std::vector step_proc_node_vec( + active_schedule.getSetSchedule().step_processor_vertices[step][proc].begin(), + active_schedule.getSetSchedule().step_processor_vertices[step][proc].end()); + for (const auto &node : step_proc_node_vec) { + + thread_data.reward_penalty_strat.init_reward_penalty( + static_cast(thread_data.active_schedule_data.current_violations.size()) + 1.0); compute_node_affinities(node, thread_data.local_affinity_table, thread_data); kl_move best_move = compute_best_move(node, thread_data.local_affinity_table, thread_data); @@ -954,37 +1259,43 @@ class kl_improver : public ImprovementScheduler { } apply_move(best_move, thread_data); - if (thread_data.active_schedule_data.current_violations.size() > parameters.abort_scatter_nodes_violation_threshold) { + if (thread_data.active_schedule_data.current_violations.size() > + parameters.abort_scatter_nodes_violation_threshold) { abort = true; break; } thread_data.affinity_table.insert(node); - //thread_data.selection_strategy.add_neighbours_to_selection(node, thread_data.affinity_table, thread_data.start_step, thread_data.end_step); + // thread_data.selection_strategy.add_neighbours_to_selection(node, thread_data.affinity_table, + // thread_data.start_step, thread_data.end_step); if (thread_data.active_schedule_data.new_violations.size() > 0) { - - for (const auto & vertex_edge_pair : thread_data.active_schedule_data.new_violations) { + + for (const auto &vertex_edge_pair : thread_data.active_schedule_data.new_violations) { const auto &vertex = vertex_edge_pair.first; thread_data.affinity_table.insert(vertex); } } #ifdef KL_DEBUG - std::cout << "move node " << best_move.node << " with gain " << best_move.gain << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step << " to: " << best_move.to_proc << "|" << best_move.to_step << std::endl; + std::cout << "move node " << best_move.node << " with gain " << best_move.gain + << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step + << " to: " << best_move.to_proc << "|" << best_move.to_step << std::endl; #endif #ifdef KL_DEBUG_COST_CHECK active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps(); - if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) { - std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; - std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl; + if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > + 0.00001) { + std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() + << ", current cost: " << thread_data.active_schedule_data.cost << std::endl; + std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" + << std::endl; } if constexpr (active_schedule_t::use_memory_constraint) { - if ( not active_schedule.memory_constraint.satisfied_memory_constraint()) - std::cout << "memory constraint not satisfied" << std::endl; + if (not active_schedule.memory_constraint.satisfied_memory_constraint()) + std::cout << "memory constraint not satisfied" << std::endl; } #endif - } if (abort) { @@ -993,7 +1304,8 @@ class kl_improver : public ImprovementScheduler { } if (abort) { - active_schedule.revert_to_best_schedule(0, 0, comm_cost_f, thread_data.active_schedule_data, thread_data.start_step, thread_data.end_step); + active_schedule.revert_to_best_schedule(0, 0, comm_cost_f, thread_data.active_schedule_data, + thread_data.start_step, thread_data.end_step); thread_data.affinity_table.reset_node_selection(); return false; } @@ -1004,12 +1316,12 @@ class kl_improver : public ImprovementScheduler { if (num_threads == 1) { // single thread case active_schedule.set_cost(thread_data_vec[0].active_schedule_data.cost); active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps(); - return; + return; } unsigned write_cursor = thread_data_vec[0].end_step + 1; for (unsigned i = 1; i < num_threads; ++i) { - auto& thread = thread_data_vec[i]; + auto &thread = thread_data_vec[i]; if (thread.start_step <= thread.end_step) { for (unsigned j = thread.start_step; j <= thread.end_step; ++j) { if (j != write_cursor) { @@ -1030,9 +1342,7 @@ class kl_improver : public ImprovementScheduler { gen = std::mt19937(rd()); } - explicit kl_improver(unsigned seed) : ImprovementScheduler() { - gen = std::mt19937(seed); - } + explicit kl_improver(unsigned seed) : ImprovementScheduler() { gen = std::mt19937(seed); } virtual ~kl_improver() = default; @@ -1041,24 +1351,24 @@ class kl_improver : public ImprovementScheduler { return RETURN_STATUS::BEST_FOUND; const unsigned num_threads = 1; - - thread_data_vec.resize(num_threads); + + thread_data_vec.resize(num_threads); thread_finished_vec.assign(num_threads, true); set_parameters(schedule.getInstance().numberOfVertices()); - initialize_datastructures(schedule); - const cost_t initial_cost = active_schedule.get_cost(); + initialize_datastructures(schedule); + const cost_t initial_cost = active_schedule.get_cost(); const unsigned num_steps = schedule.numberOfSupersteps(); set_start_step(0, thread_data_vec[0]); - thread_data_vec[0].end_step = (num_steps > 0) ? num_steps - 1 : 0; + thread_data_vec[0].end_step = (num_steps > 0) ? num_steps - 1 : 0; - auto & thread_data = this->thread_data_vec[0]; + auto &thread_data = this->thread_data_vec[0]; thread_data.active_schedule_data.initialize_cost(active_schedule.get_cost()); thread_data.selection_strategy.setup(thread_data.start_step, thread_data.end_step); - run_local_search(thread_data); - - synchronize_active_schedule(num_threads); + run_local_search(thread_data); + + synchronize_active_schedule(num_threads); if (initial_cost > active_schedule.get_cost()) { active_schedule.write_schedule(schedule); @@ -1076,81 +1386,100 @@ class kl_improver : public ImprovementScheduler { } virtual void setTimeQualityParameter(const double time_quality) { this->parameters.time_quality = time_quality; } - virtual void setSuperstepRemoveStrengthParameter(const double superstep_remove_strength) { this->parameters.superstep_remove_strength = superstep_remove_strength; } - - virtual std::string getScheduleName() const { - return "kl_improver_" + comm_cost_f.name(); + virtual void setSuperstepRemoveStrengthParameter(const double superstep_remove_strength) { + this->parameters.superstep_remove_strength = superstep_remove_strength; } + + virtual std::string getScheduleName() const { return "kl_improver_" + comm_cost_f.name(); } }; -template -void kl_improver::set_parameters(vertex_idx_t num_nodes) { - const unsigned log_num_nodes = (num_nodes > 1) ? static_cast(std::log(num_nodes)) : 1; +template +void kl_improver::set_parameters( + vertex_idx_t num_nodes) { + const unsigned log_num_nodes = (num_nodes > 1) ? static_cast(std::log(num_nodes)) : 1; // Total number of outer iterations. Proportional to sqrt N. - parameters.max_outer_iterations = static_cast(std::sqrt(num_nodes) * (parameters.time_quality * 10.0) / parameters.num_parallel_loops); + parameters.max_outer_iterations = + static_cast(std::sqrt(num_nodes) * (parameters.time_quality * 10.0) / parameters.num_parallel_loops); // Number of times to reset the search for violations before giving up. - parameters.max_no_vioaltions_removed_backtrack_reset = parameters.time_quality < 0.75 ? 1 : parameters.time_quality < 1.0 ? 2 : 3; + parameters.max_no_vioaltions_removed_backtrack_reset = parameters.time_quality < 0.75 ? 1 + : parameters.time_quality < 1.0 ? 2 + : 3; // Parameters for the superstep removal heuristic. - parameters.max_no_vioaltions_removed_backtrack_for_remove_step_reset = 3 + static_cast(parameters.superstep_remove_strength * 7); - parameters.node_max_step_selection_epochs = parameters.superstep_remove_strength < 0.75 ? 1 : parameters.superstep_remove_strength < 1.0 ? 2 : 3; + parameters.max_no_vioaltions_removed_backtrack_for_remove_step_reset = + 3 + static_cast(parameters.superstep_remove_strength * 7); + parameters.node_max_step_selection_epochs = parameters.superstep_remove_strength < 0.75 ? 1 + : parameters.superstep_remove_strength < 1.0 ? 2 + : 3; parameters.remove_step_epocs = static_cast(parameters.superstep_remove_strength * 4.0); - parameters.min_inner_iter_reset = static_cast(log_num_nodes + log_num_nodes * (1.0 + parameters.time_quality)); - + parameters.min_inner_iter_reset = + static_cast(log_num_nodes + log_num_nodes * (1.0 + parameters.time_quality)); + if (parameters.remove_step_epocs > 0) { - parameters.try_remove_step_after_num_outer_iterations = parameters.max_outer_iterations / parameters.remove_step_epocs; + parameters.try_remove_step_after_num_outer_iterations = + parameters.max_outer_iterations / parameters.remove_step_epocs; } else { // Effectively disable superstep removal if remove_step_epocs is 0. parameters.try_remove_step_after_num_outer_iterations = parameters.max_outer_iterations + 1; } - + unsigned i = 0; - for (auto & thread : thread_data_vec) { + for (auto &thread : thread_data_vec) { thread.thread_id = i++; // The number of nodes to consider in each inner iteration. Proportional to log(N). - thread.selection_strategy.selection_threshold = static_cast(std::ceil(parameters.time_quality * 10 * log_num_nodes + log_num_nodes)); + thread.selection_strategy.selection_threshold = + static_cast(std::ceil(parameters.time_quality * 10 * log_num_nodes + log_num_nodes)); } - #ifdef KL_DEBUG_1 - std::cout << "kl set parameter, number of nodes: " << num_nodes << std::endl; - std::cout << "max outer iterations: " << parameters.max_outer_iterations << std::endl; - std::cout << "max inner iterations: " << parameters.max_inner_iterations_reset << std::endl; - std::cout << "no improvement iterations reduce penalty: " << thread_data_vec[0].no_improvement_iterations_reduce_penalty << std::endl; - std::cout << "selction threshold: " << thread_data_vec[0].selection_strategy.selection_threshold << std::endl; - std::cout << "remove step epocs: " << parameters.remove_step_epocs << std::endl; - std::cout << "try remove step after num outer iterations: " << parameters.try_remove_step_after_num_outer_iterations << std::endl; - std::cout << "number of parallel loops: " << parameters.num_parallel_loops << std::endl; - #endif +#ifdef KL_DEBUG_1 + std::cout << "kl set parameter, number of nodes: " << num_nodes << std::endl; + std::cout << "max outer iterations: " << parameters.max_outer_iterations << std::endl; + std::cout << "max inner iterations: " << parameters.max_inner_iterations_reset << std::endl; + std::cout << "no improvement iterations reduce penalty: " + << thread_data_vec[0].no_improvement_iterations_reduce_penalty << std::endl; + std::cout << "selction threshold: " << thread_data_vec[0].selection_strategy.selection_threshold << std::endl; + std::cout << "remove step epocs: " << parameters.remove_step_epocs << std::endl; + std::cout << "try remove step after num outer iterations: " << parameters.try_remove_step_after_num_outer_iterations + << std::endl; + std::cout << "number of parallel loops: " << parameters.num_parallel_loops << std::endl; +#endif } -template -void kl_improver::update_node_work_affinity(node_selection_container_t &nodes, kl_move move, const pre_move_work_data & prev_work_data, std::map &recompute_max_gain) { +template +void kl_improver::update_node_work_affinity( + node_selection_container_t &nodes, kl_move move, const pre_move_work_data &prev_work_data, + std::map &recompute_max_gain) { const size_t active_count = nodes.size(); for (size_t i = 0; i < active_count; ++i) { const VertexType node = nodes.get_selected_nodes()[i]; - - kl_gain_update_info update_info = update_node_work_affinity_after_move(node, move, prev_work_data, nodes.at(node)); + + kl_gain_update_info update_info = + update_node_work_affinity_after_move(node, move, prev_work_data, nodes.at(node)); if (update_info.update_from_step || update_info.update_to_step) { recompute_max_gain[node] = update_info; - } + } } } -template -void kl_improver::update_max_gain(kl_move move, std::map &recompute_max_gain, ThreadSearchContext & thread_data) { - for (auto& pair : recompute_max_gain) { +template +void kl_improver::update_max_gain( + kl_move move, std::map &recompute_max_gain, ThreadSearchContext &thread_data) { + for (auto &pair : recompute_max_gain) { if (pair.second.full_update) { - recompute_node_max_gain(pair.first, thread_data.affinity_table, thread_data); + recompute_node_max_gain(pair.first, thread_data.affinity_table, thread_data); } else { if (pair.second.update_entire_from_step) { update_best_move(pair.first, move.from_step, thread_data.affinity_table, thread_data); } else if (pair.second.update_from_step && is_compatible(pair.first, move.from_proc)) { update_best_move(pair.first, move.from_step, move.from_proc, thread_data.affinity_table, thread_data); - } + } if (move.from_step != move.to_step || not pair.second.update_entire_from_step) { if (pair.second.update_entire_to_step) { @@ -1159,12 +1488,14 @@ void kl_improver -void kl_improver::compute_work_affinity(VertexType node, std::vector> & affinity_table_node, ThreadSearchContext & thread_data) { +template +void kl_improver::compute_work_affinity( + VertexType node, std::vector> &affinity_table_node, ThreadSearchContext &thread_data) { const unsigned node_step = active_schedule.assigned_superstep(node); const work_weight_t vertex_weight = graph->vertex_work_weight(node); @@ -1176,108 +1507,157 @@ void kl_improver(active_schedule.get_step_max_work(step)); - for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { + for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(step, proc); const cost_t work_diff = static_cast(new_weight) - max_work_for_step; affinity_table_node[proc][idx] = std::max(0.0, work_diff); } } - const unsigned node_proc = active_schedule.assigned_processor(node); + const unsigned node_proc = active_schedule.assigned_processor(node); const work_weight_t max_work_for_step = active_schedule.get_step_max_work(node_step); - const bool is_sole_max_processor = (active_schedule.get_step_max_work_processor_count()[node_step] == 1) && (max_work_for_step == active_schedule.get_step_processor_work(node_step, node_proc)); - - const cost_t node_proc_affinity = is_sole_max_processor ? std::min(vertex_weight, max_work_for_step - active_schedule.get_step_second_max_work(node_step)) : 0.0; + const bool is_sole_max_processor = + (active_schedule.get_step_max_work_processor_count()[node_step] == 1) && + (max_work_for_step == active_schedule.get_step_processor_work(node_step, node_proc)); + + const cost_t node_proc_affinity = + is_sole_max_processor + ? std::min(vertex_weight, max_work_for_step - active_schedule.get_step_second_max_work(node_step)) + : 0.0; affinity_table_node[node_proc][window_size] = node_proc_affinity; - - for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { - if(proc == node_proc) + + for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { + if (proc == node_proc) continue; - const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, proc); - affinity_table_node[proc][window_size] = compute_same_step_affinity(max_work_for_step, new_weight, node_proc_affinity); + const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, proc); + affinity_table_node[proc][window_size] = + compute_same_step_affinity(max_work_for_step, new_weight, node_proc_affinity); } -} +} -template -void kl_improver::process_work_update_step(VertexType node, unsigned node_step, unsigned node_proc, work_weight_t vertex_weight, unsigned move_step, unsigned move_proc, work_weight_t move_correction_node_weight, const work_weight_t prev_move_step_max_work, const work_weight_t prev_move_step_second_max_work, unsigned prev_move_step_max_work_processor_count, bool & update_step, bool & update_entire_step, bool & full_update, std::vector> & affinity_table_node) { - const unsigned lower_bound = move_step > window_size ? move_step - window_size : 0; +template +void kl_improver::process_work_update_step( + VertexType node, unsigned node_step, unsigned node_proc, work_weight_t vertex_weight, unsigned move_step, + unsigned move_proc, work_weight_t move_correction_node_weight, const work_weight_t prev_move_step_max_work, + const work_weight_t prev_move_step_second_max_work, unsigned prev_move_step_max_work_processor_count, + bool &update_step, bool &update_entire_step, bool &full_update, + std::vector> &affinity_table_node) { + const unsigned lower_bound = move_step > window_size ? move_step - window_size : 0; if (lower_bound <= node_step && node_step <= move_step + window_size) { update_step = true; - if (node_step == move_step) { - const work_weight_t new_max_weight = active_schedule.get_step_max_work(move_step); + if (node_step == move_step) { + const work_weight_t new_max_weight = active_schedule.get_step_max_work(move_step); const work_weight_t new_second_max_weight = active_schedule.get_step_second_max_work(move_step); const work_weight_t new_step_proc_work = active_schedule.get_step_processor_work(node_step, node_proc); - const work_weight_t prev_step_proc_work = (node_proc == move_proc) ? new_step_proc_work + move_correction_node_weight : new_step_proc_work; - const bool prev_is_sole_max_processor = (prev_move_step_max_work_processor_count == 1) && (prev_move_step_max_work == prev_step_proc_work); - const cost_t prev_node_proc_affinity = prev_is_sole_max_processor ? std::min(vertex_weight, prev_move_step_max_work - prev_move_step_second_max_work) : 0.0; + const work_weight_t prev_step_proc_work = + (node_proc == move_proc) ? new_step_proc_work + move_correction_node_weight : new_step_proc_work; + const bool prev_is_sole_max_processor = + (prev_move_step_max_work_processor_count == 1) && (prev_move_step_max_work == prev_step_proc_work); + const cost_t prev_node_proc_affinity = + prev_is_sole_max_processor + ? std::min(vertex_weight, prev_move_step_max_work - prev_move_step_second_max_work) + : 0.0; + + const bool new_is_sole_max_processor = + (active_schedule.get_step_max_work_processor_count()[node_step] == 1) && + (new_max_weight == new_step_proc_work); + const cost_t new_node_proc_affinity = + new_is_sole_max_processor ? std::min(vertex_weight, new_max_weight - new_second_max_weight) : 0.0; - const bool new_is_sole_max_processor = (active_schedule.get_step_max_work_processor_count()[node_step] == 1) && (new_max_weight == new_step_proc_work); - const cost_t new_node_proc_affinity = new_is_sole_max_processor ? std::min(vertex_weight, new_max_weight - new_second_max_weight) : 0.0; - const cost_t diff = new_node_proc_affinity - prev_node_proc_affinity; - const bool update_node_proc_affinity = std::abs(diff) > EPSILON; + const bool update_node_proc_affinity = std::abs(diff) > EPSILON; if (update_node_proc_affinity) { full_update = true; affinity_table_node[node_proc][window_size] += diff; } - + if ((prev_move_step_max_work != new_max_weight) || update_node_proc_affinity) { update_entire_step = true; - for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { - if((proc == node_proc) || (proc == move_proc)) + for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { + if ((proc == node_proc) || (proc == move_proc)) continue; - const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, proc); - const cost_t prev_other_affinity = compute_same_step_affinity(prev_move_step_max_work, new_weight, prev_node_proc_affinity); - const cost_t other_affinity = compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity); - - affinity_table_node[proc][window_size] += (other_affinity - prev_other_affinity); + const work_weight_t new_weight = + vertex_weight + active_schedule.get_step_processor_work(node_step, proc); + const cost_t prev_other_affinity = + compute_same_step_affinity(prev_move_step_max_work, new_weight, prev_node_proc_affinity); + const cost_t other_affinity = + compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity); + + affinity_table_node[proc][window_size] += (other_affinity - prev_other_affinity); } } - + if (node_proc != move_proc && is_compatible(node, move_proc)) { - const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move_proc) + move_correction_node_weight; - const cost_t prev_other_affinity = compute_same_step_affinity(prev_move_step_max_work, prev_new_weight, prev_node_proc_affinity); - const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move_proc); - const cost_t other_affinity = compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity); - - affinity_table_node[move_proc][window_size] += (other_affinity - prev_other_affinity); - } + const work_weight_t prev_new_weight = vertex_weight + + active_schedule.get_step_processor_work(node_step, move_proc) + + move_correction_node_weight; + const cost_t prev_other_affinity = + compute_same_step_affinity(prev_move_step_max_work, prev_new_weight, prev_node_proc_affinity); + const work_weight_t new_weight = + vertex_weight + active_schedule.get_step_processor_work(node_step, move_proc); + const cost_t other_affinity = + compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity); + + affinity_table_node[move_proc][window_size] += (other_affinity - prev_other_affinity); + } } else { const work_weight_t new_max_weight = active_schedule.get_step_max_work(move_step); const unsigned idx = rel_step_idx(node_step, move_step); - if (prev_move_step_max_work != new_max_weight) { + if (prev_move_step_max_work != new_max_weight) { update_entire_step = true; // update moving to all procs with special for move_proc - for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { - const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(move_step, proc); + for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { + const work_weight_t new_weight = + vertex_weight + active_schedule.get_step_processor_work(move_step, proc); if (proc != move_proc) { - const cost_t prev_affinity = prev_move_step_max_work < new_weight ? static_cast(new_weight) - static_cast(prev_move_step_max_work) : 0.0; - const cost_t new_affinity = new_max_weight < new_weight ? static_cast(new_weight) - static_cast(new_max_weight) : 0.0; - affinity_table_node[proc][idx] += new_affinity - prev_affinity; + const cost_t prev_affinity = + prev_move_step_max_work < new_weight + ? static_cast(new_weight) - static_cast(prev_move_step_max_work) + : 0.0; + const cost_t new_affinity = + new_max_weight < new_weight + ? static_cast(new_weight) - static_cast(new_max_weight) + : 0.0; + affinity_table_node[proc][idx] += new_affinity - prev_affinity; } else { - const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(move_step, proc) + move_correction_node_weight; - const cost_t prev_affinity = prev_move_step_max_work < prev_new_weight ? static_cast(prev_new_weight) - static_cast(prev_move_step_max_work) : 0.0; - - const cost_t new_affinity = new_max_weight < new_weight ? static_cast(new_weight) - static_cast(new_max_weight) : 0.0; + const work_weight_t prev_new_weight = vertex_weight + + active_schedule.get_step_processor_work(move_step, proc) + + move_correction_node_weight; + const cost_t prev_affinity = + prev_move_step_max_work < prev_new_weight + ? static_cast(prev_new_weight) - static_cast(prev_move_step_max_work) + : 0.0; + + const cost_t new_affinity = + new_max_weight < new_weight + ? static_cast(new_weight) - static_cast(new_max_weight) + : 0.0; affinity_table_node[proc][idx] += new_affinity - prev_affinity; } - } + } } else { // update only move_proc if (is_compatible(node, move_proc)) { - const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(move_step, move_proc); + const work_weight_t new_weight = + vertex_weight + active_schedule.get_step_processor_work(move_step, move_proc); const work_weight_t prev_new_weight = new_weight + move_correction_node_weight; - const cost_t prev_affinity = prev_move_step_max_work < prev_new_weight ? static_cast(prev_new_weight) - static_cast(prev_move_step_max_work) : 0.0; - - const cost_t new_affinity = new_max_weight < new_weight ? static_cast(new_weight) - static_cast(new_max_weight) : 0.0; + const cost_t prev_affinity = + prev_move_step_max_work < prev_new_weight + ? static_cast(prev_new_weight) - static_cast(prev_move_step_max_work) + : 0.0; + + const cost_t new_affinity = new_max_weight < new_weight ? static_cast(new_weight) - + static_cast(new_max_weight) + : 0.0; affinity_table_node[move_proc][idx] += new_affinity - prev_affinity; } } @@ -1285,20 +1665,25 @@ void kl_improver -bool kl_improver::select_nodes_check_remove_superstep(unsigned & step_to_remove, ThreadSearchContext & thread_data) { - if (thread_data.step_selection_epoch_counter >= parameters.node_max_step_selection_epochs || thread_data.num_steps() < 3) { +template +bool kl_improver::select_nodes_check_remove_superstep(unsigned &step_to_remove, + ThreadSearchContext &thread_data) { + if (thread_data.step_selection_epoch_counter >= parameters.node_max_step_selection_epochs || + thread_data.num_steps() < 3) { return false; } - - for (step_to_remove = thread_data.step_selection_counter; step_to_remove <= thread_data.end_step; step_to_remove++) { - assert(step_to_remove >= thread_data.start_step && step_to_remove <= thread_data.end_step); + + for (step_to_remove = thread_data.step_selection_counter; step_to_remove <= thread_data.end_step; + step_to_remove++) { + assert(step_to_remove >= thread_data.start_step && step_to_remove <= thread_data.end_step); #ifdef KL_DEBUG - std::cout << "Checking to remove step " << step_to_remove << "/" << thread_data.end_step << std::endl; + std::cout << "Checking to remove step " << step_to_remove << "/" << thread_data.end_step << std::endl; #endif if (check_remove_superstep(step_to_remove)) { #ifdef KL_DEBUG - std::cout << "Checking to scatter step " << step_to_remove << "/" << thread_data.end_step << std::endl; + std::cout << "Checking to scatter step " << step_to_remove << "/" << thread_data.end_step << std::endl; #endif assert(step_to_remove >= thread_data.start_step && step_to_remove <= thread_data.end_step); if (scatter_nodes_superstep(step_to_remove, thread_data)) { @@ -1318,19 +1703,23 @@ bool kl_improver -bool kl_improver::check_remove_superstep(unsigned step) { - if (active_schedule.num_steps() < 2) +template +bool kl_improver::check_remove_superstep( + unsigned step) { + if (active_schedule.num_steps() < 2) return false; - + if (active_schedule.get_step_max_work(step) < instance->synchronisationCosts()) return true; return false; } -template -void kl_improver::reset_inner_search_structures(ThreadSearchContext & thread_data) const { +template +void kl_improver::reset_inner_search_structures( + ThreadSearchContext &thread_data) const { thread_data.unlock_edge_backtrack_counter = thread_data.unlock_edge_backtrack_counter_reset; thread_data.max_inner_iterations = parameters.max_inner_iterations_reset; thread_data.max_no_vioaltions_removed_backtrack = parameters.max_no_vioaltions_removed_backtrack_reset; @@ -1340,18 +1729,22 @@ void kl_improver -bool kl_improver::is_local_search_blocked(ThreadSearchContext & thread_data) { - for (const auto& pair : thread_data.active_schedule_data.new_violations) { +template +bool kl_improver::is_local_search_blocked( + ThreadSearchContext &thread_data) { + for (const auto &pair : thread_data.active_schedule_data.new_violations) { if (thread_data.lock_manager.is_locked(pair.first)) { - return true; + return true; } } return false; } -template -void kl_improver::initialize_datastructures(BspSchedule &schedule) { +template +void kl_improver::initialize_datastructures( + BspSchedule &schedule) { input_schedule = &schedule; instance = &schedule.getInstance(); graph = &instance->getComputationalDag(); @@ -1363,54 +1756,64 @@ void kl_improvernum_vertices()); - t_data.reward_penalty_strat.initialize(active_schedule, comm_cost_f.get_max_comm_weight_multiplied(), active_schedule.get_max_work_weight()); + t_data.lock_manager.initialize(graph->num_vertices()); + t_data.reward_penalty_strat.initialize(active_schedule, comm_cost_f.get_max_comm_weight_multiplied(), + active_schedule.get_max_work_weight()); t_data.selection_strategy.initialize(active_schedule, gen, t_data.start_step, t_data.end_step); - + t_data.local_affinity_table.resize(instance->numberOfProcessors()); for (unsigned i = 0; i < instance->numberOfProcessors(); ++i) { t_data.local_affinity_table[i].resize(window_range); } - } + } } -template -void kl_improver::update_avg_gain(const cost_t gain, const unsigned num_iter, cost_t & average_gain) { +template +void kl_improver::update_avg_gain( + const cost_t gain, const unsigned num_iter, double &average_gain) { average_gain = static_cast((average_gain * num_iter + gain)) / (num_iter + 1.0); } -template -void kl_improver::insert_gain_heap(ThreadSearchContext & thread_data) { +template +void kl_improver::insert_gain_heap( + ThreadSearchContext &thread_data) { const size_t active_count = thread_data.affinity_table.size(); for (size_t i = 0; i < active_count; ++i) { - const VertexType node = thread_data.affinity_table.get_selected_nodes()[i]; + const VertexType node = thread_data.affinity_table.get_selected_nodes()[i]; compute_node_affinities(node, thread_data.affinity_table.at(node), thread_data); const auto best_move = compute_best_move(node, thread_data.affinity_table[node], thread_data); thread_data.max_gain_heap.push(node, best_move); } } -template -void kl_improver::insert_new_nodes_gain_heap(std::vector& new_nodes, node_selection_container_t &nodes, ThreadSearchContext & thread_data) { +template +void kl_improver::insert_new_nodes_gain_heap( + std::vector &new_nodes, node_selection_container_t &nodes, ThreadSearchContext &thread_data) { for (const auto &node : new_nodes) { nodes.insert(node); compute_node_affinities(node, thread_data.affinity_table.at(node), thread_data); const auto best_move = compute_best_move(node, thread_data.affinity_table[node], thread_data); - thread_data.max_gain_heap.push(node, best_move); + thread_data.max_gain_heap.push(node, best_move); } } -template +template void kl_improver::cleanup_datastructures() { thread_data_vec.clear(); - active_schedule.clear(); + active_schedule.clear(); } -template -void kl_improver::print_heap(heap_datastructure & max_gain_heap) const { +template +void kl_improver::print_heap( + heap_datastructure &max_gain_heap) const { if (max_gain_heap.is_empty()) { std::cout << "heap is empty" << std::endl; @@ -1419,29 +1822,32 @@ void kl_improver -void kl_improver::update_best_move(VertexType node, unsigned step, unsigned proc, node_selection_container_t &affinity_table, ThreadSearchContext & thread_data) { +template +void kl_improver::update_best_move( + VertexType node, unsigned step, unsigned proc, node_selection_container_t &affinity_table, + ThreadSearchContext &thread_data) { const unsigned node_proc = active_schedule.assigned_processor(node); const unsigned node_step = active_schedule.assigned_superstep(node); - if((node_proc == proc) && (node_step == step)) + if ((node_proc == proc) && (node_step == step)) return; kl_move node_move = thread_data.max_gain_heap.get_value(node); cost_t max_gain = node_move.gain; - + unsigned max_proc = node_move.to_proc; unsigned max_step = node_move.to_step; @@ -1449,69 +1855,75 @@ void kl_improver max_gain) { max_gain = gain; max_proc = proc; - max_step = step; - } - + max_step = step; + } + const cost_t diff = max_gain - node_move.gain; if ((std::abs(diff) > EPSILON) || (max_proc != node_move.to_proc) || (max_step != node_move.to_step)) { node_move.gain = max_gain; node_move.to_proc = max_proc; node_move.to_step = max_step; thread_data.max_gain_heap.update(node, node_move); - } + } } } - -template -void kl_improver::update_best_move(VertexType node, unsigned step, node_selection_container_t &affinity_table, ThreadSearchContext & thread_data) { - + +template +void kl_improver::update_best_move( + VertexType node, unsigned step, node_selection_container_t &affinity_table, ThreadSearchContext &thread_data) { + const unsigned node_proc = active_schedule.assigned_processor(node); const unsigned node_step = active_schedule.assigned_superstep(node); kl_move node_move = thread_data.max_gain_heap.get_value(node); cost_t max_gain = node_move.gain; - + unsigned max_proc = node_move.to_proc; unsigned max_step = node_move.to_step; if (max_step == step) { - recompute_node_max_gain(node, affinity_table, thread_data); - } else { + recompute_node_max_gain(node, affinity_table, thread_data); + } else { if (node_step != step) { const unsigned idx = rel_step_idx(node_step, step); - for (const unsigned p : proc_range.compatible_processors_vertex(node)) { + for (const unsigned p : proc_range.compatible_processors_vertex(node)) { if constexpr (active_schedule_t::use_memory_constraint) { - if( not active_schedule.memory_constraint.can_move(node, p, step)) continue; + if (not active_schedule.memory_constraint.can_move(node, p, step)) + continue; } - const cost_t gain = affinity_table[node][node_proc][window_size] - affinity_table[node][p][idx]; + const cost_t gain = affinity_table[node][node_proc][window_size] - affinity_table[node][p][idx]; if (gain > max_gain) { max_gain = gain; max_proc = p; - max_step = step; + max_step = step; } } } else { - for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { + for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { if (proc == node_proc) continue; if constexpr (active_schedule_t::use_memory_constraint) { - if( not active_schedule.memory_constraint.can_move(node, proc, step)) continue; + if (not active_schedule.memory_constraint.can_move(node, proc, step)) + continue; } - const cost_t gain = affinity_table[node][node_proc][window_size] - affinity_table[node][proc][window_size]; + const cost_t gain = + affinity_table[node][node_proc][window_size] - affinity_table[node][proc][window_size]; if (gain > max_gain) { max_gain = gain; max_proc = proc; - max_step = step; + max_step = step; } } - } + } const cost_t diff = max_gain - node_move.gain; if ((std::abs(diff) > EPSILON) || (max_proc != node_move.to_proc) || (max_step != node_move.to_step)) { @@ -1519,8 +1931,8 @@ void kl_improver +template class kl_improver_test : public kl_improver { - + using VertexType = vertex_idx_t; using kl_move = kl_move_struct; using heap_datastructure = MaxPairingHeap; @@ -33,8 +34,7 @@ class kl_improver_test : public kl_improver; using node_selection_container_t = adaptive_affinity_table; - public: - + public: kl_improver_test() : kl_improver() { this->thread_data_vec.resize(1); this->thread_finished_vec.assign(1, true); @@ -42,18 +42,11 @@ class kl_improver_test : public kl_improveractive_schedule; } - active_schedule_t& get_active_schedule() { - return this->active_schedule; - } - - auto & get_affinity_table() { - return this->thread_data_vec[0].affinity_table; - } + auto &get_affinity_table() { return this->thread_data_vec[0].affinity_table; } - auto & get_comm_cost_f() { - return this->comm_cost_f; - } + auto &get_comm_cost_f() { return this->comm_cost_f; } void setup_schedule(BspSchedule &schedule) { this->thread_data_vec.resize(1); @@ -63,39 +56,33 @@ class kl_improver_test : public kl_improverthread_data_vec[0].active_schedule_data.initialize_cost(this->active_schedule.get_cost()); } - void apply_move_test(kl_move move) { - this->apply_move(move, this->thread_data_vec[0]); - } + void apply_move_test(kl_move move) { this->apply_move(move, this->thread_data_vec[0]); } - auto & get_max_gain_heap() { - return this->thread_data_vec[0].max_gain_heap; - } + auto &get_max_gain_heap() { return this->thread_data_vec[0].max_gain_heap; } - auto get_current_cost() { - return this->thread_data_vec[0].active_schedule_data.cost; - } + auto get_current_cost() { return this->thread_data_vec[0].active_schedule_data.cost; } - bool is_feasible() { - return this->thread_data_vec[0].active_schedule_data.feasible; - } + bool is_feasible() { return this->thread_data_vec[0].active_schedule_data.feasible; } void compute_violations_test() { this->active_schedule.compute_violations(this->thread_data_vec[0].active_schedule_data); } - node_selection_container_t& insert_gain_heap_test(const std::vector& n) { - this->thread_data_vec[0].affinity_table.initialize(this->active_schedule, n.size()); + node_selection_container_t &insert_gain_heap_test(const std::vector &n) { + this->thread_data_vec[0].reward_penalty_strat.penalty = 0.0; + this->thread_data_vec[0].reward_penalty_strat.reward = 0.0; + this->thread_data_vec[0].affinity_table.initialize(this->active_schedule, n.size()); for (const auto &node : n) { this->thread_data_vec[0].affinity_table.insert(node); } this->insert_gain_heap(this->thread_data_vec[0]); - - return this->thread_data_vec[0].affinity_table; + + return this->thread_data_vec[0].affinity_table; } - node_selection_container_t& insert_gain_heap_test_penalty(const std::vector& n) { + node_selection_container_t &insert_gain_heap_test_penalty(const std::vector &n) { this->thread_data_vec[0].affinity_table.initialize(this->active_schedule, n.size()); for (const auto &node : n) { this->thread_data_vec[0].affinity_table.insert(node); @@ -105,34 +92,35 @@ class kl_improver_test : public kl_improverinsert_gain_heap(this->thread_data_vec[0]); - return this->thread_data_vec[0].affinity_table; + return this->thread_data_vec[0].affinity_table; } - node_selection_container_t& insert_gain_heap_test_penalty_reward(const std::vector& n) { + node_selection_container_t &insert_gain_heap_test_penalty_reward(const std::vector &n) { this->thread_data_vec[0].affinity_table.initialize(this->active_schedule, n.size()); for (const auto &node : n) { this->thread_data_vec[0].affinity_table.insert(node); } - + this->thread_data_vec[0].reward_penalty_strat.init_reward_penalty(); this->thread_data_vec[0].reward_penalty_strat.reward = 15.0; this->insert_gain_heap(this->thread_data_vec[0]); - return this->thread_data_vec[0].affinity_table; + return this->thread_data_vec[0].affinity_table; } - void update_affinity_table_test(kl_move best_move, node_selection_container_t & node_selection) { + void update_affinity_table_test(kl_move best_move, node_selection_container_t &node_selection) { std::map recompute_max_gain; std::vector new_nodes; const auto prev_work_data = this->active_schedule.get_pre_move_work_data(best_move); + const auto prev_comm_data = this->comm_cost_f.get_pre_move_comm_data(best_move); this->apply_move(best_move, this->thread_data_vec[0]); - - this->update_node_work_affinity(node_selection, best_move, prev_work_data, recompute_max_gain); - this->comm_cost_f.update_node_comm_affinity(best_move, this->thread_data_vec[0], this->thread_data_vec[0].reward_penalty_strat.penalty, this->thread_data_vec[0].reward_penalty_strat.reward, recompute_max_gain, new_nodes); - } + this->thread_data_vec[0].affinity_table.trim(); + this->update_affinities(best_move, this->thread_data_vec[0], recompute_max_gain, new_nodes, prev_work_data, + prev_comm_data); + } auto run_inner_iteration_test() { @@ -141,25 +129,30 @@ class kl_improver_test : public kl_improverprint_heap(this->thread_data_vec[0].max_gain_heap); - kl_move best_move = this->get_best_move(this->thread_data_vec[0].affinity_table, this->thread_data_vec[0].lock_manager, this->thread_data_vec[0].max_gain_heap); // locks best_move.node and removes it from node_selection - + kl_move best_move = this->get_best_move( + this->thread_data_vec[0].affinity_table, this->thread_data_vec[0].lock_manager, + this->thread_data_vec[0].max_gain_heap); // locks best_move.node and removes it from node_selection + #ifdef KL_DEBUG - std::cout << "Best move: " << best_move.node << " gain: " << best_move.gain << ", from: " << best_move.from_step << "|" << best_move.from_proc << " to: " << best_move.to_step << "|" << best_move.to_proc << std::endl; + std::cout << "Best move: " << best_move.node << " gain: " << best_move.gain << ", from: " << best_move.from_step + << "|" << best_move.from_proc << " to: " << best_move.to_step << "|" << best_move.to_proc + << std::endl; #endif const auto prev_work_data = this->active_schedule.get_pre_move_work_data(best_move); + const auto prev_comm_data = this->comm_cost_f.get_pre_move_comm_data(best_move); this->apply_move(best_move, this->thread_data_vec[0]); this->thread_data_vec[0].affinity_table.trim(); - this->update_node_work_affinity(this->thread_data_vec[0].affinity_table, best_move, prev_work_data, recompute_max_gain); - this->comm_cost_f.update_node_comm_affinity(best_move, this->thread_data_vec[0], this->thread_data_vec[0].reward_penalty_strat.penalty, this->thread_data_vec[0].reward_penalty_strat.reward, recompute_max_gain, new_nodes); + this->update_affinities(best_move, this->thread_data_vec[0], recompute_max_gain, new_nodes, prev_work_data, + prev_comm_data); #ifdef KL_DEBUG - std::cout << "New nodes: { "; + std::cout << "New nodes: { "; for (const auto v : new_nodes) { std::cout << v << " "; - } - std::cout << "}" << std::endl; + } + std::cout << "}" << std::endl; #endif this->update_max_gain(best_move, recompute_max_gain, this->thread_data_vec[0]); @@ -168,10 +161,9 @@ class kl_improver_test : public kl_improver &schedule) { - this->active_schedule.write_schedule(schedule); - } + bool is_node_locked(VertexType node) const { return this->thread_data_vec[0].lock_manager.is_locked(node); } + void get_active_schedule_test(BspSchedule &schedule) { this->active_schedule.write_schedule(schedule); } }; } // namespace osp \ No newline at end of file diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp index 12e0cfa6..80ed0e48 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp @@ -26,21 +26,31 @@ limitations under the License. #include "kl_improver.hpp" #include "comm_cost_modules/kl_total_comm_cost.hpp" #include "comm_cost_modules/kl_hyper_total_comm_cost.hpp" +#include "comm_cost_modules/kl_bsp_comm_cost.hpp" #include "osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp" namespace osp { +using double_cost_t = double; + template -using kl_total_comm_improver = kl_improver, MemoryConstraint_t, window_size, double>; +using kl_total_comm_improver = kl_improver, MemoryConstraint_t, window_size, double_cost_t>; template, unsigned window_size = 1, bool use_node_communication_costs_arg = true> -using kl_total_comm_improver_local_mem_constr = kl_improver, MemoryConstraint_t, window_size, double>; +using kl_total_comm_improver_local_mem_constr = kl_improver, MemoryConstraint_t, window_size, double_cost_t>; + +template +using kl_total_lambda_comm_improver = kl_improver, MemoryConstraint_t, window_size, double_cost_t>; + +template, unsigned window_size = 1> +using kl_total_lambda_comm_improver_local_mem_constr = kl_improver, MemoryConstraint_t, window_size, double_cost_t>; template -using kl_total_lambda_comm_improver = kl_improver, MemoryConstraint_t, window_size, double>; +using kl_bsp_comm_improver = kl_improver, MemoryConstraint_t, window_size, double_cost_t>; template, unsigned window_size = 1> -using kl_total_lambda_comm_improver_local_mem_constr = kl_improver, MemoryConstraint_t, window_size, double>; +using kl_bsp_comm_improver_local_mem_constr = kl_improver, MemoryConstraint_t, window_size, double_cost_t>; + } // namespace osp diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp index e87d7dbb..5946c7e5 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp @@ -23,6 +23,7 @@ limitations under the License. #include "kl_improver_mt.hpp" #include "comm_cost_modules/kl_total_comm_cost.hpp" #include "comm_cost_modules/kl_hyper_total_comm_cost.hpp" +#include "comm_cost_modules/kl_bsp_comm_cost.hpp" namespace osp { @@ -32,5 +33,9 @@ using kl_total_comm_improver_mt = kl_improver_mt using kl_total_lambda_comm_improver_mt = kl_improver_mt, MemoryConstraint_t, window_size, double>; +template +using kl_bsp_comm_improver_mt = kl_improver_mt, MemoryConstraint_t, window_size, double>; + + } // namespace osp diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp index 24337f05..7f3bb29d 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp @@ -18,32 +18,32 @@ limitations under the License. #pragma once -#include #include "kl_active_schedule.hpp" +#include namespace osp { template struct reward_penalty_strategy { - + kl_active_schedule_t *active_schedule; cost_t max_weight; unsigned violations_threshold = 0; cost_t initial_penalty = 10.0; cost_t penalty = 0; - cost_t reward = 0; + cost_t reward = 0; - void initialize(kl_active_schedule_t & sched, const cost_t max_comm, const cost_t max_work) { + void initialize(kl_active_schedule_t &sched, const cost_t max_comm, const cost_t max_work) { max_weight = std::max(max_work, max_comm * sched.getInstance().communicationCosts()); active_schedule = &sched; - initial_penalty = std::sqrt(max_weight); + initial_penalty = static_cast(std::sqrt(max_weight)); } - + void init_reward_penalty(double multiplier = 1.0) { - multiplier = std::min(multiplier, 10.0); - penalty = initial_penalty * multiplier; - reward = max_weight * multiplier; + multiplier = std::min(multiplier, 10.0); + penalty = static_cast(initial_penalty * multiplier); + reward = static_cast(max_weight * multiplier); } }; @@ -52,23 +52,15 @@ struct set_vertex_lock_manger { std::unordered_set locked_nodes; - void initialize(size_t ) {} + void initialize(size_t) {} - void lock(VertexType node) { - locked_nodes.insert(node); - } + void lock(VertexType node) { locked_nodes.insert(node); } - void unlock(VertexType node) { - locked_nodes.erase(node); - } + void unlock(VertexType node) { locked_nodes.erase(node); } - bool is_locked(VertexType node) { - return locked_nodes.find(node) != locked_nodes.end(); - } + bool is_locked(VertexType node) { return locked_nodes.find(node) != locked_nodes.end(); } - void clear() { - locked_nodes.clear(); - } + void clear() { locked_nodes.clear(); } }; template @@ -76,25 +68,15 @@ struct vector_vertex_lock_manger { std::vector locked_nodes; - void initialize(size_t num_nodes) { - locked_nodes.resize(num_nodes); - } + void initialize(size_t num_nodes) { locked_nodes.resize(num_nodes); } - void lock(VertexType node) { - locked_nodes[node] = true; - } + void lock(VertexType node) { locked_nodes[node] = true; } - void unlock(VertexType node) { - locked_nodes[node] = false; - } + void unlock(VertexType node) { locked_nodes[node] = false; } - bool is_locked(VertexType node) { - return locked_nodes[node]; - } + bool is_locked(VertexType node) { return locked_nodes[node]; } - void clear() { - locked_nodes.assign(locked_nodes.size(), false); - } + void clear() { locked_nodes.assign(locked_nodes.size(), false); } }; template @@ -102,9 +84,9 @@ struct adaptive_affinity_table { constexpr static unsigned window_range = 2 * window_size + 1; using VertexType = vertex_idx_t; -private: + private: const kl_active_schedule_t *active_schedule; - const Graph_t * graph; + const Graph_t *graph; std::vector node_is_selected; std::vector selected_nodes_idx; @@ -115,14 +97,13 @@ struct adaptive_affinity_table { std::vector gaps; size_t last_idx; -public: - - void initialize(const kl_active_schedule_t & sche_, const std::size_t initial_table_size) { + public: + void initialize(const kl_active_schedule_t &sche_, const std::size_t initial_table_size) { active_schedule = &sche_; graph = &(sche_.getInstance().getComputationalDag()); last_idx = 0; - + node_is_selected.resize(graph->num_vertices()); selected_nodes_idx.resize(graph->num_vertices()); selected_nodes.resize(initial_table_size); @@ -136,49 +117,37 @@ struct adaptive_affinity_table { for (auto &row : table) { row.resize(window_range); } - } + } } - inline std::vector& get_selected_nodes() { - return selected_nodes; - } + inline std::vector &get_selected_nodes() { return selected_nodes; } - inline const std::vector& get_selected_nodes() const { - return selected_nodes; - } + inline const std::vector &get_selected_nodes() const { return selected_nodes; } - inline size_t size() const { - return last_idx - gaps.size(); - } + inline size_t size() const { return last_idx - gaps.size(); } - inline bool is_selected(VertexType node) const { - return node_is_selected[node]; - } + inline bool is_selected(VertexType node) const { return node_is_selected[node]; } - inline const std::vector & get_selected_nodes_indices() const { - return selected_nodes_idx; - } + inline const std::vector &get_selected_nodes_indices() const { return selected_nodes_idx; } - inline size_t get_selected_nodes_idx(VertexType node) const { - return selected_nodes_idx[node]; - } + inline size_t get_selected_nodes_idx(VertexType node) const { return selected_nodes_idx[node]; } - inline std::vector> & operator[](VertexType node) { + inline std::vector> &operator[](VertexType node) { assert(node_is_selected[node]); return affinity_table[selected_nodes_idx[node]]; } - inline std::vector> & at(VertexType node) { + inline std::vector> &at(VertexType node) { assert(node_is_selected[node]); return affinity_table[selected_nodes_idx[node]]; } - inline const std::vector> & at(VertexType node) const { + inline const std::vector> &at(VertexType node) const { assert(node_is_selected[node]); return affinity_table[selected_nodes_idx[node]]; } - inline std::vector> & get_affinity_table(VertexType node) { + inline std::vector> &get_affinity_table(VertexType node) { assert(node_is_selected[node]); return affinity_table[selected_nodes_idx[node]]; } @@ -193,11 +162,11 @@ struct adaptive_affinity_table { gaps.pop_back(); } else { insert_location = last_idx; - + if (insert_location >= selected_nodes.size()) { const size_t old_size = selected_nodes.size(); const size_t new_size = std::min(old_size * 2, static_cast(graph->num_vertices())); - + selected_nodes.resize(new_size); affinity_table.resize(new_size); @@ -215,7 +184,7 @@ struct adaptive_affinity_table { node_is_selected[node] = true; selected_nodes_idx[node] = insert_location; selected_nodes[insert_location] = node; - + return true; } @@ -225,13 +194,13 @@ struct adaptive_affinity_table { gaps.push_back(selected_nodes_idx[node]); } - + void reset_node_selection() { - node_is_selected.assign(node_is_selected.size(), false); + node_is_selected.assign(node_is_selected.size(), false); gaps.clear(); last_idx = 0; } - + void clear() { node_is_selected.clear(); selected_nodes_idx.clear(); @@ -242,7 +211,7 @@ struct adaptive_affinity_table { } void trim() { - while (!gaps.empty() && last_idx > 0) { + while (!gaps.empty() && last_idx > 0) { size_t last_element_idx = last_idx - 1; // The last element could be a gap itself. If so, just shrink the size. @@ -277,17 +246,16 @@ struct static_affinity_table { constexpr static unsigned window_range = 2 * window_size + 1; using VertexType = vertex_idx_t; -private: + private: const kl_active_schedule_t *active_schedule; - const Graph_t * graph; + const Graph_t *graph; - std::unordered_set selected_nodes; + std::unordered_set selected_nodes; std::vector>> affinity_table; -public: - - void initialize(const kl_active_schedule_t & sche_, const std::size_t ) { + public: + void initialize(const kl_active_schedule_t &sche_, const std::size_t) { active_schedule = &sche_; graph = &(sche_.getInstance().getComputationalDag()); @@ -298,50 +266,32 @@ struct static_affinity_table { for (auto &row : table) { row.resize(window_range); } - } + } } - inline std::vector get_selected_nodes() const { - return {selected_nodes.begin(), selected_nodes.end()}; - } + inline std::vector get_selected_nodes() const { return {selected_nodes.begin(), selected_nodes.end()}; } - inline size_t size() const { - return selected_nodes.size(); - } + inline size_t size() const { return selected_nodes.size(); } - inline bool is_selected(VertexType node) const { - return selected_nodes.find(node) != selected_nodes.end(); - } + inline bool is_selected(VertexType node) const { return selected_nodes.find(node) != selected_nodes.end(); } - inline std::vector> & operator[](VertexType node) { - return affinity_table[node]; - } + inline std::vector> &operator[](VertexType node) { return affinity_table[node]; } - inline std::vector> & at(VertexType node) { - return affinity_table[node]; - } + inline std::vector> &at(VertexType node) { return affinity_table[node]; } - inline const std::vector> & at(VertexType node) const { - return affinity_table[node]; - } + inline const std::vector> &at(VertexType node) const { return affinity_table[node]; } - inline std::vector> & get_affinity_table(VertexType node) { - return affinity_table[node]; - } + inline std::vector> &get_affinity_table(VertexType node) { return affinity_table[node]; } bool insert(VertexType node) { const auto pair = selected_nodes.insert(node); return pair.second; } - void remove(VertexType node) { - selected_nodes.erase(node); - } - - void reset_node_selection() { - selected_nodes.clear(); - } - + void remove(VertexType node) { selected_nodes.erase(node); } + + void reset_node_selection() { selected_nodes.clear(); } + void clear() { affinity_table.clear(); selected_nodes.clear(); @@ -356,8 +306,8 @@ struct vertex_selection_strategy { using EdgeType = edge_desc_t; const kl_active_schedule_t *active_schedule; - const Graph_t * graph; - std::mt19937 * gen; + const Graph_t *graph; + std::mt19937 *gen; std::size_t selection_threshold = 0; unsigned strategy_counter = 0; @@ -366,9 +316,10 @@ struct vertex_selection_strategy { unsigned max_work_counter = 0; - inline void initialize(const kl_active_schedule_t & sche_, std::mt19937 & gen_, const unsigned start_step, const unsigned end_step) { + inline void initialize(const kl_active_schedule_t &sche_, std::mt19937 &gen_, const unsigned start_step, + const unsigned end_step) { active_schedule = &sche_; - graph = &(sche_.getInstance().getComputationalDag()); + graph = &(sche_.getInstance().getComputationalDag()); gen = &gen_; permutation.reserve(graph->num_vertices() / active_schedule->num_steps() * (end_step - start_step)); @@ -381,7 +332,7 @@ struct vertex_selection_strategy { const unsigned num_procs = active_schedule->getInstance().numberOfProcessors(); for (unsigned step = start_step; step <= end_step; ++step) { - const auto & processor_vertices = active_schedule->getSetSchedule().step_processor_vertices[step]; + const auto &processor_vertices = active_schedule->getSetSchedule().step_processor_vertices[step]; for (unsigned proc = 0; proc < num_procs; ++proc) { for (const auto node : processor_vertices[proc]) { permutation.push_back(node); @@ -393,11 +344,11 @@ struct vertex_selection_strategy { std::shuffle(permutation.begin(), permutation.end(), *gen); } - - void add_neighbours_to_selection(vertex_idx_t node, container_t &nodes, const unsigned start_step, const unsigned end_step) { + void add_neighbours_to_selection(vertex_idx_t node, container_t &nodes, const unsigned start_step, + const unsigned end_step) { for (const auto parent : graph->parents(node)) { const unsigned parent_step = active_schedule->assigned_superstep(parent); - if (parent_step >= start_step && parent_step <= end_step) + if (parent_step >= start_step && parent_step <= end_step) nodes.insert(parent); } @@ -408,37 +359,38 @@ struct vertex_selection_strategy { } } - inline void select_active_nodes(container_t & node_selection, const unsigned start_step, const unsigned end_step) { + inline void select_active_nodes(container_t &node_selection, const unsigned start_step, const unsigned end_step) { if (strategy_counter < 3) { - select_nodes_permutation_threshold(selection_threshold, node_selection); + select_nodes_permutation_threshold(selection_threshold, node_selection); } else if (strategy_counter == 4) { select_nodes_max_work_proc(selection_threshold, node_selection, start_step, end_step); - } + } strategy_counter++; strategy_counter %= 5; } - void select_nodes_violations(container_t & node_selection, std::unordered_set& current_violations, const unsigned start_step, const unsigned end_step) { - for (const auto & edge : current_violations) { + void select_nodes_violations(container_t &node_selection, std::unordered_set ¤t_violations, + const unsigned start_step, const unsigned end_step) { + for (const auto &edge : current_violations) { const auto source_v = source(edge, *graph); const auto target_v = target(edge, *graph); - + const unsigned source_step = active_schedule->assigned_superstep(source_v); if (source_step >= start_step && source_step <= end_step) node_selection.insert(source_v); - + const unsigned target_step = active_schedule->assigned_superstep(target_v); if (target_step >= start_step && target_step <= end_step) node_selection.insert(target_v); } } - void select_nodes_permutation_threshold(const std::size_t & threshold, container_t & node_selection) { + void select_nodes_permutation_threshold(const std::size_t &threshold, container_t &node_selection) { const size_t bound = std::min(threshold + permutation_idx, permutation.size()); - for (std::size_t i = permutation_idx; i < bound; i++) { - node_selection.insert(permutation[i]); + for (std::size_t i = permutation_idx; i < bound; i++) { + node_selection.insert(permutation[i]); } permutation_idx = bound; @@ -448,7 +400,8 @@ struct vertex_selection_strategy { } } - void select_nodes_max_work_proc(const std::size_t & threshold, container_t & node_selection, const unsigned start_step, const unsigned end_step) { + void select_nodes_max_work_proc(const std::size_t &threshold, container_t &node_selection, + const unsigned start_step, const unsigned end_step) { while (node_selection.size() < threshold) { if (max_work_counter > end_step) { max_work_counter = start_step; // wrap around @@ -460,18 +413,17 @@ struct vertex_selection_strategy { } } - void select_nodes_max_work_proc_helper(const std::size_t & threshold, unsigned step, container_t & node_selection) { + void select_nodes_max_work_proc_helper(const std::size_t &threshold, unsigned step, container_t &node_selection) { const unsigned num_max_work_proc = active_schedule->work_datastructures.step_max_work_processor_count[step]; for (unsigned idx = 0; idx < num_max_work_proc; idx++) { const unsigned proc = active_schedule->work_datastructures.step_processor_work_[step][idx].proc; - const std::unordered_set> step_proc_vert = active_schedule->getSetSchedule().step_processor_vertices[step][proc]; - const size_t num_insert = std::min(threshold - node_selection.size(), step_proc_vert.size()); + const std::unordered_set> step_proc_vert = + active_schedule->getSetSchedule().step_processor_vertices[step][proc]; + const size_t num_insert = std::min(threshold - node_selection.size(), step_proc_vert.size()); auto end_it = step_proc_vert.begin(); std::advance(end_it, num_insert); - std::for_each(step_proc_vert.begin(), end_it, [&](const auto& val) { - node_selection.insert(val); - }); - } + std::for_each(step_proc_vert.begin(), end_it, [&](const auto &val) { node_selection.insert(val); }); + } } }; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 74cac6c7..8a6260bd 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -60,6 +60,12 @@ _add_test( kl_lambda ) _add_test( kl_util ) +_add_test( kl_bsp_cost ) + +_add_test( kl_bsp_improver_test ) + +_add_test( kl_bsp_affinity_test ) + _add_test( heaps ) _add_test( kl_mem_constr ) diff --git a/tests/kl_bsp_affinity_test.cpp b/tests/kl_bsp_affinity_test.cpp new file mode 100644 index 00000000..9d67de8e --- /dev/null +++ b/tests/kl_bsp_affinity_test.cpp @@ -0,0 +1,967 @@ + +#define BOOST_TEST_MODULE kl_bsp_affinity +#include + +#include "osp/bsp/model/BspSchedule.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp" +#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" +#include "test_graphs.hpp" + +using namespace osp; +using graph = computational_dag_edge_idx_vector_impl_def_int_t; +using kl_active_schedule_t = kl_active_schedule; + +BOOST_AUTO_TEST_CASE(simple_parent_child_test) { + using VertexType = graph::vertex_idx; + + graph dag; + const VertexType v0 = dag.add_vertex(10, 5, 2); // work=10, mem=5, comm=2 + const VertexType v1 = dag.add_vertex(8, 4, 1); // work=8, mem=4, comm=1 + dag.add_edge(v0, v1, 3); // edge weight=3 + + BspArchitecture arch; + arch.setNumberOfProcessors(2); + + BspInstance instance(dag, arch); + instance.setCommunicationCosts(10); // comm multiplier + instance.setSynchronisationCosts(5); + + BspSchedule schedule(instance); + schedule.setAssignedProcessors({0, 1}); // v0 on p0, v1 on p1 + schedule.setAssignedSupersteps({0, 1}); // v0 in step 0, v1 in step 1 + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + // Insert only v0 into gain heap to control which node moves + auto node_selection = kl.insert_gain_heap_test({0}); + + // Run one iteration - this will move v0 to its best position + auto recompute_max_gain = kl.run_inner_iteration_test(); + + // Compare costs after move + double after_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_tracked = kl.get_current_cost(); + + BOOST_CHECK_CLOSE(after_recomputed, after_tracked, 0.00001); +} + +/** + * Helper to validate comm datastructures by comparing with freshly computed ones + */ +template +bool validate_comm_datastructures( + const max_comm_datastructure &comm_ds_incremental, + kl_active_schedule_t &active_sched, const BspInstance &instance, const std::string &context) { + + // 1. Clone Schedule + BspSchedule current_schedule(instance); + active_sched.write_schedule(current_schedule); + + // 2. Fresh Computation + kl_active_schedule_t kl_sched_fresh; + kl_sched_fresh.initialize(current_schedule); + + max_comm_datastructure comm_ds_fresh; + comm_ds_fresh.initialize(kl_sched_fresh); + + // Compute for all steps + unsigned max_step = current_schedule.numberOfSupersteps(); + comm_ds_fresh.compute_comm_datastructures(0, max_step > 0 ? max_step - 1 : 0); + + bool all_match = true; + // std::cout << "\nValidating comm datastructures " << context << ":" << std::endl; + + // 3. Validate Comm Costs + for (unsigned step = 0; step < max_step; ++step) { + for (unsigned p = 0; p < instance.numberOfProcessors(); ++p) { + auto send_inc = comm_ds_incremental.step_proc_send(step, p); + auto send_fresh = comm_ds_fresh.step_proc_send(step, p); + auto recv_inc = comm_ds_incremental.step_proc_receive(step, p); + auto recv_fresh = comm_ds_fresh.step_proc_receive(step, p); + + if (std::abs(send_inc - send_fresh) > 1e-6 || std::abs(recv_inc - recv_fresh) > 1e-6) { + all_match = false; + std::cout << " MISMATCH at step " << step << " proc " << p << ":" << std::endl; + std::cout << " Incremental: send=" << send_inc << ", recv=" << recv_inc << std::endl; + std::cout << " Fresh: send=" << send_fresh << ", recv=" << recv_fresh << std::endl; + } + } + } + + // 4. Validate Lambda Maps + for (const auto v : instance.vertices()) { + for (unsigned p = 0; p < instance.numberOfProcessors(); ++p) { + unsigned count_inc = 0; + if (comm_ds_incremental.node_lambda_map.has_proc_entry(v, p)) { + count_inc = comm_ds_incremental.node_lambda_map.get_proc_entry(v, p); + } + + unsigned count_fresh = 0; + if (comm_ds_fresh.node_lambda_map.has_proc_entry(v, p)) { + count_fresh = comm_ds_fresh.node_lambda_map.get_proc_entry(v, p); + } + + if (count_inc != count_fresh) { + all_match = false; + std::cout << " LAMBDA MISMATCH at node " << v << " proc " << p << ":" << std::endl; + std::cout << " Incremental: " << count_inc << std::endl; + std::cout << " Fresh: " << count_fresh << std::endl; + } + } + } + + return all_match; +} + +/** + * Helper to validate affinity tables by comparing with freshly computed ones + */ +template +bool validate_affinity_tables( + kl_improver_test &kl_incremental, + const BspInstance &instance, const std::string &context) { + + // 1. Get current schedule from incremental + BspSchedule current_schedule(instance); + kl_incremental.get_active_schedule_test(current_schedule); + + // 2. Create fresh kl_improver and compute all affinities from scratch + kl_improver_test kl_fresh; + kl_fresh.setup_schedule(current_schedule); + + // Get selected nodes from incremental + std::vector> selected_nodes; + + const size_t active_count = kl_incremental.get_affinity_table().size(); + for (size_t i = 0; i < active_count; ++i) { + selected_nodes.push_back(kl_incremental.get_affinity_table().get_selected_nodes()[i]); + } + + + std::cout << "\n [" << context << "] Validating " << selected_nodes.size() << " selected nodes: { "; + for (const auto n : selected_nodes) { + std::cout << n << " "; + } + std::cout << "}" << std::endl; + + // Compute affinities for all selected nodes + kl_fresh.insert_gain_heap_test(selected_nodes); + + bool all_match = true; + const unsigned num_procs = instance.numberOfProcessors(); + const unsigned num_steps = kl_incremental.get_active_schedule().num_steps(); + + // 3. Compare affinity tables for each selected node + + for (const auto & node : selected_nodes) { + + const auto &affinity_inc = kl_incremental.get_affinity_table().get_affinity_table(node); + const auto &affinity_fresh = kl_fresh.get_affinity_table().get_affinity_table(node); + + unsigned node_step = kl_incremental.get_active_schedule().assigned_superstep(node); + + for (unsigned p = 0; p < num_procs; ++p) { + if (p >= affinity_inc.size() || p >= affinity_fresh.size()) + continue; + + for (unsigned idx = 0; idx < affinity_inc[p].size() && idx < affinity_fresh[p].size(); ++idx) { + int step_offset = static_cast(idx) - static_cast(window_size); + int target_step_signed = static_cast(node_step) + step_offset; + + // Skip affinities for supersteps that don't exist + if (target_step_signed < 0 || target_step_signed >= static_cast(num_steps)) { + continue; + } + + double val_inc = affinity_inc[p][idx]; + double val_fresh = affinity_fresh[p][idx]; + + if (std::abs(val_inc - val_fresh) > 1e-4) { + all_match = false; + + std::cout << " AFFINITY MISMATCH [" << context << "]: node=" << node << " to P" << p << " S" + << target_step_signed << " (offset=" << step_offset << ")" << std::endl; + std::cout << " Incremental: " << val_inc << std::endl; + std::cout << " Fresh: " << val_fresh << std::endl; + std::cout << " Difference: " << (val_inc - val_fresh) << std::endl; + } + } + } + } + + return all_match; +} + +BOOST_AUTO_TEST_CASE(test_update_datastructure_after_move) { + graph dag; + + // Create 6 vertices with specific comm weights + dag.add_vertex(1, 10, 1); // 0 + dag.add_vertex(1, 1, 1); // 1 + dag.add_vertex(1, 5, 1); // 2 + dag.add_vertex(1, 1, 1); // 3 + dag.add_vertex(1, 2, 1); // 4 + dag.add_vertex(1, 1, 1); // 5 + + // Add edges + dag.add_edge(0, 1, 1); + dag.add_edge(2, 3, 1); + dag.add_edge(4, 5, 1); + + BspArchitecture arch; + arch.setNumberOfProcessors(3); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Schedule: + // Proc 0: Node 0, 4, 5 + // Proc 1: Node 1, 2 + // Proc 2: Node 3 + schedule.setAssignedProcessors({0, 1, 1, 2, 0, 0}); + // Steps: 0, 1, 0, 1, 0, 0 + schedule.setAssignedSupersteps({0, 1, 0, 1, 0, 0}); + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + kl.insert_gain_heap_test({0}); + kl.run_inner_iteration_test(); + + double after_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_tracked = kl.get_current_cost(); + + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "test_update_datastructure_after_move")); + BOOST_CHECK_CLOSE(after_recomputed, after_tracked, 0.00001); +} + +BOOST_AUTO_TEST_CASE(test_multiple_sequential_moves) { + graph dag; + + // Create a linear chain: 0 -> 1 -> 2 -> 3 + dag.add_vertex(1, 10, 1); // 0 + dag.add_vertex(1, 8, 1); // 1 + dag.add_vertex(1, 6, 1); // 2 + dag.add_vertex(1, 4, 1); // 3 + + dag.add_edge(0, 1, 1); + dag.add_edge(1, 2, 1); + dag.add_edge(2, 3, 1); + + BspArchitecture arch; + arch.setNumberOfProcessors(4); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + schedule.setAssignedProcessors({0, 1, 2, 3}); + schedule.setAssignedSupersteps({0, 0, 0, 0}); + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + kl.insert_gain_heap_test({1}); + kl.run_inner_iteration_test(); + + double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move1_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "test_multiple_sequential_moves_1")); + BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move2_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "test_multiple_sequential_moves_2")); + BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move3_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "test_multiple_sequential_moves_3")); + BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001); + + // After: Node 0 has 3 local children + // Send cost = 10 * 0 = 0 (all local) + // Work cost 4 + BOOST_CHECK_CLOSE(after_move3_tracked, 4.0, 0.00001); +} + +BOOST_AUTO_TEST_CASE(test_node_with_multiple_children) { + graph dag; + + // Tree structure: Node 0 has three children (1, 2, 3) + dag.add_vertex(1, 1, 1); // 0 + dag.add_vertex(1, 1, 1); // 1 + dag.add_vertex(1, 1, 1); // 2 + dag.add_vertex(1, 1, 1); // 3 + + dag.add_edge(0, 1, 1); + dag.add_edge(0, 2, 1); + dag.add_edge(0, 3, 1); + + BspArchitecture arch; + arch.setNumberOfProcessors(4); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + schedule.setAssignedProcessors({0, 1, 2, 3}); + schedule.setAssignedSupersteps({0, 0, 0, 0}); + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + kl.insert_gain_heap_test({1}); + kl.get_comm_cost_f().compute_schedule_cost(); + kl.run_inner_iteration_test(); + + double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move1_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "test_node_with_multiple_children")); + BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move2_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "test_node_with_multiple_children_2")); + BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move3_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "test_node_with_multiple_children_3")); + BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001); + + // After: Node 0 has 3 local children + // Send cost = 10 * 0 = 0 (all local) + // Work cost 4 + BOOST_CHECK_CLOSE(after_move3_tracked, 4.0, 0.00001); +} + +BOOST_AUTO_TEST_CASE(test_cross_step_moves) { + graph dag; + + // 0 -> 1 -> 2 + dag.add_vertex(1, 10, 1); // 0 + dag.add_vertex(1, 8, 1); // 1 + dag.add_vertex(1, 6, 1); // 2 + + dag.add_edge(0, 1, 1); + dag.add_edge(1, 2, 1); + + BspArchitecture arch; + arch.setNumberOfProcessors(2); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + schedule.setAssignedProcessors({0, 1, 0}); + schedule.setAssignedSupersteps({0, 1, 2}); + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + kl.insert_gain_heap_test({1}); + kl.run_inner_iteration_test(); + + double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move1_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "test_cross_step_moves_1")); + BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001); +} + +BOOST_AUTO_TEST_CASE(test_complex_scenario) { + std::cout << "Test case complex scenario" << std::endl; + graph dag; + + const auto v1 = dag.add_vertex(2, 9, 2); + const auto v2 = dag.add_vertex(3, 8, 4); + const auto v3 = dag.add_vertex(4, 7, 3); + const auto v4 = dag.add_vertex(5, 6, 2); + const auto v5 = dag.add_vertex(6, 5, 6); + const auto v6 = dag.add_vertex(7, 4, 2); + dag.add_vertex(8, 3, 4); // v7 (index 6) + const auto v8 = dag.add_vertex(9, 2, 1); // v8 (index 7) + + dag.add_edge(v1, v2, 2); + dag.add_edge(v1, v3, 2); + dag.add_edge(v1, v4, 2); + dag.add_edge(v2, v5, 12); + dag.add_edge(v3, v5, 6); + dag.add_edge(v3, v6, 7); + dag.add_edge(v5, v8, 9); + dag.add_edge(v4, v8, 9); + + BspArchitecture arch; + arch.setNumberOfProcessors(2); // P0, P1 + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + schedule.setAssignedProcessors({1, 1, 0, 0, 1, 0, 0, 1}); + schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + kl.insert_gain_heap_test({v3, v1}); + kl.run_inner_iteration_test(); + + double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move1_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "complex_move1")); + BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move2_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "complex_move2")); + BOOST_CHECK(validate_affinity_tables(kl, instance, "complex_move2")); + BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move3_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "complex_move3")); + BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move4_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move4_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "complex_move4")); + BOOST_CHECK_CLOSE(after_move4_recomputed, after_move4_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move5_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move5_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "complex_move5")); + BOOST_CHECK_CLOSE(after_move5_recomputed, after_move5_tracked, 0.00001); +} + +BOOST_AUTO_TEST_CASE(test_complex_scenario_only_compute) { + graph dag; + + const auto v1 = dag.add_vertex(2, 9, 2); + const auto v2 = dag.add_vertex(3, 8, 4); + const auto v3 = dag.add_vertex(4, 7, 3); + const auto v4 = dag.add_vertex(5, 6, 2); + const auto v5 = dag.add_vertex(6, 5, 6); + const auto v6 = dag.add_vertex(7, 4, 2); + const auto v7 = dag.add_vertex(8, 3, 4); // v7 (index 6) + const auto v8 = dag.add_vertex(9, 2, 1); // v8 (index 7) + + dag.add_edge(v1, v2, 2); + dag.add_edge(v1, v3, 2); + dag.add_edge(v1, v4, 2); + dag.add_edge(v2, v5, 12); + dag.add_edge(v3, v5, 6); + dag.add_edge(v3, v6, 7); + dag.add_edge(v5, v8, 9); + dag.add_edge(v4, v8, 9); + + BspArchitecture arch; + arch.setNumberOfProcessors(2); // P0, P1 + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + schedule.setAssignedProcessors({1, 1, 0, 0, 1, 0, 0, 1}); + schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + kl.insert_gain_heap_test({v1}); + kl.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "complex_move1")); + BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); + + kl_improver_test kl2; + kl2.setup_schedule(schedule); + + kl2.insert_gain_heap_test({v2}); + kl2.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl2.get_comm_cost_f().comm_ds, kl2.get_active_schedule(), instance, + "complex_move2")); + BOOST_CHECK_CLOSE(kl2.get_comm_cost_f().compute_schedule_cost_test(), kl2.get_current_cost(), 0.00001); + + kl_improver_test kl3; + kl3.setup_schedule(schedule); + + kl3.insert_gain_heap_test({v3}); + kl3.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl3.get_comm_cost_f().comm_ds, kl3.get_active_schedule(), instance, + "complex_move3")); + BOOST_CHECK_CLOSE(kl3.get_comm_cost_f().compute_schedule_cost_test(), kl3.get_current_cost(), 0.00001); + + kl_improver_test kl4; + kl4.setup_schedule(schedule); + + kl4.insert_gain_heap_test({v4}); + kl4.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl4.get_comm_cost_f().comm_ds, kl4.get_active_schedule(), instance, + "complex_move4")); + BOOST_CHECK_CLOSE(kl4.get_comm_cost_f().compute_schedule_cost_test(), kl4.get_current_cost(), 0.00001); + + kl_improver_test kl5; + kl5.setup_schedule(schedule); + + kl5.insert_gain_heap_test({v5}); + kl5.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl5.get_comm_cost_f().comm_ds, kl5.get_active_schedule(), instance, + "complex_move5")); + BOOST_CHECK_CLOSE(kl5.get_comm_cost_f().compute_schedule_cost_test(), kl5.get_current_cost(), 0.00001); + + kl_improver_test kl6; + kl6.setup_schedule(schedule); + + kl6.insert_gain_heap_test({v6}); + kl6.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl6.get_comm_cost_f().comm_ds, kl6.get_active_schedule(), instance, + "complex_move6")); + BOOST_CHECK_CLOSE(kl6.get_comm_cost_f().compute_schedule_cost_test(), kl6.get_current_cost(), 0.00001); + + kl_improver_test kl7; + kl7.setup_schedule(schedule); + + kl7.insert_gain_heap_test({v7}); + kl7.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl7.get_comm_cost_f().comm_ds, kl7.get_active_schedule(), instance, + "complex_move7")); + BOOST_CHECK_CLOSE(kl7.get_comm_cost_f().compute_schedule_cost_test(), kl7.get_current_cost(), 0.00001); + + kl_improver_test kl8; + kl8.setup_schedule(schedule); + + kl8.insert_gain_heap_test({v8}); + kl8.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl8.get_comm_cost_f().comm_ds, kl8.get_active_schedule(), instance, + "complex_move8")); + BOOST_CHECK_CLOSE(kl8.get_comm_cost_f().compute_schedule_cost_test(), kl8.get_current_cost(), 0.00001); +} + +BOOST_AUTO_TEST_CASE(test_complex_scenario_only_compute_2) { + graph dag; + + const auto v1 = dag.add_vertex(2, 9, 2); + const auto v2 = dag.add_vertex(3, 8, 4); + const auto v3 = dag.add_vertex(4, 7, 3); + const auto v4 = dag.add_vertex(5, 6, 2); + const auto v5 = dag.add_vertex(6, 5, 6); + const auto v6 = dag.add_vertex(7, 4, 2); + const auto v7 = dag.add_vertex(8, 3, 4); // v7 (index 6) + const auto v8 = dag.add_vertex(9, 2, 1); // v8 (index 7) + + dag.add_edge(v1, v2, 2); + dag.add_edge(v1, v5, 2); + dag.add_edge(v1, v6, 2); + dag.add_edge(v1, v3, 2); + dag.add_edge(v1, v4, 2); + dag.add_edge(v2, v5, 12); + dag.add_edge(v2, v6, 2); + dag.add_edge(v2, v7, 2); + dag.add_edge(v2, v8, 2); + dag.add_edge(v3, v5, 6); + dag.add_edge(v3, v6, 7); + dag.add_edge(v3, v7, 2); + dag.add_edge(v3, v8, 2); + dag.add_edge(v5, v8, 9); + dag.add_edge(v4, v8, 9); + dag.add_edge(v5, v7, 2); + dag.add_edge(v6, v7, 2); + dag.add_edge(v7, v8, 2); + + BspArchitecture arch; + arch.setNumberOfProcessors(2); // P0, P1 + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + schedule.setAssignedProcessors({1, 1, 0, 0, 1, 0, 0, 1}); + schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + kl.insert_gain_heap_test({v1}); + kl.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "complex_move1")); + BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); + + kl_improver_test kl2; + kl2.setup_schedule(schedule); + + kl2.insert_gain_heap_test({v2}); + kl2.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl2.get_comm_cost_f().comm_ds, kl2.get_active_schedule(), instance, + "complex_move2")); + BOOST_CHECK_CLOSE(kl2.get_comm_cost_f().compute_schedule_cost_test(), kl2.get_current_cost(), 0.00001); + + kl_improver_test kl3; + kl3.setup_schedule(schedule); + + kl3.insert_gain_heap_test({v3}); + kl3.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl3.get_comm_cost_f().comm_ds, kl3.get_active_schedule(), instance, + "complex_move3")); + BOOST_CHECK_CLOSE(kl3.get_comm_cost_f().compute_schedule_cost_test(), kl3.get_current_cost(), 0.00001); + + kl_improver_test kl4; + kl4.setup_schedule(schedule); + + kl4.insert_gain_heap_test({v4}); + kl4.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl4.get_comm_cost_f().comm_ds, kl4.get_active_schedule(), instance, + "complex_move4")); + BOOST_CHECK_CLOSE(kl4.get_comm_cost_f().compute_schedule_cost_test(), kl4.get_current_cost(), 0.00001); + + kl_improver_test kl5; + kl5.setup_schedule(schedule); + + kl5.insert_gain_heap_test({v5}); + kl5.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl5.get_comm_cost_f().comm_ds, kl5.get_active_schedule(), instance, + "complex_move5")); + BOOST_CHECK_CLOSE(kl5.get_comm_cost_f().compute_schedule_cost_test(), kl5.get_current_cost(), 0.00001); + + kl_improver_test kl6; + kl6.setup_schedule(schedule); + + kl6.insert_gain_heap_test({v6}); + kl6.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl6.get_comm_cost_f().comm_ds, kl6.get_active_schedule(), instance, + "complex_move6")); + BOOST_CHECK_CLOSE(kl6.get_comm_cost_f().compute_schedule_cost_test(), kl6.get_current_cost(), 0.00001); + + kl_improver_test kl7; + kl7.setup_schedule(schedule); + + kl7.insert_gain_heap_test({v7}); + kl7.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl7.get_comm_cost_f().comm_ds, kl7.get_active_schedule(), instance, + "complex_move7")); + BOOST_CHECK_CLOSE(kl7.get_comm_cost_f().compute_schedule_cost_test(), kl7.get_current_cost(), 0.00001); + + kl_improver_test kl8; + kl8.setup_schedule(schedule); + + kl8.insert_gain_heap_test({v8}); + kl8.run_inner_iteration_test(); + + BOOST_CHECK(validate_comm_datastructures(kl8.get_comm_cost_f().comm_ds, kl8.get_active_schedule(), instance, + "complex_move8")); + BOOST_CHECK_CLOSE(kl8.get_comm_cost_f().compute_schedule_cost_test(), kl8.get_current_cost(), 0.00001); +} + +BOOST_AUTO_TEST_CASE(test_grid_graph_complex_moves) { + // Construct 5x5 Grid Graph (25 nodes, indices 0-24) + graph dag = osp::construct_grid_dag(5, 5); + + BspArchitecture arch; + arch.setNumberOfProcessors(4); // P0..P3 + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Assign Processors and Supersteps + std::vector procs(25); + std::vector steps(25); + + for (unsigned r = 0; r < 5; ++r) { + for (unsigned c = 0; c < 5; ++c) { + unsigned idx = r * 5 + c; + if (r < 2) { + procs[idx] = 0; + steps[idx] = (c < 3) ? 0 : 1; + } else if (r < 4) { + procs[idx] = 1; + steps[idx] = (c < 3) ? 2 : 3; + } else { + procs[idx] = 2; + steps[idx] = (c < 3) ? 4 : 5; + } + } + } + + // Override: Node 7 (1,2) to P3, S1. + procs[7] = 3; + steps[7] = 1; + + schedule.setAssignedProcessors(procs); + schedule.setAssignedSupersteps(steps); + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + kl.insert_gain_heap_test({12, 8, 7}); + kl.run_inner_iteration_test(); + + double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move1_tracked = kl.get_current_cost(); + BOOST_CHECK( + validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "grid_move1")); + BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move2_tracked = kl.get_current_cost(); + BOOST_CHECK( + validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "grid_move2")); + BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move3_tracked = kl.get_current_cost(); + BOOST_CHECK( + validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "grid_move3")); + BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move4_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move4_tracked = kl.get_current_cost(); + BOOST_CHECK( + validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "grid_move4")); + BOOST_CHECK_CLOSE(after_move4_recomputed, after_move4_tracked, 0.00001); +} + +BOOST_AUTO_TEST_CASE(test_butterfly_graph_moves) { + // Stages=2 -> 3 levels of 4 nodes each = 12 nodes. + // Level 0: 0-3. Level 1: 4-7. Level 2: 8-11. + graph dag = osp::construct_butterfly_dag(2); + + BspArchitecture arch; + arch.setNumberOfProcessors(2); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Assign: + // Level 0: P0, Step 0 + // Level 1: P1, Step 1 + // Level 2: P0, Step 2 + std::vector procs(12); + std::vector steps(12); + for (unsigned i = 0; i < 12; ++i) { + if (i < 4) { + procs[i] = 0; + steps[i] = 0; + } else if (i < 8) { + procs[i] = 1; + steps[i] = 1; + } else { + procs[i] = 0; + steps[i] = 2; + } + } + + schedule.setAssignedProcessors(procs); + schedule.setAssignedSupersteps(steps); + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + kl.insert_gain_heap_test({4, 6, 0}); + kl.run_inner_iteration_test(); + + double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move1_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "butterfly_move1")); + BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move2_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "butterfly_move2")); + BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move3_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "butterfly_move3")); + BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move4_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move4_tracked = kl.get_current_cost(); + BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, + "butterfly_move4")); + BOOST_CHECK_CLOSE(after_move4_recomputed, after_move4_tracked, 0.00001); +} + +BOOST_AUTO_TEST_CASE(test_ladder_graph_moves) { + // Ladder with 5 rungs -> 6 pairs of nodes = 12 nodes. + // Pairs: (0,1), (2,3), ... (10,11). + graph dag = osp::construct_ladder_dag(5); + + BspArchitecture arch; + arch.setNumberOfProcessors(2); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Assign: + // Even nodes (Left rail): P0 + // Odd nodes (Right rail): P1 + // Steps: Pair i at Step i. + std::vector procs(12); + std::vector steps(12); + for (unsigned i = 0; i < 6; ++i) { + procs[2 * i] = 0; + steps[2 * i] = i; + procs[2 * i + 1] = 1; + steps[2 * i + 1] = i; + } + + schedule.setAssignedProcessors(procs); + schedule.setAssignedSupersteps(steps); + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + kl.setup_schedule(schedule); + + kl.insert_gain_heap_test({1, 3, 0, 2}); + kl.run_inner_iteration_test(); + + double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move1_tracked = kl.get_current_cost(); + BOOST_CHECK( + validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "ladder_move1")); + BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move2_tracked = kl.get_current_cost(); + BOOST_CHECK( + validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "ladder_move2")); + BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move3_tracked = kl.get_current_cost(); + BOOST_CHECK( + validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "ladder_move3")); + BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001); + + kl.run_inner_iteration_test(); + + double after_move4_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double after_move4_tracked = kl.get_current_cost(); + BOOST_CHECK( + validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "ladder_move4")); + BOOST_CHECK_CLOSE(after_move4_recomputed, after_move4_tracked, 0.00001); +} \ No newline at end of file diff --git a/tests/kl_bsp_cost.cpp b/tests/kl_bsp_cost.cpp new file mode 100644 index 00000000..36e999ff --- /dev/null +++ b/tests/kl_bsp_cost.cpp @@ -0,0 +1,1086 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#define BOOST_TEST_MODULE kl_bsp_cost +#include + +#include "test_graphs.hpp" +#include "osp/bsp/model/BspSchedule.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_active_schedule.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp" +#include "osp/concepts/graph_traits.hpp" +#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" + +using namespace osp; +using graph = computational_dag_edge_idx_vector_impl_def_int_t; +using kl_active_schedule_t = kl_active_schedule; + +BOOST_AUTO_TEST_CASE(test_arrange_superstep_comm_data) { + graph dag; + + dag.add_vertex(1, 1, 1); + dag.add_vertex(1, 1, 1); + dag.add_vertex(1, 1, 1); + dag.add_vertex(1, 1, 1); + + BspArchitecture arch; + arch.setNumberOfProcessors(4); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Initialize schedule with 1 step + schedule.setAssignedProcessors({0, 1, 2, 3}); + schedule.setAssignedSupersteps({0, 0, 0, 0}); + schedule.updateNumberOfSupersteps(); + + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + + unsigned step = 0; + + // Case 1: Unique Max + comm_ds.step_proc_send(step, 0) = 10; + comm_ds.step_proc_send(step, 1) = 5; + comm_ds.step_proc_send(step, 2) = 2; + comm_ds.step_proc_send(step, 3) = 1; + + comm_ds.step_proc_receive(step, 0) = 8; + comm_ds.step_proc_receive(step, 1) = 8; + comm_ds.step_proc_receive(step, 2) = 2; + comm_ds.step_proc_receive(step, 3) = 1; + + comm_ds.arrange_superstep_comm_data(step); + + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 1); // Only proc 0 has 10 + BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 8); // Next highest is 8 (from recv) + + // Case 2: Shared Max + comm_ds.reset_superstep(step); + comm_ds.step_proc_send(step, 0) = 10; // Need to re-set this as reset clears it + comm_ds.step_proc_send(step, 1) = 10; + comm_ds.step_proc_send(step, 2) = 2; + comm_ds.step_proc_send(step, 3) = 1; + + comm_ds.step_proc_receive(step, 0) = 5; + comm_ds.step_proc_receive(step, 1) = 5; + comm_ds.step_proc_receive(step, 2) = 2; + comm_ds.step_proc_receive(step, 3) = 1; + comm_ds.arrange_superstep_comm_data(step); + + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 2); // Proc 0 and 1 + BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 5); // Next highest is 5 (from recv) + + // Case 3: Max in Recv + comm_ds.reset_superstep(step); + + comm_ds.step_proc_send(step, 0) = 5; + comm_ds.step_proc_send(step, 1) = 5; + comm_ds.step_proc_send(step, 2) = 2; + comm_ds.step_proc_send(step, 3) = 1; + + comm_ds.step_proc_receive(step, 0) = 12; + comm_ds.step_proc_receive(step, 1) = 8; + comm_ds.step_proc_receive(step, 2) = 2; + comm_ds.step_proc_receive(step, 3) = 1; + comm_ds.arrange_superstep_comm_data(step); + + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 12); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 1); + BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 8); + + // Case 4: All same + comm_ds.reset_superstep(step); + // Send: 10, 10, 10, 10 + // Recv: 10, 10, 10, 10 + for (unsigned i = 0; i < 4; ++i) { + comm_ds.step_proc_send(step, i) = 10; + comm_ds.step_proc_receive(step, i) = 10; + } + comm_ds.arrange_superstep_comm_data(step); + + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 8); // 4 sends + 4 recvs + BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 0); // If all removed, 0. + + // Case 5: Max removed, second max is from same type (Send) + comm_ds.reset_superstep(step); + comm_ds.step_proc_send(step, 0) = 10; + comm_ds.step_proc_send(step, 1) = 8; + comm_ds.step_proc_send(step, 2) = 2; + comm_ds.step_proc_send(step, 3) = 1; + + for (unsigned i = 0; i < 4; ++i) + comm_ds.step_proc_receive(step, i) = 5; + + comm_ds.arrange_superstep_comm_data(step); + + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 1); + BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 8); + + // Case 6: Max removed, second max is from other type (Recv) + comm_ds.reset_superstep(step); + + comm_ds.step_proc_send(step, 0) = 10; + comm_ds.step_proc_send(step, 1) = 4; + comm_ds.step_proc_send(step, 2) = 2; + comm_ds.step_proc_send(step, 3) = 1; + + comm_ds.step_proc_receive(step, 0) = 8; + comm_ds.step_proc_receive(step, 1) = 5; + comm_ds.step_proc_receive(step, 2) = 2; + comm_ds.step_proc_receive(step, 3) = 1; + + comm_ds.arrange_superstep_comm_data(step); + + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 1); + BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 8); +} + +BOOST_AUTO_TEST_CASE(test_compute_comm_datastructures) { + graph dag; + + // Create 6 vertices with specific comm weights + // Node 0: weight 10 (sends to 1) + dag.add_vertex(1, 10, 1); + // Node 1: weight 1 + dag.add_vertex(1, 1, 1); + // Node 2: weight 5 (sends to 3) + dag.add_vertex(1, 5, 1); + // Node 3: weight 1 + dag.add_vertex(1, 1, 1); + // Node 4: weight 2 (local to 5) + dag.add_vertex(1, 2, 1); + // Node 5: weight 1 + dag.add_vertex(1, 1, 1); + + // Add edges + // 0 -> 1 + dag.add_edge(0, 1, 1); // Edge weight ignored by max_comm_datastructure + // 2 -> 3 + dag.add_edge(2, 3, 1); + // 4 -> 5 + dag.add_edge(4, 5, 1); + + BspArchitecture arch; + arch.setNumberOfProcessors(3); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Schedule: + // Proc 0: Node 0, 4, 5 + // Proc 1: Node 1, 2 + // Proc 2: Node 3 + schedule.setAssignedProcessors({0, 1, 1, 2, 0, 0}); + schedule.setAssignedSupersteps({0, 1, 0, 1, 0, 0}); + schedule.updateNumberOfSupersteps(); + + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + + // Compute for steps 0 and 1 + comm_ds.compute_comm_datastructures(0, 1); + + unsigned step = 0; + + // Expected Step 0: + // Proc 0 sends: 10 (Node 0 -> Node 1 on Proc 1) + // Proc 1 receives: 10 (from Proc 0) + // Proc 1 sends: 5 (Node 2 -> Node 3 on Proc 2) + // Proc 2 receives: 5 (from Proc 1) + // Proc 2 sends: 0 + // Proc 0 receives: 0 + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 0), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 1), 5); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 2), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 1), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 2), 5); + + // Max Comm Calculation Step 0 + // Send Max: 10 (P0) + // Recv Max: 10 (P1) + // Global Max: 10 + // Count: 2 (P0 send, P1 recv) + // Second Max: 5 (P1 send, P2 recv) + + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 2); + BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 5); + + // Verify Step 1 (Should be empty as Nodes 1 and 3 are leaves) + step = 1; + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 2), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 2), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 0); +} + +/** + * Helper to validate comm datastructures by comparing with freshly computed ones + */ +template +bool validate_comm_datastructures( + const max_comm_datastructure &comm_ds_incremental, + kl_active_schedule_t &active_sched, const BspInstance &instance, const std::string &context) { + + // 1. Clone Schedule + BspSchedule current_schedule(instance); + active_sched.write_schedule(current_schedule); + + // 2. Fresh Computation + kl_active_schedule_t kl_sched_fresh; + kl_sched_fresh.initialize(current_schedule); + + max_comm_datastructure comm_ds_fresh; + comm_ds_fresh.initialize(kl_sched_fresh); + + // Compute for all steps + unsigned max_step = current_schedule.numberOfSupersteps(); + comm_ds_fresh.compute_comm_datastructures(0, max_step > 0 ? max_step - 1 : 0); + + bool all_match = true; + // std::cout << "\nValidating comm datastructures " << context << ":" << std::endl; + + // 3. Validate Comm Costs + for (unsigned step = 0; step < max_step; ++step) { + for (unsigned p = 0; p < instance.numberOfProcessors(); ++p) { + auto send_inc = comm_ds_incremental.step_proc_send(step, p); + auto send_fresh = comm_ds_fresh.step_proc_send(step, p); + auto recv_inc = comm_ds_incremental.step_proc_receive(step, p); + auto recv_fresh = comm_ds_fresh.step_proc_receive(step, p); + + if (std::abs(send_inc - send_fresh) > 1e-6 || std::abs(recv_inc - recv_fresh) > 1e-6) { + all_match = false; + std::cout << " MISMATCH at step " << step << " proc " << p << ":" << std::endl; + std::cout << " Incremental: send=" << send_inc << ", recv=" << recv_inc << std::endl; + std::cout << " Fresh: send=" << send_fresh << ", recv=" << recv_fresh << std::endl; + } + } + } + + // 4. Validate Lambda Maps + for (const auto v : instance.vertices()) { + for (unsigned p = 0; p < instance.numberOfProcessors(); ++p) { + unsigned count_inc = 0; + if (comm_ds_incremental.node_lambda_map.has_proc_entry(v, p)) { + count_inc = comm_ds_incremental.node_lambda_map.get_proc_entry(v, p); + } + + unsigned count_fresh = 0; + if (comm_ds_fresh.node_lambda_map.has_proc_entry(v, p)) { + count_fresh = comm_ds_fresh.node_lambda_map.get_proc_entry(v, p); + } + + if (count_inc != count_fresh) { + all_match = false; + std::cout << " LAMBDA MISMATCH at node " << v << " proc " << p << ":" << std::endl; + std::cout << " Incremental: " << count_inc << std::endl; + std::cout << " Fresh: " << count_fresh << std::endl; + } + } + } + + return all_match; +} + +BOOST_AUTO_TEST_CASE(test_update_datastructure_after_move) { + graph dag; + + // Create 6 vertices with specific comm weights + dag.add_vertex(1, 10, 1); // 0 + dag.add_vertex(1, 1, 1); // 1 + dag.add_vertex(1, 5, 1); // 2 + dag.add_vertex(1, 1, 1); // 3 + dag.add_vertex(1, 2, 1); // 4 + dag.add_vertex(1, 1, 1); // 5 + + // Add edges + dag.add_edge(0, 1, 1); + dag.add_edge(2, 3, 1); + dag.add_edge(4, 5, 1); + + BspArchitecture arch; + arch.setNumberOfProcessors(3); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Schedule: + // Proc 0: Node 0, 4, 5 + // Proc 1: Node 1, 2 + // Proc 2: Node 3 + schedule.setAssignedProcessors({0, 1, 1, 2, 0, 0}); + // Steps: 0, 1, 0, 1, 0, 0 + schedule.setAssignedSupersteps({0, 1, 0, 1, 0, 0}); + schedule.updateNumberOfSupersteps(); + + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + comm_ds.compute_comm_datastructures(0, 1); + + // Move Node 0 from Proc 0 (Step 0) to Proc 2 (Step 0) + // kl_move_struct(node, gain, from_proc, from_step, to_proc, to_step) + using kl_move = kl_move_struct; + kl_move move(0, 0.0, 0, 0, 2, 0); + + // Apply the move to the schedule first + thread_local_active_schedule_data active_schedule_data; + active_schedule_data.initialize_cost(0.0); + kl_sched.apply_move(move, active_schedule_data); + + // Then update the communication datastructures + comm_ds.update_datastructure_after_move(move, 0, 1); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_update_datastructure_after_move")); + + unsigned step = 0; + + // Expected Changes: + // Node 0 (was P0 -> P1) is now (P2 -> P1). + // P0 Send: 10 -> 0 + // P2 Send: 0 -> 10 + // P1 Recv: 10 -> 10 (Source changed, but destination same) + + // Others unchanged: + // P1 Send: 5 + // P2 Recv: 5 + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 1), 5); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 2), 10); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 1), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 2), 5); + + // Max Comm: + // Send Max: 10 (P2) + // Recv Max: 10 (P1) + // Global Max: 10 + // Count: 2 (P2 send, P1 recv) + // Second Max: 5 (P1 send, P2 recv) + + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 2); + BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 5); +} + +BOOST_AUTO_TEST_CASE(test_multiple_sequential_moves) { + graph dag; + + // Create a linear chain: 0 -> 1 -> 2 -> 3 + dag.add_vertex(1, 10, 1); // 0 + dag.add_vertex(1, 8, 1); // 1 + dag.add_vertex(1, 6, 1); // 2 + dag.add_vertex(1, 4, 1); // 3 + + dag.add_edge(0, 1, 1); + dag.add_edge(1, 2, 1); + dag.add_edge(2, 3, 1); + + BspArchitecture arch; + arch.setNumberOfProcessors(4); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Initial: All at step 0, on different processors + // 0@P0, 1@P1, 2@P2, 3@P3 + schedule.setAssignedProcessors({0, 1, 2, 3}); + schedule.setAssignedSupersteps({0, 0, 0, 0}); + schedule.updateNumberOfSupersteps(); + + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + comm_ds.compute_comm_datastructures(0, 0); + + // Initial state: + // P0 sends to P1 (10), P1 sends to P2 (8), P2 sends to P3 (6) + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 8); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 2), 6); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 3), 0); + + using kl_move = kl_move_struct; + thread_local_active_schedule_data active_schedule_data; + active_schedule_data.initialize_cost(0.0); + + // Move 1: Move node 1 from P1 to P0 (make 0->1 local) + kl_move move1(1, 0.0, 1, 0, 0, 0); + kl_sched.apply_move(move1, active_schedule_data); + comm_ds.update_datastructure_after_move(move1, 0, 0); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_multiple_sequential_moves_1")); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 8); // Node 1 sends + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0); // Node was moved away + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 0); // No receives at P0 + + // Move 2: Move node 2 from P2 to P0 (chain more local) + kl_move move2(2, 0.0, 2, 0, 0, 0); + kl_sched.apply_move(move2, active_schedule_data); + comm_ds.update_datastructure_after_move(move2, 0, 0); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_multiple_sequential_moves_2")); + + // After move2: Nodes 0,1,2 all at P0, only 3 at P3 + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 6); // Only node 2 sends off-proc + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 2), 0); // Node moved away + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 3), 6); // P3 receives from node 2 + + // Move 3: Move node 3 to P0 (everything local) + kl_move move3(3, 0.0, 3, 0, 0, 0); + kl_sched.apply_move(move3, active_schedule_data); + comm_ds.update_datastructure_after_move(move3, 0, 0); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_multiple_sequential_moves_3")); + + // After move3: All nodes at P0, all communication is local + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0); // All local + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(0), 0); // No communication cost +} + +BOOST_AUTO_TEST_CASE(test_node_with_multiple_children) { + graph dag; + + // Tree structure: Node 0 has three children (1, 2, 3) + dag.add_vertex(1, 10, 1); // 0 + dag.add_vertex(1, 1, 1); // 1 + dag.add_vertex(1, 1, 1); // 2 + dag.add_vertex(1, 1, 1); // 3 + + dag.add_edge(0, 1, 1); + dag.add_edge(0, 2, 1); + dag.add_edge(0, 3, 1); + + BspArchitecture arch; + arch.setNumberOfProcessors(4); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + schedule.setAssignedProcessors({0, 1, 2, 3}); + schedule.setAssignedSupersteps({0, 0, 0, 0}); + schedule.updateNumberOfSupersteps(); + + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + comm_ds.compute_comm_datastructures(0, 0); + + // Initial: Node 0 has 3 children on P1, P2, P3 (3 unique off-proc) + // Send cost = 10 * 3 = 30 + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 30); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 2), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 3), 10); + + using kl_move = kl_move_struct; + thread_local_active_schedule_data active_schedule_data; + active_schedule_data.initialize_cost(0.0); + + // Move child 1 to P0 (same as parent) + kl_move move1(1, 0.0, 1, 0, 0, 0); + kl_sched.apply_move(move1, active_schedule_data); + comm_ds.update_datastructure_after_move(move1, 0, 0); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_node_with_multiple_children")); + + // After: Node 0 has 1 local child, 2 off-proc (P2, P3) + // Send cost = 10 * 2 = 20 + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 20); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 0); // No longer receives + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 2), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 3), 10); + + kl_move move2(2, 0.0, 2, 0, 0, 0); + kl_sched.apply_move(move2, active_schedule_data); + comm_ds.update_datastructure_after_move(move2, 0, 0); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_node_with_multiple_children_2")); + + // After: Node 0 has 2 local children, 1 off-proc (P3) + // Send cost = 10 * 1 = 10 + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 2), 0); // No longer receives + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 3), 10); + + // Move child 3 to P0 (all local) + kl_move move3(3, 0.0, 3, 0, 0, 0); + kl_sched.apply_move(move3, active_schedule_data); + comm_ds.update_datastructure_after_move(move3, 0, 0); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_node_with_multiple_children_3")); + + // After: Node 0 has 3 local children + // Send cost = 10 * 0 = 0 (all local) + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 3), 0); // No longer receives +} + +BOOST_AUTO_TEST_CASE(test_cross_step_moves) { + graph dag; + + // 0 -> 1 -> 2 + dag.add_vertex(1, 10, 1); // 0 + dag.add_vertex(1, 8, 1); // 1 + dag.add_vertex(1, 6, 1); // 2 + + dag.add_edge(0, 1, 1); + dag.add_edge(1, 2, 1); + + BspArchitecture arch; + arch.setNumberOfProcessors(2); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + schedule.setAssignedProcessors({0, 1, 0}); + schedule.setAssignedSupersteps({0, 1, 2}); + schedule.updateNumberOfSupersteps(); + + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + comm_ds.compute_comm_datastructures(0, 2); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 8); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 0), 8); + + using kl_move = kl_move_struct; + thread_local_active_schedule_data active_schedule_data; + active_schedule_data.initialize_cost(0.0); + + // Move node 1 from (P1, step1) to (P0, step1) + // This makes 0->1 edge stay cross-step but changes processor + kl_move move1(1, 0.0, 1, 1, 0, 1); + kl_sched.apply_move(move1, active_schedule_data); + comm_ds.update_datastructure_after_move(move1, 0, 2); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0); // Local (same processor) + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 0); // No receive needed + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0); // Local (same processor) + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 0); // Node moved away + + kl_move move2(1, 0.0, 0, 1, 0, 0); + kl_sched.apply_move(move2, active_schedule_data); + comm_ds.update_datastructure_after_move(move2, 0, 2); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0); // All local at P0 +} + +BOOST_AUTO_TEST_CASE(test_complex_scenario_user_provided) { + graph dag; + + // Vertices from user request + // v1(0): w=2, c=9, m=2 + const auto v1 = dag.add_vertex(2, 9, 2); + const auto v2 = dag.add_vertex(3, 8, 4); + const auto v3 = dag.add_vertex(4, 7, 3); + const auto v4 = dag.add_vertex(5, 6, 2); + const auto v5 = dag.add_vertex(6, 5, 6); + const auto v6 = dag.add_vertex(7, 4, 2); + dag.add_vertex(8, 3, 4); // v7 (index 6) + const auto v8 = dag.add_vertex(9, 2, 1); // v8 (index 7) + + // Edges + dag.add_edge(v1, v2, 2); + dag.add_edge(v1, v3, 2); + dag.add_edge(v1, v4, 2); + dag.add_edge(v2, v5, 12); + dag.add_edge(v3, v5, 6); + dag.add_edge(v3, v6, 7); + dag.add_edge(v5, v8, 9); + dag.add_edge(v4, v8, 9); + + BspArchitecture arch; + arch.setNumberOfProcessors(2); // P0, P1 + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Schedule: {1, 1, 0, 0, 1, 0, 0, 1} + // v1@P1, v2@P1, v3@P0, v4@P0, v5@P1, v6@P0, v7@P0, v8@P1 + schedule.setAssignedProcessors({1, 1, 0, 0, 1, 0, 0, 1}); + + // Supersteps: {0, 0, 1, 1, 2, 2, 3, 3} + schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); + schedule.updateNumberOfSupersteps(); + + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + comm_ds.compute_comm_datastructures(0, 3); + + // === Initial State Verification === + // ... (Same as before) ... + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 9); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 9); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(0), 9); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 13); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 1), 13); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(1), 13); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(2), 0); + + using kl_move = kl_move_struct; + thread_local_active_schedule_data active_schedule_data; + active_schedule_data.initialize_cost(0.0); + + // === Move 1: Move v3 from P0 to P1 (at Step 1) === + kl_move move1(v3, 0.0, 0, 1, 1, 1); + kl_sched.apply_move(move1, active_schedule_data); + comm_ds.update_datastructure_after_move(move1, 0, 3); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "complex_move1")); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 9); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 6); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 7); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(1), 7); + + // === Move 2: Move v4 from P0 to P1 (at Step 1) === + kl_move move2(v4, 0.0, 0, 1, 1, 1); + kl_sched.apply_move(move2, active_schedule_data); + comm_ds.update_datastructure_after_move(move2, 0, 3); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "complex_move2")); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 7); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(1), 7); + + // === Move 3: Move v5 from P1 to P0 (at Step 2) === + kl_move move3(v5, 0.0, 1, 2, 0, 2); + kl_sched.apply_move(move3, active_schedule_data); + comm_ds.update_datastructure_after_move(move3, 0, 3); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "complex_move3")); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 8); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(0), 8); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 7); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 0), 5); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(2), 5); + + // === Move 4: Move v6 from P0 to P1 (at Step 2) === + // v6 is child of v3 (P1, S1). + // Before: v3(P1) -> v6(P0). Off-proc. + // After: v3(P1) -> v6(P1). Local. + // v3 also sends to v5(P0). + // So v3 targets: {P0}. Count = 1. + // Send Cost v3 = 7. Unchanged. + kl_move move4(v6, 0.0, 0, 2, 1, 2); + kl_sched.apply_move(move4, active_schedule_data); + comm_ds.update_datastructure_after_move(move4, 0, 3); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "complex_move4")); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 7); + + // === Move 5: Move v5 from P0 to P1 (at Step 2) === + // v5 moves back to P1. + // v3(P1) -> v5(P1), v6(P1). All local. + // Send Cost v3 = 0. + kl_move move5(v5, 0.0, 0, 2, 1, 2); + kl_sched.apply_move(move5, active_schedule_data); + comm_ds.update_datastructure_after_move(move5, 0, 3); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "complex_move5")); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_max_comm(1), 0); +} + +/** + * Test: Grid Graph Complex Moves + * Uses a 5x5 Grid Graph (25 nodes) with 6 Supersteps and 4 Processors. + * Performs various moves to verify incremental updates in a dense graph. + */ +BOOST_AUTO_TEST_CASE(test_grid_graph_complex_moves) { + // Construct 5x5 Grid Graph (25 nodes, indices 0-24) + graph dag = osp::construct_grid_dag(5, 5); + + BspArchitecture arch; + arch.setNumberOfProcessors(4); // P0..P3 + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Assign Processors and Supersteps + std::vector procs(25); + std::vector steps(25); + + for (unsigned r = 0; r < 5; ++r) { + for (unsigned c = 0; c < 5; ++c) { + unsigned idx = r * 5 + c; + if (r < 2) { + procs[idx] = 0; + steps[idx] = (c < 3) ? 0 : 1; + } else if (r < 4) { + procs[idx] = 1; + steps[idx] = (c < 3) ? 2 : 3; + } else { + procs[idx] = 2; + steps[idx] = (c < 3) ? 4 : 5; + } + } + } + + // Override: Node 7 (1,2) to P3, S1. + procs[7] = 3; + steps[7] = 1; + + schedule.setAssignedProcessors(procs); + schedule.setAssignedSupersteps(steps); + schedule.updateNumberOfSupersteps(); + + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + comm_ds.compute_comm_datastructures(0, 5); + + // Initial check + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 3), 2); + + using kl_move = kl_move_struct; + thread_local_active_schedule_data active_schedule_data; + active_schedule_data.initialize_cost(0.0); + + // === Move 1: Node 12 (P1->P0) === + kl_move move1(12, 0.0, 1, 2, 0, 2); + kl_sched.apply_move(move1, active_schedule_data); + comm_ds.update_datastructure_after_move(move1, 0, 5); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "grid_move1")); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 3), 1); + + // === Move 2: Node 8 (P0->P3) === + kl_move move2(8, 0.0, 0, 1, 3, 1); + kl_sched.apply_move(move2, active_schedule_data); + comm_ds.update_datastructure_after_move(move2, 0, 5); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "grid_move2")); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 3), 3); + + // === Move 3: Node 12 (P0->P3) === + kl_move move3(12, 0.0, 0, 2, 3, 2); + kl_sched.apply_move(move3, active_schedule_data); + comm_ds.update_datastructure_after_move(move3, 0, 5); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "grid_move3")); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 3), 2); + + // === Move 4: Node 7 (P3->P0) === + kl_move move4(7, 0.0, 3, 1, 0, 1); + kl_sched.apply_move(move4, active_schedule_data); + comm_ds.update_datastructure_after_move(move4, 0, 5); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "grid_move4")); + + // Check P0 send contribution from Node 7. + // Node 7 contributes 10. + // We can check if P0 send >= 10. + BOOST_CHECK_GE(comm_ds.step_proc_send(1, 0), 1); +} + +/** + * Test: Butterfly Graph Moves + * Uses a Butterfly Graph (FFT pattern) to test structured communication patterns. + * Stages = 2 (12 nodes). 3 Supersteps. 2 Processors. + */ +BOOST_AUTO_TEST_CASE(test_butterfly_graph_moves) { + // Stages=2 -> 3 levels of 4 nodes each = 12 nodes. + // Level 0: 0-3. Level 1: 4-7. Level 2: 8-11. + graph dag = osp::construct_butterfly_dag(2); + + BspArchitecture arch; + arch.setNumberOfProcessors(2); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Assign: + // Level 0: P0, Step 0 + // Level 1: P1, Step 1 + // Level 2: P0, Step 2 + std::vector procs(12); + std::vector steps(12); + for (unsigned i = 0; i < 12; ++i) { + if (i < 4) { + procs[i] = 0; + steps[i] = 0; + } else if (i < 8) { + procs[i] = 1; + steps[i] = 1; + } else { + procs[i] = 0; + steps[i] = 2; + } + } + + schedule.setAssignedProcessors(procs); + schedule.setAssignedSupersteps(steps); + schedule.updateNumberOfSupersteps(); + + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + comm_ds.compute_comm_datastructures(0, 2); + + // Initial State: + // Step 0 (P0): Nodes 0-3 send to Level 1 (P1). + // Each node in butterfly connects to 2 nodes in next level. + // 0 -> 4, 6. (Both P1). Count=1. Cost=10. + // 1 -> 5, 7. (Both P1). Count=1. Cost=10. + // ... All 4 nodes send to P1. Total P0 Send = 40. + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 4); + + // Step 1 (P1): Nodes 4-7 send to Level 2 (P0). + // All 4 nodes send to P0. Total P1 Send = 40. + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 4); + + using kl_move = kl_move_struct; + thread_local_active_schedule_data active_schedule_data; + active_schedule_data.initialize_cost(0.0); + + // === Move 1: Move Node 4 (Level 1) P1 -> P0 === + // Node 4 moves to P0. + // Impact on Step 0 (Parents 0, 1): + // Node 0 -> 4(P0), 6(P1). Targets {P0, P1}. P0 is local. Targets {P1}. Count=1. + // Node 1 -> 5(P1), 7(P1). Targets {P1}. Count=1. + // Step 0 Send Cost unchanged (still 40). + kl_move move1(4, 0.0, 1, 1, 0, 1); + kl_sched.apply_move(move1, active_schedule_data); + comm_ds.update_datastructure_after_move(move1, 0, 2); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "butterfly_move1")); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 4); + + // Impact on Step 1 (Node 4): + // Node 4 (P0) -> 8(P0), 10(P0). All local. + // Node 4 stops sending. (Was 10). + // P1 Send decreases by 10 -> 30. + // P0 Send increases by 0 (all local). + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 3); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0); + + // === Move 2: Move Node 6 (Level 1) P1 -> P0 === + // Node 6 moves to P0. + // Impact on Step 0 (Parent 0): + // Node 0 -> 4(P0), 6(P0). All local. + // Node 0 stops sending. (Was 10). + // P0 Send decreases by 10 -> 30. + kl_move move2(6, 0.0, 1, 1, 0, 1); + kl_sched.apply_move(move2, active_schedule_data); + comm_ds.update_datastructure_after_move(move2, 0, 2); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "butterfly_move2")); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 2); + + // Impact on Step 1 (Node 6): + // Node 6 (P0) -> 8(P0), 10(P0). All local. + // P1 Send decreases by 10 -> 20. + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 2); + + // === Move 3: Move Node 0 (Level 0) P0 -> P1 === + // Node 0 moves to P1. + // Impact on Step 0: + // Node 0 (P1) -> 4(P0), 6(P0). Targets {P0}. Count=1. Cost=10. + // Node 1 (P0) -> 5(P1), 7(P1). Targets {P1}. Count=1. Cost=10. + // P0 Send: 10 (from Node 1). + // P1 Send: 10 (from Node 0). + kl_move move3(0, 0.0, 0, 0, 1, 0); + kl_sched.apply_move(move3, active_schedule_data); + comm_ds.update_datastructure_after_move(move3, 0, 2); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "butterfly_move3")); + + // === Move 4: Move Node 8 (Level 2) P0 -> P1 === + // Node 8 moves to P1. + // Impact on Step 1: + // Node 4 (P0) -> 8(P1), 10(P0). Targets {P1}. Count=1. Cost=10. + // Node 6 (P0) -> 8(P1), 10(P0). Targets {P1}. Count=1. Cost=10. + // P0 Send increases. + kl_move move4(8, 0.0, 0, 2, 1, 2); + kl_sched.apply_move(move4, active_schedule_data); + comm_ds.update_datastructure_after_move(move4, 0, 2); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "butterfly_move4")); +} + +/** + * Test: Ladder Graph Moves + * Uses a Ladder Graph (Rungs=5 -> 12 nodes). + * Tests moving rungs between processors. + */ +BOOST_AUTO_TEST_CASE(test_ladder_graph_moves) { + // Ladder with 5 rungs -> 6 pairs of nodes = 12 nodes. + // Pairs: (0,1), (2,3), ... (10,11). + graph dag = osp::construct_ladder_dag(5); + + BspArchitecture arch; + arch.setNumberOfProcessors(2); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(1); + + BspInstance instance(dag, arch); + BspSchedule schedule(instance); + + // Assign: + // Even nodes (Left rail): P0 + // Odd nodes (Right rail): P1 + // Steps: Pair i at Step i. + std::vector procs(12); + std::vector steps(12); + for (unsigned i = 0; i < 6; ++i) { + procs[2 * i] = 0; + steps[2 * i] = i; + procs[2 * i + 1] = 1; + steps[2 * i + 1] = i; + } + + schedule.setAssignedProcessors(procs); + schedule.setAssignedSupersteps(steps); + schedule.updateNumberOfSupersteps(); + + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + comm_ds.compute_comm_datastructures(0, 5); + + // Initial State: + // Rung i (u1, v1) connects to Rung i+1 (u2, v2). + // u1(P0) -> u2(P0), v2(P1). Targets {P1}. Count=1. Cost=10. + // v1(P1) -> u2(P0), v2(P1). Targets {P0}. Count=1. Cost=10. + // This applies for Steps 0 to 4. + + for (unsigned s = 0; s < 5; ++s) { + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(s, 0), 1); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(s, 1), 1); + } + + using kl_move = kl_move_struct; + thread_local_active_schedule_data active_schedule_data; + active_schedule_data.initialize_cost(0.0); + + // === Move 1: Move Node 1 (Rung 0, Right) P1 -> P0 === + // Node 1 moves to P0. + // Rung 0 is now (0, 1) both at P0. + // Impact on Step 0: + // u1(0) -> u2(2, P0), v2(3, P1). Targets {P1}. Cost=10. (Unchanged) + // v1(1) -> u2(2, P0), v2(3, P1). Targets {P1}. Cost=10. + // P0 Send = 10 + 10 = 20. + // P1 Send = 0. + kl_move move1(1, 0.0, 1, 0, 0, 0); + kl_sched.apply_move(move1, active_schedule_data); + comm_ds.update_datastructure_after_move(move1, 0, 5); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "ladder_move1")); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 2); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0); + + // === Move 2: Move Node 3 (Rung 1, Right) P1 -> P0 === + // Node 3 moves to P0. + // Rung 1 is now (2, 3) both at P0. + // Impact on Step 0 (Parents 0, 1): + // u1(0) -> u2(2, P0), v2(3, P0). All local. Cost=0. + // v1(1) -> u2(2, P0), v2(3, P0). All local. Cost=0. + // P0 Send at Step 0 = 0. + kl_move move2(3, 0.0, 1, 1, 0, 1); + kl_sched.apply_move(move2, active_schedule_data); + comm_ds.update_datastructure_after_move(move2, 0, 5); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "ladder_move2")); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0); + + // Impact on Step 1 (Nodes 2, 3): + // u2(2, P0) -> u3(4, P0), v3(5, P1). Targets {P1}. Cost=10. + // v2(3, P0) -> u3(4, P0), v3(5, P1). Targets {P1}. Cost=10. + // P0 Send at Step 1 = 20. + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 2); + + // === Move 3: Move Node 0 (Rung 0, Left) P0 -> P1 === + // Node 0 moves to P1. + // Rung 0 is now (0@P1, 1@P0). Split again. + // Impact on Step 0: + // u1(0, P1) -> u2(2, P0), v2(3, P0). Targets {P0}. Cost=10. + // v1(1, P0) -> u2(2, P0), v2(3, P0). All local. Cost=0. + // P0 Send: 0. + // P1 Send: 10. + kl_move move3(0, 0.0, 0, 0, 1, 0); + kl_sched.apply_move(move3, active_schedule_data); + comm_ds.update_datastructure_after_move(move3, 0, 5); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "ladder_move3")); + + // === Move 4: Move Node 2 (Rung 1, Left) P0 -> P1 === + // Node 2 moves to P1. + // Rung 1 is now (2@P1, 3@P0). Split again. + // Impact on Step 0 (Parents 0, 1): + // u1(0, P1) -> u2(2, P1), v2(3, P0). Targets {P0}. Cost=10. + // v1(1, P0) -> u2(2, P1), v2(3, P0). Targets {P1}. Cost=10. + // P0 Send: 10. + // P1 Send: 10. + kl_move move4(2, 0.0, 0, 1, 1, 1); + kl_sched.apply_move(move4, active_schedule_data); + comm_ds.update_datastructure_after_move(move4, 0, 5); + BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "ladder_move4")); +} \ No newline at end of file diff --git a/tests/kl_bsp_improver_test.cpp b/tests/kl_bsp_improver_test.cpp new file mode 100644 index 00000000..250dfa18 --- /dev/null +++ b/tests/kl_bsp_improver_test.cpp @@ -0,0 +1,250 @@ + +#define BOOST_TEST_MODULE kl_bsp_improver +#include + +#include "osp/auxiliary/io/arch_file_reader.hpp" +#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" +#include "osp/bsp/scheduler/CoarsenRefineSchedulers/MultiLevelHillClimbing.hpp" +#include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp" +#include "osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp" +#include "osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing_for_comm_schedule.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp" +#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" +#include "test_graphs.hpp" + +using namespace osp; + +template +void add_mem_weights(Graph_t &dag) { + + int mem_weight = 1; + int comm_weight = 7; + + for (const auto &v : dag.vertices()) { + + dag.set_vertex_work_weight(v, static_cast>(mem_weight++ % 10 + 2)); + dag.set_vertex_mem_weight(v, static_cast>(mem_weight++ % 10 + 2)); + dag.set_vertex_comm_weight(v, static_cast>(comm_weight++ % 10 + 2)); + } +} + +BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) { + + using graph = computational_dag_edge_idx_vector_impl_def_int_t; + using VertexType = graph::vertex_idx; + + graph dag; + + const VertexType v1 = dag.add_vertex(2, 9, 2); + const VertexType v2 = dag.add_vertex(3, 8, 4); + const VertexType v3 = dag.add_vertex(4, 7, 3); + const VertexType v4 = dag.add_vertex(5, 6, 2); + const VertexType v5 = dag.add_vertex(6, 5, 6); + const VertexType v6 = dag.add_vertex(7, 4, 2); + dag.add_vertex(8, 3, 4); + const VertexType v8 = dag.add_vertex(9, 2, 1); + + dag.add_edge(v1, v2, 2); + dag.add_edge(v1, v3, 2); + dag.add_edge(v1, v4, 2); + dag.add_edge(v2, v5, 12); + dag.add_edge(v3, v5, 6); + dag.add_edge(v3, v6, 7); + dag.add_edge(v5, v8, 9); + dag.add_edge(v4, v8, 9); + + BspArchitecture arch; + + BspInstance instance(dag, arch); + + BspSchedule schedule(instance); + + schedule.setAssignedProcessors({1, 1, 0, 0, 1, 0, 0, 1}); + schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); + + schedule.updateNumberOfSupersteps(); + + using comm_cost_t = kl_bsp_comm_cost_function; + using kl_improver_test = kl_improver_test; + + kl_improver_test kl; + + kl.setup_schedule(schedule); + + auto &kl_active_schedule = kl.get_active_schedule(); + + // Verify work datastructures are set up correctly + BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(0), 5.0); + BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(0), 0.0); + BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(1), 9.0); + BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(1), 0.0); + BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(2), 7.0); + BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); + BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); + BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); + + BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); + BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true); + + // Check initial cost consistency + double initial_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double initial_tracked = kl.get_current_cost(); + BOOST_CHECK_CLOSE(initial_recomputed, initial_tracked, 0.00001); + + // Insert nodes into gain heap + auto node_selection = kl.insert_gain_heap_test_penalty({2, 3}); + + // Run first iteration and check cost consistency + auto recompute_max_gain = kl.run_inner_iteration_test(); + + double iter1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double iter1_tracked = kl.get_current_cost(); + BOOST_CHECK_CLOSE(iter1_recomputed, iter1_tracked, 0.00001); + + // Run second iteration + auto &node3_affinity = kl.get_affinity_table()[3]; + + recompute_max_gain = kl.run_inner_iteration_test(); + + double iter2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double iter2_tracked = kl.get_current_cost(); + + BOOST_CHECK_CLOSE(iter2_recomputed, iter2_tracked, 0.00001); + + // Run third iteration + recompute_max_gain = kl.run_inner_iteration_test(); + + double iter3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double iter3_tracked = kl.get_current_cost(); + BOOST_CHECK_CLOSE(iter3_recomputed, iter3_tracked, 0.00001); + + // Run fourth iteration + recompute_max_gain = kl.run_inner_iteration_test(); + + double iter4_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test(); + double iter4_tracked = kl.get_current_cost(); + BOOST_CHECK_CLOSE(iter4_recomputed, iter4_tracked, 0.00001); +} + +BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs) { + std::vector filenames_graph = large_spaa_graphs(); + using graph = computational_dag_edge_idx_vector_impl_def_int_t; + // Getting root git directory + std::filesystem::path cwd = std::filesystem::current_path(); + std::cout << cwd << std::endl; + while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) { + cwd = cwd.parent_path(); + std::cout << cwd << std::endl; + } + + for (auto &filename_graph : filenames_graph) { + GreedyBspScheduler test_scheduler; + BspInstance instance; + bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(), + instance.getComputationalDag()); + + instance.getArchitecture().setSynchronisationCosts(500); + instance.getArchitecture().setCommunicationCosts(5); + instance.getArchitecture().setNumberOfProcessors(4); + + std::vector> send_cost = {{0, 1, 4, 4}, {1, 0, 4, 4}, {4, 4, 0, 1}, {4, 4, 1, 0}}; + + instance.getArchitecture().setSendCosts(send_cost); + + if (!status_graph) { + + std::cout << "Reading files failed." << std::endl; + BOOST_CHECK(false); + } + + add_mem_weights(instance.getComputationalDag()); + + BspSchedule schedule(instance); + const auto result = test_scheduler.computeSchedule(schedule); + + schedule.updateNumberOfSupersteps(); + + std::cout << "initial scedule with costs: " << schedule.computeCosts() << " and " + << schedule.numberOfSupersteps() << " number of supersteps" << std::endl; + + BspSchedule schedule_2(schedule); + + BOOST_CHECK_EQUAL(RETURN_STATUS::OSP_SUCCESS, result); + BOOST_CHECK_EQUAL(&schedule.getInstance(), &instance); + BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); + + kl_total_lambda_comm_improver kl_total_lambda; + auto start_time = std::chrono::high_resolution_clock::now(); + auto status = kl_total_lambda.improveSchedule(schedule); + auto finish_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(finish_time - start_time).count(); + + std::cout << "kl lambda new finished in " << duration << " seconds, costs: " << schedule.computeCosts() + << " and lambda costs: " << schedule.computeTotalLambdaCosts() << " with " + << schedule.numberOfSupersteps() << " number of supersteps" << std::endl; + + BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); + BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); + + kl_bsp_comm_improver_mt kl; + kl.setTimeQualityParameter(5.0); + start_time = std::chrono::high_resolution_clock::now(); + status = kl.improveSchedule(schedule); + finish_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(finish_time - start_time).count(); + + std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeCosts() << " with " + << schedule.numberOfSupersteps() << " number of supersteps" << std::endl; + + BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); + BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); + + BspScheduleCS schedule_cs(schedule); + + HillClimbingForCommSteps hc_comm_steps; + start_time = std::chrono::high_resolution_clock::now(); + status = hc_comm_steps.improveSchedule(schedule_cs); + finish_time = std::chrono::high_resolution_clock::now(); + + duration = std::chrono::duration_cast(finish_time - start_time).count(); + + std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs.computeCosts() + << " with " << schedule_cs.numberOfSupersteps() << " number of supersteps" << std::endl; + + BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); + BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); + + kl_total_lambda.improveSchedule(schedule_2); + + HillClimbingScheduler hc; + + start_time = std::chrono::high_resolution_clock::now(); + status = hc.improveSchedule(schedule_2); + finish_time = std::chrono::high_resolution_clock::now(); + + duration = std::chrono::duration_cast(finish_time - start_time).count(); + + std::cout << "hc finished in " << duration << " seconds, costs: " << schedule_2.computeCosts() << " with " + << schedule_2.numberOfSupersteps() << " number of supersteps" << std::endl; + + BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); + BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true); + + BspScheduleCS schedule_cs_2(schedule_2); + + start_time = std::chrono::high_resolution_clock::now(); + status = hc_comm_steps.improveSchedule(schedule_cs_2); + finish_time = std::chrono::high_resolution_clock::now(); + + duration = std::chrono::duration_cast(finish_time - start_time).count(); + + std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs_2.computeCosts() + << " with " << schedule_cs_2.numberOfSupersteps() << " number of supersteps" << std::endl; + + BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); + BOOST_CHECK_EQUAL(schedule_cs_2.satisfiesPrecedenceConstraints(), true); + } +} \ No newline at end of file From e0ad973ec734745f87d2b17f298c11bfb674239d Mon Sep 17 00:00:00 2001 From: tonibohnlein Date: Tue, 25 Nov 2025 16:05:58 +0100 Subject: [PATCH 2/3] adding more new_nodes after move comments update update --- .../comm_cost_modules/comm_cost_policies.hpp | 457 ++++++++++++++++++ .../generic_lambda_container.hpp | 124 +++++ .../max_comm_datastructure.hpp | 88 ++-- .../KernighanLin_v2/kl_improver.hpp | 45 +- tests/kl_bsp_cost.cpp | 192 +++++++- tests/kl_bsp_improver_test.cpp | 170 +++---- 6 files changed, 943 insertions(+), 133 deletions(-) create mode 100644 include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/comm_cost_policies.hpp create mode 100644 include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/generic_lambda_container.hpp diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/comm_cost_policies.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/comm_cost_policies.hpp new file mode 100644 index 00000000..8fb1ceff --- /dev/null +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/comm_cost_policies.hpp @@ -0,0 +1,457 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include +#include +#include + +namespace osp { + +struct EagerCommCostPolicy { + using ValueType = unsigned; + + template + static inline void attribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step, + const unsigned u_proc, const unsigned v_proc, const unsigned v_step, + const ValueType &val) { + ds.step_proc_receive(u_step, v_proc) += cost; + ds.step_proc_send(u_step, u_proc) += cost; + } + + template + static inline void unattribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step, + const unsigned u_proc, const unsigned v_proc, const unsigned v_step, + const ValueType &val) { + ds.step_proc_receive(u_step, v_proc) -= cost; + ds.step_proc_send(u_step, u_proc) -= cost; + } + + static inline bool add_child(ValueType &val, unsigned step) { + val++; + return val == 1; + } + + static inline bool remove_child(ValueType &val, unsigned step) { + val--; + return val == 0; + } + + static inline void reset(ValueType &val) { val = 0; } + + static inline bool has_entry(const ValueType &val) { return val > 0; } + + static inline bool is_single_entry(const ValueType &val) { return val == 1; } + + template + static inline void calculate_delta_remove(const ValueType &val, unsigned child_step, unsigned parent_step, + unsigned parent_proc, unsigned child_proc, comm_weight_t cost, + DeltaTracker &dt) { + if (val == 1) { + dt.add(true, parent_step, child_proc, -cost); + dt.add(false, parent_step, parent_proc, -cost); + } + } + + template + static inline void calculate_delta_add(const ValueType &val, unsigned child_step, unsigned parent_step, + unsigned parent_proc, unsigned child_proc, comm_weight_t cost, + DeltaTracker &dt) { + if (val == 0) { + dt.add(true, parent_step, child_proc, cost); + dt.add(false, parent_step, parent_proc, cost); + } + } + + template + static inline void calculate_delta_outgoing(const ValueType &val, unsigned node_step, unsigned node_proc, + unsigned child_proc, comm_weight_t cost, DeltaTracker &dt) { + if (val > 0) { + comm_weight_t total_cost = cost * val; + dt.add(true, node_step, child_proc, total_cost); + dt.add(false, node_step, node_proc, total_cost); + } + } +}; + +struct LazyCommCostPolicy { + using ValueType = std::vector; + + template + static inline void attribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step, + const unsigned u_proc, const unsigned v_proc, const unsigned v_step, + const ValueType &val) { + // val contains v_step (already added). + // Check if v_step is the new minimum. + unsigned min_step = std::numeric_limits::max(); + for (unsigned s : val) + min_step = std::min(min_step, s); + + if (min_step == v_step) { + // Check if it was strictly smaller than previous min. + unsigned prev_min = std::numeric_limits::max(); + for (size_t i = 0; i < val.size() - 1; ++i) { + prev_min = std::min(prev_min, val[i]); + } + + if (v_step < prev_min) { + if (prev_min != std::numeric_limits::max() && prev_min > 0) { + ds.step_proc_receive(prev_min - 1, v_proc) -= cost; + ds.step_proc_send(prev_min - 1, u_proc) -= cost; + } + if (v_step > 0) { + ds.step_proc_receive(v_step - 1, v_proc) += cost; + ds.step_proc_send(v_step - 1, u_proc) += cost; + } + } + } + } + + template + static inline void unattribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step, + const unsigned u_proc, const unsigned v_proc, const unsigned v_step, + const ValueType &val) { + // val is state AFTER removal. + + if (val.empty()) { + // Removed the last child. + if (v_step > 0) { + ds.step_proc_receive(v_step - 1, v_proc) -= cost; + ds.step_proc_send(v_step - 1, u_proc) -= cost; + } + } else { + // Check if v_step was the unique minimum. + unsigned new_min = val[0]; + for (unsigned s : val) + new_min = std::min(new_min, s); + + if (v_step < new_min) { + // v_step was the unique minimum. + if (v_step > 0) { + ds.step_proc_receive(v_step - 1, v_proc) -= cost; + ds.step_proc_send(v_step - 1, u_proc) -= cost; + } + if (new_min > 0) { + ds.step_proc_receive(new_min - 1, v_proc) += cost; + ds.step_proc_send(new_min - 1, u_proc) += cost; + } + } + } + } + + static inline bool add_child(ValueType &val, unsigned step) { + val.push_back(step); + if (val.size() == 1) + return true; + unsigned min_s = val[0]; + for (unsigned s : val) + min_s = std::min(min_s, s); + return step == min_s; + } + + static inline bool remove_child(ValueType &val, unsigned step) { + auto it = std::find(val.begin(), val.end(), step); + if (it != val.end()) { + val.erase(it); + if (val.empty()) { + return true; + } + unsigned new_min = val[0]; + for (unsigned s : val) + new_min = std::min(new_min, s); + bool res = step < new_min; + return res; + } + return false; + } + + static inline void reset(ValueType &val) { val.clear(); } + + static inline bool has_entry(const ValueType &val) { return !val.empty(); } + + static inline bool is_single_entry(const ValueType &val) { return val.size() == 1; } + + template + static inline void calculate_delta_remove(const ValueType &val, unsigned child_step, unsigned parent_step, + unsigned parent_proc, unsigned child_proc, comm_weight_t cost, + DeltaTracker &dt) { + if (val.empty()) + return; + unsigned min_s = val[0]; + for (unsigned s : val) + min_s = std::min(min_s, s); + + if (child_step == min_s) { + int count = 0; + for (unsigned s : val) + if (s == min_s) + count++; + + if (count == 1) { + if (min_s > 0) { + dt.add(true, min_s - 1, child_proc, -cost); + dt.add(false, min_s - 1, parent_proc, -cost); + } + if (val.size() > 1) { + unsigned next_min = std::numeric_limits::max(); + for (unsigned s : val) { + if (s != min_s) + next_min = std::min(next_min, s); + } + if (next_min != std::numeric_limits::max() && next_min > 0) { + dt.add(true, next_min - 1, child_proc, cost); + dt.add(false, next_min - 1, parent_proc, cost); + } + } + } + } + } + + template + static inline void calculate_delta_add(const ValueType &val, unsigned child_step, unsigned parent_step, + unsigned parent_proc, unsigned child_proc, comm_weight_t cost, + DeltaTracker &dt) { + if (val.empty()) { + if (child_step > 0) { + dt.add(true, child_step - 1, child_proc, cost); + dt.add(false, child_step - 1, parent_proc, cost); + } + } else { + unsigned min_s = val[0]; + for (unsigned s : val) + min_s = std::min(min_s, s); + + if (child_step < min_s) { + if (min_s > 0) { + dt.add(true, min_s - 1, child_proc, -cost); + dt.add(false, min_s - 1, parent_proc, -cost); + } + if (child_step > 0) { + dt.add(true, child_step - 1, child_proc, cost); + dt.add(false, child_step - 1, parent_proc, cost); + } + } + } + } + + template + static inline void calculate_delta_outgoing(const ValueType &val, unsigned node_step, unsigned node_proc, + unsigned child_proc, comm_weight_t cost, DeltaTracker &dt) { + for (unsigned s : val) { + if (s > 0) { + dt.add(true, s - 1, child_proc, cost); + dt.add(false, s - 1, node_proc, cost); + } + } + } +}; + +struct BufferedCommCostPolicy { + using ValueType = std::vector; + + template + static inline void attribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step, + const unsigned u_proc, const unsigned v_proc, const unsigned v_step, + const ValueType &val) { + // Buffered: Send at u_step, Receive at v_step - 1. + + unsigned min_step = std::numeric_limits::max(); + for (unsigned s : val) + min_step = std::min(min_step, s); + + if (min_step == v_step) { + unsigned prev_min = std::numeric_limits::max(); + for (size_t i = 0; i < val.size() - 1; ++i) + prev_min = std::min(prev_min, val[i]); + + if (v_step < prev_min) { + if (prev_min != std::numeric_limits::max() && prev_min > 0) { + ds.step_proc_receive(prev_min - 1, v_proc) -= cost; + } + if (v_step > 0) { + ds.step_proc_receive(v_step - 1, v_proc) += cost; + } + } + } + + // Send side logic (u_step) + // If this is the FIRST child on this proc, add send cost. + if (val.size() == 1) { + ds.step_proc_send(u_step, u_proc) += cost; + } + } + + template + static inline void unattribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step, + const unsigned u_proc, const unsigned v_proc, const unsigned v_step, + const ValueType &val) { + // val is state AFTER removal. + + if (val.empty()) { + // Removed last child. + ds.step_proc_send(u_step, u_proc) -= cost; // Send side + if (v_step > 0) { + ds.step_proc_receive(v_step - 1, v_proc) -= cost; // Recv side + } + } else { + // Check if v_step was unique minimum for Recv side. + unsigned new_min = val[0]; + for (unsigned s : val) + new_min = std::min(new_min, s); + + if (v_step < new_min) { + if (v_step > 0) { + ds.step_proc_receive(v_step - 1, v_proc) -= cost; + } + if (new_min > 0) { + ds.step_proc_receive(new_min - 1, v_proc) += cost; + } + } + // Send side remains (val not empty). + } + } + + static inline bool add_child(ValueType &val, unsigned step) { + val.push_back(step); + if (val.size() == 1) + return true; // Need update for send side + unsigned min_s = val[0]; + for (unsigned s : val) + min_s = std::min(min_s, s); + return step == min_s; // Need update for recv side + } + + static inline bool remove_child(ValueType &val, unsigned step) { + auto it = std::find(val.begin(), val.end(), step); + if (it != val.end()) { + val.erase(it); + if (val.empty()) + return true; // Need update for send side + unsigned new_min = val[0]; + for (unsigned s : val) + new_min = std::min(new_min, s); + return step < new_min; // Need update for recv side + } + return false; + } + + static inline void reset(ValueType &val) { val.clear(); } + static inline bool has_entry(const ValueType &val) { return !val.empty(); } + static inline bool is_single_entry(const ValueType &val) { return val.size() == 1; } + + template + static inline void calculate_delta_remove(const ValueType &val, unsigned child_step, unsigned parent_step, + unsigned parent_proc, unsigned child_proc, comm_weight_t cost, + DeltaTracker &dt) { + // Lazy: Send and Recv are both at min(child_steps) - 1. + + if (val.empty()) + return; + + unsigned min_s = val[0]; + for (unsigned s : val) + min_s = std::min(min_s, s); + + if (child_step == min_s) { + int count = 0; + for (unsigned s : val) + if (s == min_s) + count++; + + if (count == 1) { + // Unique min being removed. + if (min_s > 0) { + dt.add(true, min_s - 1, child_proc, -cost); // Remove Recv + dt.add(false, min_s - 1, parent_proc, -cost); // Remove Send + } + + if (val.size() > 1) { + unsigned next_min = std::numeric_limits::max(); + for (unsigned s : val) + if (s != min_s) + next_min = std::min(next_min, s); + + if (next_min != std::numeric_limits::max() && next_min > 0) { + dt.add(true, next_min - 1, child_proc, cost); // Add Recv at new min + dt.add(false, next_min - 1, parent_proc, cost); // Add Send at new min + } + } + } + } + } + + template + static inline void calculate_delta_add(const ValueType &val, unsigned child_step, unsigned parent_step, + unsigned parent_proc, unsigned child_proc, comm_weight_t cost, + DeltaTracker &dt) { + // Lazy: Send and Recv are both at min(child_steps) - 1. + + if (val.empty()) { + // First child. + if (child_step > 0) { + dt.add(true, child_step - 1, child_proc, cost); + dt.add(false, child_step - 1, parent_proc, cost); + } + } else { + unsigned min_s = val[0]; + for (unsigned s : val) + min_s = std::min(min_s, s); + + if (child_step < min_s) { + // New global minimum. + if (min_s > 0) { + dt.add(true, min_s - 1, child_proc, -cost); // Remove old Recv + dt.add(false, min_s - 1, parent_proc, -cost); // Remove old Send + } + if (child_step > 0) { + dt.add(true, child_step - 1, child_proc, cost); // Add new Recv + dt.add(false, child_step - 1, parent_proc, cost); // Add new Send + } + } + } + } + + template + static inline void calculate_delta_outgoing(const ValueType &val, unsigned node_step, unsigned node_proc, + unsigned child_proc, comm_weight_t cost, DeltaTracker &dt) { + // Buffered Outgoing (Node -> Children) + // Node is parent (sender). Pays at node_step. + // Children are receivers. Pay at child_step - 1. + + // Send side: node_step. + // If val is not empty, we pay send cost ONCE. + if (!val.empty()) { + dt.add(false, node_step, node_proc, cost); + } + + // Recv side: iterate steps in val (child steps). + // But we only pay at min(val) - 1. + if (!val.empty()) { + unsigned min_s = val[0]; + for (unsigned s : val) + min_s = std::min(min_s, s); + + if (min_s > 0) { + dt.add(true, min_s - 1, child_proc, cost); + } + } + } +}; + +} // namespace osp diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/generic_lambda_container.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/generic_lambda_container.hpp new file mode 100644 index 00000000..623d51d8 --- /dev/null +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/generic_lambda_container.hpp @@ -0,0 +1,124 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include +#include +#include + +namespace osp { + +template +struct DefaultHasEntry { + static inline bool has_entry(const T &val) { return val != 0; } +}; + +template +struct DefaultHasEntry> { + static inline bool has_entry(const std::vector &val) { return !val.empty(); } +}; + +/** + * @brief Generic container for tracking child processor assignments in a BSP schedule using vectors. + * + * This structure tracks information about children assigned to each processor. + * It uses a 2D vector for dense data. + */ +template> +struct generic_lambda_vector_container { + + /** + * @brief Range adapter for iterating over non-zero/non-empty processor entries. + */ + class lambda_vector_range { + private: + const std::vector &vec_; + + public: + class lambda_vector_iterator { + using iterator_category = std::input_iterator_tag; + using value_type = std::pair; + using difference_type = std::ptrdiff_t; + using pointer = value_type *; + using reference = value_type &; + + private: + const std::vector &vec_; + unsigned index_; + + public: + lambda_vector_iterator(const std::vector &vec) : vec_(vec), index_(0) { + while (index_ < vec_.size() && !HasEntry::has_entry(vec_[index_])) { + ++index_; + } + } + + lambda_vector_iterator(const std::vector &vec, unsigned index) : vec_(vec), index_(index) {} + + lambda_vector_iterator &operator++() { + ++index_; + while (index_ < vec_.size() && !HasEntry::has_entry(vec_[index_])) { + ++index_; + } + return *this; + } + + value_type operator*() const { return std::make_pair(index_, vec_[index_]); } + + bool operator==(const lambda_vector_iterator &other) const { return index_ == other.index_; } + bool operator!=(const lambda_vector_iterator &other) const { return !(*this == other); } + }; + + lambda_vector_range(const std::vector &vec) : vec_(vec) {} + + lambda_vector_iterator begin() { return lambda_vector_iterator(vec_); } + lambda_vector_iterator end() { return lambda_vector_iterator(vec_, static_cast(vec_.size())); } + }; + + /// 2D vector: for each node, stores processor assignment info + std::vector> node_lambda_vec; + + /// Number of processors in the system + unsigned num_procs_ = 0; + + inline void initialize(const vertex_idx_t num_vertices, const unsigned num_procs) { + node_lambda_vec.assign(num_vertices, std::vector(num_procs)); + num_procs_ = num_procs; + } + + inline void reset_node(const vertex_idx_t node) { node_lambda_vec[node].assign(num_procs_, ValueType()); } + + inline void clear() { node_lambda_vec.clear(); } + + inline bool has_proc_entry(const vertex_idx_t node, const unsigned proc) const { + return HasEntry::has_entry(node_lambda_vec[node][proc]); + } + + inline ValueType &get_proc_entry(const vertex_idx_t node, const unsigned proc) { + return node_lambda_vec[node][proc]; + } + + inline ValueType get_proc_entry(const vertex_idx_t node, const unsigned proc) const { + return node_lambda_vec[node][proc]; + } + + inline auto iterate_proc_entries(const vertex_idx_t node) { return lambda_vector_range(node_lambda_vec[node]); } +}; + +} // namespace osp diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp index cc8d8a5a..b3820231 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp @@ -18,9 +18,12 @@ limitations under the License. #pragma once +#include "comm_cost_policies.hpp" +#include "generic_lambda_container.hpp" #include "lambda_container.hpp" #include "osp/bsp/model/BspInstance.hpp" #include +#include #include #include @@ -53,7 +56,7 @@ struct pre_move_comm_data { } }; -template +template struct max_comm_datastructure { using comm_weight_t = v_commw_t; @@ -73,7 +76,13 @@ struct max_comm_datastructure { comm_weight_t max_comm_weight = 0; - lambda_vector_container node_lambda_map; + // Select the appropriate container type based on the policy's ValueType + using ContainerType = + typename std::conditional::value, + lambda_vector_container, + generic_lambda_vector_container>::type; + + ContainerType node_lambda_map; // Optimization: Scratchpad for update_datastructure_after_move to avoid allocations std::vector affected_steps_list; @@ -123,9 +132,6 @@ struct max_comm_datastructure { } inline void arrange_superstep_comm_data(const unsigned step) { - // Linear scan O(P) to find max, second_max and count - - // 1. Analyze Sends comm_weight_t max_send = 0; comm_weight_t second_max_send = 0; unsigned max_send_count = 0; @@ -143,7 +149,6 @@ struct max_comm_datastructure { } } - // 2. Analyze Receives comm_weight_t max_receive = 0; comm_weight_t second_max_receive = 0; unsigned max_receive_count = 0; @@ -161,7 +166,6 @@ struct max_comm_datastructure { } } - // 3. Aggregate Global Stats const comm_weight_t global_max = std::max(max_send, max_receive); step_max_comm_cache[step] = global_max; @@ -172,7 +176,6 @@ struct max_comm_datastructure { global_count += max_receive_count; step_max_comm_count_cache[step] = global_count; - // Determine second max comm_weight_t cand_send = (max_send == global_max) ? second_max_send : max_send; comm_weight_t cand_recv = (max_receive == global_max) ? second_max_receive : max_receive; @@ -204,7 +207,7 @@ struct max_comm_datastructure { void update_datastructure_after_move(const kl_move &move, unsigned, unsigned) { const auto &graph = instance->getComputationalDag(); - // --- 0. Prepare Scratchpad (Avoids Allocations) --- + // Prepare Scratchpad (Avoids Allocations) --- for (unsigned step : affected_steps_list) { if (step < step_is_affected.size()) step_is_affected[step] = false; @@ -225,20 +228,16 @@ struct max_comm_datastructure { const unsigned to_proc = move.to_proc; const comm_weight_t comm_w_node = graph.vertex_comm_weight(node); - // --- 1. Handle Node Movement (Outgoing Edges: Node -> Children) --- + // Handle Node Movement (Outgoing Edges: Node -> Children) if (from_step != to_step) { // Case 1: Node changes Step - // Optimization: Fuse the loop to iterate lambda map only once. - - for (const auto [proc, count] : node_lambda_map.iterate_proc_entries(node)) { + for (const auto [proc, val] : node_lambda_map.iterate_proc_entries(node)) { // A. Remove Old (Sender: from_proc, Receiver: proc) if (proc != from_proc) { const comm_weight_t cost = comm_w_node * instance->sendCosts(from_proc, proc); - // Optimization: check cost > 0 to avoid dirtying cache lines with +0 ops - if (cost > 0) { - step_proc_receive_[from_step][proc] -= cost; - step_proc_send_[from_step][from_proc] -= cost; + if (cost > 0) { + CommPolicy::unattribute_communication(*this, cost, from_step, from_proc, proc, 0, val); } } @@ -246,8 +245,7 @@ struct max_comm_datastructure { if (proc != to_proc) { const comm_weight_t cost = comm_w_node * instance->sendCosts(to_proc, proc); if (cost > 0) { - step_proc_receive_[to_step][proc] += cost; - step_proc_send_[to_step][to_proc] += cost; + CommPolicy::attribute_communication(*this, cost, to_step, to_proc, proc, 0, val); } } } @@ -257,13 +255,12 @@ struct max_comm_datastructure { } else if (from_proc != to_proc) { // Case 2: Node stays in same Step, but changes Processor - for (const auto [proc, count] : node_lambda_map.iterate_proc_entries(node)) { + for (const auto [proc, val] : node_lambda_map.iterate_proc_entries(node)) { // Remove Old (Sender: from_proc, Receiver: proc) if (proc != from_proc) { const comm_weight_t cost = comm_w_node * instance->sendCosts(from_proc, proc); if (cost > 0) { - step_proc_receive_[from_step][proc] -= cost; - step_proc_send_[from_step][from_proc] -= cost; + CommPolicy::unattribute_communication(*this, cost, from_step, from_proc, proc, 0, val); } } @@ -271,17 +268,16 @@ struct max_comm_datastructure { if (proc != to_proc) { const comm_weight_t cost = comm_w_node * instance->sendCosts(to_proc, proc); if (cost > 0) { - step_proc_receive_[from_step][proc] += cost; - step_proc_send_[from_step][to_proc] += cost; + CommPolicy::attribute_communication(*this, cost, from_step, to_proc, proc, 0, val); } } } mark_step(from_step); } - // --- 2. Update Parents' Outgoing Communication (Parents → Node) --- + // Update Parents' Outgoing Communication (Parents → Node) - if (from_proc != to_proc) { + if (from_proc != to_proc || from_step != to_step) { for (const auto &parent : graph.parents(node)) { const unsigned parent_step = active_schedule->assigned_superstep(parent); // Fast boundary check @@ -291,27 +287,30 @@ struct max_comm_datastructure { const unsigned parent_proc = active_schedule->assigned_processor(parent); const comm_weight_t comm_w_parent = graph.vertex_comm_weight(parent); - const bool removed_from_proc = node_lambda_map.decrease_proc_count(parent, from_proc); - const bool added_to_proc = node_lambda_map.increase_proc_count(parent, to_proc); + auto &val = node_lambda_map.get_proc_entry(parent, from_proc); + const bool removed_from_proc = CommPolicy::remove_child(val, from_step); // 1. Handle Removal from from_proc if (removed_from_proc) { if (from_proc != parent_proc) { const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, from_proc); if (cost > 0) { - step_proc_send_[parent_step][parent_proc] -= cost; - step_proc_receive_[parent_step][from_proc] -= cost; + CommPolicy::unattribute_communication(*this, cost, parent_step, parent_proc, from_proc, + from_step, val); } } } + auto &val_to = node_lambda_map.get_proc_entry(parent, to_proc); + const bool added_to_proc = CommPolicy::add_child(val_to, to_step); + // 2. Handle Addition to to_proc if (added_to_proc) { if (to_proc != parent_proc) { const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, to_proc); if (cost > 0) { - step_proc_send_[parent_step][parent_proc] += cost; - step_proc_receive_[parent_step][to_proc] += cost; + CommPolicy::attribute_communication(*this, cost, parent_step, parent_proc, to_proc, to_step, + val_to); } } } @@ -320,7 +319,7 @@ struct max_comm_datastructure { } } - // --- 3. Re-arrange Affected Steps --- + // Re-arrange Affected Steps for (unsigned step : affected_steps_list) { arrange_superstep_comm_data(step); } @@ -358,27 +357,28 @@ struct max_comm_datastructure { for (const auto &v : graph.children(u)) { const unsigned v_proc = vec_sched.assignedProcessor(v); - const unsigned v_step = vec_sched.assignedSuperstep(v); - const comm_weight_t comm_w_send_cost = (u_proc != v_proc) ? comm_w * instance->sendCosts(u_proc, v_proc) : 0; - - if (node_lambda_map.increase_proc_count(u, v_proc)) { + const unsigned v_step = vec_sched.assignedSuperstep(v); + + const comm_weight_t comm_w_send_cost = + (u_proc != v_proc) ? comm_w * instance->sendCosts(u_proc, v_proc) : 0; + + auto &val = node_lambda_map.get_proc_entry(u, v_proc); + if (CommPolicy::add_child(val, v_step)) { if (u_proc != v_proc && comm_w_send_cost > 0) { - attribute_communication(comm_w_send_cost, u_step, u_proc, v_proc, v_step); + CommPolicy::attribute_communication(*this, comm_w_send_cost, u_step, u_proc, v_proc, v_step, + val); } } } } for (unsigned step = start_step; step <= end_step; step++) { + if (step >= step_proc_send_.size()) { + continue; + } arrange_superstep_comm_data(step); } } - - inline void attribute_communication(const comm_weight_t &comm_w_send_cost, const unsigned u_step, const unsigned u_proc, const unsigned v_proc, - const unsigned) { - step_proc_receive_[u_step][v_proc] += comm_w_send_cost; - step_proc_send_[u_step][u_proc] += comm_w_send_cost; - } }; } // namespace osp \ No newline at end of file diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp index a27ebe9b..97bd35a7 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp @@ -1024,6 +1024,45 @@ class kl_improver : public ImprovementScheduler { thread_data.reward_penalty_strat.reward, recompute_max_gain, new_nodes); // this only updated reward/penalty, collects new_nodes, and fills recompute_max_gain + // Add nodes from affected steps to new_nodes + // { + // std::unordered_set steps_to_check; + // const unsigned num_steps = active_schedule.num_steps(); + + // auto add_steps_range = [&](unsigned center_step) { + // unsigned start = (center_step > window_size) ? center_step - window_size : 0; + // unsigned end = std::min(center_step + window_size, num_steps - 1); + + // // Constrain to thread range + // if (start < thread_data.start_step) + // start = thread_data.start_step; + // if (end > thread_data.end_step) + // end = thread_data.end_step; + + // for (unsigned s = start; s <= end; ++s) { + // steps_to_check.insert(s); + // } + // }; + + // add_steps_range(best_move.from_step); + // add_steps_range(best_move.to_step); + + // for (unsigned step : steps_to_check) { + // for (unsigned proc = 0; proc < instance->numberOfProcessors(); ++proc) { + // const auto &nodes_in_step = active_schedule.getSetSchedule().step_processor_vertices[step][proc]; + // for (const auto &node : nodes_in_step) { + // if (!thread_data.affinity_table.is_selected(node) && !thread_data.lock_manager.is_locked(node)) { + // new_nodes.push_back(node); + // } + // } + // } + // } + + // // Deduplicate new_nodes + // std::sort(new_nodes.begin(), new_nodes.end()); + // new_nodes.erase(std::unique(new_nodes.begin(), new_nodes.end()), new_nodes.end()); + // } + // Determine the steps where max/second_max/max_count for work/comm changed std::unordered_set changed_steps; @@ -1150,7 +1189,8 @@ class kl_improver : public ImprovementScheduler { ThreadSearchContext &thread_data) { if (no_imp_counter >= thread_data.no_improvement_iterations_reduce_penalty && thread_data.reward_penalty_strat.initial_penalty > 1.0) { - thread_data.reward_penalty_strat.initial_penalty = static_cast(std::floor(std::sqrt(thread_data.reward_penalty_strat.initial_penalty))); + thread_data.reward_penalty_strat.initial_penalty = + static_cast(std::floor(std::sqrt(thread_data.reward_penalty_strat.initial_penalty))); thread_data.unlock_edge_backtrack_counter_reset += 1; thread_data.no_improvement_iterations_reduce_penalty += 15; #ifdef KL_DEBUG_1 @@ -1204,7 +1244,8 @@ class kl_improver : public ImprovementScheduler { if (select_nodes_check_remove_superstep(thread_data.step_to_remove, thread_data)) { active_schedule.swap_empty_step_fwd(thread_data.step_to_remove, thread_data.end_step); thread_data.end_step--; - thread_data.local_search_start_step = static_cast(thread_data.active_schedule_data.applied_moves.size()); + thread_data.local_search_start_step = + static_cast(thread_data.active_schedule_data.applied_moves.size()); thread_data.active_schedule_data.update_cost(static_cast(-1.0 * instance->synchronisationCosts())); if constexpr (enable_preresolving_violations) { diff --git a/tests/kl_bsp_cost.cpp b/tests/kl_bsp_cost.cpp index 36e999ff..05a5882c 100644 --- a/tests/kl_bsp_cost.cpp +++ b/tests/kl_bsp_cost.cpp @@ -19,7 +19,6 @@ limitations under the License. #define BOOST_TEST_MODULE kl_bsp_cost #include -#include "test_graphs.hpp" #include "osp/bsp/model/BspSchedule.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp" @@ -27,6 +26,7 @@ limitations under the License. #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp" #include "osp/concepts/graph_traits.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" +#include "test_graphs.hpp" using namespace osp; using graph = computational_dag_edge_idx_vector_impl_def_int_t; @@ -319,7 +319,7 @@ bool validate_comm_datastructures( } } } - + return all_match; } @@ -1083,4 +1083,192 @@ BOOST_AUTO_TEST_CASE(test_ladder_graph_moves) { kl_sched.apply_move(move4, active_schedule_data); comm_ds.update_datastructure_after_move(move4, 0, 5); BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "ladder_move4")); +} + +BOOST_AUTO_TEST_CASE(test_lazy_and_buffered_modes) { + std::cout << "Setup Graph" << std::endl; + graph instance; + instance.add_vertex(1, 10, 1); + instance.add_vertex(1, 10, 1); + instance.add_vertex(1, 10, 1); + + instance.add_edge(0, 1, 1); + instance.add_edge(0, 2, 1); + + std::cout << "Setup Arch" << std::endl; + osp::BspArchitecture arch; + arch.setNumberOfProcessors(2); + arch.setCommunicationCosts(1); + arch.setSynchronisationCosts(0); + + std::cout << "Setup BspInstance" << std::endl; + osp::BspInstance bsp_instance(instance, arch); + + std::cout << "Setup Schedule" << std::endl; + osp::BspSchedule schedule(bsp_instance); + schedule.setAssignedProcessor(0, 0); + schedule.setAssignedProcessor(1, 1); + schedule.setAssignedProcessor(2, 1); + + schedule.setAssignedSuperstep(0, 0); + schedule.setAssignedSuperstep(1, 2); + schedule.setAssignedSuperstep(2, 4); + + schedule.updateNumberOfSupersteps(); + + std::cout << "Setup KL Sched" << std::endl; + kl_active_schedule_t kl_sched; + kl_sched.initialize(schedule); + + thread_local_active_schedule_data active_schedule_data; + active_schedule_data.initialize_cost(0.0); + + std::cout << "Setup Complete" << std::endl; + std::cout << "Num Vertices: " << instance.num_vertices() << std::endl; + std::cout << "Num Procs: " << arch.numberOfProcessors() << std::endl; + + std::cout << "Start Eager Test" << std::endl; + { + using CommPolicy = osp::EagerCommCostPolicy; + osp::max_comm_datastructure comm_ds; + std::cout << "Initialize Eager Comm DS" << std::endl; + comm_ds.initialize(kl_sched); + + std::cout << "Checking node_lambda_map" << std::endl; + std::cout << "node_lambda_vec size: " << comm_ds.node_lambda_map.node_lambda_vec.size() << std::endl; + if (comm_ds.node_lambda_map.node_lambda_vec.size() > 0) { + std::cout << "node_lambda_vec[0] size: " << comm_ds.node_lambda_map.node_lambda_vec[0].size() << std::endl; + } + + std::cout << "Compute Eager Comm DS" << std::endl; + comm_ds.compute_comm_datastructures(0, 4); + std::cout << "Eager Done" << std::endl; + } + + std::cout << "Start Lazy Test" << std::endl; + // --- Test Lazy Policy --- + { + using CommPolicy = osp::LazyCommCostPolicy; + osp::max_comm_datastructure comm_ds; + std::cout << "Initialize Comm DS" << std::endl; + comm_ds.initialize(kl_sched); + std::cout << "Compute Comm DS" << std::endl; + comm_ds.compute_comm_datastructures(0, 4); + + // Expected Behavior for Lazy: + // Node 0 (P0) sends to P1. + // Children on P1 are at Step 2 and Step 4. + // Lazy policy should attribute cost to min(2, 4) - 1 = Step 1. + // Cost = 10 * 1.0 = 10. + + // Lazy: Send and Recv at min(2, 4) - 1 = Step 1. + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 0), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 1), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 0), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 1), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 1), 0); + + using kl_move = osp::kl_move_struct; + kl_move move(1, 0.0, 1, 2, 1, 3); // Node 1, Step 2->3, Proc 1->1 + kl_sched.apply_move(move, active_schedule_data); + comm_ds.update_datastructure_after_move(move, 0, 4); + + // After move: Children at {3, 4}. Min = 3. Send/Recv at Step 2. + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 0), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 0), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 1), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 0), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 1), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 1), 0); + + // Reset Node 1 to Step 2 for next test + kl_move move_back(1, 0.0, 1, 3, 1, 2); + kl_sched.apply_move(move_back, active_schedule_data); + } + + // --- Test Buffered Policy --- + { + using CommPolicy = osp::BufferedCommCostPolicy; + osp::max_comm_datastructure comm_ds; + comm_ds.initialize(kl_sched); + comm_ds.compute_comm_datastructures(0, 4); + + // Buffered: Send at Step 0. Recv at min(2, 4) - 1 = Step 1. + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 0), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 1), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 0), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 1), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 1), 0); + + using kl_move = osp::kl_move_struct; + kl_move move(1, 0.0, 1, 2, 1, 3); // Node 1, Step 2->3, Proc 1->1 + kl_sched.apply_move(move, active_schedule_data); + comm_ds.update_datastructure_after_move(move, 0, 4); + + // After move: Children at {3, 4}. Min = 3. Recv at Step 2. Send still at Step 0. + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 0), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 0), 0); + + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 1), 10); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 1), 0); + BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 1), 0); + } } \ No newline at end of file diff --git a/tests/kl_bsp_improver_test.cpp b/tests/kl_bsp_improver_test.cpp index 250dfa18..df3ac3f1 100644 --- a/tests/kl_bsp_improver_test.cpp +++ b/tests/kl_bsp_improver_test.cpp @@ -129,122 +129,122 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) { BOOST_CHECK_CLOSE(iter4_recomputed, iter4_tracked, 0.00001); } -BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs) { - std::vector filenames_graph = large_spaa_graphs(); - using graph = computational_dag_edge_idx_vector_impl_def_int_t; - // Getting root git directory - std::filesystem::path cwd = std::filesystem::current_path(); - std::cout << cwd << std::endl; - while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) { - cwd = cwd.parent_path(); - std::cout << cwd << std::endl; - } +// BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs) { +// std::vector filenames_graph = large_spaa_graphs(); +// using graph = computational_dag_edge_idx_vector_impl_def_int_t; +// // Getting root git directory +// std::filesystem::path cwd = std::filesystem::current_path(); +// std::cout << cwd << std::endl; +// while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) { +// cwd = cwd.parent_path(); +// std::cout << cwd << std::endl; +// } - for (auto &filename_graph : filenames_graph) { - GreedyBspScheduler test_scheduler; - BspInstance instance; - bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(), - instance.getComputationalDag()); +// for (auto &filename_graph : filenames_graph) { +// GreedyBspScheduler test_scheduler; +// BspInstance instance; +// bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(), +// instance.getComputationalDag()); - instance.getArchitecture().setSynchronisationCosts(500); - instance.getArchitecture().setCommunicationCosts(5); - instance.getArchitecture().setNumberOfProcessors(4); +// instance.getArchitecture().setSynchronisationCosts(500); +// instance.getArchitecture().setCommunicationCosts(5); +// instance.getArchitecture().setNumberOfProcessors(4); - std::vector> send_cost = {{0, 1, 4, 4}, {1, 0, 4, 4}, {4, 4, 0, 1}, {4, 4, 1, 0}}; +// std::vector> send_cost = {{0, 1, 4, 4}, {1, 0, 4, 4}, {4, 4, 0, 1}, {4, 4, 1, 0}}; - instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().setSendCosts(send_cost); - if (!status_graph) { +// if (!status_graph) { - std::cout << "Reading files failed." << std::endl; - BOOST_CHECK(false); - } +// std::cout << "Reading files failed." << std::endl; +// BOOST_CHECK(false); +// } - add_mem_weights(instance.getComputationalDag()); +// add_mem_weights(instance.getComputationalDag()); - BspSchedule schedule(instance); - const auto result = test_scheduler.computeSchedule(schedule); +// BspSchedule schedule(instance); +// const auto result = test_scheduler.computeSchedule(schedule); - schedule.updateNumberOfSupersteps(); +// schedule.updateNumberOfSupersteps(); - std::cout << "initial scedule with costs: " << schedule.computeCosts() << " and " - << schedule.numberOfSupersteps() << " number of supersteps" << std::endl; +// std::cout << "initial scedule with costs: " << schedule.computeCosts() << " and " +// << schedule.numberOfSupersteps() << " number of supersteps" << std::endl; - BspSchedule schedule_2(schedule); +// BspSchedule schedule_2(schedule); - BOOST_CHECK_EQUAL(RETURN_STATUS::OSP_SUCCESS, result); - BOOST_CHECK_EQUAL(&schedule.getInstance(), &instance); - BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); +// BOOST_CHECK_EQUAL(RETURN_STATUS::OSP_SUCCESS, result); +// BOOST_CHECK_EQUAL(&schedule.getInstance(), &instance); +// BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); - kl_total_lambda_comm_improver kl_total_lambda; - auto start_time = std::chrono::high_resolution_clock::now(); - auto status = kl_total_lambda.improveSchedule(schedule); - auto finish_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(finish_time - start_time).count(); +// kl_total_lambda_comm_improver kl_total_lambda; +// auto start_time = std::chrono::high_resolution_clock::now(); +// auto status = kl_total_lambda.improveSchedule(schedule); +// auto finish_time = std::chrono::high_resolution_clock::now(); +// auto duration = std::chrono::duration_cast(finish_time - start_time).count(); - std::cout << "kl lambda new finished in " << duration << " seconds, costs: " << schedule.computeCosts() - << " and lambda costs: " << schedule.computeTotalLambdaCosts() << " with " - << schedule.numberOfSupersteps() << " number of supersteps" << std::endl; +// std::cout << "kl lambda new finished in " << duration << " seconds, costs: " << schedule.computeCosts() +// << " and lambda costs: " << schedule.computeTotalLambdaCosts() << " with " +// << schedule.numberOfSupersteps() << " number of supersteps" << std::endl; - BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); - BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); +// BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); +// BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); - kl_bsp_comm_improver_mt kl; - kl.setTimeQualityParameter(5.0); - start_time = std::chrono::high_resolution_clock::now(); - status = kl.improveSchedule(schedule); - finish_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(finish_time - start_time).count(); +// kl_bsp_comm_improver_mt kl(42); +// kl.setTimeQualityParameter(2.0); +// start_time = std::chrono::high_resolution_clock::now(); +// status = kl.improveSchedule(schedule); +// finish_time = std::chrono::high_resolution_clock::now(); +// duration = std::chrono::duration_cast(finish_time - start_time).count(); - std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeCosts() << " with " - << schedule.numberOfSupersteps() << " number of supersteps" << std::endl; +// std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeCosts() << " with " +// << schedule.numberOfSupersteps() << " number of supersteps" << std::endl; - BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); - BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); +// BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); +// BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); - BspScheduleCS schedule_cs(schedule); +// BspScheduleCS schedule_cs(schedule); - HillClimbingForCommSteps hc_comm_steps; - start_time = std::chrono::high_resolution_clock::now(); - status = hc_comm_steps.improveSchedule(schedule_cs); - finish_time = std::chrono::high_resolution_clock::now(); +// HillClimbingForCommSteps hc_comm_steps; +// start_time = std::chrono::high_resolution_clock::now(); +// status = hc_comm_steps.improveSchedule(schedule_cs); +// finish_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(finish_time - start_time).count(); +// duration = std::chrono::duration_cast(finish_time - start_time).count(); - std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs.computeCosts() - << " with " << schedule_cs.numberOfSupersteps() << " number of supersteps" << std::endl; +// std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs.computeCosts() +// << " with " << schedule_cs.numberOfSupersteps() << " number of supersteps" << std::endl; - BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); - BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); +// BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); +// BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); - kl_total_lambda.improveSchedule(schedule_2); +// kl_total_lambda.improveSchedule(schedule_2); - HillClimbingScheduler hc; +// HillClimbingScheduler hc; - start_time = std::chrono::high_resolution_clock::now(); - status = hc.improveSchedule(schedule_2); - finish_time = std::chrono::high_resolution_clock::now(); +// start_time = std::chrono::high_resolution_clock::now(); +// status = hc.improveSchedule(schedule_2); +// finish_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(finish_time - start_time).count(); +// duration = std::chrono::duration_cast(finish_time - start_time).count(); - std::cout << "hc finished in " << duration << " seconds, costs: " << schedule_2.computeCosts() << " with " - << schedule_2.numberOfSupersteps() << " number of supersteps" << std::endl; +// std::cout << "hc finished in " << duration << " seconds, costs: " << schedule_2.computeCosts() << " with " +// << schedule_2.numberOfSupersteps() << " number of supersteps" << std::endl; - BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); - BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true); +// BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); +// BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true); - BspScheduleCS schedule_cs_2(schedule_2); +// BspScheduleCS schedule_cs_2(schedule_2); - start_time = std::chrono::high_resolution_clock::now(); - status = hc_comm_steps.improveSchedule(schedule_cs_2); - finish_time = std::chrono::high_resolution_clock::now(); +// start_time = std::chrono::high_resolution_clock::now(); +// status = hc_comm_steps.improveSchedule(schedule_cs_2); +// finish_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(finish_time - start_time).count(); +// duration = std::chrono::duration_cast(finish_time - start_time).count(); - std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs_2.computeCosts() - << " with " << schedule_cs_2.numberOfSupersteps() << " number of supersteps" << std::endl; +// std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs_2.computeCosts() +// << " with " << schedule_cs_2.numberOfSupersteps() << " number of supersteps" << std::endl; - BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); - BOOST_CHECK_EQUAL(schedule_cs_2.satisfiesPrecedenceConstraints(), true); - } -} \ No newline at end of file +// BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); +// BOOST_CHECK_EQUAL(schedule_cs_2.satisfiesPrecedenceConstraints(), true); +// } +// } \ No newline at end of file From 3264e04df55348180abe65454b4539a1bc8441c6 Mon Sep 17 00:00:00 2001 From: tonibohnlein Date: Thu, 27 Nov 2025 15:16:04 +0100 Subject: [PATCH 3/3] simplification --- .../max_comm_datastructure.hpp | 67 +++++++++---------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp index b3820231..236e11cc 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp @@ -277,47 +277,46 @@ struct max_comm_datastructure { // Update Parents' Outgoing Communication (Parents → Node) - if (from_proc != to_proc || from_step != to_step) { - for (const auto &parent : graph.parents(node)) { - const unsigned parent_step = active_schedule->assigned_superstep(parent); - // Fast boundary check - if (parent_step >= step_proc_send_.size()) - continue; - - const unsigned parent_proc = active_schedule->assigned_processor(parent); - const comm_weight_t comm_w_parent = graph.vertex_comm_weight(parent); - - auto &val = node_lambda_map.get_proc_entry(parent, from_proc); - const bool removed_from_proc = CommPolicy::remove_child(val, from_step); - - // 1. Handle Removal from from_proc - if (removed_from_proc) { - if (from_proc != parent_proc) { - const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, from_proc); - if (cost > 0) { - CommPolicy::unattribute_communication(*this, cost, parent_step, parent_proc, from_proc, - from_step, val); - } + for (const auto &parent : graph.parents(node)) { + const unsigned parent_step = active_schedule->assigned_superstep(parent); + // Fast boundary check + if (parent_step >= step_proc_send_.size()) + continue; + + const unsigned parent_proc = active_schedule->assigned_processor(parent); + const comm_weight_t comm_w_parent = graph.vertex_comm_weight(parent); + + auto &val = node_lambda_map.get_proc_entry(parent, from_proc); + const bool removed_from_proc = CommPolicy::remove_child(val, from_step); + + // 1. Handle Removal from from_proc + if (removed_from_proc) { + if (from_proc != parent_proc) { + const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, from_proc); + if (cost > 0) { + CommPolicy::unattribute_communication(*this, cost, parent_step, parent_proc, from_proc, + from_step, val); } } + } + + auto &val_to = node_lambda_map.get_proc_entry(parent, to_proc); + const bool added_to_proc = CommPolicy::add_child(val_to, to_step); - auto &val_to = node_lambda_map.get_proc_entry(parent, to_proc); - const bool added_to_proc = CommPolicy::add_child(val_to, to_step); - - // 2. Handle Addition to to_proc - if (added_to_proc) { - if (to_proc != parent_proc) { - const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, to_proc); - if (cost > 0) { - CommPolicy::attribute_communication(*this, cost, parent_step, parent_proc, to_proc, to_step, - val_to); - } + // 2. Handle Addition to to_proc + if (added_to_proc) { + if (to_proc != parent_proc) { + const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, to_proc); + if (cost > 0) { + CommPolicy::attribute_communication(*this, cost, parent_step, parent_proc, to_proc, to_step, + val_to); } } - - mark_step(parent_step); } + + mark_step(parent_step); } + // Re-arrange Affected Steps for (unsigned step : affected_steps_list) {