From 9acc43d682c76f89ffc1273517f1048adda18fdb Mon Sep 17 00:00:00 2001
From: tonibohnlein <toni.boehnlein18@gmail.com>
Date: Wed, 19 Nov 2025 11:54:37 +0100
Subject: [PATCH 1/3] compute_comm_affinity

added test kl_bsp_cost

more tests for max_comm_datastructure

update

update node affinity

added test for affinity

update test

update

affinity tests

affinity tests

update

update

small performance optimization

more test, pre move data update

pre_move_comm_data

new unit test

update test

update tests

update

update

update lambda container, bsp_cost function

cost function correction

max_comm_datastructures fix

fix kl_bsp_cost test

update

update

update

update

more tests

update

update comm update

update

update

update

update

update

update

added more tests, cmpute working

update comm affinity implemenation

comm_affinity optimizations

enable tests

update test

debug output

update

update update

update

unit tests passing

cleaning

update datastructure

update

update numa

added mt test
---
 .../auxiliary/io/hdag_graph_file_reader.hpp   |    1 +
 .../comm_cost_modules/kl_bsp_comm_cost.hpp    |  675 +++++++--
 .../kl_hyper_total_comm_cost.hpp              |   11 +-
 .../comm_cost_modules/kl_total_comm_cost.hpp  |    6 +
 .../comm_cost_modules/lambda_container.hpp    |  330 +++-
 .../max_comm_datastructure.hpp                |  396 +++--
 .../KernighanLin_v2/kl_improver.hpp           | 1340 +++++++++++------
 .../KernighanLin_v2/kl_improver_test.hpp      |   96 +-
 .../KernighanLin_v2/kl_include.hpp            |   18 +-
 .../KernighanLin_v2/kl_include_mt.hpp         |    5 +
 .../LocalSearch/KernighanLin_v2/kl_util.hpp   |  222 ++-
 tests/CMakeLists.txt                          |    6 +
 tests/kl_bsp_affinity_test.cpp                |  967 ++++++++++++
 tests/kl_bsp_cost.cpp                         | 1086 +++++++++++++
 tests/kl_bsp_improver_test.cpp                |  250 +++
 15 files changed, 4447 insertions(+), 962 deletions(-)
 create mode 100644 tests/kl_bsp_affinity_test.cpp
 create mode 100644 tests/kl_bsp_cost.cpp
 create mode 100644 tests/kl_bsp_improver_test.cpp
diff --git a/include/osp/auxiliary/io/hdag_graph_file_reader.hpp b/include/osp/auxiliary/io/hdag_graph_file_reader.hpp
index 63d04909..a91481a7 100644
--- a/include/osp/auxiliary/io/hdag_graph_file_reader.hpp
+++ b/include/osp/auxiliary/io/hdag_graph_file_reader.hpp
@@ -29,6 +29,7 @@ limitations under the License.
 #include "osp/concepts/computational_dag_concept.hpp"
 #include "osp/graph_algorithms/directed_graph_util.hpp"
 #include "osp/auxiliary/io/filepath_checker.hpp"
+#include "osp/concepts/constructable_computational_dag_concept.hpp"
 
 namespace osp {
 namespace file_reader {
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp
index 679db815..f6c425bd 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp
@@ -19,218 +19,649 @@ limitations under the License.
 #pragma once
 
 #include "../kl_active_schedule.hpp"
-#include "lambda_container.hpp"
+#include "../kl_improver.hpp"
 #include "max_comm_datastructure.hpp"
+#include <array>
 
 namespace osp {
 
-template <typename Graph_t, typename cost_t, typename MemoryConstraint_t, unsigned window_size = 1>
+// A lightweight helper to track deltas without hash maps or repeated allocations.
+// Uses a dense vector for O(1) lookups and a sparse list for fast iteration/clearing.
+template<typename comm_weight_t>
+struct FastDeltaTracker {
+    std::vector<comm_weight_t> dense_vals;  // Size: num_procs
+    std::vector<unsigned> dirty_procs;      // List of modified indices
+    std::vector<unsigned> proc_dirty_index; // Map proc -> index in dirty_procs (num_procs if not dirty)
+    unsigned num_procs = 0;
+
+    void initialize(unsigned n_procs) {
+        if (n_procs > num_procs) {
+            num_procs = n_procs;
+            dense_vals.resize(num_procs, 0);
+            dirty_procs.reserve(num_procs);
+            proc_dirty_index.resize(num_procs, num_procs);
+        }
+    }
+
+    inline void add(unsigned proc, comm_weight_t val) {
+        if (val == 0)
+            return;
+
+        // If currently 0, it is becoming dirty
+        if (dense_vals[proc] == 0) {
+            proc_dirty_index[proc] = static_cast<unsigned>(dirty_procs.size());
+            dirty_procs.push_back(proc);
+        }
+
+        dense_vals[proc] += val;
+
+        // If it returns to 0, remove it from dirty list (Swap and Pop for O(1))
+        if (dense_vals[proc] == 0) {
+            unsigned idx = proc_dirty_index[proc];
+            unsigned last_proc = dirty_procs.back();
+
+            // Move last element to the hole
+            dirty_procs[idx] = last_proc;
+            proc_dirty_index[last_proc] = idx;
+
+            // Remove last
+            dirty_procs.pop_back();
+            proc_dirty_index[proc] = num_procs;
+        }
+    }
+
+    inline comm_weight_t get(unsigned proc) const {
+        if (proc < dense_vals.size())
+            return dense_vals[proc];
+        return 0;
+    }
+
+    inline void clear() {
+        for (unsigned p : dirty_procs) {
+            dense_vals[p] = 0;
+            proc_dirty_index[p] = num_procs;
+        }
+        dirty_procs.clear();
+    }
+};
+
+template<typename Graph_t, typename cost_t, typename MemoryConstraint_t, unsigned window_size = 1>
 struct kl_bsp_comm_cost_function {
-    
+
     using VertexType = vertex_idx_t<Graph_t>;
     using kl_move = kl_move_struct<cost_t, VertexType>;
     using kl_gain_update_info = kl_update_info<VertexType>;
+    using comm_weight_t = v_commw_t<Graph_t>;
 
     constexpr static unsigned window_range = 2 * window_size + 1;
+    constexpr static bool is_max_comm_cost_function = true;
 
     kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> *active_schedule;
     compatible_processor_range<Graph_t> *proc_range;
     const Graph_t *graph;
     const BspInstance<Graph_t> *instance;
 
-    max_comm_datastructure<Graph_t> comm_ds;
+    max_comm_datastructure<Graph_t, cost_t, kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t>> comm_ds;
 
     inline cost_t get_comm_multiplier() { return 1; }
     inline cost_t get_max_comm_weight() { return comm_ds.max_comm_weight; }
     inline cost_t get_max_comm_weight_multiplied() { return comm_ds.max_comm_weight; }
     inline const std::string name() const { return "bsp_comm"; }
-    inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); }
-    inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return (node_step < window_size + start_step) ? window_size - (node_step - start_step) : 0; }
-    inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return (node_step + window_size <= end_step) ? window_range : window_range - (node_step + window_size - end_step); }
+    inline bool is_compatible(VertexType node, unsigned proc) {
+        return active_schedule->getInstance().isCompatible(node, proc);
+    }
+    inline unsigned start_idx(const unsigned node_step, const unsigned start_step) {
+        return (node_step < window_size + start_step) ? window_size - (node_step - start_step) : 0;
+    }
+    inline unsigned end_idx(const unsigned node_step, const unsigned end_step) {
+        return (node_step + window_size <= end_step) ? window_range
+                                                     : window_range - (node_step + window_size - end_step);
+    }
 
-    void initialize(kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> &sched, compatible_processor_range<Graph_t> &p_range) {
+    void initialize(kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> &sched,
+                    compatible_processor_range<Graph_t> &p_range) {
         active_schedule = &sched;
         proc_range = &p_range;
         instance = &sched.getInstance();
         graph = &instance->getComputationalDag();
 
         const unsigned num_steps = active_schedule->num_steps();
-        comm_ds.initialize(active_schedule->getSetSchedule(), *instance, num_steps);
-        comm_ds.set_active_schedule(*active_schedule);
+        comm_ds.initialize(*active_schedule);
+    }
+
+    using pre_move_comm_data_t = pre_move_comm_data<comm_weight_t>;
+
+    inline pre_move_comm_data<comm_weight_t> get_pre_move_comm_data(const kl_move &move) {
+        return comm_ds.get_pre_move_comm_data(move);
     }
 
-    void compute_send_receive_datastructures() {        
+    void compute_send_receive_datastructures() {
         comm_ds.compute_comm_datastructures(0, active_schedule->num_steps() - 1);
     }
 
     template<bool compute_datastructures = true>
     cost_t compute_schedule_cost() {
-        if constexpr (compute_datastructures) compute_send_receive_datastructures();
+        if constexpr (compute_datastructures)
+            compute_send_receive_datastructures();
 
         cost_t total_cost = 0;
         for (unsigned step = 0; step < active_schedule->num_steps(); step++) {
             total_cost += active_schedule->get_step_max_work(step);
             total_cost += comm_ds.step_max_comm(step) * instance->communicationCosts();
         }
-        total_cost += static_cast<cost_t>(active_schedule->num_steps() - 1) * instance->synchronisationCosts();
+
+        if (active_schedule->num_steps() > 1) {
+            total_cost += static_cast<cost_t>(active_schedule->num_steps() - 1) * instance->synchronisationCosts();
+        }
+
         return total_cost;
     }
 
     cost_t compute_schedule_cost_test() { return compute_schedule_cost<false>(); }
 
-    void update_datastructure_after_move(const kl_move &move, const unsigned start_step, const unsigned end_step) {        
+    void update_datastructure_after_move(const kl_move &move, const unsigned start_step, const unsigned end_step) {
         comm_ds.update_datastructure_after_move(move, start_step, end_step);
     }
 
-    template <typename affinity_table_t>
+    // Structure to hold thread-local scratchpads to avoid re-allocation.
+    struct ScratchData {
+        std::vector<FastDeltaTracker<comm_weight_t>> send_deltas; // Size: num_steps
+        std::vector<FastDeltaTracker<comm_weight_t>> recv_deltas; // Size: num_steps
+
+        std::vector<unsigned> active_steps; // List of steps touched in current operation
+        std::vector<bool> step_is_active;   // Fast lookup for active steps
+
+        std::vector<std::pair<unsigned, comm_weight_t>> child_cost_buffer;
+
+        void init(unsigned n_steps, unsigned n_procs) {
+            if (send_deltas.size() < n_steps) {
+                send_deltas.resize(n_steps);
+                recv_deltas.resize(n_steps);
+                step_is_active.resize(n_steps, false);
+                active_steps.reserve(n_steps);
+            }
+
+            for (auto &tracker : send_deltas)
+                tracker.initialize(n_procs);
+            for (auto &tracker : recv_deltas)
+                tracker.initialize(n_procs);
+
+            child_cost_buffer.reserve(n_procs);
+        }
+
+        void clear_all() {
+            for (unsigned step : active_steps) {
+                send_deltas[step].clear();
+                recv_deltas[step].clear();
+                step_is_active[step] = false;
+            }
+            active_steps.clear();
+            child_cost_buffer.clear();
+        }
+
+        void mark_active(unsigned step) {
+            if (!step_is_active[step]) {
+                step_is_active[step] = true;
+                active_steps.push_back(step);
+            }
+        }
+    };
+
+    template<typename affinity_table_t>
     void compute_comm_affinity(VertexType node, affinity_table_t &affinity_table_node, const cost_t &penalty,
                                const cost_t &reward, const unsigned start_step, const unsigned end_step) {
 
+        // Use static thread_local scratchpad to avoid allocation in hot loop
+        static thread_local ScratchData scratch;
+        scratch.init(active_schedule->num_steps(), instance->numberOfProcessors());
+        scratch.clear_all();
+
         const unsigned node_step = active_schedule->assigned_superstep(node);
         const unsigned node_proc = active_schedule->assigned_processor(node);
         const unsigned window_bound = end_idx(node_step, end_step);
         const unsigned node_start_idx = start_idx(node_step, start_step);
 
-        const cost_t comm_w_node = graph->vertex_comm_weight(node);
+        for (const auto &target : instance->getComputationalDag().children(node)) {
+            const unsigned target_step = active_schedule->assigned_superstep(target);
+            const unsigned target_proc = active_schedule->assigned_processor(target);
+
+            if (target_step < node_step + (target_proc != node_proc)) {
+                const unsigned diff = node_step - target_step;
+                const unsigned bound = window_size > diff ? window_size - diff : 0;
+                unsigned idx = node_start_idx;
+                for (; idx < bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
+                        affinity_table_node[p][idx] -= reward;
+                    }
+                }
+                if (window_size >= diff && is_compatible(node, target_proc)) {
+                    affinity_table_node[target_proc][idx] -= reward;
+                }
+            } else {
+                const unsigned diff = target_step - node_step;
+                unsigned idx = window_size + diff;
+                if (idx < window_bound && is_compatible(node, target_proc)) {
+                    affinity_table_node[target_proc][idx] -= penalty;
+                }
+                for (; idx < window_bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
+                        affinity_table_node[p][idx] += penalty;
+                    }
+                }
+            }
+        }
+
+        for (const auto &source : instance->getComputationalDag().parents(node)) {
+            const unsigned source_step = active_schedule->assigned_superstep(source);
+            const unsigned source_proc = active_schedule->assigned_processor(source);
+
+            if (source_step < node_step + (source_proc == node_proc)) {
+                const unsigned diff = node_step - source_step;
+                const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0;
+                unsigned idx = node_start_idx;
+                for (; idx < bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
+                        affinity_table_node[p][idx] += penalty;
+                    }
+                }
+                if (idx - 1 < bound && is_compatible(node, source_proc)) {
+                    affinity_table_node[source_proc][idx - 1] -= penalty;
+                }
+            } else {
+                const unsigned diff = source_step - node_step;
+                unsigned idx = std::min(window_size + diff, window_bound);
+                if (idx < window_bound && is_compatible(node, source_proc)) {
+                    affinity_table_node[source_proc][idx] -= reward;
+                }
+                idx++;
+                for (; idx < window_bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
+                        affinity_table_node[p][idx] -= reward;
+                    }
+                }
+            }
+        }
+
+        const comm_weight_t comm_w_node = graph->vertex_comm_weight(node);
+        const auto &current_vec_schedule = active_schedule->getVectorSchedule();
+
+        auto add_delta = [&](bool is_recv, unsigned step, unsigned proc, comm_weight_t val) {
+            if (val == 0)
+                return;
+            if (step < active_schedule->num_steps()) {
+                scratch.mark_active(step);
+                if (is_recv)
+                    scratch.recv_deltas[step].add(proc, val);
+                else
+                    scratch.send_deltas[step].add(proc, val);
+            }
+        };
+
+        // 1. Remove Node from Current State (Phase 1 - Invariant for all candidates)
+
+        // Outgoing (Children)
+        // Child stops receiving from node_proc at node_step
+        auto node_lambda_entries = comm_ds.node_lambda_map.iterate_proc_entries(node);
+        comm_weight_t total_send_cost_removed = 0;
 
-        const auto &current_set_schedule = active_schedule->getSetSchedule();
+        for (const auto [proc, count] : node_lambda_entries) {
+            if (proc != node_proc) {
+                const comm_weight_t cost = comm_w_node * instance->sendCosts(node_proc, proc);
+                if (cost > 0) {
+                    add_delta(true, node_step, proc, -cost);
+                    total_send_cost_removed += cost;
+                }
+            }
+        }
+        if (total_send_cost_removed > 0) {
+            add_delta(false, node_step, node_proc, -total_send_cost_removed);
+        }
+
+        // Incoming (Parents)
+        for (const auto &u : graph->parents(node)) {
+            const unsigned u_proc = active_schedule->assigned_processor(u);
+            const unsigned u_step = current_vec_schedule.assignedSuperstep(u);
+            const comm_weight_t comm_w_u = graph->vertex_comm_weight(u);
+
+            if (u_proc != node_proc) {
+                if (comm_ds.node_lambda_map.get_proc_entry(u, node_proc) == 1) {
+                    const comm_weight_t cost = comm_w_u * instance->sendCosts(u_proc, node_proc);
+                    if (cost > 0) {
+                        add_delta(true, u_step, node_proc, -cost);
+                        add_delta(false, u_step, u_proc, -cost);
+                    }
+                }
+            }
+        }
+
+        // 2. Add Node to Target (Iterate candidates)
+
+        for (const unsigned p_to : proc_range->compatible_processors_vertex(node)) {
+
+            // --- Part A: Incoming Edges (Parents -> p_to) ---
+            // These updates are specific to p_to but independent of s_to.
+            // We apply them, run the s_to loop, then revert them.
+
+            for (const auto &u : graph->parents(node)) {
+                const unsigned u_proc = active_schedule->assigned_processor(u);
+                const unsigned u_step = current_vec_schedule.assignedSuperstep(u);
+                const comm_weight_t comm_w_u = graph->vertex_comm_weight(u);
+
+                if (u_proc != p_to) {
+                    bool already_sending_to_p_to = false;
+                    unsigned count_on_p_to = comm_ds.node_lambda_map.get_proc_entry(u, p_to);
+
+                    if (p_to == node_proc) {
+                        if (count_on_p_to > 0)
+                            count_on_p_to--;
+                    }
+
+                    if (count_on_p_to > 0) {
+                        already_sending_to_p_to = true;
+                    }
+
+                    if (!already_sending_to_p_to) {
+                        const comm_weight_t cost = comm_w_u * instance->sendCosts(u_proc, p_to);
+                        if (cost > 0) {
+                            add_delta(true, u_step, p_to, cost);
+                            add_delta(false, u_step, u_proc, cost);
+                        }
+                    }
+                }
+            }
 
-        for (unsigned p_to = 0; p_to < instance->numberOfProcessors(); ++p_to) {
-            if (!is_compatible(node, p_to)) continue;
+            // --- Part B: Outgoing Edges (Node -> Children) ---
+            // These depend on which processors children are on.
+            scratch.child_cost_buffer.clear();
+            comm_weight_t total_send_cost_added = 0;
+
+            for (const auto [v_proc, count] : comm_ds.node_lambda_map.iterate_proc_entries(node)) {
+                if (v_proc != p_to) {
+                    const comm_weight_t cost = comm_w_node * instance->sendCosts(p_to, v_proc);
+                    if (cost > 0) {
+                        scratch.child_cost_buffer.push_back({v_proc, cost});
+                        total_send_cost_added += cost;
+                    }
+                }
+            }
 
+            // Iterate Window (s_to)
             for (unsigned s_to_idx = node_start_idx; s_to_idx < window_bound; ++s_to_idx) {
                 unsigned s_to = node_step + s_to_idx - window_size;
-                cost_t comm_cost_change = 0;
-
-                const auto pre_move_data_from = comm_ds.get_pre_move_comm_data_step(node_step);
-                const auto pre_move_data_to = comm_ds.get_pre_move_comm_data_step(s_to);
-                
-                // --- Outgoing communication from `node` ---
-                // From
-                for (const auto [proc, count] : comm_ds.node_lambda_map.iterate_proc_entries(node)) {
-                     comm_cost_change += calculate_comm_cost_change_send(node_step, node_proc, comm_w_node, -1, pre_move_data_from);
-                }
-                // To
-                lambda_vector_container temp_lambda_map; // Use a temporary map for 'to' state
-                temp_lambda_map.initialize(1, instance->numberOfProcessors());
-                for (const auto &v : graph->children(node)) {
-                    const unsigned v_proc = current_set_schedule.assignedProcessor(v);
-
-                    if (p_to != v_proc) {
-                        if (temp_lambda_map.increase_proc_count(0, v_proc)) {
-                            comm_cost_change -= calculate_comm_cost_change_send(s_to, p_to, comm_w_node, 1, pre_move_data_to);
-                            comm_cost_change -= calculate_comm_cost_change_receive(s_to, v_proc, comm_w_node, 1, pre_move_data_to);
-                        }
+
+                // Apply Outgoing Deltas for this specific step s_to
+                for (const auto &[v_proc, cost] : scratch.child_cost_buffer) {
+                    add_delta(true, s_to, v_proc, cost);
+                }
+
+                if (total_send_cost_added > 0) {
+                    add_delta(false, s_to, p_to, total_send_cost_added);
+                }
+
+                cost_t total_change = 0;
+
+                // Only check steps that are active (modified in Phase 1, Part A, or Part B)
+                for (unsigned step : scratch.active_steps) {
+                    // Check if dirty_procs is empty implies no change for this step
+                    // FastDeltaTracker ensures dirty_procs is empty if all deltas summed to 0
+                    if (!scratch.send_deltas[step].dirty_procs.empty() ||
+                        !scratch.recv_deltas[step].dirty_procs.empty()) {
+
+                        total_change +=
+                            calculate_step_cost_change(step, scratch.send_deltas[step], scratch.recv_deltas[step]);
                     }
                 }
 
-                // --- Incoming communication to `node` ---
-                for (const auto &u : graph->parents(node)) {
-                    const unsigned u_proc = active_schedule->assigned_processor(u);
-                    const unsigned u_step = current_set_schedule.assignedSuperstep(u);
-                    const cost_t comm_w_u = graph->vertex_comm_weight(u);
-                    const auto pre_move_data_u = comm_ds.get_pre_move_comm_data_step(u_step);
-                    
-                    // From
-                    if (u_proc != node_proc) {
-                        // Send part (from parent u) & Receive part (at node_proc) // TODO: this is not correct, the lambda map is not updated
-                        if (comm_ds.node_lambda_map.get_proc_entry(u, node_proc) == 1) { // if node is the only child on this proc
-                            comm_cost_change += calculate_comm_cost_change_send(u_step, u_proc, comm_w_u, -1, pre_move_data_u);
-                            comm_cost_change += calculate_comm_cost_change_receive(u_step, node_proc, comm_w_u, -1, pre_move_data_u);
-                        }
+                affinity_table_node[p_to][s_to_idx] += total_change * instance->communicationCosts();
+
+                // Revert Outgoing Deltas for s_to (Inverse of Apply)
+                for (const auto &[v_proc, cost] : scratch.child_cost_buffer) {
+                    add_delta(true, s_to, v_proc, -cost);
+                }
+                if (total_send_cost_added > 0) {
+                    add_delta(false, s_to, p_to, -total_send_cost_added);
+                }
+            }
+
+            // Revert Incoming Deltas (Inverse of Part A)
+            for (const auto &u : graph->parents(node)) {
+                const unsigned u_proc = active_schedule->assigned_processor(u);
+                const unsigned u_step = current_vec_schedule.assignedSuperstep(u);
+                const comm_weight_t comm_w_u = graph->vertex_comm_weight(u);
+
+                if (u_proc != p_to) {
+                    bool already_sending_to_p_to = false;
+                    unsigned count_on_p_to = comm_ds.node_lambda_map.get_proc_entry(u, p_to);
+                    if (p_to == node_proc) {
+                        if (count_on_p_to > 0)
+                            count_on_p_to--;
                     }
-                    // To
-                    if (u_proc != p_to) {
-                        // Send part (from parent u) & Receive part (at p_to)
-                        // This logic is complex for an affinity calculation.
-                        // A full recompute for neighbors is a safer bet, which is what update_node_comm_affinity does. // TODO: this is not true anymore
-                        // The following is an approximation.
-
-                        // if moving node to p_to creates a new communication link for parent u
-                        bool has_other_on_p_to = false;
-                        for(const auto& sibling : graph->children(u)) {
-                            if (sibling != node && active_schedule->assigned_processor(sibling) == p_to) { has_other_on_p_to = true; break; }
-                        }
-                        if (!has_other_on_p_to) {
-                             comm_cost_change -= calculate_comm_cost_change_send(u_step, u_proc, comm_w_u, 1, pre_move_data_u);
-                             comm_cost_change -= calculate_comm_cost_change_receive(u_step, p_to, comm_w_u, 1, pre_move_data_u);
+                    if (count_on_p_to > 0)
+                        already_sending_to_p_to = true;
+
+                    if (!already_sending_to_p_to) {
+                        const comm_weight_t cost = comm_w_u * instance->sendCosts(u_proc, p_to);
+                        if (cost > 0) {
+                            add_delta(true, u_step, p_to, -cost);
+                            add_delta(false, u_step, u_proc, -cost);
                         }
                     }
                 }
-                affinity_table_node[p_to][s_to_idx] += comm_cost_change * instance->communicationCosts();
             }
         }
     }
 
-    cost_t calculate_comm_cost_change_send(unsigned step, unsigned p_send, cost_t comm_w, int sign, const pre_move_comm_data<cost_t>& pre_move_data) {
-        cost_t old_max = pre_move_data.from_step_max_comm;
+    comm_weight_t calculate_step_cost_change(unsigned step, const FastDeltaTracker<comm_weight_t> &delta_send,
+                                             const FastDeltaTracker<comm_weight_t> &delta_recv) {
 
-        cost_t new_send = comm_ds.step_proc_send(step, p_send) + sign * comm_w;
-        cost_t new_max_send = comm_ds.step_max_send(step);
-        if (new_send > new_max_send) new_max_send = new_send;
-        else if (comm_ds.step_proc_send(step, p_send) == new_max_send) {
-            if (sign < 0 && comm_ds.step_max_send_processor_count[step] == 1) {
-                new_max_send = comm_ds.step_second_max_send(step);
-            } else {
-                new_max_send = new_send;
-            }
-        }
+        comm_weight_t old_max = comm_ds.step_max_comm(step);
+        comm_weight_t second_max = comm_ds.step_second_max_comm(step);
+        unsigned old_max_count = comm_ds.step_max_comm_count(step);
 
-        return std::max(new_max_send, comm_ds.step_max_receive(step)) - old_max;
-    }
+        comm_weight_t new_global_max = 0;
+        unsigned reduced_max_instances = 0;
 
-    cost_t calculate_comm_cost_change_receive(unsigned step, unsigned p_receive, cost_t comm_w, int sign, const pre_move_comm_data<cost_t>& pre_move_data) {
-        cost_t old_max = pre_move_data.from_step_max_comm;
+        // 1. Check modified sends (Iterate sparse dirty list)
+        for (unsigned proc : delta_send.dirty_procs) {
+            comm_weight_t delta = delta_send.get(proc);
+            // delta cannot be 0 here due to FastDeltaTracker invariant
 
-        cost_t new_receive = comm_ds.step_proc_receive(step, p_receive) + sign * comm_w;
+            comm_weight_t current_val = comm_ds.step_proc_send(step, proc);
+            comm_weight_t new_val = current_val + delta;
 
-        cost_t new_max_receive = comm_ds.step_max_receive(step);
-        if (new_receive > new_max_receive) new_max_receive = new_receive;
-        else if (comm_ds.step_proc_receive(step, p_receive) == new_max_receive) {
-            if (sign < 0 && comm_ds.step_max_receive_processor_count[step] == 1) {
-                new_max_receive = comm_ds.step_second_max_receive(step);
-            } else {
-                new_max_receive = new_receive;
-            }
+            if (new_val > new_global_max)
+                new_global_max = new_val;
+            if (delta < 0 && current_val == old_max)
+                reduced_max_instances++;
         }
 
-        return std::max(comm_ds.step_max_send(step), new_max_receive) - old_max;
-    }
+        // 2. Check modified receives (Iterate sparse dirty list)
+        for (unsigned proc : delta_recv.dirty_procs) {
+            comm_weight_t delta = delta_recv.get(proc);
+
+            comm_weight_t current_val = comm_ds.step_proc_receive(step, proc);
+            comm_weight_t new_val = current_val + delta;
+
+            if (new_val > new_global_max)
+                new_global_max = new_val;
+            if (delta < 0 && current_val == old_max)
+                reduced_max_instances++;
+        }
 
-    cost_t calculate_comm_cost_change(unsigned step, unsigned p_send, unsigned p_receive, cost_t comm_w, int sign) {
-        const auto pre_move_data = comm_ds.get_pre_move_comm_data_step(step);
-        cost_t change = 0;
-        change += calculate_comm_cost_change_send(step, p_send, comm_w, sign, pre_move_data);
-        comm_ds.step_proc_send(step, p_send) += sign * comm_w;
-        change += calculate_comm_cost_change_receive(step, p_receive, comm_w, sign, pre_move_data);
-        comm_ds.step_proc_send(step, p_send) -= sign * comm_w; // revert for next calculation
-        return change;
+        // 3. Determine result
+        if (new_global_max > old_max) {
+            return new_global_max - old_max;
+        }
+        if (reduced_max_instances < old_max_count) {
+            return 0;
+        }
+        return std::max(new_global_max, second_max) - old_max;
     }
 
-    template <typename thread_data_t>
+    template<typename thread_data_t>
     void update_node_comm_affinity(const kl_move &move, thread_data_t &thread_data, const cost_t &penalty,
-                                   const cost_t &reward, std::map<VertexType, kl_gain_update_info> &max_gain_recompute,
+                                   const cost_t &reward, std::map<VertexType, kl_gain_update_info> &,
                                    std::vector<VertexType> &new_nodes) {
-        // For simplicity and correctness, we will do a full recompute for neighbors.
-        // A fully incremental update is very complex for this cost function.
-        auto process_neighbor = [&](VertexType neighbor) {
-            if (thread_data.lock_manager.is_locked(neighbor)) return;
-            if (not thread_data.affinity_table.is_selected(neighbor)) {
-                new_nodes.push_back(neighbor);
-                return;
+
+        const unsigned start_step = thread_data.start_step;
+        const unsigned end_step = thread_data.end_step;
+
+        for (const auto &target : instance->getComputationalDag().children(move.node)) {
+            const unsigned target_step = active_schedule->assigned_superstep(target);
+            if (target_step < start_step || target_step > end_step)
+                continue;
+
+            if (thread_data.lock_manager.is_locked(target))
+                continue;
+
+            if (not thread_data.affinity_table.is_selected(target)) {
+                new_nodes.push_back(target);
+                continue;
             }
-            if (max_gain_recompute.find(neighbor) == max_gain_recompute.end()) {
-                max_gain_recompute[neighbor] = kl_gain_update_info(neighbor, true);
+
+            const unsigned target_proc = active_schedule->assigned_processor(target);
+            const unsigned target_start_idx = start_idx(target_step, start_step);
+            auto &affinity_table = thread_data.affinity_table.at(target);
+
+            if (move.from_step < target_step + (move.from_proc == target_proc)) {
+                const unsigned diff = target_step - move.from_step;
+                const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0;
+                unsigned idx = target_start_idx;
+                for (; idx < bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
+                        affinity_table[p][idx] -= penalty;
+                    }
+                }
+
+                if (idx - 1 < bound && is_compatible(target, move.from_proc)) {
+                    affinity_table[move.from_proc][idx - 1] += penalty;
+                }
+
             } else {
-                max_gain_recompute[neighbor].full_update = true;
+                const unsigned diff = move.from_step - target_step;
+                const unsigned window_bound = end_idx(target_step, end_step);
+                unsigned idx = std::min(window_size + diff, window_bound);
+
+                if (idx < window_bound && is_compatible(target, move.from_proc)) {
+                    affinity_table[move.from_proc][idx] += reward;
+                }
+
+                idx++;
+
+                for (; idx < window_bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
+                        affinity_table[p][idx] += reward;
+                    }
+                }
             }
-        };
 
-        for (const auto &target : graph->children(move.node)) {
-            process_neighbor(target);
+            if (move.to_step < target_step + (move.to_proc == target_proc)) {
+                unsigned idx = target_start_idx;
+                const unsigned diff = target_step - move.to_step;
+                const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0;
+                for (; idx < bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
+                        affinity_table[p][idx] += penalty;
+                    }
+                }
+
+                if (idx - 1 < bound && is_compatible(target, move.to_proc)) {
+                    affinity_table[move.to_proc][idx - 1] -= penalty;
+                }
+
+            } else {
+                const unsigned diff = move.to_step - target_step;
+                const unsigned window_bound = end_idx(target_step, end_step);
+                unsigned idx = std::min(window_size + diff, window_bound);
+
+                if (idx < window_bound && is_compatible(target, move.to_proc)) {
+                    affinity_table[move.to_proc][idx] -= reward;
+                }
+
+                idx++;
+
+                for (; idx < window_bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
+                        affinity_table[p][idx] -= reward;
+                    }
+                }
+            }
         }
-        for (const auto &source : graph->parents(move.node)) {
-            process_neighbor(source);
+
+        for (const auto &source : instance->getComputationalDag().parents(move.node)) {
+            const unsigned source_step = active_schedule->assigned_superstep(source);
+            if (source_step < start_step || source_step > end_step)
+                continue;
+
+            if (thread_data.lock_manager.is_locked(source))
+                continue;
+
+            if (not thread_data.affinity_table.is_selected(source)) {
+                new_nodes.push_back(source);
+                continue;
+            }
+
+            const unsigned source_proc = active_schedule->assigned_processor(source);
+            const unsigned source_start_idx = start_idx(source_step, start_step);
+            const unsigned window_bound = end_idx(source_step, end_step);
+            auto &affinity_table_source = thread_data.affinity_table.at(source);
+
+            if (move.from_step < source_step + (move.from_proc != source_proc)) {
+                const unsigned diff = source_step - move.from_step;
+                const unsigned bound = window_size > diff ? window_size - diff : 0;
+                unsigned idx = source_start_idx;
+                for (; idx < bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
+                        affinity_table_source[p][idx] += reward;
+                    }
+                }
+
+                if (window_size >= diff && is_compatible(source, move.from_proc)) {
+                    affinity_table_source[move.from_proc][idx] += reward;
+                }
+
+            } else {
+                const unsigned diff = move.from_step - source_step;
+                unsigned idx = window_size + diff;
+
+                if (idx < window_bound && is_compatible(source, move.from_proc)) {
+                    affinity_table_source[move.from_proc][idx] += penalty;
+                }
+
+                for (; idx < window_bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
+                        affinity_table_source[p][idx] -= penalty;
+                    }
+                }
+            }
+
+            if (move.to_step < source_step + (move.to_proc != source_proc)) {
+                const unsigned diff = source_step - move.to_step;
+                const unsigned bound = window_size > diff ? window_size - diff : 0;
+                unsigned idx = source_start_idx;
+                for (; idx < bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
+                        affinity_table_source[p][idx] -= reward;
+                    }
+                }
+
+                if (window_size >= diff && is_compatible(source, move.to_proc)) {
+                    affinity_table_source[move.to_proc][idx] -= reward;
+                }
+
+            } else {
+                const unsigned diff = move.to_step - source_step;
+                unsigned idx = window_size + diff;
+
+                if (idx < window_bound && is_compatible(source, move.to_proc)) {
+                    affinity_table_source[move.to_proc][idx] -= penalty;
+                }
+                for (; idx < window_bound; idx++) {
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
+                        affinity_table_source[p][idx] += penalty;
+                    }
+                }
+            }
         }
     }
 };
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp
index 6b6f25b5..50384c72 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp
@@ -32,7 +32,8 @@ struct kl_hyper_total_comm_cost_function {
     using kl_gain_update_info = kl_update_info<VertexType>;
    
     constexpr static unsigned window_range = 2 * window_size + 1;
-  
+    constexpr static bool is_max_comm_cost_function = false;
+
     kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> *active_schedule;
 
     compatible_processor_range<Graph_t> *proc_range;
@@ -43,7 +44,7 @@ struct kl_hyper_total_comm_cost_function {
     cost_t comm_multiplier = 1; 
     cost_t max_comm_weight = 0;
 
-    lambda_vector_container node_lambda_map;
+    lambda_vector_container<VertexType> node_lambda_map;
 
     inline cost_t get_comm_multiplier() { return comm_multiplier; }
     inline cost_t get_max_comm_weight() { return max_comm_weight; }
@@ -60,6 +61,12 @@ struct kl_hyper_total_comm_cost_function {
         node_lambda_map.initialize(graph->num_vertices(), instance->numberOfProcessors());      
     }
 
+    struct empty_struct {};
+
+    using pre_move_comm_data_t = empty_struct;
+
+    inline empty_struct get_pre_move_comm_data(const kl_move& ) { return empty_struct(); }
+
     cost_t compute_schedule_cost() {
         cost_t work_costs = 0;
         for (unsigned step = 0; step < active_schedule->num_steps(); step++) {
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp
index 7d0d61ea..be7c627c 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp
@@ -29,6 +29,8 @@ struct kl_total_comm_cost_function {
     using kl_move = kl_move_struct<cost_t, VertexType>;
     using kl_gain_update_info = kl_update_info<VertexType>;
     
+    constexpr static bool is_max_comm_cost_function = false;
+
     constexpr static unsigned window_range = 2 * window_size + 1;
     constexpr static bool use_node_communication_costs = use_node_communication_costs_arg || not has_edge_weights_v<Graph_t>;
      
@@ -58,6 +60,10 @@ struct kl_total_comm_cost_function {
         comm_multiplier = 1.0 / instance->numberOfProcessors();        
     }
 
+    struct empty_struct {};
+    using pre_move_comm_data_t = empty_struct;
+    inline empty_struct get_pre_move_comm_data(const kl_move& ) { return empty_struct(); }
+
     cost_t compute_schedule_cost_test() {
         return compute_schedule_cost();
     }
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/lambda_container.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/lambda_container.hpp
index fd126699..0eccc815 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/lambda_container.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/lambda_container.hpp
@@ -18,24 +18,101 @@ limitations under the License.
 
 #pragma once
 
-#include <vector>
-#include <unordered_map>
 #include <cassert>
+#include <unordered_map>
+#include <vector>
 
 namespace osp {
 
+/**
+ * @brief Container for tracking child processor assignments in a BSP schedule using hash maps.
+ *
+ * This structure tracks how many children a node has that are assigned to each processor.
+ * It uses unordered_map for sparse data representation.
+ *
+ * For each node, the map stores the count of children assigned to each processor, which is
+ * important for computing communication costs in BSP scheduling.
+ */
+template<typename vertex_idx_t>
 struct lambda_map_container {
 
-    std::vector<std::unordered_map<unsigned,unsigned>> node_lambda_map;
+    /// Vector of maps: for each node, maps processor ID to assignment count
+    std::vector<std::unordered_map<unsigned, unsigned>> node_lambda_map;
+
+    /**
+     * @brief Initialize the container for a given number of vertices.
+     * @param num_vertices Number of nodes in the schedule
+     * @param (unused) Number of processors (not needed for map-based implementation)
+     */
+    inline void initialize(const vertex_idx_t num_vertices, const unsigned) { node_lambda_map.resize(num_vertices); }
+
+    /**
+     * @brief Reset all processor assignments for a specific node.
+     * @param node Node index to reset
+     */
+    inline void reset_node(const vertex_idx_t node) { node_lambda_map[node].clear(); }
 
-    inline void initialize(const size_t num_vertices, const unsigned) { node_lambda_map.resize(num_vertices); }
-    inline void reset_node(const size_t node) { node_lambda_map[node].clear(); }
+    /**
+     * @brief Clear all data from the container.
+     */
     inline void clear() { node_lambda_map.clear(); }
-    inline bool has_proc_entry(const size_t node, const unsigned proc) const { return (node_lambda_map[node].find(proc) != node_lambda_map[node].end()); }
-    inline bool has_no_proc_entry(const size_t node, const unsigned proc) const { return (node_lambda_map[node].find(proc) == node_lambda_map[node].end()); }
-    inline unsigned & get_proc_entry(const size_t node, const unsigned proc) { return node_lambda_map[node][proc]; }
 
-    inline bool increase_proc_count(const size_t node, const unsigned proc) {
+    /**
+     * @brief Check if a processor has an entry for a given node.
+     * @param node Node index
+     * @param proc Processor ID
+     * @return true if the processor has at least one assignment to the node
+     */
+    inline bool has_proc_entry(const vertex_idx_t node, const unsigned proc) const {
+        return (node_lambda_map[node].find(proc) != node_lambda_map[node].end());
+    }
+
+    /**
+     * @brief Check if a processor has no entry for a given node.
+     * @param node Node index
+     * @param proc Processor ID
+     * @return true if the processor has no assignments to the node
+     */
+    inline bool has_no_proc_entry(const vertex_idx_t node, const unsigned proc) const {
+        return (node_lambda_map[node].find(proc) == node_lambda_map[node].end());
+    }
+
+    /**
+     * @brief Get a reference to the processor count for a given node.
+     * @param node Node index
+     * @param proc Processor ID
+     * @return Reference to the count (creates entry if it doesn't exist)
+     */
+    inline unsigned &get_proc_entry(const vertex_idx_t node, const unsigned proc) { return node_lambda_map[node][proc]; }
+
+    /**
+     * @brief Get the processor count for a given node (const version).
+     * @param node Node index
+     * @param proc Processor ID
+     * @return The count value for the processor at the node
+     * @pre has_proc_entry(node, proc) must be true
+     */
+    inline unsigned get_proc_entry(const vertex_idx_t node, const unsigned proc) const {
+        assert(has_proc_entry(node, proc));
+        return node_lambda_map[node].at(proc);
+    }
+
+    /**
+     * @brief Get the number of different processors to which a node has children assigned.
+     * @param node Node index
+     * @return The count of different processors the node is sending to
+     */
+    inline unsigned get_proc_count(const vertex_idx_t node) const {
+        return static_cast<unsigned>(node_lambda_map[node].size());
+    }
+
+    /**
+     * @brief Increase the processor count for a given node.
+     * @param node Node index
+     * @param proc Processor ID
+     * @return true if this is the first assignment of this processor to the node
+     */
+    inline bool increase_proc_count(const vertex_idx_t node, const unsigned proc) {
         if (has_proc_entry(node, proc)) {
             node_lambda_map[node][proc]++;
             return false;
@@ -45,7 +122,14 @@ struct lambda_map_container {
         }
     }
 
-    inline bool decrease_proc_count(const size_t node, const unsigned proc) {
+    /**
+     * @brief Decrease the processor count for a given node.
+     * @param node Node index
+     * @param proc Processor ID
+     * @return true if this was the last assignment of this processor to the node
+     * @pre has_proc_entry(node, proc) must be true
+     */
+    inline bool decrease_proc_count(const vertex_idx_t node, const unsigned proc) {
         assert(has_proc_entry(node, proc));
         if (node_lambda_map[node][proc] == 1) {
             node_lambda_map[node].erase(proc);
@@ -56,40 +140,80 @@ struct lambda_map_container {
         }
     }
 
-    inline const auto & iterate_proc_entries(const size_t node) {
-        return node_lambda_map[node];
-    }
+    /**
+     * @brief Get an iterable view of all processor entries for a node.
+     * @param node Node index
+     * @return Reference to the unordered_map of processor assignments for the node
+     */
+    inline const auto &iterate_proc_entries(const vertex_idx_t node) { return node_lambda_map[node]; }
 };
 
+/**
+ * @brief Container for tracking child processor assignments in a BSP schedule using vectors.
+ *
+ * This structure tracks how many children a node has that are assigned to each processor.
+ * It uses a 2D vector for dense data, making it efficient when most processors may have
+ * children of nodes assigned to them, or when the processor count is relatively small.
+ *
+ * For each node, the vector stores the count of children assigned to each processor, which is
+ * important for computing communication costs in BSP scheduling.
+ */
+template<typename vertex_idx_t>
 struct lambda_vector_container {
-   
+
+    /**
+     * @brief Range adapter for iterating over non-zero processor entries.
+     *
+     * Provides a range-based for loop interface that automatically skips processors
+     * with zero assignments.
+     */
     class lambda_vector_range {
-        private:
-            const std::vector<unsigned> & vec_;
+      private:
+        const std::vector<unsigned> &vec_;
 
-        public:
+      public:
+        /**
+         * @brief Iterator that skips zero entries in the lambda vector.
+         *
+         * Implements an input iterator that yields pairs of (processor_id, count)
+         * for all processors with non-zero assignment counts.
+         */
         class lambda_vector_iterator {
-        
+
             using iterator_category = std::input_iterator_tag;
             using value_type = std::pair<unsigned, unsigned>;
             using difference_type = std::ptrdiff_t;
-            using pointer = value_type*;
-            using reference = value_type&;
-        private:
-            const std::vector<unsigned>& vec_;
-            size_t index_;
-        public:
-
-        lambda_vector_iterator(const std::vector<unsigned>& vec) : vec_(vec), index_(0) {
-            // Advance to the first valid entry
-            while (index_ < vec_.size() && vec_[index_] == 0) {
-                ++index_;
+            using pointer = value_type *;
+            using reference = value_type &;
+
+          private:
+            const std::vector<unsigned> &vec_;
+            unsigned index_;
+
+          public:
+            /**
+             * @brief Construct iterator at the beginning, skipping initial zeros.
+             * @param vec Reference to the vector to iterate over
+             */
+            lambda_vector_iterator(const std::vector<unsigned> &vec) : vec_(vec), index_(0) {
+                // Advance to the first valid entry
+                while (index_ < vec_.size() && vec_[index_] == 0) {
+                    ++index_;
+                }
             }
-        }
 
-        lambda_vector_iterator(const std::vector<unsigned>& vec, size_t index) : vec_(vec), index_(index) {}
+            /**
+             * @brief Construct iterator at a specific position.
+             * @param vec Reference to the vector to iterate over
+             * @param index Starting index
+             */
+            lambda_vector_iterator(const std::vector<unsigned> &vec, unsigned index) : vec_(vec), index_(index) {}
 
-        lambda_vector_iterator& operator++() {
+            /**
+             * @brief Advance to the next non-zero entry.
+             * @return Reference to this iterator
+             */
+            lambda_vector_iterator &operator++() {
                 ++index_;
                 while (index_ < vec_.size() && vec_[index_] == 0) {
                     ++index_;
@@ -97,58 +221,152 @@ struct lambda_vector_container {
                 return *this;
             }
 
-            value_type operator*() const {
-                return std::make_pair(static_cast<unsigned>(index_), vec_[index_]);
-            }
+            /**
+             * @brief Dereference to get (processor_id, count) pair.
+             * @return Pair of processor ID and its count
+             */
+            value_type operator*() const { return std::make_pair(index_, vec_[index_]); }
 
-            bool operator==(const lambda_vector_iterator& other) const {
-                return index_ == other.index_;
-            }
+            /**
+             * @brief Check equality with another iterator.
+             * @param other Iterator to compare with
+             * @return true if both iterators point to the same position
+             */
+            bool operator==(const lambda_vector_iterator &other) const { return index_ == other.index_; }
 
-            bool operator!=(const lambda_vector_iterator& other) const {
-                return !(*this == other);
-            }
+            /**
+             * @brief Check inequality with another iterator.
+             * @param other Iterator to compare with
+             * @return true if iterators point to different positions
+             */
+            bool operator!=(const lambda_vector_iterator &other) const { return !(*this == other); }
         };
 
-        lambda_vector_range(const std::vector<unsigned>& vec) : vec_(vec) {}
+        /**
+         * @brief Construct a range from a vector.
+         * @param vec Reference to the vector to create range over
+         */
+        lambda_vector_range(const std::vector<unsigned> &vec) : vec_(vec) {}
 
+        /// Get iterator to the first non-zero entry
         lambda_vector_iterator begin() { return lambda_vector_iterator(vec_); }
-        lambda_vector_iterator end() { return lambda_vector_iterator(vec_, vec_.size()); }
+
+        /// Get iterator to the end
+        lambda_vector_iterator end() { return lambda_vector_iterator(vec_, static_cast<unsigned>(vec_.size())); }
     };
 
+    /// 2D vector: for each node, stores processor assignment counts
     std::vector<std::vector<unsigned>> node_lambda_vec;
+
+    /// Number of processors in the system
     unsigned num_procs_ = 0;
 
-    inline void initialize(const size_t num_vertices, const unsigned num_procs) { 
-        node_lambda_vec.assign(num_vertices, {num_procs});
-        num_procs_ = num_procs; 
+    /**
+     * @brief Initialize the container for a given number of vertices and processors.
+     * @param num_vertices Number of nodes in the schedule
+     * @param num_procs Number of processors in the system
+     */
+    inline void initialize(const vertex_idx_t num_vertices, const unsigned num_procs) {
+        node_lambda_vec.assign(num_vertices, std::vector<unsigned>(num_procs, 0));
+        num_procs_ = num_procs;
     }
 
-    inline void reset_node(const size_t node) { node_lambda_vec[node].assign(num_procs_, 0); }
+    /**
+     * @brief Reset all processor assignments for a specific node.
+     * @param node Node index to reset
+     */
+    inline void reset_node(const vertex_idx_t node) { node_lambda_vec[node].assign(num_procs_, 0); }
+
+    /**
+     * @brief Clear all data from the container.
+     */
     inline void clear() { node_lambda_vec.clear(); }
-    inline bool has_proc_entry(const size_t node, const unsigned proc) const { return node_lambda_vec[node][proc] > 0; }
-    inline bool has_no_proc_entry(const size_t node, const unsigned proc) const { return node_lambda_vec[node][proc] == 0; }
-    inline unsigned & get_proc_entry(const size_t node, const unsigned proc) { return node_lambda_vec[node][proc]; }
 
-    inline unsigned get_proc_entry(const size_t node, const unsigned proc) const {
+    /**
+     * @brief Check if a processor has an entry for a given node.
+     * @param node Node index
+     * @param proc Processor ID
+     * @return true if the processor has at least one assignment to the node
+     */
+    inline bool has_proc_entry(const vertex_idx_t node, const unsigned proc) const { return node_lambda_vec[node][proc] > 0; }
+
+    /**
+     * @brief Check if a processor has no entry for a given node.
+     * @param node Node index
+     * @param proc Processor ID
+     * @return true if the processor has no assignments to the node
+     */
+    inline bool has_no_proc_entry(const vertex_idx_t node, const unsigned proc) const {
+        return node_lambda_vec[node][proc] == 0;
+    }
+
+    /**
+     * @brief Get a reference to the processor count for a given node.
+     * @param node Node index
+     * @param proc Processor ID
+     * @return Reference to the count (allows modification)
+     */
+    inline unsigned &get_proc_entry(const vertex_idx_t node, const unsigned proc) { return node_lambda_vec[node][proc]; }
+
+    /**
+     * @brief Get the processor count for a given node (const version).
+     * @param node Node index
+     * @param proc Processor ID
+     * @return The count value for the processor at the node
+     * @pre has_proc_entry(node, proc) must be true
+     */
+    inline unsigned get_proc_entry(const vertex_idx_t node, const unsigned proc) const {
         assert(has_proc_entry(node, proc));
         return node_lambda_vec[node][proc];
     }
 
-    inline bool increase_proc_count(const size_t node, const unsigned proc) {
+    /**
+     * @brief Get the processor count for a given node (alias for compatibility).
+     * @param node Node index
+     * @param proc Processor ID
+     * @return The count value for the processor at the node
+     * @pre has_proc_entry(node, proc) must be true
+     */
+    inline unsigned get_proc_count(const vertex_idx_t node) const {
+        unsigned count = 0;
+        for (unsigned proc = 0; proc < num_procs_; ++proc) {
+            if (node_lambda_vec[node][proc] > 0) {
+                ++count;
+            }
+        }
+        return count;
+    }
+
+    /**
+     * @brief Increase the processor count for a given node.
+     * @param node Node index
+     * @param proc Processor ID
+     * @return true if this is the first assignment of this processor to the node
+     */
+    inline bool increase_proc_count(const vertex_idx_t node, const unsigned proc) {
         node_lambda_vec[node][proc]++;
         return node_lambda_vec[node][proc] == 1;
     }
 
-    inline bool decrease_proc_count(const size_t node, const unsigned proc) {
+    /**
+     * @brief Decrease the processor count for a given node.
+     * @param node Node index
+     * @param proc Processor ID
+     * @return true if this was the last assignment of this processor to the node
+     * @pre has_proc_entry(node, proc) must be true
+     */
+    inline bool decrease_proc_count(const vertex_idx_t node, const unsigned proc) {
         assert(has_proc_entry(node, proc));
         node_lambda_vec[node][proc]--;
         return node_lambda_vec[node][proc] == 0;
     }
 
-    inline auto iterate_proc_entries(const size_t node) {
-        return lambda_vector_range(node_lambda_vec[node]);
-    }
+    /**
+     * @brief Get an iterable range over all non-zero processor entries for a node.
+     * @param node Node index
+     * @return Range object that can be used in range-based for loops
+     */
+    inline auto iterate_proc_entries(const vertex_idx_t node) { return lambda_vector_range(node_lambda_vec[node]); }
 };
 
 } // namespace osp
\ No newline at end of file
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp
index 82ade586..cc8d8a5a 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp
@@ -18,28 +18,39 @@ limitations under the License.
 
 #pragma once
 
-#include <vector>
-#include <set>
-#include <algorithm>
-#include "osp/bsp/model/BspInstance.hpp"
 #include "lambda_container.hpp"
-#include "../kl_active_schedule.hpp"
+#include "osp/bsp/model/BspInstance.hpp"
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+namespace osp {
 
 template<typename comm_weight_t>
 struct pre_move_comm_data {
 
-    comm_weight_t from_step_max_comm;
-    comm_weight_t from_step_second_max_comm;
+    struct step_info {
+        comm_weight_t max_comm;
+        comm_weight_t second_max_comm;
+        unsigned max_comm_count;
+    };
 
-    comm_weight_t to_step_max_comm;
-    comm_weight_t to_step_second_max_comm;
+    std::unordered_map<unsigned, step_info> step_data;
 
     pre_move_comm_data() = default;
-    pre_move_comm_data(comm_weight_t from_max, comm_weight_t from_second_max,
-                       comm_weight_t to_max, comm_weight_t to_second_max)
-        : from_step_max_comm(from_max), from_step_second_max_comm(from_second_max),
-          to_step_max_comm(to_max), to_step_second_max_comm(to_second_max) {}
 
+    void add_step(unsigned step, comm_weight_t max, comm_weight_t second, unsigned count) {
+        step_data[step] = {max, second, count};
+    }
+
+    bool get_step(unsigned step, step_info &info) const {
+        auto it = step_data.find(step);
+        if (it != step_data.end()) {
+            info = it->second;
+            return true;
+        }
+        return false;
+    }
 };
 
 template<typename Graph_t, typename cost_t, typename kl_active_schedule_t>
@@ -51,167 +62,292 @@ struct max_comm_datastructure {
 
     const BspInstance<Graph_t> *instance;
     const kl_active_schedule_t *active_schedule;
-   
-    struct comm_proc {
-        comm_weight_t comm;
-        unsigned proc;
-
-        comm_proc() : comm(0), proc(0) {}
-        comm_proc(comm_weight_t c, unsigned p) : comm(c), proc(p) {}
-    
-        bool operator<(comm_proc const &rhs) const {
-            return (comm > rhs.comm) or (comm == rhs.comm and proc < rhs.proc);
-        }
-    };
-
-    std::vector<std::vector<comm_proc>> step_proc_send_sorted;
-    std::vector<std::vector<comm_proc>> step_proc_receive_sorted;
 
-    std::vector<std::vector<comm_weight_t>> step_proc_send;
-    std::vector<std::vector<comm_weight_t>> step_proc_receive;
-
-    std::vector<unsigned> step_max_send_processor_count;
-    std::vector<unsigned> step_max_receive_processor_count;
+    std::vector<std::vector<comm_weight_t>> step_proc_send_;
+    std::vector<std::vector<comm_weight_t>> step_proc_receive_;
 
+    // Caches for fast cost calculation (Global Max/Second Max per step)
     std::vector<comm_weight_t> step_max_comm_cache;
     std::vector<comm_weight_t> step_second_max_comm_cache;
+    std::vector<unsigned> step_max_comm_count_cache;
 
     comm_weight_t max_comm_weight = 0;
 
-    lambda_vector_container node_lambda_map;
-
-    inline comm_weight_t step_proc_send(unsigned step, unsigned proc) const { return step_proc_send[step][proc]; }
-    inline comm_weight_t& step_proc_send(unsigned step, unsigned proc) { return step_proc_send[step][proc]; }
-    inline comm_weight_t step_proc_receive(unsigned step, unsigned proc) const { return step_proc_receive[step][proc]; }
-    inline comm_weight_t& step_proc_receive(unsigned step, unsigned proc) { return step_proc_receive[step][proc]; }
+    lambda_vector_container<VertexType> node_lambda_map;
 
-    inline comm_weight_t step_max_send(unsigned step) const { return step_proc_send_sorted[step][0].comm; }
-    inline comm_weight_t step_second_max_send(unsigned step) const {
-        return step_proc_send_sorted[step][step_max_send_processor_count[step]].comm;
-    }
+    // Optimization: Scratchpad for update_datastructure_after_move to avoid allocations
+    std::vector<unsigned> affected_steps_list;
+    std::vector<bool> step_is_affected;
 
-    inline comm_weight_t step_max_receive(unsigned step) const { return step_proc_receive_sorted[step][0].comm; }
-    inline comm_weight_t step_second_max_receive(unsigned step) const {
-        return step_proc_receive_sorted[step][step_max_receive_processor_count[step]].comm;
+    inline comm_weight_t step_proc_send(unsigned step, unsigned proc) const { return step_proc_send_[step][proc]; }
+    inline comm_weight_t &step_proc_send(unsigned step, unsigned proc) { return step_proc_send_[step][proc]; }
+    inline comm_weight_t step_proc_receive(unsigned step, unsigned proc) const {
+        return step_proc_receive_[step][proc];
     }
+    inline comm_weight_t &step_proc_receive(unsigned step, unsigned proc) { return step_proc_receive_[step][proc]; }
 
     inline comm_weight_t step_max_comm(unsigned step) const { return step_max_comm_cache[step]; }
-    inline comm_weight_t step_second_max_comm(unsigned step) const {
-        return step_second_max_comm_cache[step];
-    }
+    inline comm_weight_t step_second_max_comm(unsigned step) const { return step_second_max_comm_cache[step]; }
+    inline unsigned step_max_comm_count(unsigned step) const { return step_max_comm_count_cache[step]; }
 
-    template<typename cost_t, typename vertex_idx_t>
-    inline pre_move_comm_data<comm_weight_t> get_pre_move_comm_data(const kl_move_struct<cost_t, vertex_idx_t>& move) {
-        return pre_move_comm_data<comm_weight_t>(
-            step_max_comm(move.from_step), step_second_max_comm(move.from_step),
-            step_max_comm(move.to_step), step_second_max_comm(move.to_step)
-        );
-    }
-
-    template<typename cost_t>
-    inline pre_move_comm_data_step<cost_t> get_pre_move_comm_data_step(unsigned step) const {
-        return pre_move_comm_data_step<cost_t>(
-            step_max_comm(step), step_second_max_comm(step), 0, 0
-        );
-    }
-
-    inline void initialize( kl_active_schedule_t &kl_sched) {        
+    inline void initialize(kl_active_schedule_t &kl_sched) {
         active_schedule = &kl_sched;
-        instance = & active_schedule->getInstance();
+        instance = &active_schedule->getInstance();
         const unsigned num_steps = active_schedule->num_steps();
         const unsigned num_procs = instance->numberOfProcessors();
         max_comm_weight = 0;
 
-        step_proc_send.assign(num_steps, std::vector<comm_weight_t>(num_procs, 0));
-        step_proc_receive.assign(num_steps, std::vector<comm_weight_t>(num_procs, 0));
-
-        step_proc_send_sorted.assign(num_steps, std::vector<comm_proc>(num_procs));
-        step_proc_receive_sorted.assign(num_steps, std::vector<comm_proc>(num_procs));
+        step_proc_send_.assign(num_steps, std::vector<comm_weight_t>(num_procs, 0));
+        step_proc_receive_.assign(num_steps, std::vector<comm_weight_t>(num_procs, 0));
 
-        step_max_send_processor_count.assign(num_steps, 0);
-        step_max_receive_processor_count.assign(num_steps, 0);
         step_max_comm_cache.assign(num_steps, 0);
         step_second_max_comm_cache.assign(num_steps, 0);
+        step_max_comm_count_cache.assign(num_steps, 0);
 
         node_lambda_map.initialize(instance->getComputationalDag().num_vertices(), num_procs);
+
+        // Initialize scratchpad
+        step_is_affected.assign(num_steps, false);
+        affected_steps_list.reserve(num_steps);
     }
 
     inline void clear() {
-        step_proc_send.clear();
-        step_proc_receive.clear();
-        step_proc_send_sorted.clear();
-        step_proc_receive_sorted.clear();
-        step_max_send_processor_count.clear();
-        step_max_receive_processor_count.clear();
+        step_proc_send_.clear();
+        step_proc_receive_.clear();
         step_max_comm_cache.clear();
         step_second_max_comm_cache.clear();
+        step_max_comm_count_cache.clear();
         node_lambda_map.clear();
+        affected_steps_list.clear();
+        step_is_affected.clear();
     }
 
     inline void arrange_superstep_comm_data(const unsigned step) {
-        for (unsigned p = 0; p < instance->numberOfProcessors(); ++p) {
-            step_proc_send_sorted[step][p] = {step_proc_send[step][p], p};
-            step_proc_receive_sorted[step][p] = {step_proc_receive[step][p], p};
-        }
-        std::sort(step_proc_send_sorted[step].begin(), step_proc_send_sorted[step].end());
-        std::sort(step_proc_receive_sorted[step].begin(), step_proc_receive_sorted[step].end());
-
-        const comm_weight_t max_send = step_proc_send_sorted[step][0].comm;
-        unsigned send_count = 1;
-        while (send_count < instance->numberOfProcessors() && step_proc_send_sorted[step][send_count].comm == max_send) {
-            send_count++;
+        // Linear scan O(P) to find max, second_max and count
+        
+        // 1. Analyze Sends
+        comm_weight_t max_send = 0;
+        comm_weight_t second_max_send = 0;
+        unsigned max_send_count = 0;
+
+        const auto &sends = step_proc_send_[step];
+        for (const auto val : sends) {
+            if (val > max_send) {
+                second_max_send = max_send;
+                max_send = val;
+                max_send_count = 1;
+            } else if (val == max_send) {
+                max_send_count++;
+            } else if (val > second_max_send) {
+                second_max_send = val;
+            }
         }
-        step_max_send_processor_count[step] = send_count;
 
-        const comm_weight_t max_receive = step_proc_receive_sorted[step][0].comm;
-        unsigned receive_count = 1;
-        while (receive_count < instance->numberOfProcessors() && step_proc_receive_sorted[step][receive_count].comm == max_receive) {
-            receive_count++;
+        // 2. Analyze Receives
+        comm_weight_t max_receive = 0;
+        comm_weight_t second_max_receive = 0;
+        unsigned max_receive_count = 0;
+
+        const auto &receives = step_proc_receive_[step];
+        for (const auto val : receives) {
+            if (val > max_receive) {
+                second_max_receive = max_receive;
+                max_receive = val;
+                max_receive_count = 1;
+            } else if (val == max_receive) {
+                max_receive_count++;
+            } else if (val > second_max_receive) {
+                second_max_receive = val;
+            }
         }
-        step_max_receive_processor_count[step] = receive_count;
 
-        step_max_comm_cache[step] = std::max(max_send, max_receive);
+        // 3. Aggregate Global Stats
+        const comm_weight_t global_max = std::max(max_send, max_receive);
+        step_max_comm_cache[step] = global_max;
 
-        const comm_weight_t second_max_send = step_proc_send_sorted[step][send_count].comm;
-        const comm_weight_t second_max_receive = step_proc_receive_sorted[step][receive_count].comm;
+        unsigned global_count = 0;
+        if (max_send == global_max)
+            global_count += max_send_count;
+        if (max_receive == global_max)
+            global_count += max_receive_count;
+        step_max_comm_count_cache[step] = global_count;
 
-        step_second_max_comm_cache[step] = std::max(std::max(second_max_send, max_receive), std::max(max_send, second_max_receive));
+        // Determine second max
+        comm_weight_t cand_send = (max_send == global_max) ? second_max_send : max_send;
+        comm_weight_t cand_recv = (max_receive == global_max) ? second_max_receive : max_receive;
 
+        step_second_max_comm_cache[step] = std::max(cand_send, cand_recv);
     }
 
-    void recompute_max_send_receive(unsigned step) {
-        arrange_superstep_comm_data(step);
+    void recompute_max_send_receive(unsigned step) { arrange_superstep_comm_data(step); }
+
+    inline pre_move_comm_data<comm_weight_t> get_pre_move_comm_data(const kl_move &move) {
+        pre_move_comm_data<comm_weight_t> data;
+        std::unordered_set<unsigned> affected_steps;
+
+        affected_steps.insert(move.from_step);
+        affected_steps.insert(move.to_step);
+
+        const auto &graph = instance->getComputationalDag();
+
+        for (const auto &parent : graph.parents(move.node)) {
+            affected_steps.insert(active_schedule->assigned_superstep(parent));
+        }
+
+        for (unsigned step : affected_steps) {
+            data.add_step(step, step_max_comm(step), step_second_max_comm(step), step_max_comm_count(step));
+        }
+
+        return data;
     }
-    
-    void update_datastructure_after_move(const kl_move& move, unsigned start_step, unsigned end_step) {
- 
+
+    void update_datastructure_after_move(const kl_move &move, unsigned, unsigned) {
+        const auto &graph = instance->getComputationalDag();
+
+        // --- 0. Prepare Scratchpad (Avoids Allocations) ---
+        for (unsigned step : affected_steps_list) {
+            if (step < step_is_affected.size())
+                step_is_affected[step] = false;
+        }
+        affected_steps_list.clear();
+
+        auto mark_step = [&](unsigned step) {
+            if (step < step_is_affected.size() && !step_is_affected[step]) {
+                step_is_affected[step] = true;
+                affected_steps_list.push_back(step);
+            }
+        };
+
+        const VertexType node = move.node;
+        const unsigned from_step = move.from_step;
+        const unsigned to_step = move.to_step;
+        const unsigned from_proc = move.from_proc;
+        const unsigned to_proc = move.to_proc;
+        const comm_weight_t comm_w_node = graph.vertex_comm_weight(node);
+
+        // --- 1. Handle Node Movement (Outgoing Edges: Node -> Children) ---
+
+        if (from_step != to_step) {
+            // Case 1: Node changes Step
+            // Optimization: Fuse the loop to iterate lambda map only once.
+            
+            for (const auto [proc, count] : node_lambda_map.iterate_proc_entries(node)) {
+                // A. Remove Old (Sender: from_proc, Receiver: proc)
+                if (proc != from_proc) {
+                    const comm_weight_t cost = comm_w_node * instance->sendCosts(from_proc, proc);
+                    // Optimization: check cost > 0 to avoid dirtying cache lines with +0 ops
+                    if (cost > 0) { 
+                        step_proc_receive_[from_step][proc] -= cost;
+                        step_proc_send_[from_step][from_proc] -= cost;
+                    }
+                }
+
+                // B. Add New (Sender: to_proc, Receiver: proc)
+                if (proc != to_proc) {
+                    const comm_weight_t cost = comm_w_node * instance->sendCosts(to_proc, proc);
+                    if (cost > 0) {
+                        step_proc_receive_[to_step][proc] += cost;
+                        step_proc_send_[to_step][to_proc] += cost;
+                    }
+                }
+            }
+            mark_step(from_step);
+            mark_step(to_step);
+
+        } else if (from_proc != to_proc) {
+            // Case 2: Node stays in same Step, but changes Processor
+
+            for (const auto [proc, count] : node_lambda_map.iterate_proc_entries(node)) {
+                // Remove Old (Sender: from_proc, Receiver: proc)
+                if (proc != from_proc) {
+                    const comm_weight_t cost = comm_w_node * instance->sendCosts(from_proc, proc);
+                    if (cost > 0) {
+                        step_proc_receive_[from_step][proc] -= cost;
+                        step_proc_send_[from_step][from_proc] -= cost;
+                    }
+                }
+
+                // Add New (Sender: to_proc, Receiver: proc)
+                if (proc != to_proc) {
+                    const comm_weight_t cost = comm_w_node * instance->sendCosts(to_proc, proc);
+                    if (cost > 0) {
+                        step_proc_receive_[from_step][proc] += cost;
+                        step_proc_send_[from_step][to_proc] += cost;
+                    }
+                }
+            }
+            mark_step(from_step);
+        }
+
+        // --- 2. Update Parents' Outgoing Communication (Parents → Node) ---
+
+        if (from_proc != to_proc) {
+            for (const auto &parent : graph.parents(node)) {
+                const unsigned parent_step = active_schedule->assigned_superstep(parent);
+                // Fast boundary check
+                if (parent_step >= step_proc_send_.size())
+                    continue;
+
+                const unsigned parent_proc = active_schedule->assigned_processor(parent);
+                const comm_weight_t comm_w_parent = graph.vertex_comm_weight(parent);
+
+                const bool removed_from_proc = node_lambda_map.decrease_proc_count(parent, from_proc);
+                const bool added_to_proc = node_lambda_map.increase_proc_count(parent, to_proc);
+
+                // 1. Handle Removal from from_proc
+                if (removed_from_proc) {
+                    if (from_proc != parent_proc) {
+                        const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, from_proc);
+                        if (cost > 0) {
+                            step_proc_send_[parent_step][parent_proc] -= cost;
+                            step_proc_receive_[parent_step][from_proc] -= cost;
+                        }
+                    }
+                }
+
+                // 2. Handle Addition to to_proc
+                if (added_to_proc) {
+                    if (to_proc != parent_proc) {
+                        const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, to_proc);
+                        if (cost > 0) {
+                            step_proc_send_[parent_step][parent_proc] += cost;
+                            step_proc_receive_[parent_step][to_proc] += cost;
+                        }
+                    }
+                }
+
+                mark_step(parent_step);
+            }
+        }
+
+        // --- 3. Re-arrange Affected Steps ---
+        for (unsigned step : affected_steps_list) {
+            arrange_superstep_comm_data(step);
+        }
     }
 
     void swap_steps(const unsigned step1, const unsigned step2) {
-        std::swap(step_proc_send[step1], step_proc_send[step2]);
-        std::swap(step_proc_receive[step1], step_proc_receive[step2]);
-        std::swap(step_proc_send_sorted[step1], step_proc_send_sorted[step2]);
-        std::swap(step_proc_receive_sorted[step1], step_proc_receive_sorted[step2]);
-        std::swap(step_max_send_processor_count[step1], step_max_send_processor_count[step2]);
-        std::swap(step_max_receive_processor_count[step1], step_max_receive_processor_count[step2]);
+        std::swap(step_proc_send_[step1], step_proc_send_[step2]);
+        std::swap(step_proc_receive_[step1], step_proc_receive_[step2]);
         std::swap(step_max_comm_cache[step1], step_max_comm_cache[step2]);
         std::swap(step_second_max_comm_cache[step1], step_second_max_comm_cache[step2]);
+        std::swap(step_max_comm_count_cache[step1], step_max_comm_count_cache[step2]);
     }
 
     void reset_superstep(unsigned step) {
-        std::fill(step_proc_send[step].begin(), step_proc_send[step].end(), 0);
-        std::fill(step_proc_receive[step].begin(), step_proc_receive[step].end(), 0);
+        std::fill(step_proc_send_[step].begin(), step_proc_send_[step].end(), 0);
+        std::fill(step_proc_receive_[step].begin(), step_proc_receive_[step].end(), 0);
         arrange_superstep_comm_data(step);
     }
 
     void compute_comm_datastructures(unsigned start_step, unsigned end_step) {
         for (unsigned step = start_step; step <= end_step; step++) {
-            std::fill(step_proc_send[step].begin(), step_proc_send[step].end(), 0);
-            std::fill(step_proc_receive[step].begin(), step_proc_receive[step].end(), 0);
+            std::fill(step_proc_send_[step].begin(), step_proc_send_[step].end(), 0);
+            std::fill(step_proc_receive_[step].begin(), step_proc_receive_[step].end(), 0);
         }
 
-        const auto & vec_sched = active_schedule->getVectorSchedule();
-        const auto & graph = instance->getComputationalDag();
+        const auto &vec_sched = active_schedule->getVectorSchedule();
+        const auto &graph = instance->getComputationalDag();
 
         for (const auto &u : graph.vertices()) {
             node_lambda_map.reset_node(u);
@@ -220,23 +356,29 @@ struct max_comm_datastructure {
             const comm_weight_t comm_w = graph.vertex_comm_weight(u);
             max_comm_weight = std::max(max_comm_weight, comm_w);
 
-            bool has_child_on_other_proc = false;
             for (const auto &v : graph.children(u)) {
                 const unsigned v_proc = vec_sched.assignedProcessor(v);
-                if (u_proc != v_proc) {
-                    if (node_lambda_map.increase_proc_count(u, v_proc)) {                        
-                        has_child_on_other_proc = true;
-                        step_proc_receive[u_step][v_proc] += comm_w;
+                const unsigned v_step = vec_sched.assignedSuperstep(v);                
+                const comm_weight_t comm_w_send_cost = (u_proc != v_proc) ? comm_w * instance->sendCosts(u_proc, v_proc) : 0;
+                
+                if (node_lambda_map.increase_proc_count(u, v_proc)) {
+                    if (u_proc != v_proc && comm_w_send_cost > 0) {
+                        attribute_communication(comm_w_send_cost, u_step, u_proc, v_proc, v_step);
                     }
                 }
             }
-
-            if(has_child_on_other_proc)
-                step_proc_send[u_step][u_proc] += comm_w;
         }
-        
+
         for (unsigned step = start_step; step <= end_step; step++) {
             arrange_superstep_comm_data(step);
         }
     }
-};
\ No newline at end of file
+
+    inline void attribute_communication(const comm_weight_t &comm_w_send_cost, const unsigned u_step, const unsigned u_proc, const unsigned v_proc,
+                                        const unsigned) {
+        step_proc_receive_[u_step][v_proc] += comm_w_send_cost;
+        step_proc_send_[u_step][u_proc] += comm_w_send_cost;
+    }
+};
+
+} // namespace osp
\ No newline at end of file
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp
index 922e049b..a27ebe9b 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp
@@ -31,9 +31,9 @@ limitations under the License.
 #include "osp/auxiliary/datastructures/heaps/PairingHeap.hpp"
 #include "osp/auxiliary/misc.hpp"
 #include "osp/bsp/scheduler/ImprovementScheduler.hpp"
+#include "osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp"
 #include "osp/graph_algorithms/directed_graph_edge_desc_util.hpp"
 #include "osp/graph_algorithms/directed_graph_util.hpp"
-#include "osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp"
 
 #include "kl_active_schedule.hpp"
 #include "kl_util.hpp"
@@ -46,12 +46,12 @@ struct kl_parameter {
     unsigned num_parallel_loops = 4;
 
     unsigned max_inner_iterations_reset = 500;
-    unsigned max_no_improvement_iterations = 50;  
+    unsigned max_no_improvement_iterations = 50;
 
     constexpr static unsigned abort_scatter_nodes_violation_threshold = 500;
     constexpr static unsigned initial_violation_threshold = 250;
 
-    unsigned max_no_vioaltions_removed_backtrack_reset;    
+    unsigned max_no_vioaltions_removed_backtrack_reset;
     unsigned remove_step_epocs;
     unsigned node_max_step_selection_epochs;
     unsigned max_no_vioaltions_removed_backtrack_for_remove_step_reset;
@@ -61,7 +61,6 @@ struct kl_parameter {
 
     unsigned thread_min_range = 8;
     unsigned thread_range_gap = 0;
-
 };
 
 template<typename VertexType>
@@ -76,12 +75,15 @@ struct kl_update_info {
     bool update_entire_from_step = false;
 
     kl_update_info() = default;
-    kl_update_info(VertexType n) : node(n), full_update(false), update_entire_to_step(false), update_entire_from_step(false) {}
-    kl_update_info(VertexType n, bool full) : node(n), full_update(full), update_entire_to_step(false), update_entire_from_step(false) {}
+    kl_update_info(VertexType n)
+        : node(n), full_update(false), update_entire_to_step(false), update_entire_from_step(false) {}
+    kl_update_info(VertexType n, bool full)
+        : node(n), full_update(full), update_entire_to_step(false), update_entire_from_step(false) {}
 };
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t = no_local_search_memory_constraint,
-         unsigned window_size = 1, typename cost_t = double>
+template<typename Graph_t, typename comm_cost_function_t,
+         typename MemoryConstraint_t = no_local_search_memory_constraint, unsigned window_size = 1,
+         typename cost_t = double>
 class kl_improver : public ImprovementScheduler<Graph_t> {
 
     static_assert(is_directed_graph_edge_desc_v<Graph_t>, "Graph_t must satisfy the directed_graph concept");
@@ -89,7 +91,6 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
     static_assert(is_computational_dag_v<Graph_t>, "Graph_t must satisfy the computational_dag concept");
 
   protected:
-
     constexpr static unsigned window_range = 2 * window_size + 1;
     constexpr static bool enable_quick_moves = true;
     constexpr static bool enable_preresolving_violations = true;
@@ -124,7 +125,7 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
 
         double average_gain = 0.0;
         unsigned max_inner_iterations = 0;
-        unsigned no_improvement_iterations_reduce_penalty = 0; 
+        unsigned no_improvement_iterations_reduce_penalty = 0;
         unsigned min_inner_iter = 0;
         unsigned no_improvement_iterations_increase_inner_iter = 0;
         unsigned step_selection_epoch_counter = 0;
@@ -136,9 +137,13 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
         unsigned max_no_vioaltions_removed_backtrack = 0;
 
         inline unsigned num_steps() const { return end_step - start_step + 1; }
-        inline unsigned start_idx(const unsigned node_step) const { return node_step < start_step + window_size ? window_size - (node_step - start_step) : 0; }
-        inline unsigned end_idx(unsigned node_step) const { return node_step + window_size <= end_step ? window_range : window_range - (node_step + window_size - end_step); }
-
+        inline unsigned start_idx(const unsigned node_step) const {
+            return node_step < start_step + window_size ? window_size - (node_step - start_step) : 0;
+        }
+        inline unsigned end_idx(unsigned node_step) const {
+            return node_step + window_size <= end_step ? window_range
+                                                       : window_range - (node_step + window_size - end_step);
+        }
     };
 
     bool compute_with_time_limit = false;
@@ -151,33 +156,38 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
 
     kl_parameter parameters;
     std::mt19937 gen;
-    
+
     active_schedule_t active_schedule;
     comm_cost_function_t comm_cost_f;
     std::vector<ThreadSearchContext> thread_data_vec;
     std::vector<bool> thread_finished_vec;
-    
-    inline unsigned rel_step_idx(const unsigned node_step, const unsigned move_step) const { return (move_step >= node_step) ? ((move_step - node_step) + window_size) : (window_size - (node_step - move_step)); }
-    inline bool is_compatible(VertexType node, unsigned proc) const { return active_schedule.getInstance().isCompatible(node, proc); }
 
-    void set_start_step(const unsigned step, ThreadSearchContext& thread_data) {
+    inline unsigned rel_step_idx(const unsigned node_step, const unsigned move_step) const {
+        return (move_step >= node_step) ? ((move_step - node_step) + window_size)
+                                        : (window_size - (node_step - move_step));
+    }
+    inline bool is_compatible(VertexType node, unsigned proc) const {
+        return active_schedule.getInstance().isCompatible(node, proc);
+    }
+
+    void set_start_step(const unsigned step, ThreadSearchContext &thread_data) {
         thread_data.start_step = step;
         thread_data.step_to_remove = step;
         thread_data.step_selection_counter = step;
-       
+
         thread_data.average_gain = 0.0;
         thread_data.max_inner_iterations = parameters.max_inner_iterations_reset;
         thread_data.no_improvement_iterations_reduce_penalty = parameters.max_no_improvement_iterations / 5;
         thread_data.min_inner_iter = parameters.min_inner_iter_reset;
         thread_data.step_selection_epoch_counter = 0;
-        thread_data.no_improvement_iterations_increase_inner_iter = 10;        
+        thread_data.no_improvement_iterations_increase_inner_iter = 10;
         thread_data.unlock_edge_backtrack_counter_reset = 0;
-        thread_data.unlock_edge_backtrack_counter = thread_data.unlock_edge_backtrack_counter_reset;        
+        thread_data.unlock_edge_backtrack_counter = thread_data.unlock_edge_backtrack_counter_reset;
         thread_data.max_no_vioaltions_removed_backtrack = parameters.max_no_vioaltions_removed_backtrack_reset;
     }
 
-
-    kl_move get_best_move(node_selection_container_t & affinity_table, vector_vertex_lock_manger<VertexType> & lock_manager, heap_datastructure & max_gain_heap) {
+    kl_move get_best_move(node_selection_container_t &affinity_table,
+                          vector_vertex_lock_manger<VertexType> &lock_manager, heap_datastructure &max_gain_heap) {
         // To introduce non-determinism and help escape local optima, if there are multiple moves with the same
         // top gain, we randomly select one. We check up to `local_max` ties.
         const unsigned local_max = 50;
@@ -198,24 +208,29 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
 
         return best_move;
     }
-    
-    inline void process_other_steps_best_move(const unsigned idx, const unsigned node_step, const VertexType& node, const cost_t affinity_current_proc_step, cost_t& max_gain, unsigned& max_proc, unsigned& max_step, const std::vector<std::vector<cost_t>> &affinity_table_node) const {    
+
+    inline void process_other_steps_best_move(const unsigned idx, const unsigned node_step, const VertexType &node,
+                                              const cost_t affinity_current_proc_step, cost_t &max_gain,
+                                              unsigned &max_proc, unsigned &max_step,
+                                              const std::vector<std::vector<cost_t>> &affinity_table_node) const {
         for (const unsigned p : proc_range.compatible_processors_vertex(node)) {
             if constexpr (active_schedule_t::use_memory_constraint) {
-                if( not active_schedule.memory_constraint.can_move(node, p, node_step + idx - window_size)) continue;                
+                if (not active_schedule.memory_constraint.can_move(node, p, node_step + idx - window_size))
+                    continue;
             }
 
             const cost_t gain = affinity_current_proc_step - affinity_table_node[p][idx];
             if (gain > max_gain) {
                 max_gain = gain;
                 max_proc = p;
-                max_step = idx; 
+                max_step = idx;
             }
         }
     }
 
     template<bool move_to_same_super_step>
-    kl_move compute_best_move(VertexType node, const std::vector<std::vector<cost_t>> &affinity_table_node, ThreadSearchContext & thread_data) {
+    kl_move compute_best_move(VertexType node, const std::vector<std::vector<cost_t>> &affinity_table_node,
+                              ThreadSearchContext &thread_data) {
         const unsigned node_step = active_schedule.assigned_superstep(node);
         const unsigned node_proc = active_schedule.assigned_processor(node);
 
@@ -228,7 +243,8 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
 
         unsigned idx = thread_data.start_idx(node_step);
         for (; idx < window_size; idx++) {
-            process_other_steps_best_move(idx, node_step, node, affinity_current_proc_step, max_gain, max_proc, max_step, affinity_table_node);
+            process_other_steps_best_move(idx, node_step, node, affinity_current_proc_step, max_gain, max_proc,
+                                          max_step, affinity_table_node);
         }
 
         if constexpr (move_to_same_super_step) {
@@ -237,14 +253,15 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                     continue;
 
                 if constexpr (active_schedule_t::use_memory_constraint) {
-                    if( not active_schedule.memory_constraint.can_move(node, proc, node_step + idx - window_size)) continue;                
+                    if (not active_schedule.memory_constraint.can_move(node, proc, node_step + idx - window_size))
+                        continue;
                 }
 
                 const cost_t gain = affinity_current_proc_step - affinity_table_node[proc][window_size];
                 if (gain > max_gain) {
                     max_gain = gain;
                     max_proc = proc;
-                    max_step = idx; 
+                    max_step = idx;
                 }
             }
         }
@@ -253,20 +270,23 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
 
         const unsigned bound = thread_data.end_idx(node_step);
         for (; idx < bound; idx++) {
-            process_other_steps_best_move(idx, node_step, node, affinity_current_proc_step, max_gain, max_proc, max_step, affinity_table_node);
+            process_other_steps_best_move(idx, node_step, node, affinity_current_proc_step, max_gain, max_proc,
+                                          max_step, affinity_table_node);
         }
 
         return kl_move(node, max_gain, node_proc, node_step, max_proc, node_step + max_step - window_size);
     }
-  
-    kl_gain_update_info update_node_work_affinity_after_move(VertexType node, kl_move move, const pre_move_work_data<work_weight_t> & prev_work_data, std::vector<std::vector<cost_t>> &affinity_table_node) {
+
+    kl_gain_update_info update_node_work_affinity_after_move(VertexType node, kl_move move,
+                                                             const pre_move_work_data<work_weight_t> &prev_work_data,
+                                                             std::vector<std::vector<cost_t>> &affinity_table_node) {
         const unsigned node_step = active_schedule.assigned_superstep(node);
         const work_weight_t vertex_weight = graph->vertex_work_weight(node);
 
         kl_gain_update_info update_info(node);
 
-        if (move.from_step == move.to_step) {        
-            const unsigned lower_bound = move.from_step > window_size ? move.from_step - window_size : 0; 
+        if (move.from_step == move.to_step) {
+            const unsigned lower_bound = move.from_step > window_size ? move.from_step - window_size : 0;
             if (lower_bound <= node_step && node_step <= move.from_step + window_size) {
                 update_info.update_from_step = true;
                 update_info.update_to_step = true;
@@ -276,147 +296,242 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
 
                 if (node_step == move.from_step) {
                     const unsigned node_proc = active_schedule.assigned_processor(node);
-                    const work_weight_t new_max_weight = active_schedule.get_step_max_work(move.from_step);   
-                    const work_weight_t new_second_max_weight = active_schedule.get_step_second_max_work(move.from_step);
-                    const work_weight_t new_step_proc_work = active_schedule.get_step_processor_work(node_step, node_proc);
-                    const work_weight_t prev_step_proc_work = (node_proc == move.from_proc) ? new_step_proc_work + graph->vertex_work_weight(move.node) : (node_proc == move.to_proc) ? new_step_proc_work - graph->vertex_work_weight(move.node) : new_step_proc_work;                                               
-                    const bool prev_is_sole_max_processor = (prev_work_data.from_step_max_work_processor_count == 1) && (prev_max_work == prev_step_proc_work);
-                    const cost_t prev_node_proc_affinity = prev_is_sole_max_processor ? std::min(vertex_weight, prev_max_work - prev_second_max_work) : 0.0;
-                    const bool new_is_sole_max_processor = (active_schedule.get_step_max_work_processor_count()[node_step] == 1) && (new_max_weight == new_step_proc_work);
-                    const cost_t new_node_proc_affinity = new_is_sole_max_processor ? std::min(vertex_weight, new_max_weight - new_second_max_weight) : 0.0;                        
-                    
+                    const work_weight_t new_max_weight = active_schedule.get_step_max_work(move.from_step);
+                    const work_weight_t new_second_max_weight =
+                        active_schedule.get_step_second_max_work(move.from_step);
+                    const work_weight_t new_step_proc_work =
+                        active_schedule.get_step_processor_work(node_step, node_proc);
+                    const work_weight_t prev_step_proc_work =
+                        (node_proc == move.from_proc) ? new_step_proc_work + graph->vertex_work_weight(move.node)
+                        : (node_proc == move.to_proc) ? new_step_proc_work - graph->vertex_work_weight(move.node)
+                                                      : new_step_proc_work;
+                    const bool prev_is_sole_max_processor = (prev_work_data.from_step_max_work_processor_count == 1) &&
+                                                            (prev_max_work == prev_step_proc_work);
+                    const cost_t prev_node_proc_affinity =
+                        prev_is_sole_max_processor ? std::min(vertex_weight, prev_max_work - prev_second_max_work)
+                                                   : 0.0;
+                    const bool new_is_sole_max_processor =
+                        (active_schedule.get_step_max_work_processor_count()[node_step] == 1) &&
+                        (new_max_weight == new_step_proc_work);
+                    const cost_t new_node_proc_affinity =
+                        new_is_sole_max_processor ? std::min(vertex_weight, new_max_weight - new_second_max_weight)
+                                                  : 0.0;
+
                     const cost_t diff = new_node_proc_affinity - prev_node_proc_affinity;
                     if (std::abs(diff) > EPSILON) {
                         update_info.full_update = true;
                         affinity_table_node[node_proc][window_size] += diff; // Use the pre-calculated diff
-                    }    
-                    
+                    }
+
                     if ((prev_max_work != new_max_weight) || update_info.full_update) {
                         update_info.update_entire_from_step = true;
 
-                        for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { 
-                            if((proc == node_proc) || (proc == move.from_proc) || (proc == move.to_proc)) {
+                        for (const unsigned proc : proc_range.compatible_processors_vertex(node)) {
+                            if ((proc == node_proc) || (proc == move.from_proc) || (proc == move.to_proc)) {
                                 continue;
                             }
 
-                            const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, proc);
-                            const cost_t prev_other_affinity = compute_same_step_affinity(prev_max_work, new_weight, prev_node_proc_affinity);   
-                            const cost_t other_affinity = compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity);             
-                               
-                            affinity_table_node[proc][window_size] += (other_affinity - prev_other_affinity);                            
+                            const work_weight_t new_weight =
+                                vertex_weight + active_schedule.get_step_processor_work(node_step, proc);
+                            const cost_t prev_other_affinity =
+                                compute_same_step_affinity(prev_max_work, new_weight, prev_node_proc_affinity);
+                            const cost_t other_affinity =
+                                compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity);
+
+                            affinity_table_node[proc][window_size] += (other_affinity - prev_other_affinity);
                         }
-                    }  
-                    
+                    }
+
                     if (node_proc != move.from_proc && is_compatible(node, move.from_proc)) {
-                        const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move.from_proc) + graph->vertex_work_weight(move.node);
-                        const cost_t prev_other_affinity = compute_same_step_affinity(prev_max_work, prev_new_weight, prev_node_proc_affinity);   
-                        const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move.from_proc);
-                        const cost_t other_affinity = compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity);           
-                        affinity_table_node[move.from_proc][window_size] += (other_affinity - prev_other_affinity);  
-                    } 
-                    
+                        const work_weight_t prev_new_weight =
+                            vertex_weight + active_schedule.get_step_processor_work(node_step, move.from_proc) +
+                            graph->vertex_work_weight(move.node);
+                        const cost_t prev_other_affinity =
+                            compute_same_step_affinity(prev_max_work, prev_new_weight, prev_node_proc_affinity);
+                        const work_weight_t new_weight =
+                            vertex_weight + active_schedule.get_step_processor_work(node_step, move.from_proc);
+                        const cost_t other_affinity =
+                            compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity);
+                        affinity_table_node[move.from_proc][window_size] += (other_affinity - prev_other_affinity);
+                    }
+
                     if (node_proc != move.to_proc && is_compatible(node, move.to_proc)) {
-                        const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move.to_proc) - graph->vertex_work_weight(move.node);
-                        const cost_t prev_other_affinity = compute_same_step_affinity(prev_max_work, prev_new_weight, prev_node_proc_affinity);      
-                        const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move.to_proc);
-                        const cost_t other_affinity = compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity);            
-                        affinity_table_node[move.to_proc][window_size] += (other_affinity - prev_other_affinity);  
+                        const work_weight_t prev_new_weight =
+                            vertex_weight + active_schedule.get_step_processor_work(node_step, move.to_proc) -
+                            graph->vertex_work_weight(move.node);
+                        const cost_t prev_other_affinity =
+                            compute_same_step_affinity(prev_max_work, prev_new_weight, prev_node_proc_affinity);
+                        const work_weight_t new_weight =
+                            vertex_weight + active_schedule.get_step_processor_work(node_step, move.to_proc);
+                        const cost_t other_affinity =
+                            compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity);
+                        affinity_table_node[move.to_proc][window_size] += (other_affinity - prev_other_affinity);
                     }
 
-                } else {                    
+                } else {
                     const work_weight_t new_max_weight = active_schedule.get_step_max_work(move.from_step);
-                    const unsigned idx = rel_step_idx(node_step, move.from_step); 
+                    const unsigned idx = rel_step_idx(node_step, move.from_step);
                     if (prev_max_work != new_max_weight) {
-                        update_info.update_entire_from_step = true;                        
+                        update_info.update_entire_from_step = true;
                         // update moving to all procs with special for move.from_proc
-                        for (const unsigned proc : proc_range.compatible_processors_vertex(node)) {                            
-                            const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(move.from_step, proc);
+                        for (const unsigned proc : proc_range.compatible_processors_vertex(node)) {
+                            const work_weight_t new_weight =
+                                vertex_weight + active_schedule.get_step_processor_work(move.from_step, proc);
                             if (proc == move.from_proc) {
-                                const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(move.from_step, proc) + graph->vertex_work_weight(move.node);
-                                const cost_t prev_affinity = prev_max_work < prev_new_weight ? static_cast<cost_t>(prev_new_weight) - static_cast<cost_t>(prev_max_work) : 0.0;
-                                const cost_t new_affinity = new_max_weight < new_weight ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight) : 0.0;
-                                affinity_table_node[proc][idx] += new_affinity - prev_affinity;                                              
+                                const work_weight_t prev_new_weight =
+                                    vertex_weight + active_schedule.get_step_processor_work(move.from_step, proc) +
+                                    graph->vertex_work_weight(move.node);
+                                const cost_t prev_affinity =
+                                    prev_max_work < prev_new_weight
+                                        ? static_cast<cost_t>(prev_new_weight) - static_cast<cost_t>(prev_max_work)
+                                        : 0.0;
+                                const cost_t new_affinity =
+                                    new_max_weight < new_weight
+                                        ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight)
+                                        : 0.0;
+                                affinity_table_node[proc][idx] += new_affinity - prev_affinity;
                             } else if (proc == move.to_proc) {
-                                const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(move.to_step, proc) - graph->vertex_work_weight(move.node);
-                                const cost_t prev_affinity = prev_max_work < prev_new_weight ? static_cast<cost_t>(prev_new_weight) - static_cast<cost_t>(prev_max_work) : 0.0;
-                                const cost_t new_affinity = new_max_weight < new_weight ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight) : 0.0;
+                                const work_weight_t prev_new_weight =
+                                    vertex_weight + active_schedule.get_step_processor_work(move.to_step, proc) -
+                                    graph->vertex_work_weight(move.node);
+                                const cost_t prev_affinity =
+                                    prev_max_work < prev_new_weight
+                                        ? static_cast<cost_t>(prev_new_weight) - static_cast<cost_t>(prev_max_work)
+                                        : 0.0;
+                                const cost_t new_affinity =
+                                    new_max_weight < new_weight
+                                        ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight)
+                                        : 0.0;
                                 affinity_table_node[proc][idx] += new_affinity - prev_affinity;
                             } else {
-                                const cost_t prev_affinity = prev_max_work < new_weight ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(prev_max_work) : 0.0;
-                                const cost_t new_affinity = new_max_weight < new_weight ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight) : 0.0;
-                                affinity_table_node[proc][idx] += new_affinity - prev_affinity;  
+                                const cost_t prev_affinity =
+                                    prev_max_work < new_weight
+                                        ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(prev_max_work)
+                                        : 0.0;
+                                const cost_t new_affinity =
+                                    new_max_weight < new_weight
+                                        ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight)
+                                        : 0.0;
+                                affinity_table_node[proc][idx] += new_affinity - prev_affinity;
                             }
-                        }                            
+                        }
                     } else {
                         // update only move.from_proc and move.to_proc
                         if (is_compatible(node, move.from_proc)) {
-                            const work_weight_t from_new_weight = vertex_weight + active_schedule.get_step_processor_work(move.from_step, move.from_proc);
-                            const work_weight_t from_prev_new_weight = from_new_weight + graph->vertex_work_weight(move.node);
-                            const cost_t from_prev_affinity = prev_max_work < from_prev_new_weight ? static_cast<cost_t>(from_prev_new_weight) - static_cast<cost_t>(prev_max_work) : 0.0;
-
-                            const cost_t from_new_affinity = new_max_weight < from_new_weight ? static_cast<cost_t>(from_new_weight) - static_cast<cost_t>(new_max_weight) : 0.0;
+                            const work_weight_t from_new_weight =
+                                vertex_weight + active_schedule.get_step_processor_work(move.from_step, move.from_proc);
+                            const work_weight_t from_prev_new_weight =
+                                from_new_weight + graph->vertex_work_weight(move.node);
+                            const cost_t from_prev_affinity =
+                                prev_max_work < from_prev_new_weight
+                                    ? static_cast<cost_t>(from_prev_new_weight) - static_cast<cost_t>(prev_max_work)
+                                    : 0.0;
+
+                            const cost_t from_new_affinity =
+                                new_max_weight < from_new_weight
+                                    ? static_cast<cost_t>(from_new_weight) - static_cast<cost_t>(new_max_weight)
+                                    : 0.0;
                             affinity_table_node[move.from_proc][idx] += from_new_affinity - from_prev_affinity;
                         }
 
                         if (is_compatible(node, move.to_proc)) {
-                            const work_weight_t to_new_weight = vertex_weight + active_schedule.get_step_processor_work(move.to_step, move.to_proc);
-                            const work_weight_t to_prev_new_weight = to_new_weight - graph->vertex_work_weight(move.node);
-                            const cost_t to_prev_affinity = prev_max_work < to_prev_new_weight ? static_cast<cost_t>(to_prev_new_weight) - static_cast<cost_t>(prev_max_work) : 0.0;
-
-                            const cost_t to_new_affinity = new_max_weight < to_new_weight ? static_cast<cost_t>(to_new_weight) - static_cast<cost_t>(new_max_weight) : 0.0;
+                            const work_weight_t to_new_weight =
+                                vertex_weight + active_schedule.get_step_processor_work(move.to_step, move.to_proc);
+                            const work_weight_t to_prev_new_weight =
+                                to_new_weight - graph->vertex_work_weight(move.node);
+                            const cost_t to_prev_affinity =
+                                prev_max_work < to_prev_new_weight
+                                    ? static_cast<cost_t>(to_prev_new_weight) - static_cast<cost_t>(prev_max_work)
+                                    : 0.0;
+
+                            const cost_t to_new_affinity =
+                                new_max_weight < to_new_weight
+                                    ? static_cast<cost_t>(to_new_weight) - static_cast<cost_t>(new_max_weight)
+                                    : 0.0;
                             affinity_table_node[move.to_proc][idx] += to_new_affinity - to_prev_affinity;
                         }
                     }
                 }
             }
-            
-        } else {            
+
+        } else {
             const unsigned node_proc = active_schedule.assigned_processor(node);
-            process_work_update_step(node, node_step, node_proc, vertex_weight, move.from_step, move.from_proc, graph->vertex_work_weight(move.node), prev_work_data.from_step_max_work, prev_work_data.from_step_second_max_work, prev_work_data.from_step_max_work_processor_count, update_info.update_from_step, update_info.update_entire_from_step, update_info.full_update, affinity_table_node);
-            process_work_update_step(node, node_step, node_proc, vertex_weight, move.to_step, move.to_proc, -graph->vertex_work_weight(move.node), prev_work_data.to_step_max_work, prev_work_data.to_step_second_max_work, prev_work_data.to_step_max_work_processor_count, update_info.update_to_step, update_info.update_entire_to_step, update_info.full_update, affinity_table_node);
+            process_work_update_step(node, node_step, node_proc, vertex_weight, move.from_step, move.from_proc,
+                                     graph->vertex_work_weight(move.node), prev_work_data.from_step_max_work,
+                                     prev_work_data.from_step_second_max_work,
+                                     prev_work_data.from_step_max_work_processor_count, update_info.update_from_step,
+                                     update_info.update_entire_from_step, update_info.full_update, affinity_table_node);
+            process_work_update_step(node, node_step, node_proc, vertex_weight, move.to_step, move.to_proc,
+                                     -graph->vertex_work_weight(move.node), prev_work_data.to_step_max_work,
+                                     prev_work_data.to_step_second_max_work,
+                                     prev_work_data.to_step_max_work_processor_count, update_info.update_to_step,
+                                     update_info.update_entire_to_step, update_info.full_update, affinity_table_node);
         }
 
         return update_info;
     }
 
-    void process_work_update_step(VertexType node, unsigned node_step, unsigned node_proc, work_weight_t vertex_weight, unsigned move_step, unsigned move_proc, work_weight_t move_correction_node_weight, const work_weight_t prev_move_step_max_work, const work_weight_t prev_move_step_second_max_work, unsigned prev_move_step_max_work_processor_count, bool & update_step, bool & update_entire_step, bool & full_update, std::vector<std::vector<cost_t>> &affinity_table_node);
-    void update_node_work_affinity(node_selection_container_t &nodes, kl_move move, const pre_move_work_data<work_weight_t> & prev_work_data, std::map<VertexType, kl_gain_update_info> &recompute_max_gain);
-    void update_best_move(VertexType node, unsigned step, unsigned proc, node_selection_container_t &affinity_table, ThreadSearchContext & thread_data);
-    void update_best_move(VertexType node, unsigned step, node_selection_container_t &affinity_table, ThreadSearchContext & thread_data);
-    void update_max_gain(kl_move move, std::map<VertexType, kl_gain_update_info> &recompute_max_gain, ThreadSearchContext & thread_data);
-    void compute_work_affinity(VertexType node, std::vector<std::vector<cost_t>> & affinity_table_node, ThreadSearchContext & thread_data);
-
-    inline void recompute_node_max_gain(VertexType node, node_selection_container_t &affinity_table, ThreadSearchContext & thread_data) {
+    void process_work_update_step(VertexType node, unsigned node_step, unsigned node_proc, work_weight_t vertex_weight,
+                                  unsigned move_step, unsigned move_proc, work_weight_t move_correction_node_weight,
+                                  const work_weight_t prev_move_step_max_work,
+                                  const work_weight_t prev_move_step_second_max_work,
+                                  unsigned prev_move_step_max_work_processor_count, bool &update_step,
+                                  bool &update_entire_step, bool &full_update,
+                                  std::vector<std::vector<cost_t>> &affinity_table_node);
+    void update_node_work_affinity(node_selection_container_t &nodes, kl_move move,
+                                   const pre_move_work_data<work_weight_t> &prev_work_data,
+                                   std::map<VertexType, kl_gain_update_info> &recompute_max_gain);
+    void update_best_move(VertexType node, unsigned step, unsigned proc, node_selection_container_t &affinity_table,
+                          ThreadSearchContext &thread_data);
+    void update_best_move(VertexType node, unsigned step, node_selection_container_t &affinity_table,
+                          ThreadSearchContext &thread_data);
+    void update_max_gain(kl_move move, std::map<VertexType, kl_gain_update_info> &recompute_max_gain,
+                         ThreadSearchContext &thread_data);
+    void compute_work_affinity(VertexType node, std::vector<std::vector<cost_t>> &affinity_table_node,
+                               ThreadSearchContext &thread_data);
+
+    inline void recompute_node_max_gain(VertexType node, node_selection_container_t &affinity_table,
+                                        ThreadSearchContext &thread_data) {
         const auto best_move = compute_best_move<true>(node, affinity_table[node], thread_data);
-        thread_data.max_gain_heap.update(node, best_move);   
+        thread_data.max_gain_heap.update(node, best_move);
     }
 
-    inline cost_t compute_same_step_affinity(const work_weight_t &max_work_for_step, const work_weight_t &new_weight, const cost_t &node_proc_affinity) {
+    inline cost_t compute_same_step_affinity(const work_weight_t &max_work_for_step, const work_weight_t &new_weight,
+                                             const cost_t &node_proc_affinity) {
         const cost_t max_work_after_removal = static_cast<cost_t>(max_work_for_step) - node_proc_affinity;
         if (new_weight > max_work_after_removal) {
             return new_weight - max_work_after_removal;
         }
         return 0.0;
     }
-    
-    inline cost_t apply_move(kl_move move, ThreadSearchContext & thread_data) {
+
+    inline cost_t apply_move(kl_move move, ThreadSearchContext &thread_data) {
         active_schedule.apply_move(move, thread_data.active_schedule_data);
-        comm_cost_f.update_datastructure_after_move(move, thread_data.start_step, thread_data.end_step); 
+        comm_cost_f.update_datastructure_after_move(move, thread_data.start_step, thread_data.end_step);
         cost_t change_in_cost = -move.gain;
-        change_in_cost += static_cast<cost_t>(thread_data.active_schedule_data.resolved_violations.size()) * thread_data.reward_penalty_strat.reward;
-        change_in_cost -= static_cast<cost_t>(thread_data.active_schedule_data.new_violations.size()) * thread_data.reward_penalty_strat.penalty;
-  
+        change_in_cost += static_cast<cost_t>(thread_data.active_schedule_data.resolved_violations.size()) *
+                          thread_data.reward_penalty_strat.reward;
+        change_in_cost -= static_cast<cost_t>(thread_data.active_schedule_data.new_violations.size()) *
+                          thread_data.reward_penalty_strat.penalty;
+
 #ifdef KL_DEBUG
-        std::cout << "penalty: " << thread_data.reward_penalty_strat.penalty << " num violations: " << thread_data.active_schedule_data.current_violations.size() <<  " num new violations: " << thread_data.active_schedule_data.new_violations.size() << ", num resolved violations: " << thread_data.active_schedule_data.resolved_violations.size() <<  ", reward: " << thread_data.reward_penalty_strat.reward << std::endl;
-        std::cout << "apply move, previous cost: " << thread_data.active_schedule_data.cost << ", new cost: " << thread_data.active_schedule_data.cost + change_in_cost << ", " << (thread_data.active_schedule_data.feasible ? "feasible," : "infeasible,") << std::endl;
+        std::cout << "penalty: " << thread_data.reward_penalty_strat.penalty
+                  << " num violations: " << thread_data.active_schedule_data.current_violations.size()
+                  << " num new violations: " << thread_data.active_schedule_data.new_violations.size()
+                  << ", num resolved violations: " << thread_data.active_schedule_data.resolved_violations.size()
+                  << ", reward: " << thread_data.reward_penalty_strat.reward << std::endl;
+        std::cout << "apply move, previous cost: " << thread_data.active_schedule_data.cost
+                  << ", new cost: " << thread_data.active_schedule_data.cost + change_in_cost << ", "
+                  << (thread_data.active_schedule_data.feasible ? "feasible," : "infeasible,") << std::endl;
 #endif
-        
+
         thread_data.active_schedule_data.update_cost(change_in_cost);
-        
+
         return change_in_cost;
-    }    
+    }
 
-    void run_quick_moves(unsigned & inner_iter, ThreadSearchContext & thread_data, const cost_t change_in_cost, const VertexType best_move_node) {
+    void run_quick_moves(unsigned &inner_iter, ThreadSearchContext &thread_data, const cost_t change_in_cost,
+                         const VertexType best_move_node) {
 #ifdef KL_DEBUG
         std::cout << "Starting quick moves sequence." << std::endl;
 #endif
@@ -430,7 +545,7 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
         std::vector<VertexType> quick_moves_stack;
         quick_moves_stack.reserve(10 + thread_data.active_schedule_data.new_violations.size() * 2);
 
-        for (const auto& key_value_pair : thread_data.active_schedule_data.new_violations) {
+        for (const auto &key_value_pair : thread_data.active_schedule_data.new_violations) {
             const auto &key = key_value_pair.first;
             quick_moves_stack.push_back(key);
         }
@@ -439,10 +554,12 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
 
             auto next_node_to_move = quick_moves_stack.back();
             quick_moves_stack.pop_back();
-            
-            thread_data.reward_penalty_strat.init_reward_penalty(static_cast<double>(thread_data.active_schedule_data.current_violations.size()) + 1.0);
+
+            thread_data.reward_penalty_strat.init_reward_penalty(
+                static_cast<double>(thread_data.active_schedule_data.current_violations.size()) + 1.0);
             compute_node_affinities(next_node_to_move, thread_data.local_affinity_table, thread_data);
-            kl_move best_quick_move = compute_best_move<true>(next_node_to_move, thread_data.local_affinity_table, thread_data);
+            kl_move best_quick_move =
+                compute_best_move<true>(next_node_to_move, thread_data.local_affinity_table, thread_data);
 
             local_lock.insert(next_node_to_move);
             if (best_quick_move.gain <= std::numeric_limits<cost_t>::lowest()) {
@@ -450,25 +567,28 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
             }
 
 #ifdef KL_DEBUG
-            std::cout << " >>> move node " << best_quick_move.node << " with gain " << best_quick_move.gain << ", from proc|step: " << best_quick_move.from_proc << "|" << best_quick_move.from_step << " to: " << best_quick_move.to_proc << "|" << best_quick_move.to_step << std::endl;
+            std::cout << " >>> move node " << best_quick_move.node << " with gain " << best_quick_move.gain
+                      << ", from proc|step: " << best_quick_move.from_proc << "|" << best_quick_move.from_step
+                      << " to: " << best_quick_move.to_proc << "|" << best_quick_move.to_step << std::endl;
 #endif
 
-            apply_move(best_quick_move, thread_data);                          
+            apply_move(best_quick_move, thread_data);
             inner_iter++;
 
             if (thread_data.active_schedule_data.new_violations.size() > 0) {
                 bool abort = false;
 
-                for (const auto& key_value_pair : thread_data.active_schedule_data.new_violations) {
+                for (const auto &key_value_pair : thread_data.active_schedule_data.new_violations) {
                     const auto &key = key_value_pair.first;
-                    if(local_lock.find(key) != local_lock.end()) {
+                    if (local_lock.find(key) != local_lock.end()) {
                         abort = true;
                         break;
-                    }                                    
+                    }
                     quick_moves_stack.push_back(key);
                 }
 
-                if (abort) break;
+                if (abort)
+                    break;
 
             } else if (thread_data.active_schedule_data.feasible) {
                 break;
@@ -476,11 +596,13 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
         }
 
         if (!thread_data.active_schedule_data.feasible) {
-            active_schedule.revert_schedule_to_bound(num_applied_moves, saved_cost ,true, comm_cost_f, thread_data.active_schedule_data, thread_data.start_step, thread_data.end_step);
+            active_schedule.revert_schedule_to_bound(num_applied_moves, saved_cost, true, comm_cost_f,
+                                                     thread_data.active_schedule_data, thread_data.start_step,
+                                                     thread_data.end_step);
 #ifdef KL_DEBUG
             std::cout << "Ending quick moves sequence with infeasible solution." << std::endl;
 #endif
-        } 
+        }
 #ifdef KL_DEBUG
         else {
             std::cout << "Ending quick moves sequence with feasible solution." << std::endl;
@@ -493,18 +615,19 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
         insert_gain_heap(thread_data); // Re-initialize the heap with the current state
     }
 
-    void resolve_violations(ThreadSearchContext & thread_data) {    
-        auto & current_violations = thread_data.active_schedule_data.current_violations;
+    void resolve_violations(ThreadSearchContext &thread_data) {
+        auto &current_violations = thread_data.active_schedule_data.current_violations;
         unsigned num_violations = static_cast<unsigned>(current_violations.size());
         if (num_violations > 0) {
 
 #ifdef KL_DEBUG_1
-            std::cout << "thread " << thread_data.thread_id << ", Starting preresolving violations with " << num_violations << " initial violations" << std::endl;
+            std::cout << "thread " << thread_data.thread_id << ", Starting preresolving violations with "
+                      << num_violations << " initial violations" << std::endl;
 #endif
             thread_data.reward_penalty_strat.init_reward_penalty(static_cast<double>(num_violations) + 1.0);
-             std::unordered_set<VertexType> local_lock;
+            std::unordered_set<VertexType> local_lock;
             unsigned num_iter = 0;
-            const unsigned min_iter = num_violations / 4; 
+            const unsigned min_iter = num_violations / 4;
             while (not current_violations.empty()) {
                 std::uniform_int_distribution<size_t> dis(0, current_violations.size() - 1);
                 auto it = current_violations.begin();
@@ -514,14 +637,14 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                 const VertexType target_v = target(next_edge, *graph);
                 const bool source_locked = local_lock.find(source_v) != local_lock.end();
                 const bool target_locked = local_lock.find(target_v) != local_lock.end();
-                    
+
                 if (source_locked && target_locked) {
 #ifdef KL_DEBUG_1
                     std::cout << "source, target locked" << std::endl;
 #endif
                     break;
                 }
-                
+
                 kl_move best_move;
                 if (source_locked || target_locked) {
                     const VertexType node = source_locked ? target_v : source_v;
@@ -529,25 +652,32 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                     best_move = compute_best_move<true>(node, thread_data.local_affinity_table, thread_data);
                 } else {
                     compute_node_affinities(source_v, thread_data.local_affinity_table, thread_data);
-                    kl_move best_source_v_move = compute_best_move<true>(source_v, thread_data.local_affinity_table, thread_data);
+                    kl_move best_source_v_move =
+                        compute_best_move<true>(source_v, thread_data.local_affinity_table, thread_data);
                     compute_node_affinities(target_v, thread_data.local_affinity_table, thread_data);
-                    kl_move best_target_v_move = compute_best_move<true>(target_v, thread_data.local_affinity_table, thread_data);
-                    best_move = best_target_v_move.gain > best_source_v_move.gain ? std::move(best_target_v_move) : std::move(best_source_v_move);
+                    kl_move best_target_v_move =
+                        compute_best_move<true>(target_v, thread_data.local_affinity_table, thread_data);
+                    best_move = best_target_v_move.gain > best_source_v_move.gain ? std::move(best_target_v_move)
+                                                                                  : std::move(best_source_v_move);
                 }
 
                 local_lock.insert(best_move.node);
-                if (best_move.gain <= std::numeric_limits<cost_t>::lowest()) continue;
+                if (best_move.gain <= std::numeric_limits<cost_t>::lowest())
+                    continue;
 
                 apply_move(best_move, thread_data);
                 thread_data.affinity_table.insert(best_move.node);
 #ifdef KL_DEBUG_1
-        std::cout << "move node " << best_move.node << " with gain " << best_move.gain << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step << " to: " << best_move.to_proc << "|" << best_move.to_step << std::endl;
+                std::cout << "move node " << best_move.node << " with gain " << best_move.gain
+                          << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step
+                          << " to: " << best_move.to_proc << "|" << best_move.to_step << std::endl;
 #endif
                 const unsigned new_num_violations = static_cast<unsigned>(current_violations.size());
-                if (new_num_violations == 0) break;
+                if (new_num_violations == 0)
+                    break;
 
-                if (thread_data.active_schedule_data.new_violations.size() > 0) {                
-                    for (const auto & vertex_edge_pair : thread_data.active_schedule_data.new_violations) {
+                if (thread_data.active_schedule_data.new_violations.size() > 0) {
+                    for (const auto &vertex_edge_pair : thread_data.active_schedule_data.new_violations) {
                         const auto &vertex = vertex_edge_pair.first;
                         thread_data.affinity_table.insert(vertex);
                     }
@@ -557,20 +687,24 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                 num_violations = new_num_violations;
                 update_avg_gain(gain, num_iter++, thread_data.average_gain);
 #ifdef KL_DEBUG_1
-                std::cout << "thread " << thread_data.thread_id << ",  preresolving violations with " << num_violations << " violations, " << num_iter << " #iterations, " << thread_data.average_gain << " average gain" << std::endl;
+                std::cout << "thread " << thread_data.thread_id << ",  preresolving violations with " << num_violations
+                          << " violations, " << num_iter << " #iterations, " << thread_data.average_gain
+                          << " average gain" << std::endl;
 #endif
                 if (num_iter > min_iter && thread_data.average_gain < 0.0) {
                     break;
                 }
             }
             thread_data.average_gain = 0.0;
-        } 
+        }
     }
 
-    void run_local_search(ThreadSearchContext & thread_data) {
+    void run_local_search(ThreadSearchContext &thread_data) {
 
 #ifdef KL_DEBUG_1
-        std::cout << "thread " << thread_data.thread_id << ", start local search, initial schedule cost: " << thread_data.active_schedule_data.cost << " with " << thread_data.num_steps() << " supersteps." << std::endl;
+        std::cout << "thread " << thread_data.thread_id
+                  << ", start local search, initial schedule cost: " << thread_data.active_schedule_data.cost
+                  << " with " << thread_data.num_steps() << " supersteps." << std::endl;
 #endif
         std::vector<VertexType> new_nodes;
         std::vector<VertexType> unlock_nodes;
@@ -584,74 +718,90 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
         for (; outer_iter < parameters.max_outer_iterations; outer_iter++) {
             cost_t initial_inner_iter_cost = thread_data.active_schedule_data.cost;
 
-            reset_inner_search_structures(thread_data);            
-            select_active_nodes(thread_data);              
-            thread_data.reward_penalty_strat.init_reward_penalty(static_cast<double>(thread_data.active_schedule_data.current_violations.size()) + 1.0);
+            reset_inner_search_structures(thread_data);
+            select_active_nodes(thread_data);
+            thread_data.reward_penalty_strat.init_reward_penalty(
+                static_cast<double>(thread_data.active_schedule_data.current_violations.size()) + 1.0);
             insert_gain_heap(thread_data);
-            
+
             unsigned inner_iter = 0;
             unsigned violation_removed_count = 0;
             unsigned reset_counter = 0;
             bool iter_inital_feasible = thread_data.active_schedule_data.feasible;
-                        
+
 #ifdef KL_DEBUG
             std::cout << "------ start inner loop ------" << std::endl;
             std::cout << "initial node selection: {";
-            for (size_t i = 0; i < thread_data.affinity_table.size() ; ++i) {
+            for (size_t i = 0; i < thread_data.affinity_table.size(); ++i) {
                 std::cout << thread_data.affinity_table.get_selected_nodes()[i] << ", ";
             }
             std::cout << "}" << std::endl;
 #endif
 #ifdef KL_DEBUG_1
             if (not iter_inital_feasible) {
-                std::cout << "initial solution not feasible, num violations: " << thread_data.active_schedule_data.current_violations.size() << ". Penalty: " << thread_data.reward_penalty_strat.penalty << ", reward: " << thread_data.reward_penalty_strat.reward << std::endl;
+                std::cout << "initial solution not feasible, num violations: "
+                          << thread_data.active_schedule_data.current_violations.size()
+                          << ". Penalty: " << thread_data.reward_penalty_strat.penalty
+                          << ", reward: " << thread_data.reward_penalty_strat.reward << std::endl;
             }
 #endif
 #ifdef KL_DEBUG_COST_CHECK
-                active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps();
-                if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) {
-                    std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
-                    std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl;
-                }
-                if constexpr (active_schedule_t::use_memory_constraint) {
-                    if ( not active_schedule.memory_constraint.satisfied_memory_constraint())
-                        std::cout << "memory constraint not satisfied" << std::endl;
-                }
+            active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps();
+            if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001) {
+                std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test()
+                          << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
+                std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<"
+                          << std::endl;
+            }
+            if constexpr (active_schedule_t::use_memory_constraint) {
+                if (not active_schedule.memory_constraint.satisfied_memory_constraint())
+                    std::cout << "memory constraint not satisfied" << std::endl;
+            }
 #endif
 
-
             while (inner_iter < thread_data.max_inner_iterations && thread_data.max_gain_heap.size() > 0) {
-                kl_move best_move = get_best_move(thread_data.affinity_table, thread_data.lock_manager, thread_data.max_gain_heap); // locks best_move.node and removes it from node_selection
+                kl_move best_move =
+                    get_best_move(thread_data.affinity_table, thread_data.lock_manager,
+                                  thread_data.max_gain_heap); // locks best_move.node and removes it from node_selection
                 if (best_move.gain <= std::numeric_limits<cost_t>::lowest()) {
                     break;
-                }                
+                }
                 update_avg_gain(best_move.gain, inner_iter, thread_data.average_gain);
 #ifdef KL_DEBUG
-        std::cout << " >>> move node " << best_move.node << " with gain " << best_move.gain << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step << " to: " << best_move.to_proc << "|" << best_move.to_step << ",avg gain: " << thread_data.average_gain << std::endl;
+                std::cout << " >>> move node " << best_move.node << " with gain " << best_move.gain
+                          << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step
+                          << " to: " << best_move.to_proc << "|" << best_move.to_step
+                          << ",avg gain: " << thread_data.average_gain << std::endl;
 #endif
                 if (inner_iter > thread_data.min_inner_iter && thread_data.average_gain < 0.0) {
 #ifdef KL_DEBUG
-            std::cout << "Negative average gain: " << thread_data.average_gain << ", end local search" << std::endl;
+                    std::cout << "Negative average gain: " << thread_data.average_gain << ", end local search"
+                              << std::endl;
 #endif
-                            break;
+                    break;
                 }
 
 #ifdef KL_DEBUG
-        if (not active_schedule.getInstance().isCompatible(best_move.node, best_move.to_proc)) {
-            std::cout << "move to incompatibe node" << std::endl;
-        }
+                if (not active_schedule.getInstance().isCompatible(best_move.node, best_move.to_proc)) {
+                    std::cout << "move to incompatibe node" << std::endl;
+                }
 #endif
 
                 const auto prev_work_data = active_schedule.get_pre_move_work_data(best_move);
+                const typename comm_cost_function_t::pre_move_comm_data_t prev_comm_data =
+                    comm_cost_f.get_pre_move_comm_data(best_move);
                 const cost_t change_in_cost = apply_move(best_move, thread_data);
 #ifdef KL_DEBUG_COST_CHECK
                 active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps();
-                if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) {
-                    std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
-                    std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl;
+                if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) >
+                    0.00001) {
+                    std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test()
+                              << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
+                    std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<"
+                              << std::endl;
                 }
                 if constexpr (active_schedule_t::use_memory_constraint) {
-                    if ( not active_schedule.memory_constraint.satisfied_memory_constraint())
+                    if (not active_schedule.memory_constraint.satisfied_memory_constraint())
                         std::cout << "memory constraint not satisfied" << std::endl;
                 }
 #endif
@@ -659,17 +809,21 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                     if (iter_inital_feasible && thread_data.active_schedule_data.new_violations.size() > 0) {
                         run_quick_moves(inner_iter, thread_data, change_in_cost, best_move.node);
 #ifdef KL_DEBUG_COST_CHECK
-                active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps();
-                if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) {
-                    std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
-                    std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl;
-                }
-                if constexpr (active_schedule_t::use_memory_constraint) {
-                    if ( not active_schedule.memory_constraint.satisfied_memory_constraint())
-                        std::cout << "memory constraint not satisfied" << std::endl;
-                }
+                        active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps();
+                        if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) >
+                            0.00001) {
+                            std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test()
+                                      << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
+                            std::cout
+                                << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<"
+                                << std::endl;
+                        }
+                        if constexpr (active_schedule_t::use_memory_constraint) {
+                            if (not active_schedule.memory_constraint.satisfied_memory_constraint())
+                                std::cout << "memory constraint not satisfied" << std::endl;
+                        }
 #endif
-                                continue;
+                        continue;
                     }
                 }
 
@@ -680,40 +834,47 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                         violation_removed_count++;
 
                         if (violation_removed_count > 3) {
-                            if (reset_counter < thread_data.max_no_vioaltions_removed_backtrack && ((not iter_inital_feasible) || (thread_data.active_schedule_data.cost < thread_data.active_schedule_data.best_cost))) {
+                            if (reset_counter < thread_data.max_no_vioaltions_removed_backtrack &&
+                                ((not iter_inital_feasible) || (thread_data.active_schedule_data.cost <
+                                                                thread_data.active_schedule_data.best_cost))) {
                                 thread_data.affinity_table.reset_node_selection();
                                 thread_data.max_gain_heap.clear();
                                 thread_data.lock_manager.clear();
-                                thread_data.selection_strategy.select_nodes_violations(thread_data.affinity_table, thread_data.active_schedule_data.current_violations, thread_data.start_step, thread_data.end_step);
+                                thread_data.selection_strategy.select_nodes_violations(
+                                    thread_data.affinity_table, thread_data.active_schedule_data.current_violations,
+                                    thread_data.start_step, thread_data.end_step);
 #ifdef KL_DEBUG
-                        std::cout << "Infeasible, and no violations resolved for 5 iterations, reset node selection" << std::endl;
+                                std::cout
+                                    << "Infeasible, and no violations resolved for 5 iterations, reset node selection"
+                                    << std::endl;
 #endif
-                                thread_data.reward_penalty_strat.init_reward_penalty(static_cast<double>(thread_data.active_schedule_data.current_violations.size()));
+                                thread_data.reward_penalty_strat.init_reward_penalty(
+                                    static_cast<double>(thread_data.active_schedule_data.current_violations.size()));
                                 insert_gain_heap(thread_data);
 
                                 reset_counter++;
                                 inner_iter++;
-                                        continue;
+                                continue;
                             } else {
 #ifdef KL_DEBUG
-                        std::cout << "Infeasible, and no violations resolved for 5 iterations, end local search" << std::endl;
+                                std::cout << "Infeasible, and no violations resolved for 5 iterations, end local search"
+                                          << std::endl;
 #endif
-                                        break;
+                                break;
                             }
                         }
                     }
                 }
-                                
-                if(is_local_search_blocked(thread_data)) {
+
+                if (is_local_search_blocked(thread_data)) {
                     if (not blocked_edge_strategy(best_move.node, unlock_nodes, thread_data)) {
-                                break;
+                        break;
                     }
                 }
 
                 thread_data.affinity_table.trim();
-
-                update_node_work_affinity(thread_data.affinity_table, best_move, prev_work_data, recompute_max_gain);
-                comm_cost_f.update_node_comm_affinity(best_move, thread_data, thread_data.reward_penalty_strat.penalty, thread_data.reward_penalty_strat.reward, recompute_max_gain, new_nodes);
+                update_affinities(best_move, thread_data, recompute_max_gain, new_nodes, prev_work_data,
+                                  prev_comm_data);
 
                 for (const auto v : unlock_nodes) {
                     thread_data.lock_manager.unlock(v);
@@ -736,12 +897,15 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
 #endif
 #ifdef KL_DEBUG_COST_CHECK
                 active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps();
-                if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) {
-                    std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
-                    std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl;
+                if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) >
+                    0.00001) {
+                    std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test()
+                              << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
+                    std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<"
+                              << std::endl;
                 }
                 if constexpr (active_schedule_t::use_memory_constraint) {
-                    if ( not active_schedule.memory_constraint.satisfied_memory_constraint())
+                    if (not active_schedule.memory_constraint.satisfied_memory_constraint())
                         std::cout << "memory constraint not satisfied" << std::endl;
                 }
 #endif
@@ -755,31 +919,40 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
             }
 
 #ifdef KL_DEBUG
-            std::cout << "--- end inner loop after " << inner_iter << " inner iterations, gain heap size: " << thread_data.max_gain_heap.size() <<  ", outer iteraion " << outer_iter << "/" << parameters.max_outer_iterations << ", current cost: " << thread_data.active_schedule_data.cost << ", " << (thread_data.active_schedule_data.feasible ? "feasible" : "infeasible") << std::endl;
+            std::cout << "--- end inner loop after " << inner_iter
+                      << " inner iterations, gain heap size: " << thread_data.max_gain_heap.size()
+                      << ", outer iteraion " << outer_iter << "/" << parameters.max_outer_iterations
+                      << ", current cost: " << thread_data.active_schedule_data.cost << ", "
+                      << (thread_data.active_schedule_data.feasible ? "feasible" : "infeasible") << std::endl;
 #endif
 #ifdef KL_DEBUG_1
-            const unsigned num_steps_tmp = thread_data.end_step;            
+            const unsigned num_steps_tmp = thread_data.end_step;
 #endif
-            active_schedule.revert_to_best_schedule(thread_data.local_search_start_step, thread_data.step_to_remove, comm_cost_f, thread_data.active_schedule_data, thread_data.start_step, thread_data.end_step);
+            active_schedule.revert_to_best_schedule(thread_data.local_search_start_step, thread_data.step_to_remove,
+                                                    comm_cost_f, thread_data.active_schedule_data,
+                                                    thread_data.start_step, thread_data.end_step);
 #ifdef KL_DEBUG_1
             if (thread_data.local_search_start_step > 0) {
-                if(num_steps_tmp == thread_data.end_step) {
-                    std::cout << "thread " << thread_data.thread_id << ", removing step " << thread_data.step_to_remove << " succeded " << std::endl;
+                if (num_steps_tmp == thread_data.end_step) {
+                    std::cout << "thread " << thread_data.thread_id << ", removing step " << thread_data.step_to_remove
+                              << " succeded " << std::endl;
                 } else {
-                    std::cout << "thread " << thread_data.thread_id << ", removing step " << thread_data.step_to_remove << " failed " << std::endl;
+                    std::cout << "thread " << thread_data.thread_id << ", removing step " << thread_data.step_to_remove
+                              << " failed " << std::endl;
                 }
-            } 
+            }
 #endif
 
-
 #ifdef KL_DEBUG_COST_CHECK
             active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps();
-            if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) {
-                std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
-                std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl;
+            if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001) {
+                std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test()
+                          << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
+                std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<"
+                          << std::endl;
             }
             if constexpr (active_schedule_t::use_memory_constraint) {
-                if ( not active_schedule.memory_constraint.satisfied_memory_constraint())
+                if (not active_schedule.memory_constraint.satisfied_memory_constraint())
                     std::cout << "memory constraint not satisfied" << std::endl;
             }
 #endif
@@ -791,10 +964,11 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                     break;
                 }
             }
-  
+
             if (other_threads_finished(thread_data.thread_id)) {
 #ifdef KL_DEBUG_1
-                std::cout << "thread " << thread_data.thread_id << ", other threads finished, end local search" << std::endl;
+                std::cout << "thread " << thread_data.thread_id << ", other threads finished, end local search"
+                          << std::endl;
 #endif
                 break;
             }
@@ -804,38 +978,148 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
 
                 if (no_improvement_iter_counter >= parameters.max_no_improvement_iterations) {
 #ifdef KL_DEBUG_1
-                    std::cout << "thread " << thread_data.thread_id << ", no improvement for " << parameters.max_no_improvement_iterations
-                              << " iterations, end local search" << std::endl;
+                    std::cout << "thread " << thread_data.thread_id << ", no improvement for "
+                              << parameters.max_no_improvement_iterations << " iterations, end local search"
+                              << std::endl;
 #endif
                     break;
-                }    
+                }
             } else {
                 no_improvement_iter_counter = 0;
-            } 
-            
+            }
+
             adjust_local_search_parameters(outer_iter, no_improvement_iter_counter, thread_data);
         }
 
 #ifdef KL_DEBUG_1
-        std::cout << "thread " << thread_data.thread_id << ", local search end after " << outer_iter << " outer iterations, current cost: " << thread_data.active_schedule_data.cost << " with " << thread_data.num_steps() << " supersteps, vs serial cost " << active_schedule.get_total_work_weight() << "." << std::endl;
+        std::cout << "thread " << thread_data.thread_id << ", local search end after " << outer_iter
+                  << " outer iterations, current cost: " << thread_data.active_schedule_data.cost << " with "
+                  << thread_data.num_steps() << " supersteps, vs serial cost "
+                  << active_schedule.get_total_work_weight() << "." << std::endl;
 #endif
         thread_finished_vec[thread_data.thread_id] = true;
-
     }
 
     bool other_threads_finished(const unsigned thread_id) {
         const size_t num_threads = thread_finished_vec.size();
-        if(num_threads == 1)
+        if (num_threads == 1)
             return false;
 
         for (size_t i = 0; i < num_threads; i++) {
-            if (i != thread_id && !thread_finished_vec[i]) 
+            if (i != thread_id && !thread_finished_vec[i])
                 return false;
         }
         return true;
     }
 
-    inline bool blocked_edge_strategy(VertexType node, std::vector<VertexType> & unlock_nodes, ThreadSearchContext & thread_data) {
+    inline void update_affinities(const kl_move &best_move, ThreadSearchContext &thread_data,
+                                  std::map<VertexType, kl_gain_update_info> &recompute_max_gain,
+                                  std::vector<VertexType> &new_nodes,
+                                  const pre_move_work_data<v_workw_t<Graph_t>> &prev_work_data,
+                                  const typename comm_cost_function_t::pre_move_comm_data_t &prev_comm_data) {
+
+        if constexpr (comm_cost_function_t::is_max_comm_cost_function) {
+            comm_cost_f.update_node_comm_affinity(
+                best_move, thread_data, thread_data.reward_penalty_strat.penalty,
+                thread_data.reward_penalty_strat.reward, recompute_max_gain,
+                new_nodes); // this only updated reward/penalty, collects new_nodes, and fills recompute_max_gain
+
+            // Determine the steps where max/second_max/max_count for work/comm changed
+            std::unordered_set<unsigned> changed_steps;
+
+            // Check work changes for from_step
+            if (best_move.from_step == best_move.to_step) {
+                // Same step - check if max/second_max changed
+                const auto current_max = active_schedule.get_step_max_work(best_move.from_step);
+                const auto current_second_max = active_schedule.get_step_second_max_work(best_move.from_step);
+                const auto current_count = active_schedule.get_step_max_work_processor_count()[best_move.from_step];
+                if (current_max != prev_work_data.from_step_max_work ||
+                    current_second_max != prev_work_data.from_step_second_max_work ||
+                    current_count != prev_work_data.from_step_max_work_processor_count) {
+                    changed_steps.insert(best_move.from_step);
+                }
+            } else {
+                // Different steps - check both
+                const auto current_from_max = active_schedule.get_step_max_work(best_move.from_step);
+                const auto current_from_second_max = active_schedule.get_step_second_max_work(best_move.from_step);
+                const auto current_from_count =
+                    active_schedule.get_step_max_work_processor_count()[best_move.from_step];
+                if (current_from_max != prev_work_data.from_step_max_work ||
+                    current_from_second_max != prev_work_data.from_step_second_max_work ||
+                    current_from_count != prev_work_data.from_step_max_work_processor_count) {
+                    changed_steps.insert(best_move.from_step);
+                }
+
+                const auto current_to_max = active_schedule.get_step_max_work(best_move.to_step);
+                const auto current_to_second_max = active_schedule.get_step_second_max_work(best_move.to_step);
+                const auto current_to_count = active_schedule.get_step_max_work_processor_count()[best_move.to_step];
+                if (current_to_max != prev_work_data.to_step_max_work ||
+                    current_to_second_max != prev_work_data.to_step_second_max_work ||
+                    current_to_count != prev_work_data.to_step_max_work_processor_count) {
+                    changed_steps.insert(best_move.to_step);
+                }
+            }
+
+            for (const auto &[step, step_info] : prev_comm_data.step_data) {
+                typename comm_cost_function_t::pre_move_comm_data_t::step_info current_info;
+                // Query current values
+                const auto current_max = comm_cost_f.comm_ds.step_max_comm(step);
+                const auto current_second_max = comm_cost_f.comm_ds.step_second_max_comm(step);
+                const auto current_count = comm_cost_f.comm_ds.step_max_comm_count(step);
+
+                if (current_max != step_info.max_comm || current_second_max != step_info.second_max_comm ||
+                    current_count != step_info.max_comm_count) {
+                    changed_steps.insert(step);
+                }
+            }
+
+            // Recompute affinities for all active nodes
+            const size_t active_count = thread_data.affinity_table.size();
+            for (size_t i = 0; i < active_count; ++i) {
+                const VertexType node = thread_data.affinity_table.get_selected_nodes()[i];
+
+                // Determine if this node needs affinity recomputation
+                // A node needs recomputation if it's in or adjacent to changed steps
+                const unsigned node_step = active_schedule.assigned_superstep(node);
+
+                // Calculate window bounds for this node once
+                const int node_lower_bound = static_cast<int>(node_step) - static_cast<int>(window_size);
+                const unsigned node_upper_bound = node_step + window_size;
+
+                bool needs_update = false;
+                // Check if any changed step falls within the node's window
+                for (unsigned step : changed_steps) {
+                    if (static_cast<int>(step) >= node_lower_bound && step <= node_upper_bound) {
+                        needs_update = true;
+                        break;
+                    }
+                }
+
+                if (needs_update) {
+                    auto &affinity_table_node = thread_data.affinity_table.get_affinity_table(node);
+
+                    // Reset affinity table entries to zero
+                    const unsigned num_procs = active_schedule.getInstance().numberOfProcessors();
+                    for (unsigned p = 0; p < num_procs; ++p) {
+                        for (unsigned idx = 0; idx < affinity_table_node[p].size(); ++idx) {
+                            affinity_table_node[p][idx] = 0;
+                        }
+                    }
+
+                    compute_node_affinities(node, affinity_table_node, thread_data);
+                    recompute_max_gain[node] = kl_gain_update_info(node, true);
+                }
+            }
+        } else {
+            update_node_work_affinity(thread_data.affinity_table, best_move, prev_work_data, recompute_max_gain);
+            comm_cost_f.update_node_comm_affinity(best_move, thread_data, thread_data.reward_penalty_strat.penalty,
+                                                  thread_data.reward_penalty_strat.reward, recompute_max_gain,
+                                                  new_nodes);
+        }
+    }
+
+    inline bool blocked_edge_strategy(VertexType node, std::vector<VertexType> &unlock_nodes,
+                                      ThreadSearchContext &thread_data) {
         if (thread_data.unlock_edge_backtrack_counter > 1) {
             for (const auto vertex_edge_pair : thread_data.active_schedule_data.new_violations) {
                 const auto &e = vertex_edge_pair.second;
@@ -849,7 +1133,8 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                 }
             }
 #ifdef KL_DEBUG
-            std::cout << "Nodes of violated edge locked, backtrack counter: " << thread_data.unlock_edge_backtrack_counter <<  std::endl;
+            std::cout << "Nodes of violated edge locked, backtrack counter: "
+                      << thread_data.unlock_edge_backtrack_counter << std::endl;
 #endif
             thread_data.unlock_edge_backtrack_counter--;
             return true;
@@ -857,94 +1142,114 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
 #ifdef KL_DEBUG
             std::cout << "Nodes of violated edge locked, end local search" << std::endl;
 #endif
-            return false;  //or reset local search and initalize with violating nodes
+            return false; // or reset local search and initalize with violating nodes
         }
     }
 
-    inline void adjust_local_search_parameters(unsigned outer_iter, unsigned no_imp_counter, ThreadSearchContext & thread_data) {
-        if (no_imp_counter >= thread_data.no_improvement_iterations_reduce_penalty && thread_data.reward_penalty_strat.initial_penalty > 1.0) {
-            thread_data.reward_penalty_strat.initial_penalty = std::floor(std::sqrt(thread_data.reward_penalty_strat.initial_penalty));
+    inline void adjust_local_search_parameters(unsigned outer_iter, unsigned no_imp_counter,
+                                               ThreadSearchContext &thread_data) {
+        if (no_imp_counter >= thread_data.no_improvement_iterations_reduce_penalty &&
+            thread_data.reward_penalty_strat.initial_penalty > 1.0) {
+            thread_data.reward_penalty_strat.initial_penalty = static_cast<cost_t>(std::floor(std::sqrt(thread_data.reward_penalty_strat.initial_penalty)));
             thread_data.unlock_edge_backtrack_counter_reset += 1;
             thread_data.no_improvement_iterations_reduce_penalty += 15;
 #ifdef KL_DEBUG_1
-            std::cout << "thread " << thread_data.thread_id << ", no improvement for " << thread_data.no_improvement_iterations_reduce_penalty
-                        << " iterations, reducing initial penalty to " << thread_data.reward_penalty_strat.initial_penalty << std::endl;
-#endif                   
-        } 
+            std::cout << "thread " << thread_data.thread_id << ", no improvement for "
+                      << thread_data.no_improvement_iterations_reduce_penalty
+                      << " iterations, reducing initial penalty to " << thread_data.reward_penalty_strat.initial_penalty
+                      << std::endl;
+#endif
+        }
 
-        if (parameters.try_remove_step_after_num_outer_iterations > 0 && ((outer_iter + 1) % parameters.try_remove_step_after_num_outer_iterations) == 0) {
-            thread_data.step_selection_epoch_counter = 0;;
+        if (parameters.try_remove_step_after_num_outer_iterations > 0 &&
+            ((outer_iter + 1) % parameters.try_remove_step_after_num_outer_iterations) == 0) {
+            thread_data.step_selection_epoch_counter = 0;
+            ;
 #ifdef KL_DEBUG
             std::cout << "reset remove epoc counter after " << outer_iter << " iterations." << std::endl;
 #endif
         }
 
-        if (no_imp_counter >= thread_data.no_improvement_iterations_increase_inner_iter ) {
+        if (no_imp_counter >= thread_data.no_improvement_iterations_increase_inner_iter) {
             thread_data.min_inner_iter = static_cast<unsigned>(std::ceil(thread_data.min_inner_iter * 2.2));
             thread_data.no_improvement_iterations_increase_inner_iter += 20;
 #ifdef KL_DEBUG_1
-            std::cout << "thread " << thread_data.thread_id << ", no improvement for " << thread_data.no_improvement_iterations_increase_inner_iter
-                        << " iterations, increasing min inner iter to " << thread_data.min_inner_iter << std::endl;
+            std::cout << "thread " << thread_data.thread_id << ", no improvement for "
+                      << thread_data.no_improvement_iterations_increase_inner_iter
+                      << " iterations, increasing min inner iter to " << thread_data.min_inner_iter << std::endl;
 #endif
         }
-
     }
-    
-    bool is_local_search_blocked(ThreadSearchContext & thread_data);
+
+    bool is_local_search_blocked(ThreadSearchContext &thread_data);
     void set_parameters(vertex_idx_t<Graph_t> num_nodes);
-    void reset_inner_search_structures(ThreadSearchContext & thread_data) const;
+    void reset_inner_search_structures(ThreadSearchContext &thread_data) const;
     void initialize_datastructures(BspSchedule<Graph_t> &schedule);
-    void print_heap(heap_datastructure & max_gain_heap) const;
+    void print_heap(heap_datastructure &max_gain_heap) const;
     void cleanup_datastructures();
-    void update_avg_gain(const cost_t gain, const unsigned num_iter, cost_t & average_gain); 
-    void insert_gain_heap(ThreadSearchContext & thread_data);
-    void insert_new_nodes_gain_heap(std::vector<VertexType>& new_nodes, node_selection_container_t &nodes, ThreadSearchContext & thread_data);
+    void update_avg_gain(const cost_t gain, const unsigned num_iter, double &average_gain);
+    void insert_gain_heap(ThreadSearchContext &thread_data);
+    void insert_new_nodes_gain_heap(std::vector<VertexType> &new_nodes, node_selection_container_t &nodes,
+                                    ThreadSearchContext &thread_data);
 
-    inline void compute_node_affinities(VertexType node, std::vector<std::vector<cost_t>> & affinity_table_node, ThreadSearchContext & thread_data) {
+    inline void compute_node_affinities(VertexType node, std::vector<std::vector<cost_t>> &affinity_table_node,
+                                        ThreadSearchContext &thread_data) {
         compute_work_affinity(node, affinity_table_node, thread_data);
-        comm_cost_f.compute_comm_affinity(node, affinity_table_node, thread_data.reward_penalty_strat.penalty, thread_data.reward_penalty_strat.reward, thread_data.start_step, thread_data.end_step);
+        comm_cost_f.compute_comm_affinity(node, affinity_table_node, thread_data.reward_penalty_strat.penalty,
+                                          thread_data.reward_penalty_strat.reward, thread_data.start_step,
+                                          thread_data.end_step);
     }
 
-    void select_active_nodes(ThreadSearchContext & thread_data) {
+    void select_active_nodes(ThreadSearchContext &thread_data) {
         if (select_nodes_check_remove_superstep(thread_data.step_to_remove, thread_data)) {
             active_schedule.swap_empty_step_fwd(thread_data.step_to_remove, thread_data.end_step);
             thread_data.end_step--;
             thread_data.local_search_start_step = static_cast<unsigned>(thread_data.active_schedule_data.applied_moves.size());
-            thread_data.active_schedule_data.update_cost(-1.0 * static_cast<cost_t>(instance->synchronisationCosts()));
+            thread_data.active_schedule_data.update_cost(static_cast<cost_t>(-1.0 * instance->synchronisationCosts()));
 
             if constexpr (enable_preresolving_violations) {
                 resolve_violations(thread_data);
             }
 
             if (thread_data.active_schedule_data.current_violations.size() > parameters.initial_violation_threshold) {
-                active_schedule.revert_to_best_schedule(thread_data.local_search_start_step, thread_data.step_to_remove, comm_cost_f, thread_data.active_schedule_data, thread_data.start_step, thread_data.end_step);
+                active_schedule.revert_to_best_schedule(thread_data.local_search_start_step, thread_data.step_to_remove,
+                                                        comm_cost_f, thread_data.active_schedule_data,
+                                                        thread_data.start_step, thread_data.end_step);
             } else {
-                thread_data.unlock_edge_backtrack_counter = static_cast<unsigned>(thread_data.active_schedule_data.current_violations.size());
-                thread_data.max_inner_iterations = std::max(thread_data.unlock_edge_backtrack_counter * 5u, parameters.max_inner_iterations_reset);
-                thread_data.max_no_vioaltions_removed_backtrack = parameters.max_no_vioaltions_removed_backtrack_for_remove_step_reset;
-    #ifdef KL_DEBUG_1
-                std::cout << "thread " << thread_data.thread_id << ", Trying to remove step " << thread_data.step_to_remove << std::endl;
-    #endif
-                return; 
-            }       
+                thread_data.unlock_edge_backtrack_counter =
+                    static_cast<unsigned>(thread_data.active_schedule_data.current_violations.size());
+                thread_data.max_inner_iterations =
+                    std::max(thread_data.unlock_edge_backtrack_counter * 5u, parameters.max_inner_iterations_reset);
+                thread_data.max_no_vioaltions_removed_backtrack =
+                    parameters.max_no_vioaltions_removed_backtrack_for_remove_step_reset;
+#ifdef KL_DEBUG_1
+                std::cout << "thread " << thread_data.thread_id << ", Trying to remove step "
+                          << thread_data.step_to_remove << std::endl;
+#endif
+                return;
+            }
         }
-        //thread_data.step_to_remove = thread_data.start_step;
+        // thread_data.step_to_remove = thread_data.start_step;
         thread_data.local_search_start_step = 0;
-        thread_data.selection_strategy.select_active_nodes(thread_data.affinity_table, thread_data.start_step, thread_data.end_step);
+        thread_data.selection_strategy.select_active_nodes(thread_data.affinity_table, thread_data.start_step,
+                                                           thread_data.end_step);
     }
 
     bool check_remove_superstep(unsigned step);
-    bool select_nodes_check_remove_superstep(unsigned & step, ThreadSearchContext & thread_data);
+    bool select_nodes_check_remove_superstep(unsigned &step, ThreadSearchContext &thread_data);
 
-    bool scatter_nodes_superstep(unsigned step, ThreadSearchContext & thread_data) {
+    bool scatter_nodes_superstep(unsigned step, ThreadSearchContext &thread_data) {
         assert(step <= thread_data.end_step && thread_data.start_step <= step);
         bool abort = false;
 
-        for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {   
-            const std::vector<VertexType> step_proc_node_vec(active_schedule.getSetSchedule().step_processor_vertices[step][proc].begin(),active_schedule.getSetSchedule().step_processor_vertices[step][proc].end());
-            for (const auto &node : step_proc_node_vec) {         
-                   
-                thread_data.reward_penalty_strat.init_reward_penalty(static_cast<double>(thread_data.active_schedule_data.current_violations.size()) + 1.0);
+        for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
+            const std::vector<VertexType> step_proc_node_vec(
+                active_schedule.getSetSchedule().step_processor_vertices[step][proc].begin(),
+                active_schedule.getSetSchedule().step_processor_vertices[step][proc].end());
+            for (const auto &node : step_proc_node_vec) {
+
+                thread_data.reward_penalty_strat.init_reward_penalty(
+                    static_cast<double>(thread_data.active_schedule_data.current_violations.size()) + 1.0);
                 compute_node_affinities(node, thread_data.local_affinity_table, thread_data);
                 kl_move best_move = compute_best_move<false>(node, thread_data.local_affinity_table, thread_data);
 
@@ -954,37 +1259,43 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                 }
 
                 apply_move(best_move, thread_data);
-                if (thread_data.active_schedule_data.current_violations.size() > parameters.abort_scatter_nodes_violation_threshold) {
+                if (thread_data.active_schedule_data.current_violations.size() >
+                    parameters.abort_scatter_nodes_violation_threshold) {
                     abort = true;
                     break;
                 }
 
                 thread_data.affinity_table.insert(node);
-                //thread_data.selection_strategy.add_neighbours_to_selection(node, thread_data.affinity_table, thread_data.start_step, thread_data.end_step);
+                // thread_data.selection_strategy.add_neighbours_to_selection(node, thread_data.affinity_table,
+                // thread_data.start_step, thread_data.end_step);
                 if (thread_data.active_schedule_data.new_violations.size() > 0) {
-                
-                    for (const auto & vertex_edge_pair : thread_data.active_schedule_data.new_violations) {
+
+                    for (const auto &vertex_edge_pair : thread_data.active_schedule_data.new_violations) {
                         const auto &vertex = vertex_edge_pair.first;
                         thread_data.affinity_table.insert(vertex);
                     }
                 }
 
 #ifdef KL_DEBUG
-                std::cout << "move node " << best_move.node << " with gain " << best_move.gain << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step << " to: " << best_move.to_proc << "|" << best_move.to_step << std::endl;
+                std::cout << "move node " << best_move.node << " with gain " << best_move.gain
+                          << ", from proc|step: " << best_move.from_proc << "|" << best_move.from_step
+                          << " to: " << best_move.to_proc << "|" << best_move.to_step << std::endl;
 #endif
 
 #ifdef KL_DEBUG_COST_CHECK
                 active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps();
-                if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) > 0.00001 ) {
-                    std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test() << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
-                    std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<" << std::endl;
+                if (std::abs(comm_cost_f.compute_schedule_cost_test() - thread_data.active_schedule_data.cost) >
+                    0.00001) {
+                    std::cout << "computed cost: " << comm_cost_f.compute_schedule_cost_test()
+                              << ", current cost: " << thread_data.active_schedule_data.cost << std::endl;
+                    std::cout << ">>>>>>>>>>>>>>>>>>>>>> compute cost not equal to new cost <<<<<<<<<<<<<<<<<<<<"
+                              << std::endl;
                 }
                 if constexpr (active_schedule_t::use_memory_constraint) {
-                    if ( not active_schedule.memory_constraint.satisfied_memory_constraint())
-                     std::cout << "memory constraint not satisfied" << std::endl;
+                    if (not active_schedule.memory_constraint.satisfied_memory_constraint())
+                        std::cout << "memory constraint not satisfied" << std::endl;
                 }
 #endif
-                
             }
 
             if (abort) {
@@ -993,7 +1304,8 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
         }
 
         if (abort) {
-            active_schedule.revert_to_best_schedule(0, 0, comm_cost_f, thread_data.active_schedule_data, thread_data.start_step, thread_data.end_step);
+            active_schedule.revert_to_best_schedule(0, 0, comm_cost_f, thread_data.active_schedule_data,
+                                                    thread_data.start_step, thread_data.end_step);
             thread_data.affinity_table.reset_node_selection();
             return false;
         }
@@ -1004,12 +1316,12 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
         if (num_threads == 1) { // single thread case
             active_schedule.set_cost(thread_data_vec[0].active_schedule_data.cost);
             active_schedule.getVectorSchedule().number_of_supersteps = thread_data_vec[0].num_steps();
-            return;        
+            return;
         }
 
         unsigned write_cursor = thread_data_vec[0].end_step + 1;
         for (unsigned i = 1; i < num_threads; ++i) {
-            auto& thread = thread_data_vec[i];
+            auto &thread = thread_data_vec[i];
             if (thread.start_step <= thread.end_step) {
                 for (unsigned j = thread.start_step; j <= thread.end_step; ++j) {
                     if (j != write_cursor) {
@@ -1030,9 +1342,7 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
         gen = std::mt19937(rd());
     }
 
-    explicit kl_improver(unsigned seed) : ImprovementScheduler<Graph_t>() {
-        gen = std::mt19937(seed);
-    }
+    explicit kl_improver(unsigned seed) : ImprovementScheduler<Graph_t>() { gen = std::mt19937(seed); }
 
     virtual ~kl_improver() = default;
 
@@ -1041,24 +1351,24 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
             return RETURN_STATUS::BEST_FOUND;
 
         const unsigned num_threads = 1;
-        
-        thread_data_vec.resize(num_threads);      
+
+        thread_data_vec.resize(num_threads);
         thread_finished_vec.assign(num_threads, true);
 
         set_parameters(schedule.getInstance().numberOfVertices());
-        initialize_datastructures(schedule); 
-        const cost_t initial_cost = active_schedule.get_cost();   
+        initialize_datastructures(schedule);
+        const cost_t initial_cost = active_schedule.get_cost();
         const unsigned num_steps = schedule.numberOfSupersteps();
 
         set_start_step(0, thread_data_vec[0]);
-        thread_data_vec[0].end_step = (num_steps > 0) ? num_steps - 1 : 0;                   
+        thread_data_vec[0].end_step = (num_steps > 0) ? num_steps - 1 : 0;
 
-        auto & thread_data = this->thread_data_vec[0];
+        auto &thread_data = this->thread_data_vec[0];
         thread_data.active_schedule_data.initialize_cost(active_schedule.get_cost());
         thread_data.selection_strategy.setup(thread_data.start_step, thread_data.end_step);
-        run_local_search(thread_data); 
-            
-        synchronize_active_schedule(num_threads);                       
+        run_local_search(thread_data);
+
+        synchronize_active_schedule(num_threads);
 
         if (initial_cost > active_schedule.get_cost()) {
             active_schedule.write_schedule(schedule);
@@ -1076,81 +1386,100 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
     }
 
     virtual void setTimeQualityParameter(const double time_quality) { this->parameters.time_quality = time_quality; }
-    virtual void setSuperstepRemoveStrengthParameter(const double superstep_remove_strength) { this->parameters.superstep_remove_strength = superstep_remove_strength; }
-
-    virtual std::string getScheduleName() const {
-        return "kl_improver_" + comm_cost_f.name();
+    virtual void setSuperstepRemoveStrengthParameter(const double superstep_remove_strength) {
+        this->parameters.superstep_remove_strength = superstep_remove_strength;
     }
+
+    virtual std::string getScheduleName() const { return "kl_improver_" + comm_cost_f.name(); }
 };
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::set_parameters(vertex_idx_t<Graph_t> num_nodes) {    
-    const unsigned log_num_nodes = (num_nodes > 1) ? static_cast<unsigned>(std::log(num_nodes)) : 1; 
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::set_parameters(
+    vertex_idx_t<Graph_t> num_nodes) {
+    const unsigned log_num_nodes = (num_nodes > 1) ? static_cast<unsigned>(std::log(num_nodes)) : 1;
 
     // Total number of outer iterations. Proportional to sqrt N.
-    parameters.max_outer_iterations = static_cast<unsigned>(std::sqrt(num_nodes) * (parameters.time_quality * 10.0) / parameters.num_parallel_loops);
+    parameters.max_outer_iterations =
+        static_cast<unsigned>(std::sqrt(num_nodes) * (parameters.time_quality * 10.0) / parameters.num_parallel_loops);
 
     // Number of times to reset the search for violations before giving up.
-    parameters.max_no_vioaltions_removed_backtrack_reset = parameters.time_quality < 0.75 ? 1 : parameters.time_quality < 1.0 ? 2 : 3;
+    parameters.max_no_vioaltions_removed_backtrack_reset = parameters.time_quality < 0.75  ? 1
+                                                           : parameters.time_quality < 1.0 ? 2
+                                                                                           : 3;
 
     // Parameters for the superstep removal heuristic.
-    parameters.max_no_vioaltions_removed_backtrack_for_remove_step_reset = 3 + static_cast<unsigned>(parameters.superstep_remove_strength * 7);
-    parameters.node_max_step_selection_epochs = parameters.superstep_remove_strength < 0.75 ? 1 : parameters.superstep_remove_strength < 1.0 ? 2 : 3;
+    parameters.max_no_vioaltions_removed_backtrack_for_remove_step_reset =
+        3 + static_cast<unsigned>(parameters.superstep_remove_strength * 7);
+    parameters.node_max_step_selection_epochs = parameters.superstep_remove_strength < 0.75  ? 1
+                                                : parameters.superstep_remove_strength < 1.0 ? 2
+                                                                                             : 3;
     parameters.remove_step_epocs = static_cast<unsigned>(parameters.superstep_remove_strength * 4.0);
 
-    parameters.min_inner_iter_reset = static_cast<unsigned>(log_num_nodes + log_num_nodes * (1.0 + parameters.time_quality));
-   
+    parameters.min_inner_iter_reset =
+        static_cast<unsigned>(log_num_nodes + log_num_nodes * (1.0 + parameters.time_quality));
+
     if (parameters.remove_step_epocs > 0) {
-        parameters.try_remove_step_after_num_outer_iterations = parameters.max_outer_iterations / parameters.remove_step_epocs;
+        parameters.try_remove_step_after_num_outer_iterations =
+            parameters.max_outer_iterations / parameters.remove_step_epocs;
     } else {
         // Effectively disable superstep removal if remove_step_epocs is 0.
         parameters.try_remove_step_after_num_outer_iterations = parameters.max_outer_iterations + 1;
     }
-    
+
     unsigned i = 0;
-    for (auto & thread : thread_data_vec) {
+    for (auto &thread : thread_data_vec) {
         thread.thread_id = i++;
         // The number of nodes to consider in each inner iteration. Proportional to log(N).
-        thread.selection_strategy.selection_threshold = static_cast<std::size_t>(std::ceil(parameters.time_quality * 10 * log_num_nodes + log_num_nodes));  
+        thread.selection_strategy.selection_threshold =
+            static_cast<std::size_t>(std::ceil(parameters.time_quality * 10 * log_num_nodes + log_num_nodes));
     }
 
-    #ifdef KL_DEBUG_1
-                    std::cout << "kl set parameter, number of nodes: " << num_nodes << std::endl;
-                    std::cout << "max outer iterations: " << parameters.max_outer_iterations << std::endl;
-                    std::cout << "max inner iterations: " << parameters.max_inner_iterations_reset << std::endl; 
-                    std::cout << "no improvement iterations reduce penalty: " << thread_data_vec[0].no_improvement_iterations_reduce_penalty << std::endl;
-                    std::cout << "selction threshold: " << thread_data_vec[0].selection_strategy.selection_threshold << std::endl;
-                    std::cout << "remove step epocs: " << parameters.remove_step_epocs << std::endl;
-                    std::cout << "try remove step after num outer iterations: " << parameters.try_remove_step_after_num_outer_iterations << std::endl;  
-                    std::cout << "number of parallel loops: " << parameters.num_parallel_loops << std::endl;                 
-    #endif
+#ifdef KL_DEBUG_1
+    std::cout << "kl set parameter, number of nodes: " << num_nodes << std::endl;
+    std::cout << "max outer iterations: " << parameters.max_outer_iterations << std::endl;
+    std::cout << "max inner iterations: " << parameters.max_inner_iterations_reset << std::endl;
+    std::cout << "no improvement iterations reduce penalty: "
+              << thread_data_vec[0].no_improvement_iterations_reduce_penalty << std::endl;
+    std::cout << "selction threshold: " << thread_data_vec[0].selection_strategy.selection_threshold << std::endl;
+    std::cout << "remove step epocs: " << parameters.remove_step_epocs << std::endl;
+    std::cout << "try remove step after num outer iterations: " << parameters.try_remove_step_after_num_outer_iterations
+              << std::endl;
+    std::cout << "number of parallel loops: " << parameters.num_parallel_loops << std::endl;
+#endif
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::update_node_work_affinity(node_selection_container_t &nodes, kl_move move, const pre_move_work_data<work_weight_t> & prev_work_data, std::map<VertexType, kl_gain_update_info> &recompute_max_gain) {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::update_node_work_affinity(
+    node_selection_container_t &nodes, kl_move move, const pre_move_work_data<work_weight_t> &prev_work_data,
+    std::map<VertexType, kl_gain_update_info> &recompute_max_gain) {
     const size_t active_count = nodes.size();
 
     for (size_t i = 0; i < active_count; ++i) {
         const VertexType node = nodes.get_selected_nodes()[i];
-            
-        kl_gain_update_info update_info = update_node_work_affinity_after_move(node, move, prev_work_data, nodes.at(node));
+
+        kl_gain_update_info update_info =
+            update_node_work_affinity_after_move(node, move, prev_work_data, nodes.at(node));
         if (update_info.update_from_step || update_info.update_to_step) {
             recompute_max_gain[node] = update_info;
-        }        
+        }
     }
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::update_max_gain(kl_move move, std::map<VertexType, kl_gain_update_info> &recompute_max_gain, ThreadSearchContext & thread_data) {
-    for (auto& pair : recompute_max_gain) { 
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::update_max_gain(
+    kl_move move, std::map<VertexType, kl_gain_update_info> &recompute_max_gain, ThreadSearchContext &thread_data) {
+    for (auto &pair : recompute_max_gain) {
         if (pair.second.full_update) {
-            recompute_node_max_gain(pair.first, thread_data.affinity_table, thread_data); 
+            recompute_node_max_gain(pair.first, thread_data.affinity_table, thread_data);
         } else {
             if (pair.second.update_entire_from_step) {
                 update_best_move(pair.first, move.from_step, thread_data.affinity_table, thread_data);
             } else if (pair.second.update_from_step && is_compatible(pair.first, move.from_proc)) {
                 update_best_move(pair.first, move.from_step, move.from_proc, thread_data.affinity_table, thread_data);
-            } 
+            }
 
             if (move.from_step != move.to_step || not pair.second.update_entire_from_step) {
                 if (pair.second.update_entire_to_step) {
@@ -1159,12 +1488,14 @@ void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size,
                     update_best_move(pair.first, move.to_step, move.to_proc, thread_data.affinity_table, thread_data);
                 }
             }
-        } 
-    }    
+        }
+    }
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::compute_work_affinity(VertexType node, std::vector<std::vector<cost_t>> & affinity_table_node, ThreadSearchContext & thread_data) {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::compute_work_affinity(
+    VertexType node, std::vector<std::vector<cost_t>> &affinity_table_node, ThreadSearchContext &thread_data) {
     const unsigned node_step = active_schedule.assigned_superstep(node);
     const work_weight_t vertex_weight = graph->vertex_work_weight(node);
 
@@ -1176,108 +1507,157 @@ void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size,
 
         const cost_t max_work_for_step = static_cast<cost_t>(active_schedule.get_step_max_work(step));
 
-        for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { 
+        for (const unsigned proc : proc_range.compatible_processors_vertex(node)) {
             const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(step, proc);
             const cost_t work_diff = static_cast<cost_t>(new_weight) - max_work_for_step;
             affinity_table_node[proc][idx] = std::max(0.0, work_diff);
         }
     }
 
-    const unsigned node_proc = active_schedule.assigned_processor(node);        
+    const unsigned node_proc = active_schedule.assigned_processor(node);
     const work_weight_t max_work_for_step = active_schedule.get_step_max_work(node_step);
-    const bool is_sole_max_processor = (active_schedule.get_step_max_work_processor_count()[node_step] == 1) && (max_work_for_step == active_schedule.get_step_processor_work(node_step, node_proc));
-
-    const cost_t node_proc_affinity = is_sole_max_processor ? std::min(vertex_weight, max_work_for_step - active_schedule.get_step_second_max_work(node_step)) : 0.0;
+    const bool is_sole_max_processor =
+        (active_schedule.get_step_max_work_processor_count()[node_step] == 1) &&
+        (max_work_for_step == active_schedule.get_step_processor_work(node_step, node_proc));
+
+    const cost_t node_proc_affinity =
+        is_sole_max_processor
+            ? std::min(vertex_weight, max_work_for_step - active_schedule.get_step_second_max_work(node_step))
+            : 0.0;
     affinity_table_node[node_proc][window_size] = node_proc_affinity;
-    
-    for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { 
-        if(proc == node_proc)
+
+    for (const unsigned proc : proc_range.compatible_processors_vertex(node)) {
+        if (proc == node_proc)
             continue;
 
-        const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, proc);           
-        affinity_table_node[proc][window_size] = compute_same_step_affinity(max_work_for_step, new_weight, node_proc_affinity);            
+        const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, proc);
+        affinity_table_node[proc][window_size] =
+            compute_same_step_affinity(max_work_for_step, new_weight, node_proc_affinity);
     }
-}   
+}
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::process_work_update_step(VertexType node, unsigned node_step, unsigned node_proc, work_weight_t vertex_weight, unsigned move_step, unsigned move_proc, work_weight_t move_correction_node_weight, const work_weight_t prev_move_step_max_work, const work_weight_t prev_move_step_second_max_work, unsigned prev_move_step_max_work_processor_count, bool & update_step, bool & update_entire_step, bool & full_update, std::vector<std::vector<cost_t>> & affinity_table_node) {
-    const unsigned lower_bound = move_step > window_size ? move_step - window_size : 0; 
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::process_work_update_step(
+    VertexType node, unsigned node_step, unsigned node_proc, work_weight_t vertex_weight, unsigned move_step,
+    unsigned move_proc, work_weight_t move_correction_node_weight, const work_weight_t prev_move_step_max_work,
+    const work_weight_t prev_move_step_second_max_work, unsigned prev_move_step_max_work_processor_count,
+    bool &update_step, bool &update_entire_step, bool &full_update,
+    std::vector<std::vector<cost_t>> &affinity_table_node) {
+    const unsigned lower_bound = move_step > window_size ? move_step - window_size : 0;
     if (lower_bound <= node_step && node_step <= move_step + window_size) {
         update_step = true;
-        if (node_step == move_step) {                
-            const work_weight_t new_max_weight = active_schedule.get_step_max_work(move_step);   
+        if (node_step == move_step) {
+            const work_weight_t new_max_weight = active_schedule.get_step_max_work(move_step);
             const work_weight_t new_second_max_weight = active_schedule.get_step_second_max_work(move_step);
             const work_weight_t new_step_proc_work = active_schedule.get_step_processor_work(node_step, node_proc);
 
-            const work_weight_t prev_step_proc_work = (node_proc == move_proc) ? new_step_proc_work + move_correction_node_weight : new_step_proc_work;
-            const bool prev_is_sole_max_processor = (prev_move_step_max_work_processor_count == 1) && (prev_move_step_max_work == prev_step_proc_work);
-            const cost_t prev_node_proc_affinity = prev_is_sole_max_processor ? std::min(vertex_weight, prev_move_step_max_work - prev_move_step_second_max_work) : 0.0;
+            const work_weight_t prev_step_proc_work =
+                (node_proc == move_proc) ? new_step_proc_work + move_correction_node_weight : new_step_proc_work;
+            const bool prev_is_sole_max_processor =
+                (prev_move_step_max_work_processor_count == 1) && (prev_move_step_max_work == prev_step_proc_work);
+            const cost_t prev_node_proc_affinity =
+                prev_is_sole_max_processor
+                    ? std::min(vertex_weight, prev_move_step_max_work - prev_move_step_second_max_work)
+                    : 0.0;
+
+            const bool new_is_sole_max_processor =
+                (active_schedule.get_step_max_work_processor_count()[node_step] == 1) &&
+                (new_max_weight == new_step_proc_work);
+            const cost_t new_node_proc_affinity =
+                new_is_sole_max_processor ? std::min(vertex_weight, new_max_weight - new_second_max_weight) : 0.0;
 
-            const bool new_is_sole_max_processor = (active_schedule.get_step_max_work_processor_count()[node_step] == 1) && (new_max_weight == new_step_proc_work);
-            const cost_t new_node_proc_affinity = new_is_sole_max_processor ? std::min(vertex_weight, new_max_weight - new_second_max_weight) : 0.0;
-            
             const cost_t diff = new_node_proc_affinity - prev_node_proc_affinity;
-            const bool update_node_proc_affinity = std::abs(diff) > EPSILON; 
+            const bool update_node_proc_affinity = std::abs(diff) > EPSILON;
             if (update_node_proc_affinity) {
                 full_update = true;
                 affinity_table_node[node_proc][window_size] += diff;
             }
-    
+
             if ((prev_move_step_max_work != new_max_weight) || update_node_proc_affinity) {
                 update_entire_step = true;
 
-                for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { 
-                    if((proc == node_proc) || (proc == move_proc))
+                for (const unsigned proc : proc_range.compatible_processors_vertex(node)) {
+                    if ((proc == node_proc) || (proc == move_proc))
                         continue;
 
-                    const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, proc);
-                    const cost_t prev_other_affinity = compute_same_step_affinity(prev_move_step_max_work, new_weight, prev_node_proc_affinity);  
-                    const cost_t other_affinity = compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity);            
-    
-                    affinity_table_node[proc][window_size] += (other_affinity - prev_other_affinity);                             
+                    const work_weight_t new_weight =
+                        vertex_weight + active_schedule.get_step_processor_work(node_step, proc);
+                    const cost_t prev_other_affinity =
+                        compute_same_step_affinity(prev_move_step_max_work, new_weight, prev_node_proc_affinity);
+                    const cost_t other_affinity =
+                        compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity);
+
+                    affinity_table_node[proc][window_size] += (other_affinity - prev_other_affinity);
                 }
             }
-            
+
             if (node_proc != move_proc && is_compatible(node, move_proc)) {
-                const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move_proc) + move_correction_node_weight;
-                const cost_t prev_other_affinity = compute_same_step_affinity(prev_move_step_max_work, prev_new_weight, prev_node_proc_affinity);  
-                const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(node_step, move_proc);
-                const cost_t other_affinity = compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity);           
-    
-                affinity_table_node[move_proc][window_size] += (other_affinity - prev_other_affinity); 
-            }        
+                const work_weight_t prev_new_weight = vertex_weight +
+                                                      active_schedule.get_step_processor_work(node_step, move_proc) +
+                                                      move_correction_node_weight;
+                const cost_t prev_other_affinity =
+                    compute_same_step_affinity(prev_move_step_max_work, prev_new_weight, prev_node_proc_affinity);
+                const work_weight_t new_weight =
+                    vertex_weight + active_schedule.get_step_processor_work(node_step, move_proc);
+                const cost_t other_affinity =
+                    compute_same_step_affinity(new_max_weight, new_weight, new_node_proc_affinity);
+
+                affinity_table_node[move_proc][window_size] += (other_affinity - prev_other_affinity);
+            }
 
         } else {
             const work_weight_t new_max_weight = active_schedule.get_step_max_work(move_step);
             const unsigned idx = rel_step_idx(node_step, move_step);
-            if (prev_move_step_max_work != new_max_weight) {                    
+            if (prev_move_step_max_work != new_max_weight) {
                 update_entire_step = true;
 
                 // update moving to all procs with special for move_proc
-                for (const unsigned proc : proc_range.compatible_processors_vertex(node)) {                    
-                    const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(move_step, proc);
+                for (const unsigned proc : proc_range.compatible_processors_vertex(node)) {
+                    const work_weight_t new_weight =
+                        vertex_weight + active_schedule.get_step_processor_work(move_step, proc);
                     if (proc != move_proc) {
 
-                        const cost_t prev_affinity = prev_move_step_max_work < new_weight ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(prev_move_step_max_work) : 0.0;
-                        const cost_t new_affinity = new_max_weight < new_weight ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight) : 0.0;
-                        affinity_table_node[proc][idx] += new_affinity - prev_affinity;  
+                        const cost_t prev_affinity =
+                            prev_move_step_max_work < new_weight
+                                ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(prev_move_step_max_work)
+                                : 0.0;
+                        const cost_t new_affinity =
+                            new_max_weight < new_weight
+                                ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight)
+                                : 0.0;
+                        affinity_table_node[proc][idx] += new_affinity - prev_affinity;
 
                     } else {
-                        const work_weight_t prev_new_weight = vertex_weight + active_schedule.get_step_processor_work(move_step, proc) + move_correction_node_weight;
-                        const cost_t prev_affinity = prev_move_step_max_work < prev_new_weight ? static_cast<cost_t>(prev_new_weight) - static_cast<cost_t>(prev_move_step_max_work) : 0.0;
-
-                        const cost_t new_affinity = new_max_weight < new_weight ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight) : 0.0;
+                        const work_weight_t prev_new_weight = vertex_weight +
+                                                              active_schedule.get_step_processor_work(move_step, proc) +
+                                                              move_correction_node_weight;
+                        const cost_t prev_affinity =
+                            prev_move_step_max_work < prev_new_weight
+                                ? static_cast<cost_t>(prev_new_weight) - static_cast<cost_t>(prev_move_step_max_work)
+                                : 0.0;
+
+                        const cost_t new_affinity =
+                            new_max_weight < new_weight
+                                ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight)
+                                : 0.0;
                         affinity_table_node[proc][idx] += new_affinity - prev_affinity;
                     }
-                }                        
+                }
             } else {
                 // update only move_proc
                 if (is_compatible(node, move_proc)) {
-                    const work_weight_t new_weight = vertex_weight + active_schedule.get_step_processor_work(move_step, move_proc);
+                    const work_weight_t new_weight =
+                        vertex_weight + active_schedule.get_step_processor_work(move_step, move_proc);
                     const work_weight_t prev_new_weight = new_weight + move_correction_node_weight;
-                    const cost_t prev_affinity = prev_move_step_max_work < prev_new_weight ? static_cast<cost_t>(prev_new_weight) - static_cast<cost_t>(prev_move_step_max_work) : 0.0;
-
-                    const cost_t new_affinity = new_max_weight < new_weight ? static_cast<cost_t>(new_weight) - static_cast<cost_t>(new_max_weight) : 0.0;
+                    const cost_t prev_affinity =
+                        prev_move_step_max_work < prev_new_weight
+                            ? static_cast<cost_t>(prev_new_weight) - static_cast<cost_t>(prev_move_step_max_work)
+                            : 0.0;
+
+                    const cost_t new_affinity = new_max_weight < new_weight ? static_cast<cost_t>(new_weight) -
+                                                                                  static_cast<cost_t>(new_max_weight)
+                                                                            : 0.0;
                     affinity_table_node[move_proc][idx] += new_affinity - prev_affinity;
                 }
             }
@@ -1285,20 +1665,25 @@ void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size,
     }
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-bool kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::select_nodes_check_remove_superstep(unsigned & step_to_remove, ThreadSearchContext & thread_data) {
-    if (thread_data.step_selection_epoch_counter >= parameters.node_max_step_selection_epochs || thread_data.num_steps() < 3) {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+bool kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size,
+                 cost_t>::select_nodes_check_remove_superstep(unsigned &step_to_remove,
+                                                              ThreadSearchContext &thread_data) {
+    if (thread_data.step_selection_epoch_counter >= parameters.node_max_step_selection_epochs ||
+        thread_data.num_steps() < 3) {
         return false;
     }
-    
-    for (step_to_remove = thread_data.step_selection_counter; step_to_remove <= thread_data.end_step; step_to_remove++) {
-        assert(step_to_remove >= thread_data.start_step && step_to_remove <= thread_data.end_step);        
+
+    for (step_to_remove = thread_data.step_selection_counter; step_to_remove <= thread_data.end_step;
+         step_to_remove++) {
+        assert(step_to_remove >= thread_data.start_step && step_to_remove <= thread_data.end_step);
 #ifdef KL_DEBUG
-            std::cout << "Checking to remove step " << step_to_remove << "/" << thread_data.end_step <<  std::endl;
+        std::cout << "Checking to remove step " << step_to_remove << "/" << thread_data.end_step << std::endl;
 #endif
         if (check_remove_superstep(step_to_remove)) {
 #ifdef KL_DEBUG
-            std::cout << "Checking to scatter step " << step_to_remove << "/" << thread_data.end_step <<  std::endl;
+            std::cout << "Checking to scatter step " << step_to_remove << "/" << thread_data.end_step << std::endl;
 #endif
             assert(step_to_remove >= thread_data.start_step && step_to_remove <= thread_data.end_step);
             if (scatter_nodes_superstep(step_to_remove, thread_data)) {
@@ -1318,19 +1703,23 @@ bool kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size,
     return false;
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-bool kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::check_remove_superstep(unsigned step) {
-    if (active_schedule.num_steps() < 2) 
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+bool kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::check_remove_superstep(
+    unsigned step) {
+    if (active_schedule.num_steps() < 2)
         return false;
-    
+
     if (active_schedule.get_step_max_work(step) < instance->synchronisationCosts())
         return true;
 
     return false;
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::reset_inner_search_structures(ThreadSearchContext & thread_data) const {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::reset_inner_search_structures(
+    ThreadSearchContext &thread_data) const {
     thread_data.unlock_edge_backtrack_counter = thread_data.unlock_edge_backtrack_counter_reset;
     thread_data.max_inner_iterations = parameters.max_inner_iterations_reset;
     thread_data.max_no_vioaltions_removed_backtrack = parameters.max_no_vioaltions_removed_backtrack_reset;
@@ -1340,18 +1729,22 @@ void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size,
     thread_data.lock_manager.clear();
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-bool kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::is_local_search_blocked(ThreadSearchContext & thread_data) {
-    for (const auto& pair : thread_data.active_schedule_data.new_violations) {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+bool kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::is_local_search_blocked(
+    ThreadSearchContext &thread_data) {
+    for (const auto &pair : thread_data.active_schedule_data.new_violations) {
         if (thread_data.lock_manager.is_locked(pair.first)) {
-            return true;                    
+            return true;
         }
     }
     return false;
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::initialize_datastructures(BspSchedule<Graph_t> &schedule) {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::initialize_datastructures(
+    BspSchedule<Graph_t> &schedule) {
     input_schedule = &schedule;
     instance = &schedule.getInstance();
     graph = &instance->getComputationalDag();
@@ -1363,54 +1756,64 @@ void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size,
     const cost_t initial_cost = comm_cost_f.compute_schedule_cost();
     active_schedule.set_cost(initial_cost);
 
-    for (auto & t_data : thread_data_vec) {
+    for (auto &t_data : thread_data_vec) {
         t_data.affinity_table.initialize(active_schedule, t_data.selection_strategy.selection_threshold);
-        t_data.lock_manager.initialize(graph->num_vertices());    
-        t_data.reward_penalty_strat.initialize(active_schedule, comm_cost_f.get_max_comm_weight_multiplied(), active_schedule.get_max_work_weight());
+        t_data.lock_manager.initialize(graph->num_vertices());
+        t_data.reward_penalty_strat.initialize(active_schedule, comm_cost_f.get_max_comm_weight_multiplied(),
+                                               active_schedule.get_max_work_weight());
         t_data.selection_strategy.initialize(active_schedule, gen, t_data.start_step, t_data.end_step);
-         
+
         t_data.local_affinity_table.resize(instance->numberOfProcessors());
         for (unsigned i = 0; i < instance->numberOfProcessors(); ++i) {
             t_data.local_affinity_table[i].resize(window_range);
         }
-    } 
+    }
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::update_avg_gain(const cost_t gain, const unsigned num_iter, cost_t & average_gain) {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::update_avg_gain(
+    const cost_t gain, const unsigned num_iter, double &average_gain) {
     average_gain = static_cast<double>((average_gain * num_iter + gain)) / (num_iter + 1.0);
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::insert_gain_heap(ThreadSearchContext & thread_data) {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::insert_gain_heap(
+    ThreadSearchContext &thread_data) {
     const size_t active_count = thread_data.affinity_table.size();
 
     for (size_t i = 0; i < active_count; ++i) {
-        const VertexType node = thread_data.affinity_table.get_selected_nodes()[i]; 
+        const VertexType node = thread_data.affinity_table.get_selected_nodes()[i];
         compute_node_affinities(node, thread_data.affinity_table.at(node), thread_data);
         const auto best_move = compute_best_move<true>(node, thread_data.affinity_table[node], thread_data);
         thread_data.max_gain_heap.push(node, best_move);
     }
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::insert_new_nodes_gain_heap(std::vector<VertexType>& new_nodes, node_selection_container_t &nodes, ThreadSearchContext & thread_data) {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::insert_new_nodes_gain_heap(
+    std::vector<VertexType> &new_nodes, node_selection_container_t &nodes, ThreadSearchContext &thread_data) {
     for (const auto &node : new_nodes) {
         nodes.insert(node);
         compute_node_affinities(node, thread_data.affinity_table.at(node), thread_data);
         const auto best_move = compute_best_move<true>(node, thread_data.affinity_table[node], thread_data);
-        thread_data.max_gain_heap.push(node, best_move);        
+        thread_data.max_gain_heap.push(node, best_move);
     }
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
 void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::cleanup_datastructures() {
     thread_data_vec.clear();
-    active_schedule.clear();             
+    active_schedule.clear();
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::print_heap(heap_datastructure & max_gain_heap) const {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::print_heap(
+    heap_datastructure &max_gain_heap) const {
 
     if (max_gain_heap.is_empty()) {
         std::cout << "heap is empty" << std::endl;
@@ -1419,29 +1822,32 @@ void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size,
     heap_datastructure temp_heap = max_gain_heap; // requires copy constructor
 
     std::cout << "heap current size: " << temp_heap.size() << std::endl;
-    const auto& top_val = temp_heap.get_value(temp_heap.top());
+    const auto &top_val = temp_heap.get_value(temp_heap.top());
     std::cout << "heap top node " << top_val.node << " gain " << top_val.gain << std::endl;
 
     unsigned count = 0;
     while (!temp_heap.is_empty() && count++ < 15) {
-        const auto& val = temp_heap.get_value(temp_heap.top());
+        const auto &val = temp_heap.get_value(temp_heap.top());
         std::cout << "node " << val.node << " gain " << val.gain << " to proc " << val.to_proc << " to step "
-                    << val.to_step << std::endl;
+                  << val.to_step << std::endl;
         temp_heap.pop();
     }
 }
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::update_best_move(VertexType node, unsigned step, unsigned proc, node_selection_container_t &affinity_table, ThreadSearchContext & thread_data) {
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::update_best_move(
+    VertexType node, unsigned step, unsigned proc, node_selection_container_t &affinity_table,
+    ThreadSearchContext &thread_data) {
     const unsigned node_proc = active_schedule.assigned_processor(node);
     const unsigned node_step = active_schedule.assigned_superstep(node);
 
-    if((node_proc == proc) && (node_step == step)) 
+    if ((node_proc == proc) && (node_step == step))
         return;
 
     kl_move node_move = thread_data.max_gain_heap.get_value(node);
     cost_t max_gain = node_move.gain;
-    
+
     unsigned max_proc = node_move.to_proc;
     unsigned max_step = node_move.to_step;
 
@@ -1449,69 +1855,75 @@ void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size,
         recompute_node_max_gain(node, affinity_table, thread_data);
     } else {
         if constexpr (active_schedule_t::use_memory_constraint) {
-            if( not active_schedule.memory_constraint.can_move(node, proc, step)) return;                
+            if (not active_schedule.memory_constraint.can_move(node, proc, step))
+                return;
         }
         const unsigned idx = rel_step_idx(node_step, step);
         const cost_t gain = affinity_table[node][node_proc][window_size] - affinity_table[node][proc][idx];
         if (gain > max_gain) {
             max_gain = gain;
             max_proc = proc;
-            max_step = step; 
-        } 
-    
+            max_step = step;
+        }
+
         const cost_t diff = max_gain - node_move.gain;
         if ((std::abs(diff) > EPSILON) || (max_proc != node_move.to_proc) || (max_step != node_move.to_step)) {
             node_move.gain = max_gain;
             node_move.to_proc = max_proc;
             node_move.to_step = max_step;
             thread_data.max_gain_heap.update(node, node_move);
-        }        
+        }
     }
 }
-    
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size, typename cost_t>
-void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::update_best_move(VertexType node, unsigned step, node_selection_container_t &affinity_table, ThreadSearchContext & thread_data) {
-    
+
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>::update_best_move(
+    VertexType node, unsigned step, node_selection_container_t &affinity_table, ThreadSearchContext &thread_data) {
+
     const unsigned node_proc = active_schedule.assigned_processor(node);
     const unsigned node_step = active_schedule.assigned_superstep(node);
 
     kl_move node_move = thread_data.max_gain_heap.get_value(node);
     cost_t max_gain = node_move.gain;
-    
+
     unsigned max_proc = node_move.to_proc;
     unsigned max_step = node_move.to_step;
 
     if (max_step == step) {
-        recompute_node_max_gain(node, affinity_table, thread_data);   
-    } else {        
+        recompute_node_max_gain(node, affinity_table, thread_data);
+    } else {
         if (node_step != step) {
             const unsigned idx = rel_step_idx(node_step, step);
-            for (const unsigned p : proc_range.compatible_processors_vertex(node)) {   
+            for (const unsigned p : proc_range.compatible_processors_vertex(node)) {
                 if constexpr (active_schedule_t::use_memory_constraint) {
-                    if( not active_schedule.memory_constraint.can_move(node, p, step)) continue;                
+                    if (not active_schedule.memory_constraint.can_move(node, p, step))
+                        continue;
                 }
-                const cost_t gain = affinity_table[node][node_proc][window_size] - affinity_table[node][p][idx];                    
+                const cost_t gain = affinity_table[node][node_proc][window_size] - affinity_table[node][p][idx];
                 if (gain > max_gain) {
                     max_gain = gain;
                     max_proc = p;
-                    max_step = step; 
+                    max_step = step;
                 }
             }
         } else {
-            for (const unsigned proc : proc_range.compatible_processors_vertex(node)) { 
+            for (const unsigned proc : proc_range.compatible_processors_vertex(node)) {
                 if (proc == node_proc)
                     continue;
                 if constexpr (active_schedule_t::use_memory_constraint) {
-                    if( not active_schedule.memory_constraint.can_move(node, proc, step)) continue;                
+                    if (not active_schedule.memory_constraint.can_move(node, proc, step))
+                        continue;
                 }
-                const cost_t gain = affinity_table[node][node_proc][window_size] - affinity_table[node][proc][window_size];
+                const cost_t gain =
+                    affinity_table[node][node_proc][window_size] - affinity_table[node][proc][window_size];
                 if (gain > max_gain) {
                     max_gain = gain;
                     max_proc = proc;
-                    max_step = step; 
+                    max_step = step;
                 }
             }
-        }        
+        }
 
         const cost_t diff = max_gain - node_move.gain;
         if ((std::abs(diff) > EPSILON) || (max_proc != node_move.to_proc) || (max_step != node_move.to_step)) {
@@ -1519,8 +1931,8 @@ void kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size,
             node_move.to_proc = max_proc;
             node_move.to_step = max_step;
             thread_data.max_gain_heap.update(node, node_move);
-        }        
-    } 
-}   
+        }
+    }
+}
 
 } // namespace osp
\ No newline at end of file
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp
index 77f13bcb..654ed111 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp
@@ -22,10 +22,11 @@ limitations under the License.
 
 namespace osp {
 
-template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t = no_local_search_memory_constraint,
-         unsigned window_size = 1, typename cost_t = double>
+template<typename Graph_t, typename comm_cost_function_t,
+         typename MemoryConstraint_t = no_local_search_memory_constraint, unsigned window_size = 1,
+         typename cost_t = double>
 class kl_improver_test : public kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t> {
-    
+
     using VertexType = vertex_idx_t<Graph_t>;
     using kl_move = kl_move_struct<cost_t, VertexType>;
     using heap_datastructure = MaxPairingHeap<VertexType, kl_move>;
@@ -33,8 +34,7 @@ class kl_improver_test : public kl_improver<Graph_t, comm_cost_function_t, Memor
     using kl_gain_update_info = kl_update_info<VertexType>;
     using node_selection_container_t = adaptive_affinity_table<Graph_t, cost_t, active_schedule_t, window_size>;
 
-    public:
-
+  public:
     kl_improver_test() : kl_improver<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t>() {
         this->thread_data_vec.resize(1);
         this->thread_finished_vec.assign(1, true);
@@ -42,18 +42,11 @@ class kl_improver_test : public kl_improver<Graph_t, comm_cost_function_t, Memor
 
     virtual ~kl_improver_test() = default;
 
+    active_schedule_t &get_active_schedule() { return this->active_schedule; }
 
-    active_schedule_t& get_active_schedule() {
-        return this->active_schedule;
-    }
-
-    auto & get_affinity_table() {
-        return this->thread_data_vec[0].affinity_table;
-    }
+    auto &get_affinity_table() { return this->thread_data_vec[0].affinity_table; }
 
-    auto & get_comm_cost_f() {
-        return this->comm_cost_f;
-    }
+    auto &get_comm_cost_f() { return this->comm_cost_f; }
 
     void setup_schedule(BspSchedule<Graph_t> &schedule) {
         this->thread_data_vec.resize(1);
@@ -63,39 +56,33 @@ class kl_improver_test : public kl_improver<Graph_t, comm_cost_function_t, Memor
         this->thread_data_vec[0].active_schedule_data.initialize_cost(this->active_schedule.get_cost());
     }
 
-    void apply_move_test(kl_move move) {
-        this->apply_move(move, this->thread_data_vec[0]);
-    }
+    void apply_move_test(kl_move move) { this->apply_move(move, this->thread_data_vec[0]); }
 
-    auto & get_max_gain_heap() {
-        return this->thread_data_vec[0].max_gain_heap;
-    }
+    auto &get_max_gain_heap() { return this->thread_data_vec[0].max_gain_heap; }
 
-    auto get_current_cost() {
-        return this->thread_data_vec[0].active_schedule_data.cost;
-    }
+    auto get_current_cost() { return this->thread_data_vec[0].active_schedule_data.cost; }
 
-    bool is_feasible() {
-        return this->thread_data_vec[0].active_schedule_data.feasible;
-    }
+    bool is_feasible() { return this->thread_data_vec[0].active_schedule_data.feasible; }
 
     void compute_violations_test() {
         this->active_schedule.compute_violations(this->thread_data_vec[0].active_schedule_data);
     }
 
-    node_selection_container_t&  insert_gain_heap_test(const std::vector<VertexType>& n) {        
-        this->thread_data_vec[0].affinity_table.initialize(this->active_schedule, n.size());
+    node_selection_container_t &insert_gain_heap_test(const std::vector<VertexType> &n) {
+        this->thread_data_vec[0].reward_penalty_strat.penalty = 0.0;
+        this->thread_data_vec[0].reward_penalty_strat.reward = 0.0;
 
+        this->thread_data_vec[0].affinity_table.initialize(this->active_schedule, n.size());
         for (const auto &node : n) {
             this->thread_data_vec[0].affinity_table.insert(node);
         }
 
         this->insert_gain_heap(this->thread_data_vec[0]);
-    
-        return this->thread_data_vec[0].affinity_table;  
+
+        return this->thread_data_vec[0].affinity_table;
     }
 
-    node_selection_container_t& insert_gain_heap_test_penalty(const std::vector<VertexType>& n) {
+    node_selection_container_t &insert_gain_heap_test_penalty(const std::vector<VertexType> &n) {
         this->thread_data_vec[0].affinity_table.initialize(this->active_schedule, n.size());
         for (const auto &node : n) {
             this->thread_data_vec[0].affinity_table.insert(node);
@@ -105,34 +92,35 @@ class kl_improver_test : public kl_improver<Graph_t, comm_cost_function_t, Memor
 
         this->insert_gain_heap(this->thread_data_vec[0]);
 
-        return this->thread_data_vec[0].affinity_table;    
+        return this->thread_data_vec[0].affinity_table;
     }
 
-    node_selection_container_t& insert_gain_heap_test_penalty_reward(const std::vector<VertexType>& n) {
+    node_selection_container_t &insert_gain_heap_test_penalty_reward(const std::vector<VertexType> &n) {
         this->thread_data_vec[0].affinity_table.initialize(this->active_schedule, n.size());
         for (const auto &node : n) {
             this->thread_data_vec[0].affinity_table.insert(node);
         }
-        
+
         this->thread_data_vec[0].reward_penalty_strat.init_reward_penalty();
         this->thread_data_vec[0].reward_penalty_strat.reward = 15.0;
 
         this->insert_gain_heap(this->thread_data_vec[0]);
 
-        return this->thread_data_vec[0].affinity_table;    
+        return this->thread_data_vec[0].affinity_table;
     }
 
-    void update_affinity_table_test(kl_move best_move, node_selection_container_t & node_selection) {
+    void update_affinity_table_test(kl_move best_move, node_selection_container_t &node_selection) {
         std::map<VertexType, kl_gain_update_info> recompute_max_gain;
         std::vector<VertexType> new_nodes;
 
         const auto prev_work_data = this->active_schedule.get_pre_move_work_data(best_move);
+        const auto prev_comm_data = this->comm_cost_f.get_pre_move_comm_data(best_move);
         this->apply_move(best_move, this->thread_data_vec[0]);
-            
-        this->update_node_work_affinity(node_selection, best_move, prev_work_data, recompute_max_gain);
-        this->comm_cost_f.update_node_comm_affinity(best_move, this->thread_data_vec[0], this->thread_data_vec[0].reward_penalty_strat.penalty, this->thread_data_vec[0].reward_penalty_strat.reward, recompute_max_gain, new_nodes);
-    }
 
+        this->thread_data_vec[0].affinity_table.trim();
+        this->update_affinities(best_move, this->thread_data_vec[0], recompute_max_gain, new_nodes, prev_work_data,
+                                prev_comm_data);
+    }
 
     auto run_inner_iteration_test() {
 
@@ -141,25 +129,30 @@ class kl_improver_test : public kl_improver<Graph_t, comm_cost_function_t, Memor
 
         this->print_heap(this->thread_data_vec[0].max_gain_heap);
 
-        kl_move best_move = this->get_best_move(this->thread_data_vec[0].affinity_table, this->thread_data_vec[0].lock_manager, this->thread_data_vec[0].max_gain_heap); // locks best_move.node and removes it from node_selection
-       
+        kl_move best_move = this->get_best_move(
+            this->thread_data_vec[0].affinity_table, this->thread_data_vec[0].lock_manager,
+            this->thread_data_vec[0].max_gain_heap); // locks best_move.node and removes it from node_selection
+
 #ifdef KL_DEBUG
-        std::cout << "Best move: " << best_move.node << " gain: " << best_move.gain << ", from: " << best_move.from_step << "|" << best_move.from_proc << " to: " << best_move.to_step << "|" << best_move.to_proc << std::endl;
+        std::cout << "Best move: " << best_move.node << " gain: " << best_move.gain << ", from: " << best_move.from_step
+                  << "|" << best_move.from_proc << " to: " << best_move.to_step << "|" << best_move.to_proc
+                  << std::endl;
 #endif
 
         const auto prev_work_data = this->active_schedule.get_pre_move_work_data(best_move);
+        const auto prev_comm_data = this->comm_cost_f.get_pre_move_comm_data(best_move);
         this->apply_move(best_move, this->thread_data_vec[0]);
 
         this->thread_data_vec[0].affinity_table.trim();
-        this->update_node_work_affinity(this->thread_data_vec[0].affinity_table, best_move, prev_work_data, recompute_max_gain);
-        this->comm_cost_f.update_node_comm_affinity(best_move, this->thread_data_vec[0], this->thread_data_vec[0].reward_penalty_strat.penalty, this->thread_data_vec[0].reward_penalty_strat.reward, recompute_max_gain, new_nodes);
+        this->update_affinities(best_move, this->thread_data_vec[0], recompute_max_gain, new_nodes, prev_work_data,
+                                prev_comm_data);
 
 #ifdef KL_DEBUG
-        std::cout << "New nodes: { "; 
+        std::cout << "New nodes: { ";
         for (const auto v : new_nodes) {
             std::cout << v << " ";
-        }                
-        std::cout << "}" << std::endl;  
+        }
+        std::cout << "}" << std::endl;
 #endif
 
         this->update_max_gain(best_move, recompute_max_gain, this->thread_data_vec[0]);
@@ -168,10 +161,9 @@ class kl_improver_test : public kl_improver<Graph_t, comm_cost_function_t, Memor
         return recompute_max_gain;
     }
 
-    void get_active_schedule_test(BspSchedule<Graph_t> &schedule) {
-        this->active_schedule.write_schedule(schedule);
-    } 
+    bool is_node_locked(VertexType node) const { return this->thread_data_vec[0].lock_manager.is_locked(node); }
 
+    void get_active_schedule_test(BspSchedule<Graph_t> &schedule) { this->active_schedule.write_schedule(schedule); }
 };
 
 } // namespace osp
\ No newline at end of file
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp
index 12e0cfa6..80ed0e48 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp
@@ -26,21 +26,31 @@ limitations under the License.
 #include "kl_improver.hpp"
 #include "comm_cost_modules/kl_total_comm_cost.hpp"
 #include "comm_cost_modules/kl_hyper_total_comm_cost.hpp"
+#include "comm_cost_modules/kl_bsp_comm_cost.hpp"
 #include "osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp"
 
 namespace osp {
 
+using double_cost_t = double;
+
 template<typename Graph_t, typename MemoryConstraint_t = no_local_search_memory_constraint, unsigned window_size = 1, bool use_node_communication_costs_arg = true> 
-using kl_total_comm_improver = kl_improver<Graph_t, kl_total_comm_cost_function<Graph_t, double, MemoryConstraint_t, window_size, use_node_communication_costs_arg>, MemoryConstraint_t, window_size, double>; 
+using kl_total_comm_improver = kl_improver<Graph_t, kl_total_comm_cost_function<Graph_t, double_cost_t, MemoryConstraint_t, window_size, use_node_communication_costs_arg>, MemoryConstraint_t, window_size, double_cost_t>; 
 
 template<typename Graph_t, typename MemoryConstraint_t = ls_local_memory_constraint<Graph_t>, unsigned window_size = 1, bool use_node_communication_costs_arg = true> 
-using kl_total_comm_improver_local_mem_constr = kl_improver<Graph_t, kl_total_comm_cost_function<Graph_t, double, MemoryConstraint_t, window_size, use_node_communication_costs_arg>, MemoryConstraint_t, window_size, double>; 
+using kl_total_comm_improver_local_mem_constr = kl_improver<Graph_t, kl_total_comm_cost_function<Graph_t, double_cost_t, MemoryConstraint_t, window_size, use_node_communication_costs_arg>, MemoryConstraint_t, window_size, double_cost_t>; 
+
+template<typename Graph_t, typename MemoryConstraint_t = no_local_search_memory_constraint, unsigned window_size = 1> 
+using kl_total_lambda_comm_improver = kl_improver<Graph_t, kl_hyper_total_comm_cost_function<Graph_t, double_cost_t, MemoryConstraint_t, window_size>, MemoryConstraint_t, window_size, double_cost_t>; 
+
+template<typename Graph_t, typename MemoryConstraint_t = ls_local_memory_constraint<Graph_t>, unsigned window_size = 1> 
+using kl_total_lambda_comm_improver_local_mem_constr = kl_improver<Graph_t, kl_hyper_total_comm_cost_function<Graph_t, double_cost_t, MemoryConstraint_t, window_size>, MemoryConstraint_t, window_size, double_cost_t>; 
 
 template<typename Graph_t, typename MemoryConstraint_t = no_local_search_memory_constraint, unsigned window_size = 1> 
-using kl_total_lambda_comm_improver = kl_improver<Graph_t, kl_hyper_total_comm_cost_function<Graph_t, double, MemoryConstraint_t, window_size>, MemoryConstraint_t, window_size, double>; 
+using kl_bsp_comm_improver = kl_improver<Graph_t, kl_bsp_comm_cost_function<Graph_t, double_cost_t, MemoryConstraint_t, window_size>, MemoryConstraint_t, window_size, double_cost_t>; 
 
 template<typename Graph_t, typename MemoryConstraint_t = ls_local_memory_constraint<Graph_t>, unsigned window_size = 1> 
-using kl_total_lambda_comm_improver_local_mem_constr = kl_improver<Graph_t, kl_hyper_total_comm_cost_function<Graph_t, double, MemoryConstraint_t, window_size>, MemoryConstraint_t, window_size, double>; 
+using kl_bsp_comm_improver_local_mem_constr = kl_improver<Graph_t, kl_bsp_comm_cost_function<Graph_t, double_cost_t, MemoryConstraint_t, window_size>, MemoryConstraint_t, window_size, double_cost_t>; 
+
 
 
 } // namespace osp
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp
index e87d7dbb..5946c7e5 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp
@@ -23,6 +23,7 @@ limitations under the License.
 #include "kl_improver_mt.hpp"
 #include "comm_cost_modules/kl_total_comm_cost.hpp"
 #include "comm_cost_modules/kl_hyper_total_comm_cost.hpp"
+#include "comm_cost_modules/kl_bsp_comm_cost.hpp"
 
 namespace osp {
 
@@ -32,5 +33,9 @@ using kl_total_comm_improver_mt = kl_improver_mt<Graph_t, kl_total_comm_cost_fun
 template<typename Graph_t, typename MemoryConstraint_t = no_local_search_memory_constraint, unsigned window_size = 1> 
 using kl_total_lambda_comm_improver_mt = kl_improver_mt<Graph_t, kl_hyper_total_comm_cost_function<Graph_t, double, MemoryConstraint_t, window_size>, MemoryConstraint_t, window_size, double>; 
 
+template<typename Graph_t, typename MemoryConstraint_t = no_local_search_memory_constraint, unsigned window_size = 1> 
+using kl_bsp_comm_improver_mt = kl_improver_mt<Graph_t, kl_bsp_comm_cost_function<Graph_t, double, MemoryConstraint_t, window_size>, MemoryConstraint_t, window_size, double>; 
+
+
 } // namespace osp
 
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp
index 24337f05..7f3bb29d 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp
@@ -18,32 +18,32 @@ limitations under the License.
 
 #pragma once
 
-#include <unordered_set>
 #include "kl_active_schedule.hpp"
+#include <unordered_set>
 
 namespace osp {
 
 template<typename cost_t, typename comm_cost_function_t, typename kl_active_schedule_t>
 struct reward_penalty_strategy {
-    
+
     kl_active_schedule_t *active_schedule;
     cost_t max_weight;
 
     unsigned violations_threshold = 0;
     cost_t initial_penalty = 10.0;
     cost_t penalty = 0;
-    cost_t reward = 0; 
+    cost_t reward = 0;
 
-    void initialize(kl_active_schedule_t & sched, const cost_t max_comm, const cost_t max_work) {
+    void initialize(kl_active_schedule_t &sched, const cost_t max_comm, const cost_t max_work) {
         max_weight = std::max(max_work, max_comm * sched.getInstance().communicationCosts());
         active_schedule = &sched;
-        initial_penalty = std::sqrt(max_weight);
+        initial_penalty = static_cast<cost_t>(std::sqrt(max_weight));
     }
- 
+
     void init_reward_penalty(double multiplier = 1.0) {
-        multiplier = std::min(multiplier, 10.0); 
-        penalty = initial_penalty * multiplier;
-        reward = max_weight * multiplier;
+        multiplier = std::min(multiplier, 10.0);
+        penalty = static_cast<cost_t>(initial_penalty * multiplier);
+        reward = static_cast<cost_t>(max_weight * multiplier);
     }
 };
 
@@ -52,23 +52,15 @@ struct set_vertex_lock_manger {
 
     std::unordered_set<VertexType> locked_nodes;
 
-    void initialize(size_t ) {}
+    void initialize(size_t) {}
 
-    void lock(VertexType node) {
-        locked_nodes.insert(node);
-    }
+    void lock(VertexType node) { locked_nodes.insert(node); }
 
-    void unlock(VertexType node) {
-        locked_nodes.erase(node);
-    }
+    void unlock(VertexType node) { locked_nodes.erase(node); }
 
-    bool is_locked(VertexType node) {
-        return locked_nodes.find(node) != locked_nodes.end();
-    }
+    bool is_locked(VertexType node) { return locked_nodes.find(node) != locked_nodes.end(); }
 
-    void clear() {
-        locked_nodes.clear();
-    }
+    void clear() { locked_nodes.clear(); }
 };
 
 template<typename VertexType>
@@ -76,25 +68,15 @@ struct vector_vertex_lock_manger {
 
     std::vector<bool> locked_nodes;
 
-    void initialize(size_t num_nodes) {
-        locked_nodes.resize(num_nodes);
-    }
+    void initialize(size_t num_nodes) { locked_nodes.resize(num_nodes); }
 
-    void lock(VertexType node) {
-        locked_nodes[node] = true;
-    }
+    void lock(VertexType node) { locked_nodes[node] = true; }
 
-    void unlock(VertexType node) {
-        locked_nodes[node] = false;
-    }
+    void unlock(VertexType node) { locked_nodes[node] = false; }
 
-    bool is_locked(VertexType node) {
-        return locked_nodes[node];
-    }
+    bool is_locked(VertexType node) { return locked_nodes[node]; }
 
-    void clear() {
-        locked_nodes.assign(locked_nodes.size(), false);
-    }
+    void clear() { locked_nodes.assign(locked_nodes.size(), false); }
 };
 
 template<typename Graph_t, typename cost_t, typename kl_active_schedule_t, unsigned window_size>
@@ -102,9 +84,9 @@ struct adaptive_affinity_table {
     constexpr static unsigned window_range = 2 * window_size + 1;
     using VertexType = vertex_idx_t<Graph_t>;
 
-private:
+  private:
     const kl_active_schedule_t *active_schedule;
-    const Graph_t * graph; 
+    const Graph_t *graph;
 
     std::vector<bool> node_is_selected;
     std::vector<size_t> selected_nodes_idx;
@@ -115,14 +97,13 @@ struct adaptive_affinity_table {
     std::vector<size_t> gaps;
     size_t last_idx;
 
-public:
-
-    void initialize(const kl_active_schedule_t & sche_, const std::size_t initial_table_size) {
+  public:
+    void initialize(const kl_active_schedule_t &sche_, const std::size_t initial_table_size) {
         active_schedule = &sche_;
         graph = &(sche_.getInstance().getComputationalDag());
 
         last_idx = 0;
- 
+
         node_is_selected.resize(graph->num_vertices());
         selected_nodes_idx.resize(graph->num_vertices());
         selected_nodes.resize(initial_table_size);
@@ -136,49 +117,37 @@ struct adaptive_affinity_table {
             for (auto &row : table) {
                 row.resize(window_range);
             }
-        }    
+        }
     }
 
-    inline std::vector<VertexType>& get_selected_nodes() {
-        return selected_nodes;
-    }
+    inline std::vector<VertexType> &get_selected_nodes() { return selected_nodes; }
 
-    inline const std::vector<VertexType>& get_selected_nodes() const {
-        return selected_nodes;
-    }
+    inline const std::vector<VertexType> &get_selected_nodes() const { return selected_nodes; }
 
-    inline size_t size() const {
-        return last_idx - gaps.size();
-    }
+    inline size_t size() const { return last_idx - gaps.size(); }
 
-    inline bool is_selected(VertexType node) const {
-        return node_is_selected[node];
-    }
+    inline bool is_selected(VertexType node) const { return node_is_selected[node]; }
 
-    inline const std::vector<size_t> & get_selected_nodes_indices() const {
-        return selected_nodes_idx;
-    }
+    inline const std::vector<size_t> &get_selected_nodes_indices() const { return selected_nodes_idx; }
 
-    inline size_t get_selected_nodes_idx(VertexType node) const {
-        return selected_nodes_idx[node];
-    }
+    inline size_t get_selected_nodes_idx(VertexType node) const { return selected_nodes_idx[node]; }
 
-    inline std::vector<std::vector<cost_t>> & operator[](VertexType node) {
+    inline std::vector<std::vector<cost_t>> &operator[](VertexType node) {
         assert(node_is_selected[node]);
         return affinity_table[selected_nodes_idx[node]];
     }
 
-    inline std::vector<std::vector<cost_t>> & at(VertexType node) {
+    inline std::vector<std::vector<cost_t>> &at(VertexType node) {
         assert(node_is_selected[node]);
         return affinity_table[selected_nodes_idx[node]];
     }
 
-    inline const std::vector<std::vector<cost_t>> & at(VertexType node) const {
+    inline const std::vector<std::vector<cost_t>> &at(VertexType node) const {
         assert(node_is_selected[node]);
         return affinity_table[selected_nodes_idx[node]];
     }
 
-    inline std::vector<std::vector<cost_t>> & get_affinity_table(VertexType node) {
+    inline std::vector<std::vector<cost_t>> &get_affinity_table(VertexType node) {
         assert(node_is_selected[node]);
         return affinity_table[selected_nodes_idx[node]];
     }
@@ -193,11 +162,11 @@ struct adaptive_affinity_table {
             gaps.pop_back();
         } else {
             insert_location = last_idx;
-            
+
             if (insert_location >= selected_nodes.size()) {
                 const size_t old_size = selected_nodes.size();
                 const size_t new_size = std::min(old_size * 2, static_cast<size_t>(graph->num_vertices()));
-                
+
                 selected_nodes.resize(new_size);
                 affinity_table.resize(new_size);
 
@@ -215,7 +184,7 @@ struct adaptive_affinity_table {
         node_is_selected[node] = true;
         selected_nodes_idx[node] = insert_location;
         selected_nodes[insert_location] = node;
-        
+
         return true;
     }
 
@@ -225,13 +194,13 @@ struct adaptive_affinity_table {
 
         gaps.push_back(selected_nodes_idx[node]);
     }
-    
+
     void reset_node_selection() {
-        node_is_selected.assign(node_is_selected.size(), false);        
+        node_is_selected.assign(node_is_selected.size(), false);
         gaps.clear();
         last_idx = 0;
     }
-    
+
     void clear() {
         node_is_selected.clear();
         selected_nodes_idx.clear();
@@ -242,7 +211,7 @@ struct adaptive_affinity_table {
     }
 
     void trim() {
-        while (!gaps.empty() && last_idx > 0) {            
+        while (!gaps.empty() && last_idx > 0) {
             size_t last_element_idx = last_idx - 1;
 
             // The last element could be a gap itself. If so, just shrink the size.
@@ -277,17 +246,16 @@ struct static_affinity_table {
     constexpr static unsigned window_range = 2 * window_size + 1;
     using VertexType = vertex_idx_t<Graph_t>;
 
-private:
+  private:
     const kl_active_schedule_t *active_schedule;
-    const Graph_t * graph; 
+    const Graph_t *graph;
 
-    std::unordered_set<VertexType> selected_nodes; 
+    std::unordered_set<VertexType> selected_nodes;
 
     std::vector<std::vector<std::vector<cost_t>>> affinity_table;
 
-public:
-
-    void initialize(const kl_active_schedule_t & sche_, const std::size_t ) {
+  public:
+    void initialize(const kl_active_schedule_t &sche_, const std::size_t) {
         active_schedule = &sche_;
         graph = &(sche_.getInstance().getComputationalDag());
 
@@ -298,50 +266,32 @@ struct static_affinity_table {
             for (auto &row : table) {
                 row.resize(window_range);
             }
-        }    
+        }
     }
 
-    inline std::vector<VertexType> get_selected_nodes() const {
-        return {selected_nodes.begin(), selected_nodes.end()};
-    }
+    inline std::vector<VertexType> get_selected_nodes() const { return {selected_nodes.begin(), selected_nodes.end()}; }
 
-    inline size_t size() const {
-        return selected_nodes.size();
-    }
+    inline size_t size() const { return selected_nodes.size(); }
 
-    inline bool is_selected(VertexType node) const {
-        return selected_nodes.find(node) != selected_nodes.end(); 
-    }
+    inline bool is_selected(VertexType node) const { return selected_nodes.find(node) != selected_nodes.end(); }
 
-    inline std::vector<std::vector<cost_t>> & operator[](VertexType node) {
-        return affinity_table[node];
-    }
+    inline std::vector<std::vector<cost_t>> &operator[](VertexType node) { return affinity_table[node]; }
 
-    inline std::vector<std::vector<cost_t>> & at(VertexType node) {
-        return affinity_table[node];
-    }
+    inline std::vector<std::vector<cost_t>> &at(VertexType node) { return affinity_table[node]; }
 
-    inline const std::vector<std::vector<cost_t>> & at(VertexType node) const {
-        return affinity_table[node];
-    }
+    inline const std::vector<std::vector<cost_t>> &at(VertexType node) const { return affinity_table[node]; }
 
-    inline std::vector<std::vector<cost_t>> & get_affinity_table(VertexType node) {
-        return affinity_table[node];
-    }
+    inline std::vector<std::vector<cost_t>> &get_affinity_table(VertexType node) { return affinity_table[node]; }
 
     bool insert(VertexType node) {
         const auto pair = selected_nodes.insert(node);
         return pair.second;
     }
 
-    void remove(VertexType node) {
-        selected_nodes.erase(node);
-    }
-    
-    void reset_node_selection() {
-        selected_nodes.clear();       
-    }
-    
+    void remove(VertexType node) { selected_nodes.erase(node); }
+
+    void reset_node_selection() { selected_nodes.clear(); }
+
     void clear() {
         affinity_table.clear();
         selected_nodes.clear();
@@ -356,8 +306,8 @@ struct vertex_selection_strategy {
     using EdgeType = edge_desc_t<Graph_t>;
 
     const kl_active_schedule_t *active_schedule;
-    const Graph_t * graph; 
-    std::mt19937 * gen;
+    const Graph_t *graph;
+    std::mt19937 *gen;
     std::size_t selection_threshold = 0;
     unsigned strategy_counter = 0;
 
@@ -366,9 +316,10 @@ struct vertex_selection_strategy {
 
     unsigned max_work_counter = 0;
 
-    inline void initialize(const kl_active_schedule_t & sche_, std::mt19937 & gen_, const unsigned start_step, const unsigned end_step) {
+    inline void initialize(const kl_active_schedule_t &sche_, std::mt19937 &gen_, const unsigned start_step,
+                           const unsigned end_step) {
         active_schedule = &sche_;
-        graph = &(sche_.getInstance().getComputationalDag());        
+        graph = &(sche_.getInstance().getComputationalDag());
         gen = &gen_;
 
         permutation.reserve(graph->num_vertices() / active_schedule->num_steps() * (end_step - start_step));
@@ -381,7 +332,7 @@ struct vertex_selection_strategy {
 
         const unsigned num_procs = active_schedule->getInstance().numberOfProcessors();
         for (unsigned step = start_step; step <= end_step; ++step) {
-            const auto & processor_vertices = active_schedule->getSetSchedule().step_processor_vertices[step]; 
+            const auto &processor_vertices = active_schedule->getSetSchedule().step_processor_vertices[step];
             for (unsigned proc = 0; proc < num_procs; ++proc) {
                 for (const auto node : processor_vertices[proc]) {
                     permutation.push_back(node);
@@ -393,11 +344,11 @@ struct vertex_selection_strategy {
         std::shuffle(permutation.begin(), permutation.end(), *gen);
     }
 
-
-    void add_neighbours_to_selection(vertex_idx_t<Graph_t> node, container_t &nodes, const unsigned start_step, const unsigned end_step) {
+    void add_neighbours_to_selection(vertex_idx_t<Graph_t> node, container_t &nodes, const unsigned start_step,
+                                     const unsigned end_step) {
         for (const auto parent : graph->parents(node)) {
             const unsigned parent_step = active_schedule->assigned_superstep(parent);
-            if (parent_step >= start_step && parent_step <= end_step)         
+            if (parent_step >= start_step && parent_step <= end_step)
                 nodes.insert(parent);
         }
 
@@ -408,37 +359,38 @@ struct vertex_selection_strategy {
         }
     }
 
-    inline void select_active_nodes(container_t & node_selection, const unsigned start_step, const unsigned end_step) {        
+    inline void select_active_nodes(container_t &node_selection, const unsigned start_step, const unsigned end_step) {
         if (strategy_counter < 3) {
-            select_nodes_permutation_threshold(selection_threshold, node_selection);    
+            select_nodes_permutation_threshold(selection_threshold, node_selection);
         } else if (strategy_counter == 4) {
             select_nodes_max_work_proc(selection_threshold, node_selection, start_step, end_step);
-        } 
+        }
 
         strategy_counter++;
         strategy_counter %= 5;
     }
 
-    void select_nodes_violations(container_t & node_selection, std::unordered_set<EdgeType>& current_violations, const unsigned start_step, const unsigned end_step) {
-        for (const auto & edge : current_violations) {
+    void select_nodes_violations(container_t &node_selection, std::unordered_set<EdgeType> &current_violations,
+                                 const unsigned start_step, const unsigned end_step) {
+        for (const auto &edge : current_violations) {
             const auto source_v = source(edge, *graph);
             const auto target_v = target(edge, *graph);
-            
+
             const unsigned source_step = active_schedule->assigned_superstep(source_v);
             if (source_step >= start_step && source_step <= end_step)
                 node_selection.insert(source_v);
-            
+
             const unsigned target_step = active_schedule->assigned_superstep(target_v);
             if (target_step >= start_step && target_step <= end_step)
                 node_selection.insert(target_v);
         }
     }
 
-    void select_nodes_permutation_threshold(const std::size_t & threshold, container_t & node_selection) {
+    void select_nodes_permutation_threshold(const std::size_t &threshold, container_t &node_selection) {
 
         const size_t bound = std::min(threshold + permutation_idx, permutation.size());
-        for (std::size_t i = permutation_idx; i < bound; i++) { 
-                node_selection.insert(permutation[i]);
+        for (std::size_t i = permutation_idx; i < bound; i++) {
+            node_selection.insert(permutation[i]);
         }
 
         permutation_idx = bound;
@@ -448,7 +400,8 @@ struct vertex_selection_strategy {
         }
     }
 
-    void select_nodes_max_work_proc(const std::size_t & threshold, container_t & node_selection, const unsigned start_step, const unsigned end_step) {        
+    void select_nodes_max_work_proc(const std::size_t &threshold, container_t &node_selection,
+                                    const unsigned start_step, const unsigned end_step) {
         while (node_selection.size() < threshold) {
             if (max_work_counter > end_step) {
                 max_work_counter = start_step; // wrap around
@@ -460,18 +413,17 @@ struct vertex_selection_strategy {
         }
     }
 
-    void select_nodes_max_work_proc_helper(const std::size_t & threshold, unsigned step, container_t & node_selection) {        
+    void select_nodes_max_work_proc_helper(const std::size_t &threshold, unsigned step, container_t &node_selection) {
         const unsigned num_max_work_proc = active_schedule->work_datastructures.step_max_work_processor_count[step];
         for (unsigned idx = 0; idx < num_max_work_proc; idx++) {
             const unsigned proc = active_schedule->work_datastructures.step_processor_work_[step][idx].proc;
-            const std::unordered_set<vertex_idx_t<Graph_t>> step_proc_vert = active_schedule->getSetSchedule().step_processor_vertices[step][proc];
-            const size_t num_insert = std::min(threshold - node_selection.size(), step_proc_vert.size());                 
+            const std::unordered_set<vertex_idx_t<Graph_t>> step_proc_vert =
+                active_schedule->getSetSchedule().step_processor_vertices[step][proc];
+            const size_t num_insert = std::min(threshold - node_selection.size(), step_proc_vert.size());
             auto end_it = step_proc_vert.begin();
             std::advance(end_it, num_insert);
-            std::for_each(step_proc_vert.begin(), end_it, [&](const auto& val) {
-                node_selection.insert(val);
-            });    
-        }        
+            std::for_each(step_proc_vert.begin(), end_it, [&](const auto &val) { node_selection.insert(val); });
+        }
     }
 };
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 74cac6c7..8a6260bd 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -60,6 +60,12 @@ _add_test( kl_lambda )
 
 _add_test( kl_util )
 
+_add_test( kl_bsp_cost )
+
+_add_test( kl_bsp_improver_test )
+
+_add_test( kl_bsp_affinity_test )
+
 _add_test( heaps )
 
 _add_test( kl_mem_constr )
diff --git a/tests/kl_bsp_affinity_test.cpp b/tests/kl_bsp_affinity_test.cpp
new file mode 100644
index 00000000..9d67de8e
--- /dev/null
+++ b/tests/kl_bsp_affinity_test.cpp
@@ -0,0 +1,967 @@
+
+#define BOOST_TEST_MODULE kl_bsp_affinity
+#include <boost/test/unit_test.hpp>
+
+#include "osp/bsp/model/BspSchedule.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp"
+#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
+#include "test_graphs.hpp"
+
+using namespace osp;
+using graph = computational_dag_edge_idx_vector_impl_def_int_t;
+using kl_active_schedule_t = kl_active_schedule<graph, double, no_local_search_memory_constraint>;
+
+BOOST_AUTO_TEST_CASE(simple_parent_child_test) {
+    using VertexType = graph::vertex_idx;
+
+    graph dag;
+    const VertexType v0 = dag.add_vertex(10, 5, 2); // work=10, mem=5, comm=2
+    const VertexType v1 = dag.add_vertex(8, 4, 1);  // work=8, mem=4, comm=1
+    dag.add_edge(v0, v1, 3);                        // edge weight=3
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2);
+
+    BspInstance<graph> instance(dag, arch);
+    instance.setCommunicationCosts(10); // comm multiplier
+    instance.setSynchronisationCosts(5);
+
+    BspSchedule schedule(instance);
+    schedule.setAssignedProcessors({0, 1}); // v0 on p0, v1 on p1
+    schedule.setAssignedSupersteps({0, 1}); // v0 in step 0, v1 in step 1
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    // Insert only v0 into gain heap to control which node moves
+    auto node_selection = kl.insert_gain_heap_test({0});
+
+    // Run one iteration - this will move v0 to its best position
+    auto recompute_max_gain = kl.run_inner_iteration_test();
+
+    // Compare costs after move
+    double after_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_tracked = kl.get_current_cost();
+
+    BOOST_CHECK_CLOSE(after_recomputed, after_tracked, 0.00001);
+}
+
+/**
+ * Helper to validate comm datastructures by comparing with freshly computed ones
+ */
+template<typename Graph>
+bool validate_comm_datastructures(
+    const max_comm_datastructure<Graph, double, kl_active_schedule_t> &comm_ds_incremental,
+    kl_active_schedule_t &active_sched, const BspInstance<Graph> &instance, const std::string &context) {
+
+    // 1. Clone Schedule
+    BspSchedule<Graph> current_schedule(instance);
+    active_sched.write_schedule(current_schedule);
+
+    // 2. Fresh Computation
+    kl_active_schedule_t kl_sched_fresh;
+    kl_sched_fresh.initialize(current_schedule);
+
+    max_comm_datastructure<Graph, double, kl_active_schedule_t> comm_ds_fresh;
+    comm_ds_fresh.initialize(kl_sched_fresh);
+
+    // Compute for all steps
+    unsigned max_step = current_schedule.numberOfSupersteps();
+    comm_ds_fresh.compute_comm_datastructures(0, max_step > 0 ? max_step - 1 : 0);
+
+    bool all_match = true;
+    // std::cout << "\nValidating comm datastructures " << context << ":" << std::endl;
+
+    // 3. Validate Comm Costs
+    for (unsigned step = 0; step < max_step; ++step) {
+        for (unsigned p = 0; p < instance.numberOfProcessors(); ++p) {
+            auto send_inc = comm_ds_incremental.step_proc_send(step, p);
+            auto send_fresh = comm_ds_fresh.step_proc_send(step, p);
+            auto recv_inc = comm_ds_incremental.step_proc_receive(step, p);
+            auto recv_fresh = comm_ds_fresh.step_proc_receive(step, p);
+
+            if (std::abs(send_inc - send_fresh) > 1e-6 || std::abs(recv_inc - recv_fresh) > 1e-6) {
+                all_match = false;
+                std::cout << "  MISMATCH at step " << step << " proc " << p << ":" << std::endl;
+                std::cout << "    Incremental: send=" << send_inc << ", recv=" << recv_inc << std::endl;
+                std::cout << "    Fresh:       send=" << send_fresh << ", recv=" << recv_fresh << std::endl;
+            }
+        }
+    }
+
+    // 4. Validate Lambda Maps
+    for (const auto v : instance.vertices()) {
+        for (unsigned p = 0; p < instance.numberOfProcessors(); ++p) {
+            unsigned count_inc = 0;
+            if (comm_ds_incremental.node_lambda_map.has_proc_entry(v, p)) {
+                count_inc = comm_ds_incremental.node_lambda_map.get_proc_entry(v, p);
+            }
+
+            unsigned count_fresh = 0;
+            if (comm_ds_fresh.node_lambda_map.has_proc_entry(v, p)) {
+                count_fresh = comm_ds_fresh.node_lambda_map.get_proc_entry(v, p);
+            }
+
+            if (count_inc != count_fresh) {
+                all_match = false;
+                std::cout << "  LAMBDA MISMATCH at node " << v << " proc " << p << ":" << std::endl;
+                std::cout << "    Incremental: " << count_inc << std::endl;
+                std::cout << "    Fresh:       " << count_fresh << std::endl;
+            }
+        }
+    }
+
+    return all_match;
+}
+
+/**
+ * Helper to validate affinity tables by comparing with freshly computed ones
+ */
+template<typename Graph_t, typename comm_cost_function_t, typename MemoryConstraint_t, unsigned window_size,
+         typename cost_t>
+bool validate_affinity_tables(
+    kl_improver_test<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t> &kl_incremental,
+    const BspInstance<Graph_t> &instance, const std::string &context) {
+
+    // 1. Get current schedule from incremental
+    BspSchedule<Graph_t> current_schedule(instance);
+    kl_incremental.get_active_schedule_test(current_schedule);
+
+    // 2. Create fresh kl_improver and compute all affinities from scratch
+    kl_improver_test<Graph_t, comm_cost_function_t, MemoryConstraint_t, window_size, cost_t> kl_fresh;
+    kl_fresh.setup_schedule(current_schedule);
+
+    // Get selected nodes from incremental
+    std::vector<vertex_idx_t<Graph_t>> selected_nodes;
+    
+    const size_t active_count = kl_incremental.get_affinity_table().size();
+    for (size_t i = 0; i < active_count; ++i) {
+        selected_nodes.push_back(kl_incremental.get_affinity_table().get_selected_nodes()[i]);
+    }
+    
+
+    std::cout << "\n  [" << context << "] Validating " << selected_nodes.size() << " selected nodes: { ";
+    for (const auto n : selected_nodes) {
+        std::cout << n << " ";
+    }
+    std::cout << "}" << std::endl;
+
+    // Compute affinities for all selected nodes
+    kl_fresh.insert_gain_heap_test(selected_nodes);
+
+    bool all_match = true;
+    const unsigned num_procs = instance.numberOfProcessors();
+    const unsigned num_steps = kl_incremental.get_active_schedule().num_steps();
+
+    // 3. Compare affinity tables for each selected node
+   
+    for (const auto & node : selected_nodes) {    
+    
+        const auto &affinity_inc = kl_incremental.get_affinity_table().get_affinity_table(node);
+        const auto &affinity_fresh = kl_fresh.get_affinity_table().get_affinity_table(node);
+
+        unsigned node_step = kl_incremental.get_active_schedule().assigned_superstep(node);
+
+        for (unsigned p = 0; p < num_procs; ++p) {
+            if (p >= affinity_inc.size() || p >= affinity_fresh.size())
+                continue;
+
+            for (unsigned idx = 0; idx < affinity_inc[p].size() && idx < affinity_fresh[p].size(); ++idx) {
+                int step_offset = static_cast<int>(idx) - static_cast<int>(window_size);
+                int target_step_signed = static_cast<int>(node_step) + step_offset;
+
+                // Skip affinities for supersteps that don't exist
+                if (target_step_signed < 0 || target_step_signed >= static_cast<int>(num_steps)) {
+                    continue;
+                }
+
+                double val_inc = affinity_inc[p][idx];
+                double val_fresh = affinity_fresh[p][idx];
+
+                if (std::abs(val_inc - val_fresh) > 1e-4) {
+                    all_match = false;
+
+                    std::cout << "  AFFINITY MISMATCH [" << context << "]: node=" << node << " to P" << p << " S"
+                              << target_step_signed << " (offset=" << step_offset << ")" << std::endl;
+                    std::cout << "    Incremental: " << val_inc << std::endl;
+                    std::cout << "    Fresh:       " << val_fresh << std::endl;
+                    std::cout << "    Difference:  " << (val_inc - val_fresh) << std::endl;
+                }
+            }
+        }
+    }
+
+    return all_match;
+}
+
+BOOST_AUTO_TEST_CASE(test_update_datastructure_after_move) {
+    graph dag;
+
+    // Create 6 vertices with specific comm weights
+    dag.add_vertex(1, 10, 1); // 0
+    dag.add_vertex(1, 1, 1);  // 1
+    dag.add_vertex(1, 5, 1);  // 2
+    dag.add_vertex(1, 1, 1);  // 3
+    dag.add_vertex(1, 2, 1);  // 4
+    dag.add_vertex(1, 1, 1);  // 5
+
+    // Add edges
+    dag.add_edge(0, 1, 1);
+    dag.add_edge(2, 3, 1);
+    dag.add_edge(4, 5, 1);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(3);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Schedule:
+    // Proc 0: Node 0, 4, 5
+    // Proc 1: Node 1, 2
+    // Proc 2: Node 3
+    schedule.setAssignedProcessors({0, 1, 1, 2, 0, 0});
+    // Steps: 0, 1, 0, 1, 0, 0
+    schedule.setAssignedSupersteps({0, 1, 0, 1, 0, 0});
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    kl.insert_gain_heap_test({0});
+    kl.run_inner_iteration_test();
+
+    double after_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_tracked = kl.get_current_cost();
+
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "test_update_datastructure_after_move"));
+    BOOST_CHECK_CLOSE(after_recomputed, after_tracked, 0.00001);
+}
+
+BOOST_AUTO_TEST_CASE(test_multiple_sequential_moves) {
+    graph dag;
+
+    // Create a linear chain: 0 -> 1 -> 2 -> 3
+    dag.add_vertex(1, 10, 1); // 0
+    dag.add_vertex(1, 8, 1);  // 1
+    dag.add_vertex(1, 6, 1);  // 2
+    dag.add_vertex(1, 4, 1);  // 3
+
+    dag.add_edge(0, 1, 1);
+    dag.add_edge(1, 2, 1);
+    dag.add_edge(2, 3, 1);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(4);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    schedule.setAssignedProcessors({0, 1, 2, 3});
+    schedule.setAssignedSupersteps({0, 0, 0, 0});
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    kl.insert_gain_heap_test({1});
+    kl.run_inner_iteration_test();
+
+    double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move1_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "test_multiple_sequential_moves_1"));
+    BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move2_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "test_multiple_sequential_moves_2"));
+    BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move3_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "test_multiple_sequential_moves_3"));
+    BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001);
+
+    // After: Node 0 has 3 local children
+    // Send cost = 10 * 0 = 0 (all local)
+    // Work cost 4
+    BOOST_CHECK_CLOSE(after_move3_tracked, 4.0, 0.00001);
+}
+
+BOOST_AUTO_TEST_CASE(test_node_with_multiple_children) {
+    graph dag;
+
+    // Tree structure: Node 0 has three children (1, 2, 3)
+    dag.add_vertex(1, 1, 1); // 0
+    dag.add_vertex(1, 1, 1); // 1
+    dag.add_vertex(1, 1, 1); // 2
+    dag.add_vertex(1, 1, 1); // 3
+
+    dag.add_edge(0, 1, 1);
+    dag.add_edge(0, 2, 1);
+    dag.add_edge(0, 3, 1);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(4);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    schedule.setAssignedProcessors({0, 1, 2, 3});
+    schedule.setAssignedSupersteps({0, 0, 0, 0});
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    kl.insert_gain_heap_test({1});
+    kl.get_comm_cost_f().compute_schedule_cost();
+    kl.run_inner_iteration_test();
+
+    double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move1_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "test_node_with_multiple_children"));
+    BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move2_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "test_node_with_multiple_children_2"));
+    BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move3_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "test_node_with_multiple_children_3"));
+    BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001);
+
+    // After: Node 0 has 3 local children
+    // Send cost = 10 * 0 = 0 (all local)
+    // Work cost 4
+    BOOST_CHECK_CLOSE(after_move3_tracked, 4.0, 0.00001);
+}
+
+BOOST_AUTO_TEST_CASE(test_cross_step_moves) {
+    graph dag;
+
+    // 0 -> 1 -> 2
+    dag.add_vertex(1, 10, 1); // 0
+    dag.add_vertex(1, 8, 1);  // 1
+    dag.add_vertex(1, 6, 1);  // 2
+
+    dag.add_edge(0, 1, 1);
+    dag.add_edge(1, 2, 1);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    schedule.setAssignedProcessors({0, 1, 0});
+    schedule.setAssignedSupersteps({0, 1, 2});
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    kl.insert_gain_heap_test({1});
+    kl.run_inner_iteration_test();
+
+    double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move1_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "test_cross_step_moves_1"));
+    BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001);
+}
+
+BOOST_AUTO_TEST_CASE(test_complex_scenario) {
+    std::cout << "Test case complex scenario" << std::endl;
+    graph dag;
+
+    const auto v1 = dag.add_vertex(2, 9, 2);
+    const auto v2 = dag.add_vertex(3, 8, 4);
+    const auto v3 = dag.add_vertex(4, 7, 3);
+    const auto v4 = dag.add_vertex(5, 6, 2);
+    const auto v5 = dag.add_vertex(6, 5, 6);
+    const auto v6 = dag.add_vertex(7, 4, 2);
+    dag.add_vertex(8, 3, 4);                 // v7 (index 6)
+    const auto v8 = dag.add_vertex(9, 2, 1); // v8 (index 7)
+
+    dag.add_edge(v1, v2, 2);
+    dag.add_edge(v1, v3, 2);
+    dag.add_edge(v1, v4, 2);
+    dag.add_edge(v2, v5, 12);
+    dag.add_edge(v3, v5, 6);
+    dag.add_edge(v3, v6, 7);
+    dag.add_edge(v5, v8, 9);
+    dag.add_edge(v4, v8, 9);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2); // P0, P1
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    schedule.setAssignedProcessors({1, 1, 0, 0, 1, 0, 0, 1});
+    schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3});
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    kl.insert_gain_heap_test({v3, v1});
+    kl.run_inner_iteration_test();
+
+    double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move1_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "complex_move1"));
+    BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move2_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "complex_move2"));
+    BOOST_CHECK(validate_affinity_tables(kl, instance, "complex_move2"));
+    BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move3_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "complex_move3"));
+    BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move4_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move4_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "complex_move4"));
+    BOOST_CHECK_CLOSE(after_move4_recomputed, after_move4_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move5_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move5_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "complex_move5"));
+    BOOST_CHECK_CLOSE(after_move5_recomputed, after_move5_tracked, 0.00001);
+}
+
+BOOST_AUTO_TEST_CASE(test_complex_scenario_only_compute) {
+    graph dag;
+
+    const auto v1 = dag.add_vertex(2, 9, 2);
+    const auto v2 = dag.add_vertex(3, 8, 4);
+    const auto v3 = dag.add_vertex(4, 7, 3);
+    const auto v4 = dag.add_vertex(5, 6, 2);
+    const auto v5 = dag.add_vertex(6, 5, 6);
+    const auto v6 = dag.add_vertex(7, 4, 2);
+    const auto v7 = dag.add_vertex(8, 3, 4); // v7 (index 6)
+    const auto v8 = dag.add_vertex(9, 2, 1); // v8 (index 7)
+
+    dag.add_edge(v1, v2, 2);
+    dag.add_edge(v1, v3, 2);
+    dag.add_edge(v1, v4, 2);
+    dag.add_edge(v2, v5, 12);
+    dag.add_edge(v3, v5, 6);
+    dag.add_edge(v3, v6, 7);
+    dag.add_edge(v5, v8, 9);
+    dag.add_edge(v4, v8, 9);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2); // P0, P1
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    schedule.setAssignedProcessors({1, 1, 0, 0, 1, 0, 0, 1});
+    schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3});
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    kl.insert_gain_heap_test({v1});
+    kl.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "complex_move1"));
+    BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
+
+    kl_improver_test kl2;
+    kl2.setup_schedule(schedule);
+
+    kl2.insert_gain_heap_test({v2});
+    kl2.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl2.get_comm_cost_f().comm_ds, kl2.get_active_schedule(), instance,
+                                             "complex_move2"));
+    BOOST_CHECK_CLOSE(kl2.get_comm_cost_f().compute_schedule_cost_test(), kl2.get_current_cost(), 0.00001);
+
+    kl_improver_test kl3;
+    kl3.setup_schedule(schedule);
+
+    kl3.insert_gain_heap_test({v3});
+    kl3.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl3.get_comm_cost_f().comm_ds, kl3.get_active_schedule(), instance,
+                                             "complex_move3"));
+    BOOST_CHECK_CLOSE(kl3.get_comm_cost_f().compute_schedule_cost_test(), kl3.get_current_cost(), 0.00001);
+
+    kl_improver_test kl4;
+    kl4.setup_schedule(schedule);
+
+    kl4.insert_gain_heap_test({v4});
+    kl4.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl4.get_comm_cost_f().comm_ds, kl4.get_active_schedule(), instance,
+                                             "complex_move4"));
+    BOOST_CHECK_CLOSE(kl4.get_comm_cost_f().compute_schedule_cost_test(), kl4.get_current_cost(), 0.00001);
+
+    kl_improver_test kl5;
+    kl5.setup_schedule(schedule);
+
+    kl5.insert_gain_heap_test({v5});
+    kl5.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl5.get_comm_cost_f().comm_ds, kl5.get_active_schedule(), instance,
+                                             "complex_move5"));
+    BOOST_CHECK_CLOSE(kl5.get_comm_cost_f().compute_schedule_cost_test(), kl5.get_current_cost(), 0.00001);
+
+    kl_improver_test kl6;
+    kl6.setup_schedule(schedule);
+
+    kl6.insert_gain_heap_test({v6});
+    kl6.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl6.get_comm_cost_f().comm_ds, kl6.get_active_schedule(), instance,
+                                             "complex_move6"));
+    BOOST_CHECK_CLOSE(kl6.get_comm_cost_f().compute_schedule_cost_test(), kl6.get_current_cost(), 0.00001);
+
+    kl_improver_test kl7;
+    kl7.setup_schedule(schedule);
+
+    kl7.insert_gain_heap_test({v7});
+    kl7.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl7.get_comm_cost_f().comm_ds, kl7.get_active_schedule(), instance,
+                                             "complex_move7"));
+    BOOST_CHECK_CLOSE(kl7.get_comm_cost_f().compute_schedule_cost_test(), kl7.get_current_cost(), 0.00001);
+
+    kl_improver_test kl8;
+    kl8.setup_schedule(schedule);
+
+    kl8.insert_gain_heap_test({v8});
+    kl8.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl8.get_comm_cost_f().comm_ds, kl8.get_active_schedule(), instance,
+                                             "complex_move8"));
+    BOOST_CHECK_CLOSE(kl8.get_comm_cost_f().compute_schedule_cost_test(), kl8.get_current_cost(), 0.00001);
+}
+
+BOOST_AUTO_TEST_CASE(test_complex_scenario_only_compute_2) {
+    graph dag;
+
+    const auto v1 = dag.add_vertex(2, 9, 2);
+    const auto v2 = dag.add_vertex(3, 8, 4);
+    const auto v3 = dag.add_vertex(4, 7, 3);
+    const auto v4 = dag.add_vertex(5, 6, 2);
+    const auto v5 = dag.add_vertex(6, 5, 6);
+    const auto v6 = dag.add_vertex(7, 4, 2);
+    const auto v7 = dag.add_vertex(8, 3, 4); // v7 (index 6)
+    const auto v8 = dag.add_vertex(9, 2, 1); // v8 (index 7)
+
+    dag.add_edge(v1, v2, 2);
+    dag.add_edge(v1, v5, 2);
+    dag.add_edge(v1, v6, 2);
+    dag.add_edge(v1, v3, 2);
+    dag.add_edge(v1, v4, 2);
+    dag.add_edge(v2, v5, 12);
+    dag.add_edge(v2, v6, 2);
+    dag.add_edge(v2, v7, 2);
+    dag.add_edge(v2, v8, 2);
+    dag.add_edge(v3, v5, 6);
+    dag.add_edge(v3, v6, 7);
+    dag.add_edge(v3, v7, 2);
+    dag.add_edge(v3, v8, 2);
+    dag.add_edge(v5, v8, 9);
+    dag.add_edge(v4, v8, 9);
+    dag.add_edge(v5, v7, 2);
+    dag.add_edge(v6, v7, 2);
+    dag.add_edge(v7, v8, 2);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2); // P0, P1
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    schedule.setAssignedProcessors({1, 1, 0, 0, 1, 0, 0, 1});
+    schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3});
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    kl.insert_gain_heap_test({v1});
+    kl.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "complex_move1"));
+    BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
+
+    kl_improver_test kl2;
+    kl2.setup_schedule(schedule);
+
+    kl2.insert_gain_heap_test({v2});
+    kl2.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl2.get_comm_cost_f().comm_ds, kl2.get_active_schedule(), instance,
+                                             "complex_move2"));
+    BOOST_CHECK_CLOSE(kl2.get_comm_cost_f().compute_schedule_cost_test(), kl2.get_current_cost(), 0.00001);
+
+    kl_improver_test kl3;
+    kl3.setup_schedule(schedule);
+
+    kl3.insert_gain_heap_test({v3});
+    kl3.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl3.get_comm_cost_f().comm_ds, kl3.get_active_schedule(), instance,
+                                             "complex_move3"));
+    BOOST_CHECK_CLOSE(kl3.get_comm_cost_f().compute_schedule_cost_test(), kl3.get_current_cost(), 0.00001);
+
+    kl_improver_test kl4;
+    kl4.setup_schedule(schedule);
+
+    kl4.insert_gain_heap_test({v4});
+    kl4.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl4.get_comm_cost_f().comm_ds, kl4.get_active_schedule(), instance,
+                                             "complex_move4"));
+    BOOST_CHECK_CLOSE(kl4.get_comm_cost_f().compute_schedule_cost_test(), kl4.get_current_cost(), 0.00001);
+
+    kl_improver_test kl5;
+    kl5.setup_schedule(schedule);
+
+    kl5.insert_gain_heap_test({v5});
+    kl5.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl5.get_comm_cost_f().comm_ds, kl5.get_active_schedule(), instance,
+                                             "complex_move5"));
+    BOOST_CHECK_CLOSE(kl5.get_comm_cost_f().compute_schedule_cost_test(), kl5.get_current_cost(), 0.00001);
+
+    kl_improver_test kl6;
+    kl6.setup_schedule(schedule);
+
+    kl6.insert_gain_heap_test({v6});
+    kl6.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl6.get_comm_cost_f().comm_ds, kl6.get_active_schedule(), instance,
+                                             "complex_move6"));
+    BOOST_CHECK_CLOSE(kl6.get_comm_cost_f().compute_schedule_cost_test(), kl6.get_current_cost(), 0.00001);
+
+    kl_improver_test kl7;
+    kl7.setup_schedule(schedule);
+
+    kl7.insert_gain_heap_test({v7});
+    kl7.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl7.get_comm_cost_f().comm_ds, kl7.get_active_schedule(), instance,
+                                             "complex_move7"));
+    BOOST_CHECK_CLOSE(kl7.get_comm_cost_f().compute_schedule_cost_test(), kl7.get_current_cost(), 0.00001);
+
+    kl_improver_test kl8;
+    kl8.setup_schedule(schedule);
+
+    kl8.insert_gain_heap_test({v8});
+    kl8.run_inner_iteration_test();
+
+    BOOST_CHECK(validate_comm_datastructures(kl8.get_comm_cost_f().comm_ds, kl8.get_active_schedule(), instance,
+                                             "complex_move8"));
+    BOOST_CHECK_CLOSE(kl8.get_comm_cost_f().compute_schedule_cost_test(), kl8.get_current_cost(), 0.00001);
+}
+
+BOOST_AUTO_TEST_CASE(test_grid_graph_complex_moves) {
+    // Construct 5x5 Grid Graph (25 nodes, indices 0-24)
+    graph dag = osp::construct_grid_dag<graph>(5, 5);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(4); // P0..P3
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Assign Processors and Supersteps
+    std::vector<unsigned> procs(25);
+    std::vector<unsigned> steps(25);
+
+    for (unsigned r = 0; r < 5; ++r) {
+        for (unsigned c = 0; c < 5; ++c) {
+            unsigned idx = r * 5 + c;
+            if (r < 2) {
+                procs[idx] = 0;
+                steps[idx] = (c < 3) ? 0 : 1;
+            } else if (r < 4) {
+                procs[idx] = 1;
+                steps[idx] = (c < 3) ? 2 : 3;
+            } else {
+                procs[idx] = 2;
+                steps[idx] = (c < 3) ? 4 : 5;
+            }
+        }
+    }
+
+    // Override: Node 7 (1,2) to P3, S1.
+    procs[7] = 3;
+    steps[7] = 1;
+
+    schedule.setAssignedProcessors(procs);
+    schedule.setAssignedSupersteps(steps);
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    kl.insert_gain_heap_test({12, 8, 7});
+    kl.run_inner_iteration_test();
+
+    double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move1_tracked = kl.get_current_cost();
+    BOOST_CHECK(
+        validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "grid_move1"));
+    BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move2_tracked = kl.get_current_cost();
+    BOOST_CHECK(
+        validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "grid_move2"));
+    BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move3_tracked = kl.get_current_cost();
+    BOOST_CHECK(
+        validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "grid_move3"));
+    BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move4_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move4_tracked = kl.get_current_cost();
+    BOOST_CHECK(
+        validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "grid_move4"));
+    BOOST_CHECK_CLOSE(after_move4_recomputed, after_move4_tracked, 0.00001);
+}
+
+BOOST_AUTO_TEST_CASE(test_butterfly_graph_moves) {
+    // Stages=2 -> 3 levels of 4 nodes each = 12 nodes.
+    // Level 0: 0-3. Level 1: 4-7. Level 2: 8-11.
+    graph dag = osp::construct_butterfly_dag<graph>(2);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Assign:
+    // Level 0: P0, Step 0
+    // Level 1: P1, Step 1
+    // Level 2: P0, Step 2
+    std::vector<unsigned> procs(12);
+    std::vector<unsigned> steps(12);
+    for (unsigned i = 0; i < 12; ++i) {
+        if (i < 4) {
+            procs[i] = 0;
+            steps[i] = 0;
+        } else if (i < 8) {
+            procs[i] = 1;
+            steps[i] = 1;
+        } else {
+            procs[i] = 0;
+            steps[i] = 2;
+        }
+    }
+
+    schedule.setAssignedProcessors(procs);
+    schedule.setAssignedSupersteps(steps);
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    kl.insert_gain_heap_test({4, 6, 0});
+    kl.run_inner_iteration_test();
+
+    double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move1_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "butterfly_move1"));
+    BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move2_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "butterfly_move2"));
+    BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move3_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "butterfly_move3"));
+    BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move4_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move4_tracked = kl.get_current_cost();
+    BOOST_CHECK(validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance,
+                                             "butterfly_move4"));
+    BOOST_CHECK_CLOSE(after_move4_recomputed, after_move4_tracked, 0.00001);
+}
+
+BOOST_AUTO_TEST_CASE(test_ladder_graph_moves) {
+    // Ladder with 5 rungs -> 6 pairs of nodes = 12 nodes.
+    // Pairs: (0,1), (2,3), ... (10,11).
+    graph dag = osp::construct_ladder_dag<graph>(5);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Assign:
+    // Even nodes (Left rail): P0
+    // Odd nodes (Right rail): P1
+    // Steps: Pair i at Step i.
+    std::vector<unsigned> procs(12);
+    std::vector<unsigned> steps(12);
+    for (unsigned i = 0; i < 6; ++i) {
+        procs[2 * i] = 0;
+        steps[2 * i] = i;
+        procs[2 * i + 1] = 1;
+        steps[2 * i + 1] = i;
+    }
+
+    schedule.setAssignedProcessors(procs);
+    schedule.setAssignedSupersteps(steps);
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+    kl.setup_schedule(schedule);
+
+    kl.insert_gain_heap_test({1, 3, 0, 2});
+    kl.run_inner_iteration_test();
+
+    double after_move1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move1_tracked = kl.get_current_cost();
+    BOOST_CHECK(
+        validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "ladder_move1"));
+    BOOST_CHECK_CLOSE(after_move1_recomputed, after_move1_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move2_tracked = kl.get_current_cost();
+    BOOST_CHECK(
+        validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "ladder_move2"));
+    BOOST_CHECK_CLOSE(after_move2_recomputed, after_move2_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move3_tracked = kl.get_current_cost();
+    BOOST_CHECK(
+        validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "ladder_move3"));
+    BOOST_CHECK_CLOSE(after_move3_recomputed, after_move3_tracked, 0.00001);
+
+    kl.run_inner_iteration_test();
+
+    double after_move4_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double after_move4_tracked = kl.get_current_cost();
+    BOOST_CHECK(
+        validate_comm_datastructures(kl.get_comm_cost_f().comm_ds, kl.get_active_schedule(), instance, "ladder_move4"));
+    BOOST_CHECK_CLOSE(after_move4_recomputed, after_move4_tracked, 0.00001);
+}
\ No newline at end of file
diff --git a/tests/kl_bsp_cost.cpp b/tests/kl_bsp_cost.cpp
new file mode 100644
index 00000000..36e999ff
--- /dev/null
+++ b/tests/kl_bsp_cost.cpp
@@ -0,0 +1,1086 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#define BOOST_TEST_MODULE kl_bsp_cost
+#include <boost/test/unit_test.hpp>
+
+#include "test_graphs.hpp"
+#include "osp/bsp/model/BspSchedule.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_active_schedule.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp"
+#include "osp/concepts/graph_traits.hpp"
+#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
+
+using namespace osp;
+using graph = computational_dag_edge_idx_vector_impl_def_int_t;
+using kl_active_schedule_t = kl_active_schedule<graph, double, no_local_search_memory_constraint>;
+
+BOOST_AUTO_TEST_CASE(test_arrange_superstep_comm_data) {
+    graph dag;
+
+    dag.add_vertex(1, 1, 1);
+    dag.add_vertex(1, 1, 1);
+    dag.add_vertex(1, 1, 1);
+    dag.add_vertex(1, 1, 1);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(4);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Initialize schedule with 1 step
+    schedule.setAssignedProcessors({0, 1, 2, 3});
+    schedule.setAssignedSupersteps({0, 0, 0, 0});
+    schedule.updateNumberOfSupersteps();
+
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    max_comm_datastructure<graph, double, kl_active_schedule_t> comm_ds;
+    comm_ds.initialize(kl_sched);
+
+    unsigned step = 0;
+
+    // Case 1: Unique Max
+    comm_ds.step_proc_send(step, 0) = 10;
+    comm_ds.step_proc_send(step, 1) = 5;
+    comm_ds.step_proc_send(step, 2) = 2;
+    comm_ds.step_proc_send(step, 3) = 1;
+
+    comm_ds.step_proc_receive(step, 0) = 8;
+    comm_ds.step_proc_receive(step, 1) = 8;
+    comm_ds.step_proc_receive(step, 2) = 2;
+    comm_ds.step_proc_receive(step, 3) = 1;
+
+    comm_ds.arrange_superstep_comm_data(step);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 1);  // Only proc 0 has 10
+    BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 8); // Next highest is 8 (from recv)
+
+    // Case 2: Shared Max
+    comm_ds.reset_superstep(step);
+    comm_ds.step_proc_send(step, 0) = 10; // Need to re-set this as reset clears it
+    comm_ds.step_proc_send(step, 1) = 10;
+    comm_ds.step_proc_send(step, 2) = 2;
+    comm_ds.step_proc_send(step, 3) = 1;
+
+    comm_ds.step_proc_receive(step, 0) = 5;
+    comm_ds.step_proc_receive(step, 1) = 5;
+    comm_ds.step_proc_receive(step, 2) = 2;
+    comm_ds.step_proc_receive(step, 3) = 1;
+    comm_ds.arrange_superstep_comm_data(step);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 2);  // Proc 0 and 1
+    BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 5); // Next highest is 5 (from recv)
+
+    // Case 3: Max in Recv
+    comm_ds.reset_superstep(step);
+
+    comm_ds.step_proc_send(step, 0) = 5;
+    comm_ds.step_proc_send(step, 1) = 5;
+    comm_ds.step_proc_send(step, 2) = 2;
+    comm_ds.step_proc_send(step, 3) = 1;
+
+    comm_ds.step_proc_receive(step, 0) = 12;
+    comm_ds.step_proc_receive(step, 1) = 8;
+    comm_ds.step_proc_receive(step, 2) = 2;
+    comm_ds.step_proc_receive(step, 3) = 1;
+    comm_ds.arrange_superstep_comm_data(step);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 12);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 1);
+    BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 8);
+
+    // Case 4: All same
+    comm_ds.reset_superstep(step);
+    // Send: 10, 10, 10, 10
+    // Recv: 10, 10, 10, 10
+    for (unsigned i = 0; i < 4; ++i) {
+        comm_ds.step_proc_send(step, i) = 10;
+        comm_ds.step_proc_receive(step, i) = 10;
+    }
+    comm_ds.arrange_superstep_comm_data(step);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 8);  // 4 sends + 4 recvs
+    BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 0); // If all removed, 0.
+
+    // Case 5: Max removed, second max is from same type (Send)
+    comm_ds.reset_superstep(step);
+    comm_ds.step_proc_send(step, 0) = 10;
+    comm_ds.step_proc_send(step, 1) = 8;
+    comm_ds.step_proc_send(step, 2) = 2;
+    comm_ds.step_proc_send(step, 3) = 1;
+
+    for (unsigned i = 0; i < 4; ++i)
+        comm_ds.step_proc_receive(step, i) = 5;
+
+    comm_ds.arrange_superstep_comm_data(step);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 1);
+    BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 8);
+
+    // Case 6: Max removed, second max is from other type (Recv)
+    comm_ds.reset_superstep(step);
+
+    comm_ds.step_proc_send(step, 0) = 10;
+    comm_ds.step_proc_send(step, 1) = 4;
+    comm_ds.step_proc_send(step, 2) = 2;
+    comm_ds.step_proc_send(step, 3) = 1;
+
+    comm_ds.step_proc_receive(step, 0) = 8;
+    comm_ds.step_proc_receive(step, 1) = 5;
+    comm_ds.step_proc_receive(step, 2) = 2;
+    comm_ds.step_proc_receive(step, 3) = 1;
+
+    comm_ds.arrange_superstep_comm_data(step);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 1);
+    BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 8);
+}
+
+BOOST_AUTO_TEST_CASE(test_compute_comm_datastructures) {
+    graph dag;
+
+    // Create 6 vertices with specific comm weights
+    // Node 0: weight 10 (sends to 1)
+    dag.add_vertex(1, 10, 1);
+    // Node 1: weight 1
+    dag.add_vertex(1, 1, 1);
+    // Node 2: weight 5 (sends to 3)
+    dag.add_vertex(1, 5, 1);
+    // Node 3: weight 1
+    dag.add_vertex(1, 1, 1);
+    // Node 4: weight 2 (local to 5)
+    dag.add_vertex(1, 2, 1);
+    // Node 5: weight 1
+    dag.add_vertex(1, 1, 1);
+
+    // Add edges
+    // 0 -> 1
+    dag.add_edge(0, 1, 1); // Edge weight ignored by max_comm_datastructure
+    // 2 -> 3
+    dag.add_edge(2, 3, 1);
+    // 4 -> 5
+    dag.add_edge(4, 5, 1);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(3);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Schedule:
+    // Proc 0: Node 0, 4, 5
+    // Proc 1: Node 1, 2
+    // Proc 2: Node 3
+    schedule.setAssignedProcessors({0, 1, 1, 2, 0, 0});
+    schedule.setAssignedSupersteps({0, 1, 0, 1, 0, 0});
+    schedule.updateNumberOfSupersteps();
+
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    max_comm_datastructure<graph, double, kl_active_schedule_t> comm_ds;
+    comm_ds.initialize(kl_sched);
+
+    // Compute for steps 0 and 1
+    comm_ds.compute_comm_datastructures(0, 1);
+
+    unsigned step = 0;
+
+    // Expected Step 0:
+    // Proc 0 sends: 10 (Node 0 -> Node 1 on Proc 1)
+    // Proc 1 receives: 10 (from Proc 0)
+    // Proc 1 sends: 5 (Node 2 -> Node 3 on Proc 2)
+    // Proc 2 receives: 5 (from Proc 1)
+    // Proc 2 sends: 0
+    // Proc 0 receives: 0
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 0), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 1), 5);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 2), 0);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 0), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 1), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 2), 5);
+
+    // Max Comm Calculation Step 0
+    // Send Max: 10 (P0)
+    // Recv Max: 10 (P1)
+    // Global Max: 10
+    // Count: 2 (P0 send, P1 recv)
+    // Second Max: 5 (P1 send, P2 recv)
+
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 2);
+    BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 5);
+
+    // Verify Step 1 (Should be empty as Nodes 1 and 3 are leaves)
+    step = 1;
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 0), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 1), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 2), 0);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 0), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 1), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 2), 0);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 0);
+}
+
+/**
+ * Helper to validate comm datastructures by comparing with freshly computed ones
+ */
+template<typename Graph>
+bool validate_comm_datastructures(
+    const max_comm_datastructure<Graph, double, kl_active_schedule_t> &comm_ds_incremental,
+    kl_active_schedule_t &active_sched, const BspInstance<Graph> &instance, const std::string &context) {
+
+    // 1. Clone Schedule
+    BspSchedule<Graph> current_schedule(instance);
+    active_sched.write_schedule(current_schedule);
+
+    // 2. Fresh Computation
+    kl_active_schedule_t kl_sched_fresh;
+    kl_sched_fresh.initialize(current_schedule);
+
+    max_comm_datastructure<Graph, double, kl_active_schedule_t> comm_ds_fresh;
+    comm_ds_fresh.initialize(kl_sched_fresh);
+
+    // Compute for all steps
+    unsigned max_step = current_schedule.numberOfSupersteps();
+    comm_ds_fresh.compute_comm_datastructures(0, max_step > 0 ? max_step - 1 : 0);
+
+    bool all_match = true;
+    // std::cout << "\nValidating comm datastructures " << context << ":" << std::endl;
+
+    // 3. Validate Comm Costs
+    for (unsigned step = 0; step < max_step; ++step) {
+        for (unsigned p = 0; p < instance.numberOfProcessors(); ++p) {
+            auto send_inc = comm_ds_incremental.step_proc_send(step, p);
+            auto send_fresh = comm_ds_fresh.step_proc_send(step, p);
+            auto recv_inc = comm_ds_incremental.step_proc_receive(step, p);
+            auto recv_fresh = comm_ds_fresh.step_proc_receive(step, p);
+
+            if (std::abs(send_inc - send_fresh) > 1e-6 || std::abs(recv_inc - recv_fresh) > 1e-6) {
+                all_match = false;
+                std::cout << "  MISMATCH at step " << step << " proc " << p << ":" << std::endl;
+                std::cout << "    Incremental: send=" << send_inc << ", recv=" << recv_inc << std::endl;
+                std::cout << "    Fresh:       send=" << send_fresh << ", recv=" << recv_fresh << std::endl;
+            }
+        }
+    }
+
+    // 4. Validate Lambda Maps
+    for (const auto v : instance.vertices()) {
+        for (unsigned p = 0; p < instance.numberOfProcessors(); ++p) {
+            unsigned count_inc = 0;
+            if (comm_ds_incremental.node_lambda_map.has_proc_entry(v, p)) {
+                count_inc = comm_ds_incremental.node_lambda_map.get_proc_entry(v, p);
+            }
+
+            unsigned count_fresh = 0;
+            if (comm_ds_fresh.node_lambda_map.has_proc_entry(v, p)) {
+                count_fresh = comm_ds_fresh.node_lambda_map.get_proc_entry(v, p);
+            }
+
+            if (count_inc != count_fresh) {
+                all_match = false;
+                std::cout << "  LAMBDA MISMATCH at node " << v << " proc " << p << ":" << std::endl;
+                std::cout << "    Incremental: " << count_inc << std::endl;
+                std::cout << "    Fresh:       " << count_fresh << std::endl;
+            }
+        }
+    }
+    
+    return all_match;
+}
+
+BOOST_AUTO_TEST_CASE(test_update_datastructure_after_move) {
+    graph dag;
+
+    // Create 6 vertices with specific comm weights
+    dag.add_vertex(1, 10, 1); // 0
+    dag.add_vertex(1, 1, 1);  // 1
+    dag.add_vertex(1, 5, 1);  // 2
+    dag.add_vertex(1, 1, 1);  // 3
+    dag.add_vertex(1, 2, 1);  // 4
+    dag.add_vertex(1, 1, 1);  // 5
+
+    // Add edges
+    dag.add_edge(0, 1, 1);
+    dag.add_edge(2, 3, 1);
+    dag.add_edge(4, 5, 1);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(3);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Schedule:
+    // Proc 0: Node 0, 4, 5
+    // Proc 1: Node 1, 2
+    // Proc 2: Node 3
+    schedule.setAssignedProcessors({0, 1, 1, 2, 0, 0});
+    // Steps: 0, 1, 0, 1, 0, 0
+    schedule.setAssignedSupersteps({0, 1, 0, 1, 0, 0});
+    schedule.updateNumberOfSupersteps();
+
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    max_comm_datastructure<graph, double, kl_active_schedule_t> comm_ds;
+    comm_ds.initialize(kl_sched);
+    comm_ds.compute_comm_datastructures(0, 1);
+
+    // Move Node 0 from Proc 0 (Step 0) to Proc 2 (Step 0)
+    // kl_move_struct(node, gain, from_proc, from_step, to_proc, to_step)
+    using kl_move = kl_move_struct<double, graph::vertex_idx>;
+    kl_move move(0, 0.0, 0, 0, 2, 0);
+
+    // Apply the move to the schedule first
+    thread_local_active_schedule_data<graph, double> active_schedule_data;
+    active_schedule_data.initialize_cost(0.0);
+    kl_sched.apply_move(move, active_schedule_data);
+
+    // Then update the communication datastructures
+    comm_ds.update_datastructure_after_move(move, 0, 1);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_update_datastructure_after_move"));
+
+    unsigned step = 0;
+
+    // Expected Changes:
+    // Node 0 (was P0 -> P1) is now (P2 -> P1).
+    // P0 Send: 10 -> 0
+    // P2 Send: 0 -> 10
+    // P1 Recv: 10 -> 10 (Source changed, but destination same)
+
+    // Others unchanged:
+    // P1 Send: 5
+    // P2 Recv: 5
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 0), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 1), 5);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(step, 2), 10);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 0), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 1), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(step, 2), 5);
+
+    // Max Comm:
+    // Send Max: 10 (P2)
+    // Recv Max: 10 (P1)
+    // Global Max: 10
+    // Count: 2 (P2 send, P1 recv)
+    // Second Max: 5 (P1 send, P2 recv)
+
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(step), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm_count(step), 2);
+    BOOST_CHECK_EQUAL(comm_ds.step_second_max_comm(step), 5);
+}
+
+BOOST_AUTO_TEST_CASE(test_multiple_sequential_moves) {
+    graph dag;
+
+    // Create a linear chain: 0 -> 1 -> 2 -> 3
+    dag.add_vertex(1, 10, 1); // 0
+    dag.add_vertex(1, 8, 1);  // 1
+    dag.add_vertex(1, 6, 1);  // 2
+    dag.add_vertex(1, 4, 1);  // 3
+
+    dag.add_edge(0, 1, 1);
+    dag.add_edge(1, 2, 1);
+    dag.add_edge(2, 3, 1);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(4);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Initial: All at step 0, on different processors
+    // 0@P0, 1@P1, 2@P2, 3@P3
+    schedule.setAssignedProcessors({0, 1, 2, 3});
+    schedule.setAssignedSupersteps({0, 0, 0, 0});
+    schedule.updateNumberOfSupersteps();
+
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    max_comm_datastructure<graph, double, kl_active_schedule_t> comm_ds;
+    comm_ds.initialize(kl_sched);
+    comm_ds.compute_comm_datastructures(0, 0);
+
+    // Initial state:
+    // P0 sends to P1 (10), P1 sends to P2 (8), P2 sends to P3 (6)
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 8);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 2), 6);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 3), 0);
+
+    using kl_move = kl_move_struct<double, graph::vertex_idx>;
+    thread_local_active_schedule_data<graph, double> active_schedule_data;
+    active_schedule_data.initialize_cost(0.0);
+
+    // Move 1: Move node 1 from P1 to P0 (make 0->1 local)
+    kl_move move1(1, 0.0, 1, 0, 0, 0);
+    kl_sched.apply_move(move1, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move1, 0, 0);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_multiple_sequential_moves_1"));
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 8);    // Node 1 sends
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0);    // Node was moved away
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 0); // No receives at P0
+
+    // Move 2: Move node 2 from P2 to P0 (chain more local)
+    kl_move move2(2, 0.0, 2, 0, 0, 0);
+    kl_sched.apply_move(move2, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move2, 0, 0);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_multiple_sequential_moves_2"));
+
+    // After move2: Nodes 0,1,2 all at P0, only 3 at P3
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 6);    // Only node 2 sends off-proc
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 2), 0);    // Node moved away
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 3), 6); // P3 receives from node 2
+
+    // Move 3: Move node 3 to P0 (everything local)
+    kl_move move3(3, 0.0, 3, 0, 0, 0);
+    kl_sched.apply_move(move3, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move3, 0, 0);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_multiple_sequential_moves_3"));
+
+    // After move3: All nodes at P0, all communication is local
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0); // All local
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(0), 0);     // No communication cost
+}
+
+BOOST_AUTO_TEST_CASE(test_node_with_multiple_children) {
+    graph dag;
+
+    // Tree structure: Node 0 has three children (1, 2, 3)
+    dag.add_vertex(1, 10, 1); // 0
+    dag.add_vertex(1, 1, 1);  // 1
+    dag.add_vertex(1, 1, 1);  // 2
+    dag.add_vertex(1, 1, 1);  // 3
+
+    dag.add_edge(0, 1, 1);
+    dag.add_edge(0, 2, 1);
+    dag.add_edge(0, 3, 1);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(4);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    schedule.setAssignedProcessors({0, 1, 2, 3});
+    schedule.setAssignedSupersteps({0, 0, 0, 0});
+    schedule.updateNumberOfSupersteps();
+
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    max_comm_datastructure<graph, double, kl_active_schedule_t> comm_ds;
+    comm_ds.initialize(kl_sched);
+    comm_ds.compute_comm_datastructures(0, 0);
+
+    // Initial: Node 0 has 3 children on P1, P2, P3 (3 unique off-proc)
+    // Send cost = 10 * 3 = 30
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 30);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 2), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 3), 10);
+
+    using kl_move = kl_move_struct<double, graph::vertex_idx>;
+    thread_local_active_schedule_data<graph, double> active_schedule_data;
+    active_schedule_data.initialize_cost(0.0);
+
+    // Move child 1 to P0 (same as parent)
+    kl_move move1(1, 0.0, 1, 0, 0, 0);
+    kl_sched.apply_move(move1, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move1, 0, 0);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_node_with_multiple_children"));
+
+    // After: Node 0 has 1 local child, 2 off-proc (P2, P3)
+    // Send cost = 10 * 2 = 20
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 20);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 0); // No longer receives
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 2), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 3), 10);
+
+    kl_move move2(2, 0.0, 2, 0, 0, 0);
+    kl_sched.apply_move(move2, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move2, 0, 0);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_node_with_multiple_children_2"));
+
+    // After: Node 0 has 2 local children, 1 off-proc (P3)
+    // Send cost = 10 * 1 = 10
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 2), 0); // No longer receives
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 3), 10);
+
+    // Move child 3 to P0 (all local)
+    kl_move move3(3, 0.0, 3, 0, 0, 0);
+    kl_sched.apply_move(move3, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move3, 0, 0);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "test_node_with_multiple_children_3"));
+
+    // After: Node 0 has 3 local children
+    // Send cost = 10 * 0 = 0 (all local)
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 3), 0); // No longer receives
+}
+
+BOOST_AUTO_TEST_CASE(test_cross_step_moves) {
+    graph dag;
+
+    // 0 -> 1 -> 2
+    dag.add_vertex(1, 10, 1); // 0
+    dag.add_vertex(1, 8, 1);  // 1
+    dag.add_vertex(1, 6, 1);  // 2
+
+    dag.add_edge(0, 1, 1);
+    dag.add_edge(1, 2, 1);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    schedule.setAssignedProcessors({0, 1, 0});
+    schedule.setAssignedSupersteps({0, 1, 2});
+    schedule.updateNumberOfSupersteps();
+
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    max_comm_datastructure<graph, double, kl_active_schedule_t> comm_ds;
+    comm_ds.initialize(kl_sched);
+    comm_ds.compute_comm_datastructures(0, 2);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 8);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 10);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 0), 8);
+
+    using kl_move = kl_move_struct<double, graph::vertex_idx>;
+    thread_local_active_schedule_data<graph, double> active_schedule_data;
+    active_schedule_data.initialize_cost(0.0);
+
+    // Move node 1 from (P1, step1) to (P0, step1)
+    // This makes 0->1 edge stay cross-step but changes processor
+    kl_move move1(1, 0.0, 1, 1, 0, 1);
+    kl_sched.apply_move(move1, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move1, 0, 2);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0);    // Local (same processor)
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 0); // No receive needed
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0); // Local (same processor)
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 0); // Node moved away
+
+    kl_move move2(1, 0.0, 0, 1, 0, 0);
+    kl_sched.apply_move(move2, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move2, 0, 2);
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0); // All local at P0
+}
+
+BOOST_AUTO_TEST_CASE(test_complex_scenario_user_provided) {
+    graph dag;
+
+    // Vertices from user request
+    // v1(0): w=2, c=9, m=2
+    const auto v1 = dag.add_vertex(2, 9, 2);
+    const auto v2 = dag.add_vertex(3, 8, 4);
+    const auto v3 = dag.add_vertex(4, 7, 3);
+    const auto v4 = dag.add_vertex(5, 6, 2);
+    const auto v5 = dag.add_vertex(6, 5, 6);
+    const auto v6 = dag.add_vertex(7, 4, 2);
+    dag.add_vertex(8, 3, 4);                 // v7 (index 6)
+    const auto v8 = dag.add_vertex(9, 2, 1); // v8 (index 7)
+
+    // Edges
+    dag.add_edge(v1, v2, 2);
+    dag.add_edge(v1, v3, 2);
+    dag.add_edge(v1, v4, 2);
+    dag.add_edge(v2, v5, 12);
+    dag.add_edge(v3, v5, 6);
+    dag.add_edge(v3, v6, 7);
+    dag.add_edge(v5, v8, 9);
+    dag.add_edge(v4, v8, 9);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2); // P0, P1
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Schedule: {1, 1, 0, 0, 1, 0, 0, 1}
+    // v1@P1, v2@P1, v3@P0, v4@P0, v5@P1, v6@P0, v7@P0, v8@P1
+    schedule.setAssignedProcessors({1, 1, 0, 0, 1, 0, 0, 1});
+
+    // Supersteps: {0, 0, 1, 1, 2, 2, 3, 3}
+    schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3});
+    schedule.updateNumberOfSupersteps();
+
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    max_comm_datastructure<graph, double, kl_active_schedule_t> comm_ds;
+    comm_ds.initialize(kl_sched);
+    comm_ds.compute_comm_datastructures(0, 3);
+
+    // === Initial State Verification ===
+    // ... (Same as before) ...
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 9);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 9);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(0), 9);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 13);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 1), 13);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(1), 13);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(2), 0);
+
+    using kl_move = kl_move_struct<double, graph::vertex_idx>;
+    thread_local_active_schedule_data<graph, double> active_schedule_data;
+    active_schedule_data.initialize_cost(0.0);
+
+    // === Move 1: Move v3 from P0 to P1 (at Step 1) ===
+    kl_move move1(v3, 0.0, 0, 1, 1, 1);
+    kl_sched.apply_move(move1, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move1, 0, 3);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "complex_move1"));
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 9);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 6);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 7);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(1), 7);
+
+    // === Move 2: Move v4 from P0 to P1 (at Step 1) ===
+    kl_move move2(v4, 0.0, 0, 1, 1, 1);
+    kl_sched.apply_move(move2, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move2, 0, 3);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "complex_move2"));
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(0), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 7);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(1), 7);
+
+    // === Move 3: Move v5 from P1 to P0 (at Step 2) ===
+    kl_move move3(v5, 0.0, 1, 2, 0, 2);
+    kl_sched.apply_move(move3, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move3, 0, 3);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "complex_move3"));
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 8);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(0), 8);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 7);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 0), 5);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(2), 5);
+
+    // === Move 4: Move v6 from P0 to P1 (at Step 2) ===
+    // v6 is child of v3 (P1, S1).
+    // Before: v3(P1) -> v6(P0). Off-proc.
+    // After: v3(P1) -> v6(P1). Local.
+    // v3 also sends to v5(P0).
+    // So v3 targets: {P0}. Count = 1.
+    // Send Cost v3 = 7. Unchanged.
+    kl_move move4(v6, 0.0, 0, 2, 1, 2);
+    kl_sched.apply_move(move4, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move4, 0, 3);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "complex_move4"));
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 7);
+
+    // === Move 5: Move v5 from P0 to P1 (at Step 2) ===
+    // v5 moves back to P1.
+    // v3(P1) -> v5(P1), v6(P1). All local.
+    // Send Cost v3 = 0.
+    kl_move move5(v5, 0.0, 0, 2, 1, 2);
+    kl_sched.apply_move(move5, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move5, 0, 3);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "complex_move5"));
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 0);
+    BOOST_CHECK_EQUAL(comm_ds.step_max_comm(1), 0);
+}
+
+/**
+ * Test: Grid Graph Complex Moves
+ * Uses a 5x5 Grid Graph (25 nodes) with 6 Supersteps and 4 Processors.
+ * Performs various moves to verify incremental updates in a dense graph.
+ */
+BOOST_AUTO_TEST_CASE(test_grid_graph_complex_moves) {
+    // Construct 5x5 Grid Graph (25 nodes, indices 0-24)
+    graph dag = osp::construct_grid_dag<graph>(5, 5);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(4); // P0..P3
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Assign Processors and Supersteps
+    std::vector<unsigned> procs(25);
+    std::vector<unsigned> steps(25);
+
+    for (unsigned r = 0; r < 5; ++r) {
+        for (unsigned c = 0; c < 5; ++c) {
+            unsigned idx = r * 5 + c;
+            if (r < 2) {
+                procs[idx] = 0;
+                steps[idx] = (c < 3) ? 0 : 1;
+            } else if (r < 4) {
+                procs[idx] = 1;
+                steps[idx] = (c < 3) ? 2 : 3;
+            } else {
+                procs[idx] = 2;
+                steps[idx] = (c < 3) ? 4 : 5;
+            }
+        }
+    }
+
+    // Override: Node 7 (1,2) to P3, S1.
+    procs[7] = 3;
+    steps[7] = 1;
+
+    schedule.setAssignedProcessors(procs);
+    schedule.setAssignedSupersteps(steps);
+    schedule.updateNumberOfSupersteps();
+
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    max_comm_datastructure<graph, double, kl_active_schedule_t> comm_ds;
+    comm_ds.initialize(kl_sched);
+    comm_ds.compute_comm_datastructures(0, 5);
+
+    // Initial check
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 3), 2);
+
+    using kl_move = kl_move_struct<double, graph::vertex_idx>;
+    thread_local_active_schedule_data<graph, double> active_schedule_data;
+    active_schedule_data.initialize_cost(0.0);
+
+    // === Move 1: Node 12 (P1->P0) ===
+    kl_move move1(12, 0.0, 1, 2, 0, 2);
+    kl_sched.apply_move(move1, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move1, 0, 5);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "grid_move1"));
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 3), 1);
+
+    // === Move 2: Node 8 (P0->P3) ===
+    kl_move move2(8, 0.0, 0, 1, 3, 1);
+    kl_sched.apply_move(move2, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move2, 0, 5);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "grid_move2"));
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 3), 3);
+
+    // === Move 3: Node 12 (P0->P3) ===
+    kl_move move3(12, 0.0, 0, 2, 3, 2);
+    kl_sched.apply_move(move3, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move3, 0, 5);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "grid_move3"));
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 3), 2);
+
+    // === Move 4: Node 7 (P3->P0) ===
+    kl_move move4(7, 0.0, 3, 1, 0, 1);
+    kl_sched.apply_move(move4, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move4, 0, 5);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "grid_move4"));
+
+    // Check P0 send contribution from Node 7.
+    // Node 7 contributes 10.
+    // We can check if P0 send >= 10.
+    BOOST_CHECK_GE(comm_ds.step_proc_send(1, 0), 1);
+}
+
+/**
+ * Test: Butterfly Graph Moves
+ * Uses a Butterfly Graph (FFT pattern) to test structured communication patterns.
+ * Stages = 2 (12 nodes). 3 Supersteps. 2 Processors.
+ */
+BOOST_AUTO_TEST_CASE(test_butterfly_graph_moves) {
+    // Stages=2 -> 3 levels of 4 nodes each = 12 nodes.
+    // Level 0: 0-3. Level 1: 4-7. Level 2: 8-11.
+    graph dag = osp::construct_butterfly_dag<graph>(2);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Assign:
+    // Level 0: P0, Step 0
+    // Level 1: P1, Step 1
+    // Level 2: P0, Step 2
+    std::vector<unsigned> procs(12);
+    std::vector<unsigned> steps(12);
+    for (unsigned i = 0; i < 12; ++i) {
+        if (i < 4) {
+            procs[i] = 0;
+            steps[i] = 0;
+        } else if (i < 8) {
+            procs[i] = 1;
+            steps[i] = 1;
+        } else {
+            procs[i] = 0;
+            steps[i] = 2;
+        }
+    }
+
+    schedule.setAssignedProcessors(procs);
+    schedule.setAssignedSupersteps(steps);
+    schedule.updateNumberOfSupersteps();
+
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    max_comm_datastructure<graph, double, kl_active_schedule_t> comm_ds;
+    comm_ds.initialize(kl_sched);
+    comm_ds.compute_comm_datastructures(0, 2);
+
+    // Initial State:
+    // Step 0 (P0): Nodes 0-3 send to Level 1 (P1).
+    // Each node in butterfly connects to 2 nodes in next level.
+    // 0 -> 4, 6. (Both P1). Count=1. Cost=10.
+    // 1 -> 5, 7. (Both P1). Count=1. Cost=10.
+    // ... All 4 nodes send to P1. Total P0 Send = 40.
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 4);
+
+    // Step 1 (P1): Nodes 4-7 send to Level 2 (P0).
+    // All 4 nodes send to P0. Total P1 Send = 40.
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 4);
+
+    using kl_move = kl_move_struct<double, graph::vertex_idx>;
+    thread_local_active_schedule_data<graph, double> active_schedule_data;
+    active_schedule_data.initialize_cost(0.0);
+
+    // === Move 1: Move Node 4 (Level 1) P1 -> P0 ===
+    // Node 4 moves to P0.
+    // Impact on Step 0 (Parents 0, 1):
+    // Node 0 -> 4(P0), 6(P1). Targets {P0, P1}. P0 is local. Targets {P1}. Count=1.
+    // Node 1 -> 5(P1), 7(P1). Targets {P1}. Count=1.
+    // Step 0 Send Cost unchanged (still 40).
+    kl_move move1(4, 0.0, 1, 1, 0, 1);
+    kl_sched.apply_move(move1, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move1, 0, 2);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "butterfly_move1"));
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 4);
+
+    // Impact on Step 1 (Node 4):
+    // Node 4 (P0) -> 8(P0), 10(P0). All local.
+    // Node 4 stops sending. (Was 10).
+    // P1 Send decreases by 10 -> 30.
+    // P0 Send increases by 0 (all local).
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 3);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0);
+
+    // === Move 2: Move Node 6 (Level 1) P1 -> P0 ===
+    // Node 6 moves to P0.
+    // Impact on Step 0 (Parent 0):
+    // Node 0 -> 4(P0), 6(P0). All local.
+    // Node 0 stops sending. (Was 10).
+    // P0 Send decreases by 10 -> 30.
+    kl_move move2(6, 0.0, 1, 1, 0, 1);
+    kl_sched.apply_move(move2, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move2, 0, 2);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "butterfly_move2"));
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 2);
+
+    // Impact on Step 1 (Node 6):
+    // Node 6 (P0) -> 8(P0), 10(P0). All local.
+    // P1 Send decreases by 10 -> 20.
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 2);
+
+    // === Move 3: Move Node 0 (Level 0) P0 -> P1 ===
+    // Node 0 moves to P1.
+    // Impact on Step 0:
+    // Node 0 (P1) -> 4(P0), 6(P0). Targets {P0}. Count=1. Cost=10.
+    // Node 1 (P0) -> 5(P1), 7(P1). Targets {P1}. Count=1. Cost=10.
+    // P0 Send: 10 (from Node 1).
+    // P1 Send: 10 (from Node 0).
+    kl_move move3(0, 0.0, 0, 0, 1, 0);
+    kl_sched.apply_move(move3, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move3, 0, 2);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "butterfly_move3"));
+
+    // === Move 4: Move Node 8 (Level 2) P0 -> P1 ===
+    // Node 8 moves to P1.
+    // Impact on Step 1:
+    // Node 4 (P0) -> 8(P1), 10(P0). Targets {P1}. Count=1. Cost=10.
+    // Node 6 (P0) -> 8(P1), 10(P0). Targets {P1}. Count=1. Cost=10.
+    // P0 Send increases.
+    kl_move move4(8, 0.0, 0, 2, 1, 2);
+    kl_sched.apply_move(move4, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move4, 0, 2);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "butterfly_move4"));
+}
+
+/**
+ * Test: Ladder Graph Moves
+ * Uses a Ladder Graph (Rungs=5 -> 12 nodes).
+ * Tests moving rungs between processors.
+ */
+BOOST_AUTO_TEST_CASE(test_ladder_graph_moves) {
+    // Ladder with 5 rungs -> 6 pairs of nodes = 12 nodes.
+    // Pairs: (0,1), (2,3), ... (10,11).
+    graph dag = osp::construct_ladder_dag<graph>(5);
+
+    BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(1);
+
+    BspInstance<graph> instance(dag, arch);
+    BspSchedule<graph> schedule(instance);
+
+    // Assign:
+    // Even nodes (Left rail): P0
+    // Odd nodes (Right rail): P1
+    // Steps: Pair i at Step i.
+    std::vector<unsigned> procs(12);
+    std::vector<unsigned> steps(12);
+    for (unsigned i = 0; i < 6; ++i) {
+        procs[2 * i] = 0;
+        steps[2 * i] = i;
+        procs[2 * i + 1] = 1;
+        steps[2 * i + 1] = i;
+    }
+
+    schedule.setAssignedProcessors(procs);
+    schedule.setAssignedSupersteps(steps);
+    schedule.updateNumberOfSupersteps();
+
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    max_comm_datastructure<graph, double, kl_active_schedule_t> comm_ds;
+    comm_ds.initialize(kl_sched);
+    comm_ds.compute_comm_datastructures(0, 5);
+
+    // Initial State:
+    // Rung i (u1, v1) connects to Rung i+1 (u2, v2).
+    // u1(P0) -> u2(P0), v2(P1). Targets {P1}. Count=1. Cost=10.
+    // v1(P1) -> u2(P0), v2(P1). Targets {P0}. Count=1. Cost=10.
+    // This applies for Steps 0 to 4.
+
+    for (unsigned s = 0; s < 5; ++s) {
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(s, 0), 1);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(s, 1), 1);
+    }
+
+    using kl_move = kl_move_struct<double, graph::vertex_idx>;
+    thread_local_active_schedule_data<graph, double> active_schedule_data;
+    active_schedule_data.initialize_cost(0.0);
+
+    // === Move 1: Move Node 1 (Rung 0, Right) P1 -> P0 ===
+    // Node 1 moves to P0.
+    // Rung 0 is now (0, 1) both at P0.
+    // Impact on Step 0:
+    // u1(0) -> u2(2, P0), v2(3, P1). Targets {P1}. Cost=10. (Unchanged)
+    // v1(1) -> u2(2, P0), v2(3, P1). Targets {P1}. Cost=10.
+    // P0 Send = 10 + 10 = 20.
+    // P1 Send = 0.
+    kl_move move1(1, 0.0, 1, 0, 0, 0);
+    kl_sched.apply_move(move1, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move1, 0, 5);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "ladder_move1"));
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 2);
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0);
+
+    // === Move 2: Move Node 3 (Rung 1, Right) P1 -> P0 ===
+    // Node 3 moves to P0.
+    // Rung 1 is now (2, 3) both at P0.
+    // Impact on Step 0 (Parents 0, 1):
+    // u1(0) -> u2(2, P0), v2(3, P0). All local. Cost=0.
+    // v1(1) -> u2(2, P0), v2(3, P0). All local. Cost=0.
+    // P0 Send at Step 0 = 0.
+    kl_move move2(3, 0.0, 1, 1, 0, 1);
+    kl_sched.apply_move(move2, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move2, 0, 5);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "ladder_move2"));
+
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0);
+
+    // Impact on Step 1 (Nodes 2, 3):
+    // u2(2, P0) -> u3(4, P0), v3(5, P1). Targets {P1}. Cost=10.
+    // v2(3, P0) -> u3(4, P0), v3(5, P1). Targets {P1}. Cost=10.
+    // P0 Send at Step 1 = 20.
+    BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 2);
+
+    // === Move 3: Move Node 0 (Rung 0, Left) P0 -> P1 ===
+    // Node 0 moves to P1.
+    // Rung 0 is now (0@P1, 1@P0). Split again.
+    // Impact on Step 0:
+    // u1(0, P1) -> u2(2, P0), v2(3, P0). Targets {P0}. Cost=10.
+    // v1(1, P0) -> u2(2, P0), v2(3, P0). All local. Cost=0.
+    // P0 Send: 0.
+    // P1 Send: 10.
+    kl_move move3(0, 0.0, 0, 0, 1, 0);
+    kl_sched.apply_move(move3, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move3, 0, 5);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "ladder_move3"));
+
+    // === Move 4: Move Node 2 (Rung 1, Left) P0 -> P1 ===
+    // Node 2 moves to P1.
+    // Rung 1 is now (2@P1, 3@P0). Split again.
+    // Impact on Step 0 (Parents 0, 1):
+    // u1(0, P1) -> u2(2, P1), v2(3, P0). Targets {P0}. Cost=10.
+    // v1(1, P0) -> u2(2, P1), v2(3, P0). Targets {P1}. Cost=10.
+    // P0 Send: 10.
+    // P1 Send: 10.
+    kl_move move4(2, 0.0, 0, 1, 1, 1);
+    kl_sched.apply_move(move4, active_schedule_data);
+    comm_ds.update_datastructure_after_move(move4, 0, 5);
+    BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "ladder_move4"));
+}
\ No newline at end of file
diff --git a/tests/kl_bsp_improver_test.cpp b/tests/kl_bsp_improver_test.cpp
new file mode 100644
index 00000000..250dfa18
--- /dev/null
+++ b/tests/kl_bsp_improver_test.cpp
@@ -0,0 +1,250 @@
+
+#define BOOST_TEST_MODULE kl_bsp_improver
+#include <boost/test/unit_test.hpp>
+
+#include "osp/auxiliary/io/arch_file_reader.hpp"
+#include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
+#include "osp/bsp/scheduler/CoarsenRefineSchedulers/MultiLevelHillClimbing.hpp"
+#include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp"
+#include "osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp"
+#include "osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing_for_comm_schedule.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp"
+#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
+#include "test_graphs.hpp"
+
+using namespace osp;
+
+template<typename Graph_t>
+void add_mem_weights(Graph_t &dag) {
+
+    int mem_weight = 1;
+    int comm_weight = 7;
+
+    for (const auto &v : dag.vertices()) {
+
+        dag.set_vertex_work_weight(v, static_cast<v_memw_t<Graph_t>>(mem_weight++ % 10 + 2));
+        dag.set_vertex_mem_weight(v, static_cast<v_memw_t<Graph_t>>(mem_weight++ % 10 + 2));
+        dag.set_vertex_comm_weight(v, static_cast<v_commw_t<Graph_t>>(comm_weight++ % 10 + 2));
+    }
+}
+
+BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) {
+
+    using graph = computational_dag_edge_idx_vector_impl_def_int_t;
+    using VertexType = graph::vertex_idx;
+
+    graph dag;
+
+    const VertexType v1 = dag.add_vertex(2, 9, 2);
+    const VertexType v2 = dag.add_vertex(3, 8, 4);
+    const VertexType v3 = dag.add_vertex(4, 7, 3);
+    const VertexType v4 = dag.add_vertex(5, 6, 2);
+    const VertexType v5 = dag.add_vertex(6, 5, 6);
+    const VertexType v6 = dag.add_vertex(7, 4, 2);
+    dag.add_vertex(8, 3, 4);
+    const VertexType v8 = dag.add_vertex(9, 2, 1);
+
+    dag.add_edge(v1, v2, 2);
+    dag.add_edge(v1, v3, 2);
+    dag.add_edge(v1, v4, 2);
+    dag.add_edge(v2, v5, 12);
+    dag.add_edge(v3, v5, 6);
+    dag.add_edge(v3, v6, 7);
+    dag.add_edge(v5, v8, 9);
+    dag.add_edge(v4, v8, 9);
+
+    BspArchitecture<graph> arch;
+
+    BspInstance<graph> instance(dag, arch);
+
+    BspSchedule schedule(instance);
+
+    schedule.setAssignedProcessors({1, 1, 0, 0, 1, 0, 0, 1});
+    schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3});
+
+    schedule.updateNumberOfSupersteps();
+
+    using comm_cost_t = kl_bsp_comm_cost_function<graph, double, no_local_search_memory_constraint>;
+    using kl_improver_test = kl_improver_test<graph, comm_cost_t>;
+
+    kl_improver_test kl;
+
+    kl.setup_schedule(schedule);
+
+    auto &kl_active_schedule = kl.get_active_schedule();
+
+    // Verify work datastructures are set up correctly
+    BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(0), 5.0);
+    BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(0), 0.0);
+    BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(1), 9.0);
+    BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(1), 0.0);
+    BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(2), 7.0);
+    BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0);
+    BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0);
+    BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0);
+
+    BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4);
+    BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true);
+
+    // Check initial cost consistency
+    double initial_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double initial_tracked = kl.get_current_cost();
+    BOOST_CHECK_CLOSE(initial_recomputed, initial_tracked, 0.00001);
+
+    // Insert nodes into gain heap
+    auto node_selection = kl.insert_gain_heap_test_penalty({2, 3});
+
+    // Run first iteration and check cost consistency
+    auto recompute_max_gain = kl.run_inner_iteration_test();
+
+    double iter1_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double iter1_tracked = kl.get_current_cost();
+    BOOST_CHECK_CLOSE(iter1_recomputed, iter1_tracked, 0.00001);
+
+    // Run second iteration
+    auto &node3_affinity = kl.get_affinity_table()[3];
+
+    recompute_max_gain = kl.run_inner_iteration_test();
+
+    double iter2_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double iter2_tracked = kl.get_current_cost();
+
+    BOOST_CHECK_CLOSE(iter2_recomputed, iter2_tracked, 0.00001);
+
+    // Run third iteration
+    recompute_max_gain = kl.run_inner_iteration_test();
+
+    double iter3_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double iter3_tracked = kl.get_current_cost();
+    BOOST_CHECK_CLOSE(iter3_recomputed, iter3_tracked, 0.00001);
+
+    // Run fourth iteration
+    recompute_max_gain = kl.run_inner_iteration_test();
+
+    double iter4_recomputed = kl.get_comm_cost_f().compute_schedule_cost_test();
+    double iter4_tracked = kl.get_current_cost();
+    BOOST_CHECK_CLOSE(iter4_recomputed, iter4_tracked, 0.00001);
+}
+
+BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs) {
+    std::vector<std::string> filenames_graph = large_spaa_graphs();
+    using graph = computational_dag_edge_idx_vector_impl_def_int_t;
+    // Getting root git directory
+    std::filesystem::path cwd = std::filesystem::current_path();
+    std::cout << cwd << std::endl;
+    while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) {
+        cwd = cwd.parent_path();
+        std::cout << cwd << std::endl;
+    }
+
+    for (auto &filename_graph : filenames_graph) {
+        GreedyBspScheduler<computational_dag_edge_idx_vector_impl_def_int_t> test_scheduler;
+        BspInstance<graph> instance;
+        bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(),
+                                                                              instance.getComputationalDag());
+
+        instance.getArchitecture().setSynchronisationCosts(500);
+        instance.getArchitecture().setCommunicationCosts(5);
+        instance.getArchitecture().setNumberOfProcessors(4);
+
+        std::vector<std::vector<int>> send_cost = {{0, 1, 4, 4}, {1, 0, 4, 4}, {4, 4, 0, 1}, {4, 4, 1, 0}};
+
+        instance.getArchitecture().setSendCosts(send_cost);
+
+        if (!status_graph) {
+
+            std::cout << "Reading files failed." << std::endl;
+            BOOST_CHECK(false);
+        }
+
+        add_mem_weights(instance.getComputationalDag());
+
+        BspSchedule<graph> schedule(instance);
+        const auto result = test_scheduler.computeSchedule(schedule);
+
+        schedule.updateNumberOfSupersteps();
+
+        std::cout << "initial scedule with costs: " << schedule.computeCosts() << " and "
+                  << schedule.numberOfSupersteps() << " number of supersteps" << std::endl;
+
+        BspSchedule<graph> schedule_2(schedule);
+
+        BOOST_CHECK_EQUAL(RETURN_STATUS::OSP_SUCCESS, result);
+        BOOST_CHECK_EQUAL(&schedule.getInstance(), &instance);
+        BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
+
+        kl_total_lambda_comm_improver<graph, no_local_search_memory_constraint, 1> kl_total_lambda;
+        auto start_time = std::chrono::high_resolution_clock::now();
+        auto status = kl_total_lambda.improveSchedule(schedule);
+        auto finish_time = std::chrono::high_resolution_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
+
+        std::cout << "kl lambda new finished in " << duration << " seconds, costs: " << schedule.computeCosts()
+                  << " and lambda costs: " << schedule.computeTotalLambdaCosts() << " with "
+                  << schedule.numberOfSupersteps() << " number of supersteps" << std::endl;
+
+        BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
+        BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true);
+
+        kl_bsp_comm_improver_mt<graph, no_local_search_memory_constraint, 1> kl;
+        kl.setTimeQualityParameter(5.0);
+        start_time = std::chrono::high_resolution_clock::now();
+        status = kl.improveSchedule(schedule);
+        finish_time = std::chrono::high_resolution_clock::now();
+        duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
+
+        std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeCosts() << " with "
+                  << schedule.numberOfSupersteps() << " number of supersteps" << std::endl;
+
+        BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
+        BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true);
+
+        BspScheduleCS<graph> schedule_cs(schedule);
+
+        HillClimbingForCommSteps<graph> hc_comm_steps;
+        start_time = std::chrono::high_resolution_clock::now();
+        status = hc_comm_steps.improveSchedule(schedule_cs);
+        finish_time = std::chrono::high_resolution_clock::now();
+
+        duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
+
+        std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs.computeCosts()
+                  << " with " << schedule_cs.numberOfSupersteps() << " number of supersteps" << std::endl;
+
+        BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
+        BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true);
+
+        kl_total_lambda.improveSchedule(schedule_2);
+
+        HillClimbingScheduler<graph> hc;
+
+        start_time = std::chrono::high_resolution_clock::now();
+        status = hc.improveSchedule(schedule_2);
+        finish_time = std::chrono::high_resolution_clock::now();
+
+        duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
+
+        std::cout << "hc finished in " << duration << " seconds, costs: " << schedule_2.computeCosts() << " with "
+                  << schedule_2.numberOfSupersteps() << " number of supersteps" << std::endl;
+
+        BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
+        BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true);
+
+        BspScheduleCS<graph> schedule_cs_2(schedule_2);
+
+        start_time = std::chrono::high_resolution_clock::now();
+        status = hc_comm_steps.improveSchedule(schedule_cs_2);
+        finish_time = std::chrono::high_resolution_clock::now();
+
+        duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
+
+        std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs_2.computeCosts()
+                  << " with " << schedule_cs_2.numberOfSupersteps() << " number of supersteps" << std::endl;
+
+        BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
+        BOOST_CHECK_EQUAL(schedule_cs_2.satisfiesPrecedenceConstraints(), true);
+    }
+}
\ No newline at end of file

From e0ad973ec734745f87d2b17f298c11bfb674239d Mon Sep 17 00:00:00 2001
From: tonibohnlein <toni.boehnlein18@gmail.com>
Date: Tue, 25 Nov 2025 16:05:58 +0100
Subject: [PATCH 2/3] adding more new_nodes after move

comments

update

update
---
 .../comm_cost_modules/comm_cost_policies.hpp  | 457 ++++++++++++++++++
 .../generic_lambda_container.hpp              | 124 +++++
 .../max_comm_datastructure.hpp                |  88 ++--
 .../KernighanLin_v2/kl_improver.hpp           |  45 +-
 tests/kl_bsp_cost.cpp                         | 192 +++++++-
 tests/kl_bsp_improver_test.cpp                | 170 +++----
 6 files changed, 943 insertions(+), 133 deletions(-)
 create mode 100644 include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/comm_cost_policies.hpp
 create mode 100644 include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/generic_lambda_container.hpp

diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/comm_cost_policies.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/comm_cost_policies.hpp
new file mode 100644
index 00000000..8fb1ceff
--- /dev/null
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/comm_cost_policies.hpp
@@ -0,0 +1,457 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+namespace osp {
+
+struct EagerCommCostPolicy {
+    using ValueType = unsigned;
+
+    template<typename DS, typename comm_weight_t>
+    static inline void attribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step,
+                                               const unsigned u_proc, const unsigned v_proc, const unsigned v_step,
+                                               const ValueType &val) {
+        ds.step_proc_receive(u_step, v_proc) += cost;
+        ds.step_proc_send(u_step, u_proc) += cost;
+    }
+
+    template<typename DS, typename comm_weight_t>
+    static inline void unattribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step,
+                                                 const unsigned u_proc, const unsigned v_proc, const unsigned v_step,
+                                                 const ValueType &val) {
+        ds.step_proc_receive(u_step, v_proc) -= cost;
+        ds.step_proc_send(u_step, u_proc) -= cost;
+    }
+
+    static inline bool add_child(ValueType &val, unsigned step) {
+        val++;
+        return val == 1;
+    }
+
+    static inline bool remove_child(ValueType &val, unsigned step) {
+        val--;
+        return val == 0;
+    }
+
+    static inline void reset(ValueType &val) { val = 0; }
+
+    static inline bool has_entry(const ValueType &val) { return val > 0; }
+
+    static inline bool is_single_entry(const ValueType &val) { return val == 1; }
+
+    template<typename DeltaTracker, typename comm_weight_t>
+    static inline void calculate_delta_remove(const ValueType &val, unsigned child_step, unsigned parent_step,
+                                              unsigned parent_proc, unsigned child_proc, comm_weight_t cost,
+                                              DeltaTracker &dt) {
+        if (val == 1) {
+            dt.add(true, parent_step, child_proc, -cost);
+            dt.add(false, parent_step, parent_proc, -cost);
+        }
+    }
+
+    template<typename DeltaTracker, typename comm_weight_t>
+    static inline void calculate_delta_add(const ValueType &val, unsigned child_step, unsigned parent_step,
+                                           unsigned parent_proc, unsigned child_proc, comm_weight_t cost,
+                                           DeltaTracker &dt) {
+        if (val == 0) {
+            dt.add(true, parent_step, child_proc, cost);
+            dt.add(false, parent_step, parent_proc, cost);
+        }
+    }
+
+    template<typename DeltaTracker, typename comm_weight_t>
+    static inline void calculate_delta_outgoing(const ValueType &val, unsigned node_step, unsigned node_proc,
+                                                unsigned child_proc, comm_weight_t cost, DeltaTracker &dt) {
+        if (val > 0) {
+            comm_weight_t total_cost = cost * val;
+            dt.add(true, node_step, child_proc, total_cost);
+            dt.add(false, node_step, node_proc, total_cost);
+        }
+    }
+};
+
+struct LazyCommCostPolicy {
+    using ValueType = std::vector<unsigned>;
+
+    template<typename DS, typename comm_weight_t>
+    static inline void attribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step,
+                                               const unsigned u_proc, const unsigned v_proc, const unsigned v_step,
+                                               const ValueType &val) {
+        // val contains v_step (already added).
+        // Check if v_step is the new minimum.
+        unsigned min_step = std::numeric_limits<unsigned>::max();
+        for (unsigned s : val)
+            min_step = std::min(min_step, s);
+
+        if (min_step == v_step) {
+            // Check if it was strictly smaller than previous min.
+            unsigned prev_min = std::numeric_limits<unsigned>::max();
+            for (size_t i = 0; i < val.size() - 1; ++i) {
+                prev_min = std::min(prev_min, val[i]);
+            }
+
+            if (v_step < prev_min) {
+                if (prev_min != std::numeric_limits<unsigned>::max() && prev_min > 0) {
+                    ds.step_proc_receive(prev_min - 1, v_proc) -= cost;
+                    ds.step_proc_send(prev_min - 1, u_proc) -= cost;
+                }
+                if (v_step > 0) {
+                    ds.step_proc_receive(v_step - 1, v_proc) += cost;
+                    ds.step_proc_send(v_step - 1, u_proc) += cost;
+                }
+            }
+        }
+    }
+
+    template<typename DS, typename comm_weight_t>
+    static inline void unattribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step,
+                                                 const unsigned u_proc, const unsigned v_proc, const unsigned v_step,
+                                                 const ValueType &val) {
+        // val is state AFTER removal.
+
+        if (val.empty()) {
+            // Removed the last child.
+            if (v_step > 0) {
+                ds.step_proc_receive(v_step - 1, v_proc) -= cost;
+                ds.step_proc_send(v_step - 1, u_proc) -= cost;
+            }
+        } else {
+            // Check if v_step was the unique minimum.
+            unsigned new_min = val[0];
+            for (unsigned s : val)
+                new_min = std::min(new_min, s);
+
+            if (v_step < new_min) {
+                // v_step was the unique minimum.
+                if (v_step > 0) {
+                    ds.step_proc_receive(v_step - 1, v_proc) -= cost;
+                    ds.step_proc_send(v_step - 1, u_proc) -= cost;
+                }
+                if (new_min > 0) {
+                    ds.step_proc_receive(new_min - 1, v_proc) += cost;
+                    ds.step_proc_send(new_min - 1, u_proc) += cost;
+                }
+            }
+        }
+    }
+
+    static inline bool add_child(ValueType &val, unsigned step) {
+        val.push_back(step);
+        if (val.size() == 1)
+            return true;
+        unsigned min_s = val[0];
+        for (unsigned s : val)
+            min_s = std::min(min_s, s);
+        return step == min_s;
+    }
+
+    static inline bool remove_child(ValueType &val, unsigned step) {
+        auto it = std::find(val.begin(), val.end(), step);
+        if (it != val.end()) {
+            val.erase(it);
+            if (val.empty()) {
+                return true;
+            }
+            unsigned new_min = val[0];
+            for (unsigned s : val)
+                new_min = std::min(new_min, s);
+            bool res = step < new_min;
+            return res;
+        }
+        return false;
+    }
+
+    static inline void reset(ValueType &val) { val.clear(); }
+
+    static inline bool has_entry(const ValueType &val) { return !val.empty(); }
+
+    static inline bool is_single_entry(const ValueType &val) { return val.size() == 1; }
+
+    template<typename DeltaTracker, typename comm_weight_t>
+    static inline void calculate_delta_remove(const ValueType &val, unsigned child_step, unsigned parent_step,
+                                              unsigned parent_proc, unsigned child_proc, comm_weight_t cost,
+                                              DeltaTracker &dt) {
+        if (val.empty())
+            return;
+        unsigned min_s = val[0];
+        for (unsigned s : val)
+            min_s = std::min(min_s, s);
+
+        if (child_step == min_s) {
+            int count = 0;
+            for (unsigned s : val)
+                if (s == min_s)
+                    count++;
+
+            if (count == 1) {
+                if (min_s > 0) {
+                    dt.add(true, min_s - 1, child_proc, -cost);
+                    dt.add(false, min_s - 1, parent_proc, -cost);
+                }
+                if (val.size() > 1) {
+                    unsigned next_min = std::numeric_limits<unsigned>::max();
+                    for (unsigned s : val) {
+                        if (s != min_s)
+                            next_min = std::min(next_min, s);
+                    }
+                    if (next_min != std::numeric_limits<unsigned>::max() && next_min > 0) {
+                        dt.add(true, next_min - 1, child_proc, cost);
+                        dt.add(false, next_min - 1, parent_proc, cost);
+                    }
+                }
+            }
+        }
+    }
+
+    template<typename DeltaTracker, typename comm_weight_t>
+    static inline void calculate_delta_add(const ValueType &val, unsigned child_step, unsigned parent_step,
+                                           unsigned parent_proc, unsigned child_proc, comm_weight_t cost,
+                                           DeltaTracker &dt) {
+        if (val.empty()) {
+            if (child_step > 0) {
+                dt.add(true, child_step - 1, child_proc, cost);
+                dt.add(false, child_step - 1, parent_proc, cost);
+            }
+        } else {
+            unsigned min_s = val[0];
+            for (unsigned s : val)
+                min_s = std::min(min_s, s);
+
+            if (child_step < min_s) {
+                if (min_s > 0) {
+                    dt.add(true, min_s - 1, child_proc, -cost);
+                    dt.add(false, min_s - 1, parent_proc, -cost);
+                }
+                if (child_step > 0) {
+                    dt.add(true, child_step - 1, child_proc, cost);
+                    dt.add(false, child_step - 1, parent_proc, cost);
+                }
+            }
+        }
+    }
+
+    template<typename DeltaTracker, typename comm_weight_t>
+    static inline void calculate_delta_outgoing(const ValueType &val, unsigned node_step, unsigned node_proc,
+                                                unsigned child_proc, comm_weight_t cost, DeltaTracker &dt) {
+        for (unsigned s : val) {
+            if (s > 0) {
+                dt.add(true, s - 1, child_proc, cost);
+                dt.add(false, s - 1, node_proc, cost);
+            }
+        }
+    }
+};
+
+struct BufferedCommCostPolicy {
+    using ValueType = std::vector<unsigned>;
+
+    template<typename DS, typename comm_weight_t>
+    static inline void attribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step,
+                                               const unsigned u_proc, const unsigned v_proc, const unsigned v_step,
+                                               const ValueType &val) {
+        // Buffered: Send at u_step, Receive at v_step - 1.
+
+        unsigned min_step = std::numeric_limits<unsigned>::max();
+        for (unsigned s : val)
+            min_step = std::min(min_step, s);
+
+        if (min_step == v_step) {
+            unsigned prev_min = std::numeric_limits<unsigned>::max();
+            for (size_t i = 0; i < val.size() - 1; ++i)
+                prev_min = std::min(prev_min, val[i]);
+
+            if (v_step < prev_min) {
+                if (prev_min != std::numeric_limits<unsigned>::max() && prev_min > 0) {
+                    ds.step_proc_receive(prev_min - 1, v_proc) -= cost;
+                }
+                if (v_step > 0) {
+                    ds.step_proc_receive(v_step - 1, v_proc) += cost;
+                }
+            }
+        }
+
+        // Send side logic (u_step)
+        // If this is the FIRST child on this proc, add send cost.
+        if (val.size() == 1) {
+            ds.step_proc_send(u_step, u_proc) += cost;
+        }
+    }
+
+    template<typename DS, typename comm_weight_t>
+    static inline void unattribute_communication(DS &ds, const comm_weight_t &cost, const unsigned u_step,
+                                                 const unsigned u_proc, const unsigned v_proc, const unsigned v_step,
+                                                 const ValueType &val) {
+        // val is state AFTER removal.
+
+        if (val.empty()) {
+            // Removed last child.
+            ds.step_proc_send(u_step, u_proc) -= cost; // Send side
+            if (v_step > 0) {
+                ds.step_proc_receive(v_step - 1, v_proc) -= cost; // Recv side
+            }
+        } else {
+            // Check if v_step was unique minimum for Recv side.
+            unsigned new_min = val[0];
+            for (unsigned s : val)
+                new_min = std::min(new_min, s);
+
+            if (v_step < new_min) {
+                if (v_step > 0) {
+                    ds.step_proc_receive(v_step - 1, v_proc) -= cost;
+                }
+                if (new_min > 0) {
+                    ds.step_proc_receive(new_min - 1, v_proc) += cost;
+                }
+            }
+            // Send side remains (val not empty).
+        }
+    }
+
+    static inline bool add_child(ValueType &val, unsigned step) {
+        val.push_back(step);
+        if (val.size() == 1)
+            return true; // Need update for send side
+        unsigned min_s = val[0];
+        for (unsigned s : val)
+            min_s = std::min(min_s, s);
+        return step == min_s; // Need update for recv side
+    }
+
+    static inline bool remove_child(ValueType &val, unsigned step) {
+        auto it = std::find(val.begin(), val.end(), step);
+        if (it != val.end()) {
+            val.erase(it);
+            if (val.empty())
+                return true; // Need update for send side
+            unsigned new_min = val[0];
+            for (unsigned s : val)
+                new_min = std::min(new_min, s);
+            return step < new_min; // Need update for recv side
+        }
+        return false;
+    }
+
+    static inline void reset(ValueType &val) { val.clear(); }
+    static inline bool has_entry(const ValueType &val) { return !val.empty(); }
+    static inline bool is_single_entry(const ValueType &val) { return val.size() == 1; }
+
+    template<typename DeltaTracker, typename comm_weight_t>
+    static inline void calculate_delta_remove(const ValueType &val, unsigned child_step, unsigned parent_step,
+                                              unsigned parent_proc, unsigned child_proc, comm_weight_t cost,
+                                              DeltaTracker &dt) {
+        // Lazy: Send and Recv are both at min(child_steps) - 1.
+
+        if (val.empty())
+            return;
+
+        unsigned min_s = val[0];
+        for (unsigned s : val)
+            min_s = std::min(min_s, s);
+
+        if (child_step == min_s) {
+            int count = 0;
+            for (unsigned s : val)
+                if (s == min_s)
+                    count++;
+
+            if (count == 1) {
+                // Unique min being removed.
+                if (min_s > 0) {
+                    dt.add(true, min_s - 1, child_proc, -cost);   // Remove Recv
+                    dt.add(false, min_s - 1, parent_proc, -cost); // Remove Send
+                }
+
+                if (val.size() > 1) {
+                    unsigned next_min = std::numeric_limits<unsigned>::max();
+                    for (unsigned s : val)
+                        if (s != min_s)
+                            next_min = std::min(next_min, s);
+
+                    if (next_min != std::numeric_limits<unsigned>::max() && next_min > 0) {
+                        dt.add(true, next_min - 1, child_proc, cost);   // Add Recv at new min
+                        dt.add(false, next_min - 1, parent_proc, cost); // Add Send at new min
+                    }
+                }
+            }
+        }
+    }
+
+    template<typename DeltaTracker, typename comm_weight_t>
+    static inline void calculate_delta_add(const ValueType &val, unsigned child_step, unsigned parent_step,
+                                           unsigned parent_proc, unsigned child_proc, comm_weight_t cost,
+                                           DeltaTracker &dt) {
+        // Lazy: Send and Recv are both at min(child_steps) - 1.
+
+        if (val.empty()) {
+            // First child.
+            if (child_step > 0) {
+                dt.add(true, child_step - 1, child_proc, cost);
+                dt.add(false, child_step - 1, parent_proc, cost);
+            }
+        } else {
+            unsigned min_s = val[0];
+            for (unsigned s : val)
+                min_s = std::min(min_s, s);
+
+            if (child_step < min_s) {
+                // New global minimum.
+                if (min_s > 0) {
+                    dt.add(true, min_s - 1, child_proc, -cost);   // Remove old Recv
+                    dt.add(false, min_s - 1, parent_proc, -cost); // Remove old Send
+                }
+                if (child_step > 0) {
+                    dt.add(true, child_step - 1, child_proc, cost);   // Add new Recv
+                    dt.add(false, child_step - 1, parent_proc, cost); // Add new Send
+                }
+            }
+        }
+    }
+
+    template<typename DeltaTracker, typename comm_weight_t>
+    static inline void calculate_delta_outgoing(const ValueType &val, unsigned node_step, unsigned node_proc,
+                                                unsigned child_proc, comm_weight_t cost, DeltaTracker &dt) {
+        // Buffered Outgoing (Node -> Children)
+        // Node is parent (sender). Pays at node_step.
+        // Children are receivers. Pay at child_step - 1.
+
+        // Send side: node_step.
+        // If val is not empty, we pay send cost ONCE.
+        if (!val.empty()) {
+            dt.add(false, node_step, node_proc, cost);
+        }
+
+        // Recv side: iterate steps in val (child steps).
+        // But we only pay at min(val) - 1.
+        if (!val.empty()) {
+            unsigned min_s = val[0];
+            for (unsigned s : val)
+                min_s = std::min(min_s, s);
+
+            if (min_s > 0) {
+                dt.add(true, min_s - 1, child_proc, cost);
+            }
+        }
+    }
+};
+
+} // namespace osp
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/generic_lambda_container.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/generic_lambda_container.hpp
new file mode 100644
index 00000000..623d51d8
--- /dev/null
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/generic_lambda_container.hpp
@@ -0,0 +1,124 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+namespace osp {
+
+template<typename T>
+struct DefaultHasEntry {
+    static inline bool has_entry(const T &val) { return val != 0; }
+};
+
+template<typename T>
+struct DefaultHasEntry<std::vector<T>> {
+    static inline bool has_entry(const std::vector<T> &val) { return !val.empty(); }
+};
+
+/**
+ * @brief Generic container for tracking child processor assignments in a BSP schedule using vectors.
+ *
+ * This structure tracks information about children assigned to each processor.
+ * It uses a 2D vector for dense data.
+ */
+template<typename vertex_idx_t, typename ValueType = unsigned, typename HasEntry = DefaultHasEntry<ValueType>>
+struct generic_lambda_vector_container {
+
+    /**
+     * @brief Range adapter for iterating over non-zero/non-empty processor entries.
+     */
+    class lambda_vector_range {
+      private:
+        const std::vector<ValueType> &vec_;
+
+      public:
+        class lambda_vector_iterator {
+            using iterator_category = std::input_iterator_tag;
+            using value_type = std::pair<unsigned, ValueType>;
+            using difference_type = std::ptrdiff_t;
+            using pointer = value_type *;
+            using reference = value_type &;
+
+          private:
+            const std::vector<ValueType> &vec_;
+            unsigned index_;
+
+          public:
+            lambda_vector_iterator(const std::vector<ValueType> &vec) : vec_(vec), index_(0) {
+                while (index_ < vec_.size() && !HasEntry::has_entry(vec_[index_])) {
+                    ++index_;
+                }
+            }
+
+            lambda_vector_iterator(const std::vector<ValueType> &vec, unsigned index) : vec_(vec), index_(index) {}
+
+            lambda_vector_iterator &operator++() {
+                ++index_;
+                while (index_ < vec_.size() && !HasEntry::has_entry(vec_[index_])) {
+                    ++index_;
+                }
+                return *this;
+            }
+
+            value_type operator*() const { return std::make_pair(index_, vec_[index_]); }
+
+            bool operator==(const lambda_vector_iterator &other) const { return index_ == other.index_; }
+            bool operator!=(const lambda_vector_iterator &other) const { return !(*this == other); }
+        };
+
+        lambda_vector_range(const std::vector<ValueType> &vec) : vec_(vec) {}
+
+        lambda_vector_iterator begin() { return lambda_vector_iterator(vec_); }
+        lambda_vector_iterator end() { return lambda_vector_iterator(vec_, static_cast<unsigned>(vec_.size())); }
+    };
+
+    /// 2D vector: for each node, stores processor assignment info
+    std::vector<std::vector<ValueType>> node_lambda_vec;
+
+    /// Number of processors in the system
+    unsigned num_procs_ = 0;
+
+    inline void initialize(const vertex_idx_t num_vertices, const unsigned num_procs) {
+        node_lambda_vec.assign(num_vertices, std::vector<ValueType>(num_procs));
+        num_procs_ = num_procs;
+    }
+
+    inline void reset_node(const vertex_idx_t node) { node_lambda_vec[node].assign(num_procs_, ValueType()); }
+
+    inline void clear() { node_lambda_vec.clear(); }
+
+    inline bool has_proc_entry(const vertex_idx_t node, const unsigned proc) const {
+        return HasEntry::has_entry(node_lambda_vec[node][proc]);
+    }
+
+    inline ValueType &get_proc_entry(const vertex_idx_t node, const unsigned proc) {
+        return node_lambda_vec[node][proc];
+    }
+
+    inline ValueType get_proc_entry(const vertex_idx_t node, const unsigned proc) const {
+        return node_lambda_vec[node][proc];
+    }
+
+    inline auto iterate_proc_entries(const vertex_idx_t node) { return lambda_vector_range(node_lambda_vec[node]); }
+};
+
+} // namespace osp
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp
index cc8d8a5a..b3820231 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp
@@ -18,9 +18,12 @@ limitations under the License.
 
 #pragma once
 
+#include "comm_cost_policies.hpp"
+#include "generic_lambda_container.hpp"
 #include "lambda_container.hpp"
 #include "osp/bsp/model/BspInstance.hpp"
 #include <algorithm>
+#include <type_traits>
 #include <unordered_map>
 #include <vector>
 
@@ -53,7 +56,7 @@ struct pre_move_comm_data {
     }
 };
 
-template<typename Graph_t, typename cost_t, typename kl_active_schedule_t>
+template<typename Graph_t, typename cost_t, typename kl_active_schedule_t, typename CommPolicy = EagerCommCostPolicy>
 struct max_comm_datastructure {
 
     using comm_weight_t = v_commw_t<Graph_t>;
@@ -73,7 +76,13 @@ struct max_comm_datastructure {
 
     comm_weight_t max_comm_weight = 0;
 
-    lambda_vector_container<VertexType> node_lambda_map;
+    // Select the appropriate container type based on the policy's ValueType
+    using ContainerType =
+        typename std::conditional<std::is_same<typename CommPolicy::ValueType, unsigned>::value,
+                                  lambda_vector_container<VertexType>,
+                                  generic_lambda_vector_container<VertexType, typename CommPolicy::ValueType>>::type;
+
+    ContainerType node_lambda_map;
 
     // Optimization: Scratchpad for update_datastructure_after_move to avoid allocations
     std::vector<unsigned> affected_steps_list;
@@ -123,9 +132,6 @@ struct max_comm_datastructure {
     }
 
     inline void arrange_superstep_comm_data(const unsigned step) {
-        // Linear scan O(P) to find max, second_max and count
-        
-        // 1. Analyze Sends
         comm_weight_t max_send = 0;
         comm_weight_t second_max_send = 0;
         unsigned max_send_count = 0;
@@ -143,7 +149,6 @@ struct max_comm_datastructure {
             }
         }
 
-        // 2. Analyze Receives
         comm_weight_t max_receive = 0;
         comm_weight_t second_max_receive = 0;
         unsigned max_receive_count = 0;
@@ -161,7 +166,6 @@ struct max_comm_datastructure {
             }
         }
 
-        // 3. Aggregate Global Stats
         const comm_weight_t global_max = std::max(max_send, max_receive);
         step_max_comm_cache[step] = global_max;
 
@@ -172,7 +176,6 @@ struct max_comm_datastructure {
             global_count += max_receive_count;
         step_max_comm_count_cache[step] = global_count;
 
-        // Determine second max
         comm_weight_t cand_send = (max_send == global_max) ? second_max_send : max_send;
         comm_weight_t cand_recv = (max_receive == global_max) ? second_max_receive : max_receive;
 
@@ -204,7 +207,7 @@ struct max_comm_datastructure {
     void update_datastructure_after_move(const kl_move &move, unsigned, unsigned) {
         const auto &graph = instance->getComputationalDag();
 
-        // --- 0. Prepare Scratchpad (Avoids Allocations) ---
+        // Prepare Scratchpad (Avoids Allocations) ---
         for (unsigned step : affected_steps_list) {
             if (step < step_is_affected.size())
                 step_is_affected[step] = false;
@@ -225,20 +228,16 @@ struct max_comm_datastructure {
         const unsigned to_proc = move.to_proc;
         const comm_weight_t comm_w_node = graph.vertex_comm_weight(node);
 
-        // --- 1. Handle Node Movement (Outgoing Edges: Node -> Children) ---
+        // Handle Node Movement (Outgoing Edges: Node -> Children)
 
         if (from_step != to_step) {
             // Case 1: Node changes Step
-            // Optimization: Fuse the loop to iterate lambda map only once.
-            
-            for (const auto [proc, count] : node_lambda_map.iterate_proc_entries(node)) {
+            for (const auto [proc, val] : node_lambda_map.iterate_proc_entries(node)) {
                 // A. Remove Old (Sender: from_proc, Receiver: proc)
                 if (proc != from_proc) {
                     const comm_weight_t cost = comm_w_node * instance->sendCosts(from_proc, proc);
-                    // Optimization: check cost > 0 to avoid dirtying cache lines with +0 ops
-                    if (cost > 0) { 
-                        step_proc_receive_[from_step][proc] -= cost;
-                        step_proc_send_[from_step][from_proc] -= cost;
+                    if (cost > 0) {
+                        CommPolicy::unattribute_communication(*this, cost, from_step, from_proc, proc, 0, val);
                     }
                 }
 
@@ -246,8 +245,7 @@ struct max_comm_datastructure {
                 if (proc != to_proc) {
                     const comm_weight_t cost = comm_w_node * instance->sendCosts(to_proc, proc);
                     if (cost > 0) {
-                        step_proc_receive_[to_step][proc] += cost;
-                        step_proc_send_[to_step][to_proc] += cost;
+                        CommPolicy::attribute_communication(*this, cost, to_step, to_proc, proc, 0, val);
                     }
                 }
             }
@@ -257,13 +255,12 @@ struct max_comm_datastructure {
         } else if (from_proc != to_proc) {
             // Case 2: Node stays in same Step, but changes Processor
 
-            for (const auto [proc, count] : node_lambda_map.iterate_proc_entries(node)) {
+            for (const auto [proc, val] : node_lambda_map.iterate_proc_entries(node)) {
                 // Remove Old (Sender: from_proc, Receiver: proc)
                 if (proc != from_proc) {
                     const comm_weight_t cost = comm_w_node * instance->sendCosts(from_proc, proc);
                     if (cost > 0) {
-                        step_proc_receive_[from_step][proc] -= cost;
-                        step_proc_send_[from_step][from_proc] -= cost;
+                        CommPolicy::unattribute_communication(*this, cost, from_step, from_proc, proc, 0, val);
                     }
                 }
 
@@ -271,17 +268,16 @@ struct max_comm_datastructure {
                 if (proc != to_proc) {
                     const comm_weight_t cost = comm_w_node * instance->sendCosts(to_proc, proc);
                     if (cost > 0) {
-                        step_proc_receive_[from_step][proc] += cost;
-                        step_proc_send_[from_step][to_proc] += cost;
+                        CommPolicy::attribute_communication(*this, cost, from_step, to_proc, proc, 0, val);
                     }
                 }
             }
             mark_step(from_step);
         }
 
-        // --- 2. Update Parents' Outgoing Communication (Parents → Node) ---
+        // Update Parents' Outgoing Communication (Parents → Node)
 
-        if (from_proc != to_proc) {
+        if (from_proc != to_proc || from_step != to_step) {
             for (const auto &parent : graph.parents(node)) {
                 const unsigned parent_step = active_schedule->assigned_superstep(parent);
                 // Fast boundary check
@@ -291,27 +287,30 @@ struct max_comm_datastructure {
                 const unsigned parent_proc = active_schedule->assigned_processor(parent);
                 const comm_weight_t comm_w_parent = graph.vertex_comm_weight(parent);
 
-                const bool removed_from_proc = node_lambda_map.decrease_proc_count(parent, from_proc);
-                const bool added_to_proc = node_lambda_map.increase_proc_count(parent, to_proc);
+                auto &val = node_lambda_map.get_proc_entry(parent, from_proc);
+                const bool removed_from_proc = CommPolicy::remove_child(val, from_step);
 
                 // 1. Handle Removal from from_proc
                 if (removed_from_proc) {
                     if (from_proc != parent_proc) {
                         const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, from_proc);
                         if (cost > 0) {
-                            step_proc_send_[parent_step][parent_proc] -= cost;
-                            step_proc_receive_[parent_step][from_proc] -= cost;
+                            CommPolicy::unattribute_communication(*this, cost, parent_step, parent_proc, from_proc,
+                                                                  from_step, val);
                         }
                     }
                 }
 
+                auto &val_to = node_lambda_map.get_proc_entry(parent, to_proc);
+                const bool added_to_proc = CommPolicy::add_child(val_to, to_step);
+
                 // 2. Handle Addition to to_proc
                 if (added_to_proc) {
                     if (to_proc != parent_proc) {
                         const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, to_proc);
                         if (cost > 0) {
-                            step_proc_send_[parent_step][parent_proc] += cost;
-                            step_proc_receive_[parent_step][to_proc] += cost;
+                            CommPolicy::attribute_communication(*this, cost, parent_step, parent_proc, to_proc, to_step,
+                                                                val_to);
                         }
                     }
                 }
@@ -320,7 +319,7 @@ struct max_comm_datastructure {
             }
         }
 
-        // --- 3. Re-arrange Affected Steps ---
+        // Re-arrange Affected Steps
         for (unsigned step : affected_steps_list) {
             arrange_superstep_comm_data(step);
         }
@@ -358,27 +357,28 @@ struct max_comm_datastructure {
 
             for (const auto &v : graph.children(u)) {
                 const unsigned v_proc = vec_sched.assignedProcessor(v);
-                const unsigned v_step = vec_sched.assignedSuperstep(v);                
-                const comm_weight_t comm_w_send_cost = (u_proc != v_proc) ? comm_w * instance->sendCosts(u_proc, v_proc) : 0;
-                
-                if (node_lambda_map.increase_proc_count(u, v_proc)) {
+                const unsigned v_step = vec_sched.assignedSuperstep(v);
+
+                const comm_weight_t comm_w_send_cost =
+                    (u_proc != v_proc) ? comm_w * instance->sendCosts(u_proc, v_proc) : 0;
+
+                auto &val = node_lambda_map.get_proc_entry(u, v_proc);
+                if (CommPolicy::add_child(val, v_step)) {
                     if (u_proc != v_proc && comm_w_send_cost > 0) {
-                        attribute_communication(comm_w_send_cost, u_step, u_proc, v_proc, v_step);
+                        CommPolicy::attribute_communication(*this, comm_w_send_cost, u_step, u_proc, v_proc, v_step,
+                                                            val);
                     }
                 }
             }
         }
 
         for (unsigned step = start_step; step <= end_step; step++) {
+            if (step >= step_proc_send_.size()) {
+                continue;
+            }
             arrange_superstep_comm_data(step);
         }
     }
-
-    inline void attribute_communication(const comm_weight_t &comm_w_send_cost, const unsigned u_step, const unsigned u_proc, const unsigned v_proc,
-                                        const unsigned) {
-        step_proc_receive_[u_step][v_proc] += comm_w_send_cost;
-        step_proc_send_[u_step][u_proc] += comm_w_send_cost;
-    }
 };
 
 } // namespace osp
\ No newline at end of file
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp
index a27ebe9b..97bd35a7 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp
@@ -1024,6 +1024,45 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                 thread_data.reward_penalty_strat.reward, recompute_max_gain,
                 new_nodes); // this only updated reward/penalty, collects new_nodes, and fills recompute_max_gain
 
+            // Add nodes from affected steps to new_nodes
+            // {
+            //     std::unordered_set<unsigned> steps_to_check;
+            //     const unsigned num_steps = active_schedule.num_steps();
+
+            //     auto add_steps_range = [&](unsigned center_step) {
+            //         unsigned start = (center_step > window_size) ? center_step - window_size : 0;
+            //         unsigned end = std::min(center_step + window_size, num_steps - 1);
+
+            //         // Constrain to thread range
+            //         if (start < thread_data.start_step)
+            //             start = thread_data.start_step;
+            //         if (end > thread_data.end_step)
+            //             end = thread_data.end_step;
+
+            //         for (unsigned s = start; s <= end; ++s) {
+            //             steps_to_check.insert(s);
+            //         }
+            //     };
+
+            //     add_steps_range(best_move.from_step);
+            //     add_steps_range(best_move.to_step);
+
+            //     for (unsigned step : steps_to_check) {
+            //         for (unsigned proc = 0; proc < instance->numberOfProcessors(); ++proc) {
+            //             const auto &nodes_in_step = active_schedule.getSetSchedule().step_processor_vertices[step][proc];
+            //             for (const auto &node : nodes_in_step) {
+            //                 if (!thread_data.affinity_table.is_selected(node) && !thread_data.lock_manager.is_locked(node)) {
+            //                     new_nodes.push_back(node);
+            //                 }
+            //             }
+            //         }
+            //     }
+
+            //     // Deduplicate new_nodes
+            //     std::sort(new_nodes.begin(), new_nodes.end());
+            //     new_nodes.erase(std::unique(new_nodes.begin(), new_nodes.end()), new_nodes.end());
+            // }
+
             // Determine the steps where max/second_max/max_count for work/comm changed
             std::unordered_set<unsigned> changed_steps;
 
@@ -1150,7 +1189,8 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
                                                ThreadSearchContext &thread_data) {
         if (no_imp_counter >= thread_data.no_improvement_iterations_reduce_penalty &&
             thread_data.reward_penalty_strat.initial_penalty > 1.0) {
-            thread_data.reward_penalty_strat.initial_penalty = static_cast<cost_t>(std::floor(std::sqrt(thread_data.reward_penalty_strat.initial_penalty)));
+            thread_data.reward_penalty_strat.initial_penalty =
+                static_cast<cost_t>(std::floor(std::sqrt(thread_data.reward_penalty_strat.initial_penalty)));
             thread_data.unlock_edge_backtrack_counter_reset += 1;
             thread_data.no_improvement_iterations_reduce_penalty += 15;
 #ifdef KL_DEBUG_1
@@ -1204,7 +1244,8 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
         if (select_nodes_check_remove_superstep(thread_data.step_to_remove, thread_data)) {
             active_schedule.swap_empty_step_fwd(thread_data.step_to_remove, thread_data.end_step);
             thread_data.end_step--;
-            thread_data.local_search_start_step = static_cast<unsigned>(thread_data.active_schedule_data.applied_moves.size());
+            thread_data.local_search_start_step =
+                static_cast<unsigned>(thread_data.active_schedule_data.applied_moves.size());
             thread_data.active_schedule_data.update_cost(static_cast<cost_t>(-1.0 * instance->synchronisationCosts()));
 
             if constexpr (enable_preresolving_violations) {
diff --git a/tests/kl_bsp_cost.cpp b/tests/kl_bsp_cost.cpp
index 36e999ff..05a5882c 100644
--- a/tests/kl_bsp_cost.cpp
+++ b/tests/kl_bsp_cost.cpp
@@ -19,7 +19,6 @@ limitations under the License.
 #define BOOST_TEST_MODULE kl_bsp_cost
 #include <boost/test/unit_test.hpp>
 
-#include "test_graphs.hpp"
 #include "osp/bsp/model/BspSchedule.hpp"
 #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp"
 #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp"
@@ -27,6 +26,7 @@ limitations under the License.
 #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_util.hpp"
 #include "osp/concepts/graph_traits.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
+#include "test_graphs.hpp"
 
 using namespace osp;
 using graph = computational_dag_edge_idx_vector_impl_def_int_t;
@@ -319,7 +319,7 @@ bool validate_comm_datastructures(
             }
         }
     }
-    
+
     return all_match;
 }
 
@@ -1083,4 +1083,192 @@ BOOST_AUTO_TEST_CASE(test_ladder_graph_moves) {
     kl_sched.apply_move(move4, active_schedule_data);
     comm_ds.update_datastructure_after_move(move4, 0, 5);
     BOOST_CHECK(validate_comm_datastructures(comm_ds, kl_sched, instance, "ladder_move4"));
+}
+
+BOOST_AUTO_TEST_CASE(test_lazy_and_buffered_modes) {
+    std::cout << "Setup Graph" << std::endl;
+    graph instance;
+    instance.add_vertex(1, 10, 1);
+    instance.add_vertex(1, 10, 1);
+    instance.add_vertex(1, 10, 1);
+
+    instance.add_edge(0, 1, 1);
+    instance.add_edge(0, 2, 1);
+
+    std::cout << "Setup Arch" << std::endl;
+    osp::BspArchitecture<graph> arch;
+    arch.setNumberOfProcessors(2);
+    arch.setCommunicationCosts(1);
+    arch.setSynchronisationCosts(0);
+
+    std::cout << "Setup BspInstance" << std::endl;
+    osp::BspInstance<graph> bsp_instance(instance, arch);
+
+    std::cout << "Setup Schedule" << std::endl;
+    osp::BspSchedule<graph> schedule(bsp_instance);
+    schedule.setAssignedProcessor(0, 0);
+    schedule.setAssignedProcessor(1, 1);
+    schedule.setAssignedProcessor(2, 1);
+
+    schedule.setAssignedSuperstep(0, 0);
+    schedule.setAssignedSuperstep(1, 2);
+    schedule.setAssignedSuperstep(2, 4);
+
+    schedule.updateNumberOfSupersteps();
+
+    std::cout << "Setup KL Sched" << std::endl;
+    kl_active_schedule_t kl_sched;
+    kl_sched.initialize(schedule);
+
+    thread_local_active_schedule_data<graph, double> active_schedule_data;
+    active_schedule_data.initialize_cost(0.0);
+
+    std::cout << "Setup Complete" << std::endl;
+    std::cout << "Num Vertices: " << instance.num_vertices() << std::endl;
+    std::cout << "Num Procs: " << arch.numberOfProcessors() << std::endl;
+
+    std::cout << "Start Eager Test" << std::endl;
+    {
+        using CommPolicy = osp::EagerCommCostPolicy;
+        osp::max_comm_datastructure<graph, double, kl_active_schedule_t, CommPolicy> comm_ds;
+        std::cout << "Initialize Eager Comm DS" << std::endl;
+        comm_ds.initialize(kl_sched);
+
+        std::cout << "Checking node_lambda_map" << std::endl;
+        std::cout << "node_lambda_vec size: " << comm_ds.node_lambda_map.node_lambda_vec.size() << std::endl;
+        if (comm_ds.node_lambda_map.node_lambda_vec.size() > 0) {
+            std::cout << "node_lambda_vec[0] size: " << comm_ds.node_lambda_map.node_lambda_vec[0].size() << std::endl;
+        }
+
+        std::cout << "Compute Eager Comm DS" << std::endl;
+        comm_ds.compute_comm_datastructures(0, 4);
+        std::cout << "Eager Done" << std::endl;
+    }
+
+    std::cout << "Start Lazy Test" << std::endl;
+    // --- Test Lazy Policy ---
+    {
+        using CommPolicy = osp::LazyCommCostPolicy;
+        osp::max_comm_datastructure<graph, double, kl_active_schedule_t, CommPolicy> comm_ds;
+        std::cout << "Initialize Comm DS" << std::endl;
+        comm_ds.initialize(kl_sched);
+        std::cout << "Compute Comm DS" << std::endl;
+        comm_ds.compute_comm_datastructures(0, 4);
+
+        // Expected Behavior for Lazy:
+        // Node 0 (P0) sends to P1.
+        // Children on P1 are at Step 2 and Step 4.
+        // Lazy policy should attribute cost to min(2, 4) - 1 = Step 1.
+        // Cost = 10 * 1.0 = 10.
+
+        // Lazy: Send and Recv at min(2, 4) - 1 = Step 1.
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 10);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 0), 0);
+
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 1), 0);
+
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 0), 0);
+
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 1), 10);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 1), 0);
+
+        using kl_move = osp::kl_move_struct<double, graph::vertex_idx>;
+        kl_move move(1, 0.0, 1, 2, 1, 3); // Node 1, Step 2->3, Proc 1->1
+        kl_sched.apply_move(move, active_schedule_data);
+        comm_ds.update_datastructure_after_move(move, 0, 4);
+
+        // After move: Children at {3, 4}. Min = 3. Send/Recv at Step 2.
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 0), 10);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 0), 0);
+
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 1), 0);
+
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 0), 0);
+
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 1), 10);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 1), 0);
+
+        // Reset Node 1 to Step 2 for next test
+        kl_move move_back(1, 0.0, 1, 3, 1, 2);
+        kl_sched.apply_move(move_back, active_schedule_data);
+    }
+
+    // --- Test Buffered Policy ---
+    {
+        using CommPolicy = osp::BufferedCommCostPolicy;
+        osp::max_comm_datastructure<graph, double, kl_active_schedule_t, CommPolicy> comm_ds;
+        comm_ds.initialize(kl_sched);
+        comm_ds.compute_comm_datastructures(0, 4);
+
+        // Buffered: Send at Step 0. Recv at min(2, 4) - 1 = Step 1.
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 10);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 0), 0);
+
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 1), 0);
+
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 0), 0);
+
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 1), 10);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 1), 0);
+
+        using kl_move = osp::kl_move_struct<double, graph::vertex_idx>;
+        kl_move move(1, 0.0, 1, 2, 1, 3); // Node 1, Step 2->3, Proc 1->1
+        kl_sched.apply_move(move, active_schedule_data);
+        comm_ds.update_datastructure_after_move(move, 0, 4);
+
+        // After move: Children at {3, 4}. Min = 3. Recv at Step 2. Send still at Step 0.
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(0, 0), 10);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(1, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(2, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(3, 0), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_send(4, 0), 0);
+
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(0, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(1, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(2, 1), 10);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(3, 1), 0);
+        BOOST_CHECK_EQUAL(comm_ds.step_proc_receive(4, 1), 0);
+    }
 }
\ No newline at end of file
diff --git a/tests/kl_bsp_improver_test.cpp b/tests/kl_bsp_improver_test.cpp
index 250dfa18..df3ac3f1 100644
--- a/tests/kl_bsp_improver_test.cpp
+++ b/tests/kl_bsp_improver_test.cpp
@@ -129,122 +129,122 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) {
     BOOST_CHECK_CLOSE(iter4_recomputed, iter4_tracked, 0.00001);
 }
 
-BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs) {
-    std::vector<std::string> filenames_graph = large_spaa_graphs();
-    using graph = computational_dag_edge_idx_vector_impl_def_int_t;
-    // Getting root git directory
-    std::filesystem::path cwd = std::filesystem::current_path();
-    std::cout << cwd << std::endl;
-    while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) {
-        cwd = cwd.parent_path();
-        std::cout << cwd << std::endl;
-    }
+// BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs) {
+//     std::vector<std::string> filenames_graph = large_spaa_graphs();
+//     using graph = computational_dag_edge_idx_vector_impl_def_int_t;
+//     // Getting root git directory
+//     std::filesystem::path cwd = std::filesystem::current_path();
+//     std::cout << cwd << std::endl;
+//     while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) {
+//         cwd = cwd.parent_path();
+//         std::cout << cwd << std::endl;
+//     }
 
-    for (auto &filename_graph : filenames_graph) {
-        GreedyBspScheduler<computational_dag_edge_idx_vector_impl_def_int_t> test_scheduler;
-        BspInstance<graph> instance;
-        bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(),
-                                                                              instance.getComputationalDag());
+//     for (auto &filename_graph : filenames_graph) {
+//         GreedyBspScheduler<computational_dag_edge_idx_vector_impl_def_int_t> test_scheduler;
+//         BspInstance<graph> instance;
+//         bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(),
+//                                                                               instance.getComputationalDag());
 
-        instance.getArchitecture().setSynchronisationCosts(500);
-        instance.getArchitecture().setCommunicationCosts(5);
-        instance.getArchitecture().setNumberOfProcessors(4);
+//         instance.getArchitecture().setSynchronisationCosts(500);
+//         instance.getArchitecture().setCommunicationCosts(5);
+//         instance.getArchitecture().setNumberOfProcessors(4);
 
-        std::vector<std::vector<int>> send_cost = {{0, 1, 4, 4}, {1, 0, 4, 4}, {4, 4, 0, 1}, {4, 4, 1, 0}};
+//         std::vector<std::vector<int>> send_cost = {{0, 1, 4, 4}, {1, 0, 4, 4}, {4, 4, 0, 1}, {4, 4, 1, 0}};
 
-        instance.getArchitecture().setSendCosts(send_cost);
+//         instance.getArchitecture().setSendCosts(send_cost);
 
-        if (!status_graph) {
+//         if (!status_graph) {
 
-            std::cout << "Reading files failed." << std::endl;
-            BOOST_CHECK(false);
-        }
+//             std::cout << "Reading files failed." << std::endl;
+//             BOOST_CHECK(false);
+//         }
 
-        add_mem_weights(instance.getComputationalDag());
+//         add_mem_weights(instance.getComputationalDag());
 
-        BspSchedule<graph> schedule(instance);
-        const auto result = test_scheduler.computeSchedule(schedule);
+//         BspSchedule<graph> schedule(instance);
+//         const auto result = test_scheduler.computeSchedule(schedule);
 
-        schedule.updateNumberOfSupersteps();
+//         schedule.updateNumberOfSupersteps();
 
-        std::cout << "initial scedule with costs: " << schedule.computeCosts() << " and "
-                  << schedule.numberOfSupersteps() << " number of supersteps" << std::endl;
+//         std::cout << "initial scedule with costs: " << schedule.computeCosts() << " and "
+//                   << schedule.numberOfSupersteps() << " number of supersteps" << std::endl;
 
-        BspSchedule<graph> schedule_2(schedule);
+//         BspSchedule<graph> schedule_2(schedule);
 
-        BOOST_CHECK_EQUAL(RETURN_STATUS::OSP_SUCCESS, result);
-        BOOST_CHECK_EQUAL(&schedule.getInstance(), &instance);
-        BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
+//         BOOST_CHECK_EQUAL(RETURN_STATUS::OSP_SUCCESS, result);
+//         BOOST_CHECK_EQUAL(&schedule.getInstance(), &instance);
+//         BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
 
-        kl_total_lambda_comm_improver<graph, no_local_search_memory_constraint, 1> kl_total_lambda;
-        auto start_time = std::chrono::high_resolution_clock::now();
-        auto status = kl_total_lambda.improveSchedule(schedule);
-        auto finish_time = std::chrono::high_resolution_clock::now();
-        auto duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
+//         kl_total_lambda_comm_improver<graph, no_local_search_memory_constraint, 1> kl_total_lambda;
+//         auto start_time = std::chrono::high_resolution_clock::now();
+//         auto status = kl_total_lambda.improveSchedule(schedule);
+//         auto finish_time = std::chrono::high_resolution_clock::now();
+//         auto duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
 
-        std::cout << "kl lambda new finished in " << duration << " seconds, costs: " << schedule.computeCosts()
-                  << " and lambda costs: " << schedule.computeTotalLambdaCosts() << " with "
-                  << schedule.numberOfSupersteps() << " number of supersteps" << std::endl;
+//         std::cout << "kl lambda new finished in " << duration << " seconds, costs: " << schedule.computeCosts()
+//                   << " and lambda costs: " << schedule.computeTotalLambdaCosts() << " with "
+//                   << schedule.numberOfSupersteps() << " number of supersteps" << std::endl;
 
-        BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
-        BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true);
+//         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
+//         BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true);
 
-        kl_bsp_comm_improver_mt<graph, no_local_search_memory_constraint, 1> kl;
-        kl.setTimeQualityParameter(5.0);
-        start_time = std::chrono::high_resolution_clock::now();
-        status = kl.improveSchedule(schedule);
-        finish_time = std::chrono::high_resolution_clock::now();
-        duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
+//         kl_bsp_comm_improver_mt<graph, no_local_search_memory_constraint, 1> kl(42);
+//         kl.setTimeQualityParameter(2.0);
+//         start_time = std::chrono::high_resolution_clock::now();
+//         status = kl.improveSchedule(schedule);
+//         finish_time = std::chrono::high_resolution_clock::now();
+//         duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
 
-        std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeCosts() << " with "
-                  << schedule.numberOfSupersteps() << " number of supersteps" << std::endl;
+//         std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeCosts() << " with "
+//                   << schedule.numberOfSupersteps() << " number of supersteps" << std::endl;
 
-        BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
-        BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true);
+//         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
+//         BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true);
 
-        BspScheduleCS<graph> schedule_cs(schedule);
+//         BspScheduleCS<graph> schedule_cs(schedule);
 
-        HillClimbingForCommSteps<graph> hc_comm_steps;
-        start_time = std::chrono::high_resolution_clock::now();
-        status = hc_comm_steps.improveSchedule(schedule_cs);
-        finish_time = std::chrono::high_resolution_clock::now();
+//         HillClimbingForCommSteps<graph> hc_comm_steps;
+//         start_time = std::chrono::high_resolution_clock::now();
+//         status = hc_comm_steps.improveSchedule(schedule_cs);
+//         finish_time = std::chrono::high_resolution_clock::now();
 
-        duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
+//         duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
 
-        std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs.computeCosts()
-                  << " with " << schedule_cs.numberOfSupersteps() << " number of supersteps" << std::endl;
+//         std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs.computeCosts()
+//                   << " with " << schedule_cs.numberOfSupersteps() << " number of supersteps" << std::endl;
 
-        BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
-        BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true);
+//         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
+//         BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true);
 
-        kl_total_lambda.improveSchedule(schedule_2);
+//         kl_total_lambda.improveSchedule(schedule_2);
 
-        HillClimbingScheduler<graph> hc;
+//         HillClimbingScheduler<graph> hc;
 
-        start_time = std::chrono::high_resolution_clock::now();
-        status = hc.improveSchedule(schedule_2);
-        finish_time = std::chrono::high_resolution_clock::now();
+//         start_time = std::chrono::high_resolution_clock::now();
+//         status = hc.improveSchedule(schedule_2);
+//         finish_time = std::chrono::high_resolution_clock::now();
 
-        duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
+//         duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
 
-        std::cout << "hc finished in " << duration << " seconds, costs: " << schedule_2.computeCosts() << " with "
-                  << schedule_2.numberOfSupersteps() << " number of supersteps" << std::endl;
+//         std::cout << "hc finished in " << duration << " seconds, costs: " << schedule_2.computeCosts() << " with "
+//                   << schedule_2.numberOfSupersteps() << " number of supersteps" << std::endl;
 
-        BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
-        BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true);
+//         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
+//         BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true);
 
-        BspScheduleCS<graph> schedule_cs_2(schedule_2);
+//         BspScheduleCS<graph> schedule_cs_2(schedule_2);
 
-        start_time = std::chrono::high_resolution_clock::now();
-        status = hc_comm_steps.improveSchedule(schedule_cs_2);
-        finish_time = std::chrono::high_resolution_clock::now();
+//         start_time = std::chrono::high_resolution_clock::now();
+//         status = hc_comm_steps.improveSchedule(schedule_cs_2);
+//         finish_time = std::chrono::high_resolution_clock::now();
 
-        duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
+//         duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
 
-        std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs_2.computeCosts()
-                  << " with " << schedule_cs_2.numberOfSupersteps() << " number of supersteps" << std::endl;
+//         std::cout << "hc_comm_steps finished in " << duration << " seconds, costs: " << schedule_cs_2.computeCosts()
+//                   << " with " << schedule_cs_2.numberOfSupersteps() << " number of supersteps" << std::endl;
 
-        BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
-        BOOST_CHECK_EQUAL(schedule_cs_2.satisfiesPrecedenceConstraints(), true);
-    }
-}
\ No newline at end of file
+//         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
+//         BOOST_CHECK_EQUAL(schedule_cs_2.satisfiesPrecedenceConstraints(), true);
+//     }
+// }
\ No newline at end of file

From 3264e04df55348180abe65454b4539a1bc8441c6 Mon Sep 17 00:00:00 2001
From: tonibohnlein <toni.boehnlein18@gmail.com>
Date: Thu, 27 Nov 2025 15:16:04 +0100
Subject: [PATCH 3/3] simplification

---
 .../max_comm_datastructure.hpp                | 67 +++++++++----------
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp
index b3820231..236e11cc 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/max_comm_datastructure.hpp
@@ -277,47 +277,46 @@ struct max_comm_datastructure {
 
         // Update Parents' Outgoing Communication (Parents → Node)
 
-        if (from_proc != to_proc || from_step != to_step) {
-            for (const auto &parent : graph.parents(node)) {
-                const unsigned parent_step = active_schedule->assigned_superstep(parent);
-                // Fast boundary check
-                if (parent_step >= step_proc_send_.size())
-                    continue;
-
-                const unsigned parent_proc = active_schedule->assigned_processor(parent);
-                const comm_weight_t comm_w_parent = graph.vertex_comm_weight(parent);
-
-                auto &val = node_lambda_map.get_proc_entry(parent, from_proc);
-                const bool removed_from_proc = CommPolicy::remove_child(val, from_step);
-
-                // 1. Handle Removal from from_proc
-                if (removed_from_proc) {
-                    if (from_proc != parent_proc) {
-                        const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, from_proc);
-                        if (cost > 0) {
-                            CommPolicy::unattribute_communication(*this, cost, parent_step, parent_proc, from_proc,
-                                                                  from_step, val);
-                        }
+        for (const auto &parent : graph.parents(node)) {
+            const unsigned parent_step = active_schedule->assigned_superstep(parent);
+            // Fast boundary check
+            if (parent_step >= step_proc_send_.size())
+                continue;
+
+            const unsigned parent_proc = active_schedule->assigned_processor(parent);
+            const comm_weight_t comm_w_parent = graph.vertex_comm_weight(parent);
+
+            auto &val = node_lambda_map.get_proc_entry(parent, from_proc);
+            const bool removed_from_proc = CommPolicy::remove_child(val, from_step);
+
+            // 1. Handle Removal from from_proc
+            if (removed_from_proc) {
+                if (from_proc != parent_proc) {
+                    const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, from_proc);
+                    if (cost > 0) {
+                        CommPolicy::unattribute_communication(*this, cost, parent_step, parent_proc, from_proc,
+                                                                from_step, val);
                     }
                 }
+            }
+
+            auto &val_to = node_lambda_map.get_proc_entry(parent, to_proc);
+            const bool added_to_proc = CommPolicy::add_child(val_to, to_step);
 
-                auto &val_to = node_lambda_map.get_proc_entry(parent, to_proc);
-                const bool added_to_proc = CommPolicy::add_child(val_to, to_step);
-
-                // 2. Handle Addition to to_proc
-                if (added_to_proc) {
-                    if (to_proc != parent_proc) {
-                        const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, to_proc);
-                        if (cost > 0) {
-                            CommPolicy::attribute_communication(*this, cost, parent_step, parent_proc, to_proc, to_step,
-                                                                val_to);
-                        }
+            // 2. Handle Addition to to_proc
+            if (added_to_proc) {
+                if (to_proc != parent_proc) {
+                    const comm_weight_t cost = comm_w_parent * instance->sendCosts(parent_proc, to_proc);
+                    if (cost > 0) {
+                        CommPolicy::attribute_communication(*this, cost, parent_step, parent_proc, to_proc, to_step,
+                                                            val_to);
                     }
                 }
-
-                mark_step(parent_step);
             }
+
+            mark_step(parent_step);
         }
+        
 
         // Re-arrange Affected Steps
         for (unsigned step : affected_steps_list) {