diff --git a/.clang-format b/.clang-format
index b622f779..93fcdef5 100644
--- a/.clang-format
+++ b/.clang-format
@@ -50,7 +50,7 @@ BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
 BreakAfterJavaFieldAnnotations: false
 BreakStringLiterals: true
-ColumnLimit:     120
+ColumnLimit:     0
 CommentPragmas:  '^ IWYU pragma:'
 CompactNamespaces: true
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
@@ -134,4 +134,3 @@ TabWidth:        8
 UseCRLF:         false
 UseTab:          Never
 ...
-
diff --git a/.githooks/pre-commit b/.githooks/pre-commit
index 81cf6a75..8fd073ee 100755
--- a/.githooks/pre-commit
+++ b/.githooks/pre-commit
@@ -47,28 +47,24 @@ fi
 
 
 # Format all .cpp and .hpp files
-for file in $(git diff --cached --name-only | grep -E '\.cpp$|\.hpp$'); do
-	# Check if the file exists
-	if [ ! -f $file ]; then
-		continue
+# We use 'git diff-index' to avoid issues with spaces in filenames
+git diff --cached --name-only --diff-filter=ACM $against | grep -E '\.(cpp|hpp)$' | while read -r file; do
+	if [ -f "$file" ]; then
+        # Apply clang-format in-place
+        clang-format -i "$file"
+        # Re-add the file to the commit to include the formatting changes
+        git add "$file"
 	fi
-	git add $file
 done
 
-# Remove trailing whitespace from all files (except .so)
-for file in $(git diff --cached --name-only | grep -vE '\.so$'); do
-	# Check if the file exists
-	if [ ! -f $file ]; then
-		continue
+# Remove trailing whitespace from all files (except .so and binary files)
+# Using sed to remove trailing whitespace
+git diff --cached --name-only --diff-filter=ACM $against | grep -vE '\.so$' | while read -r file; do
+	if [ -f "$file" ]; then
+        # Check if file is text to avoid corrupting binaries
+        if file "$file" | grep -q "text"; then
+            sed -i 's/[[:space:]]*$//' "$file"
+            git add "$file"
+        fi
 	fi
-	git add $file
 done
-
-for file in $(git diff --cached --name-only); do
-	# Check if the file exists
-	if [ ! -f $file ]; then
-		continue
-	fi
-	git add $file
-done
-
diff --git a/apps/test_suite_runner/StatsModules/BspCommStatsModule.hpp b/apps/test_suite_runner/StatsModules/BspCommStatsModule.hpp
index 6b9e96fd..7f1066ee 100644
--- a/apps/test_suite_runner/StatsModules/BspCommStatsModule.hpp
+++ b/apps/test_suite_runner/StatsModules/BspCommStatsModule.hpp
@@ -18,36 +18,36 @@ limitations under the License.
 
 #pragma once
 
-#include <string>
-#include <vector>
-#include <map>
 #include "IStatsModule.hpp"
 #include "osp/bsp/model/BspSchedule.hpp" // Still needed
+#include "osp/bsp/model/cost/BufferedSendingCost.hpp"
+#include "osp/bsp/model/cost/TotalCommunicationCost.hpp"
+#include "osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp"
+#include <map>
+#include <string>
+#include <vector>
 
 namespace osp {
 
 template<typename Graph_t>
-class BspCommStatsModule : public IStatisticModule<BspSchedule<Graph_t>> { 
-public:
-
-private:
+class BspCommStatsModule : public IStatisticModule<BspSchedule<Graph_t>> {
+  public:
+  private:
     const std::vector<std::string> metric_headers = {
-        "TotalCommCost", "TotalLambdaCommCost", "BufferedSendingCosts" 
-    };
-
-public:
+        "TotalCommCost", "TotalLambdaCommCost", "BufferedSendingCosts"};
 
+  public:
     std::vector<std::string> get_metric_headers() const override {
         return metric_headers;
     }
 
     std::map<std::string, std::string> record_statistics(
-                            const BspSchedule<Graph_t>& schedule, 
-                            std::ofstream& /*log_stream*/) const override {
+        const BspSchedule<Graph_t> &schedule,
+        std::ofstream & /*log_stream*/) const override {
         std::map<std::string, std::string> stats;
-        stats["TotalCommCost"] = std::to_string(schedule.computeTotalCosts());
-        stats["TotalLambdaCommCost"] = std::to_string(schedule.computeTotalLambdaCosts());
-        stats["BufferedSendingCosts"] = std::to_string(schedule.computeBufferedSendingCosts());
+        stats["TotalCommCost"] = std::to_string(TotalCommunicationCost<Graph_t>()(schedule));
+        stats["TotalLambdaCommCost"] = std::to_string(TotalLambdaCommunicationCost<Graph_t>()(schedule));
+        stats["BufferedSendingCosts"] = std::to_string(BufferedSendingCost<Graph_t>()(schedule));
         return stats;
     }
 };
diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
index 175f5335..d7f7e77f 100644
--- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
+++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp
@@ -20,6 +20,7 @@ limitations under the License.
 
 #ifdef EIGEN_FOUND
 
+#include <Eigen/Core>
 #include <algorithm>
 #include <iostream>
 #include <list>
@@ -28,16 +29,13 @@ limitations under the License.
 #include <random>
 #include <stdexcept>
 #include <vector>
-#include <Eigen/Core>
-#include <omp.h>
 
-#include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp"
 #include "osp/bsp/model/BspInstance.hpp"
 #include "osp/bsp/model/BspSchedule.hpp"
+#include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp"
 
 namespace osp {
 
-
 template<typename eigen_idx_type>
 class Sptrsv {
     using uVertType = typename SparseMatrixImp<eigen_idx_type>::vertex_idx;
@@ -51,14 +49,14 @@ class Sptrsv {
 
     std::vector<uVertType> col_idx;
     std::vector<uVertType> row_ptr;
-    
+
     std::vector<uVertType> row_idx;
     std::vector<uVertType> col_ptr;
 
     std::vector<std::vector<unsigned>> step_proc_ptr;
     std::vector<std::vector<unsigned>> step_proc_num;
 
-    double * x;
+    double *x;
     const double *b;
 
     unsigned num_supersteps;
@@ -67,12 +65,12 @@ class Sptrsv {
     std::vector<std::vector<std::vector<eigen_idx_type>>> vector_step_processor_vertices_u;
     std::vector<int> ready;
 
-    std::vector<std::vector<std::vector<eigen_idx_type>>>  bounds_array_l;
-    std::vector<std::vector<std::vector<eigen_idx_type>>>  bounds_array_u;
+    std::vector<std::vector<std::vector<eigen_idx_type>>> bounds_array_l;
+    std::vector<std::vector<std::vector<eigen_idx_type>>> bounds_array_u;
 
     Sptrsv() = default;
 
-    Sptrsv(BspInstance<SparseMatrixImp<eigen_idx_type>>  &inst) : instance(&inst) {};
+    Sptrsv(BspInstance<SparseMatrixImp<eigen_idx_type>> &inst) : instance(&inst) {};
 
     void setup_csr_no_permutation(const BspSchedule<SparseMatrixImp<eigen_idx_type>> &schedule) {
         vector_step_processor_vertices = std::vector<std::vector<std::vector<eigen_idx_type>>>(
@@ -93,73 +91,71 @@ class Sptrsv {
         num_supersteps = schedule.numberOfSupersteps();
         size_t number_of_vertices = instance->getComputationalDag().num_vertices();
 
-        #pragma omp parallel num_threads(2)
+#pragma omp parallel num_threads(2)
         {
             int id = omp_get_thread_num();
-            switch(id) {
-                case 0:
-                {
-                    for (size_t node=0; node < number_of_vertices; ++node){
-                        vector_step_processor_vertices[schedule.assignedSuperstep(node)][schedule.assignedProcessor(node)].push_back(static_cast<eigen_idx_type>(node));
-                    }  
-
-                    for (unsigned int step=0; step<schedule.numberOfSupersteps(); ++step){
-                        for (unsigned int proc=0; proc<instance->numberOfProcessors(); ++proc){
-                            if (!vector_step_processor_vertices[step][proc].empty()){
-                                eigen_idx_type start = vector_step_processor_vertices[step][proc][0];
-                                eigen_idx_type prev = vector_step_processor_vertices[step][proc][0];
-
-                                for (size_t i=1; i< vector_step_processor_vertices[step][proc].size(); ++i){
-                                    if(vector_step_processor_vertices[step][proc][i] != prev + 1){
-                                        bounds_array_l[step][proc].push_back(start);
-                                        bounds_array_l[step][proc].push_back(prev);
-                                        start = vector_step_processor_vertices[step][proc][i];
-                                    }
-                                    prev = vector_step_processor_vertices[step][proc][i];
-                                }
+            switch (id) {
+            case 0: {
+                for (size_t node = 0; node < number_of_vertices; ++node) {
+                    vector_step_processor_vertices[schedule.assignedSuperstep(node)][schedule.assignedProcessor(node)].push_back(static_cast<eigen_idx_type>(node));
+                }
 
-                                bounds_array_l[step][proc].push_back(start);
-                                bounds_array_l[step][proc].push_back(prev);
+                for (unsigned int step = 0; step < schedule.numberOfSupersteps(); ++step) {
+                    for (unsigned int proc = 0; proc < instance->numberOfProcessors(); ++proc) {
+                        if (!vector_step_processor_vertices[step][proc].empty()) {
+                            eigen_idx_type start = vector_step_processor_vertices[step][proc][0];
+                            eigen_idx_type prev = vector_step_processor_vertices[step][proc][0];
+
+                            for (size_t i = 1; i < vector_step_processor_vertices[step][proc].size(); ++i) {
+                                if (vector_step_processor_vertices[step][proc][i] != prev + 1) {
+                                    bounds_array_l[step][proc].push_back(start);
+                                    bounds_array_l[step][proc].push_back(prev);
+                                    start = vector_step_processor_vertices[step][proc][i];
+                                }
+                                prev = vector_step_processor_vertices[step][proc][i];
                             }
+
+                            bounds_array_l[step][proc].push_back(start);
+                            bounds_array_l[step][proc].push_back(prev);
                         }
                     }
-
-                    break;
                 }
-                case 1:
-                {   
-                    size_t node=number_of_vertices;
-                    do {
-                        node--;
-                        vector_step_processor_vertices_u[schedule.assignedSuperstep(node)][schedule.assignedProcessor(node)].push_back(static_cast<eigen_idx_type>(node));
-                    } while (node > 0);
-
-                    for (unsigned int step=0; step<schedule.numberOfSupersteps(); ++step){
-                        for (unsigned int proc=0; proc<instance->numberOfProcessors(); ++proc){
-                            if (!vector_step_processor_vertices_u[step][proc].empty()){
-                                eigen_idx_type start_u = static_cast<eigen_idx_type>(vector_step_processor_vertices_u[step][proc][0]);
-                                eigen_idx_type prev_u = static_cast<eigen_idx_type>(vector_step_processor_vertices_u[step][proc][0]);
-
-                                for (size_t i=1; i<vector_step_processor_vertices_u[step][proc].size(); ++i){
-                                    if( static_cast<eigen_idx_type>(vector_step_processor_vertices_u[step][proc][i]) != prev_u - 1){
-                                        bounds_array_u[step][proc].push_back(start_u);
-                                        bounds_array_u[step][proc].push_back(prev_u);
-                                        start_u = static_cast<eigen_idx_type>(vector_step_processor_vertices_u[step][proc][i]);
-                                    }
-                                    prev_u = static_cast<eigen_idx_type>(vector_step_processor_vertices_u[step][proc][i]);
-                                }
 
-                                bounds_array_u[step][proc].push_back(start_u);
-                                bounds_array_u[step][proc].push_back(prev_u);
+                break;
+            }
+            case 1: {
+                size_t node = number_of_vertices;
+                do {
+                    node--;
+                    vector_step_processor_vertices_u[schedule.assignedSuperstep(node)][schedule.assignedProcessor(node)].push_back(static_cast<eigen_idx_type>(node));
+                } while (node > 0);
+
+                for (unsigned int step = 0; step < schedule.numberOfSupersteps(); ++step) {
+                    for (unsigned int proc = 0; proc < instance->numberOfProcessors(); ++proc) {
+                        if (!vector_step_processor_vertices_u[step][proc].empty()) {
+                            eigen_idx_type start_u = static_cast<eigen_idx_type>(vector_step_processor_vertices_u[step][proc][0]);
+                            eigen_idx_type prev_u = static_cast<eigen_idx_type>(vector_step_processor_vertices_u[step][proc][0]);
+
+                            for (size_t i = 1; i < vector_step_processor_vertices_u[step][proc].size(); ++i) {
+                                if (static_cast<eigen_idx_type>(vector_step_processor_vertices_u[step][proc][i]) != prev_u - 1) {
+                                    bounds_array_u[step][proc].push_back(start_u);
+                                    bounds_array_u[step][proc].push_back(prev_u);
+                                    start_u = static_cast<eigen_idx_type>(vector_step_processor_vertices_u[step][proc][i]);
+                                }
+                                prev_u = static_cast<eigen_idx_type>(vector_step_processor_vertices_u[step][proc][i]);
                             }
+
+                            bounds_array_u[step][proc].push_back(start_u);
+                            bounds_array_u[step][proc].push_back(prev_u);
                         }
                     }
-
-                    break;
-                }
-                default:{
-                    std::cout << "Unexpected Behaviour" << std::endl;
                 }
+
+                break;
+            }
+            default: {
+                std::cout << "Unexpected Behaviour" << std::endl;
+            }
             }
         }
     }
@@ -184,7 +180,7 @@ class Sptrsv {
         step_proc_ptr =
             std::vector<std::vector<unsigned>>(num_supersteps, std::vector<unsigned>(instance->numberOfProcessors(), 0));
 
-        step_proc_num = schedule.num_assigned_nodes_per_superstep_processor();
+        step_proc_num = schedule.numAssignedNodesPerSuperstepProcessor();
 
         unsigned current_step = 0;
         unsigned current_processor = 0;
@@ -194,9 +190,9 @@ class Sptrsv {
         for (const uVertType &node : perm_inv) {
 
             if (schedule.assignedProcessor(node) != current_processor || schedule.assignedSuperstep(node) != current_step) {
-                
+
                 while (schedule.assignedProcessor(node) != current_processor ||
-                    schedule.assignedSuperstep(node) != current_step) {
+                       schedule.assignedSuperstep(node) != current_step) {
 
                     if (current_processor < instance->numberOfProcessors() - 1) {
                         current_processor++;
@@ -207,7 +203,6 @@ class Sptrsv {
                 }
 
                 step_proc_ptr[current_step][current_processor] = static_cast<unsigned>(row_ptr.size());
-
             }
 
             row_ptr.push_back(col_idx.size());
@@ -225,7 +220,7 @@ class Sptrsv {
                 const auto *outer = instance->getComputationalDag().getCSR()->outerIndexPtr();
                 for (uVertType par_ind = static_cast<uVertType>(outer[node]); par_ind < static_cast<uVertType>(outer[node + 1] - 1); ++par_ind) {
 
-                    if (static_cast<size_t>(instance->getComputationalDag().getCSR()->innerIndexPtr()[par_ind]) == perm_inv[par]){
+                    if (static_cast<size_t>(instance->getComputationalDag().getCSR()->innerIndexPtr()[par_ind]) == perm_inv[par]) {
                         val.push_back(instance->getComputationalDag().getCSR()->valuePtr()[par_ind]);
                         found++;
                     }
@@ -234,62 +229,62 @@ class Sptrsv {
             }
 
             col_idx.push_back(perm[node]);
-            val.push_back(instance->getComputationalDag().getCSR()->valuePtr()[ instance->getComputationalDag().getCSR()->outerIndexPtr()[node + 1] - 1 ]);
+            val.push_back(instance->getComputationalDag().getCSR()->valuePtr()[instance->getComputationalDag().getCSR()->outerIndexPtr()[node + 1] - 1]);
         }
 
         row_ptr.push_back(col_idx.size());
     }
 
-    void lsolve_serial(){
+    void lsolve_serial() {
         eigen_idx_type number_of_vertices = static_cast<eigen_idx_type>(instance->numberOfVertices());
-        for (eigen_idx_type i = 0; i < number_of_vertices; ++i){
+        for (eigen_idx_type i = 0; i < number_of_vertices; ++i) {
             x[i] = b[i];
-            for (eigen_idx_type j = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i]; j < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1; ++j){
+            for (eigen_idx_type j = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i]; j < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1; ++j) {
                 x[i] -= (*(instance->getComputationalDag().getCSR())).valuePtr()[j] * x[(*(instance->getComputationalDag().getCSR())).innerIndexPtr()[j]];
             }
             x[i] /= (*(instance->getComputationalDag().getCSR())).valuePtr()[(*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1];
         }
     }
 
-    void usolve_serial(){
+    void usolve_serial() {
         eigen_idx_type number_of_vertices = static_cast<eigen_idx_type>(instance->numberOfVertices());
 
         eigen_idx_type i = number_of_vertices;
         do {
             i--;
             x[i] = b[i];
-            for (eigen_idx_type j = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i] + 1; j < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i + 1]; ++j){
+            for (eigen_idx_type j = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i] + 1; j < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i + 1]; ++j) {
                 x[i] -= (*(instance->getComputationalDag().getCSC())).valuePtr()[j] * x[(*(instance->getComputationalDag().getCSC())).innerIndexPtr()[j]];
             }
             x[i] /= (*(instance->getComputationalDag().getCSC())).valuePtr()[(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i]];
         } while (i != 0);
     }
 
-    void lsolve_no_permutation_in_place(){
-    #pragma omp parallel num_threads(instance->numberOfProcessors())
+    void lsolve_no_permutation_in_place() {
+#pragma omp parallel num_threads(instance->numberOfProcessors())
         {
             const size_t proc = static_cast<size_t>(omp_get_thread_num());
-            for (unsigned step = 0; step < num_supersteps; ++step){
+            for (unsigned step = 0; step < num_supersteps; ++step) {
                 const size_t bounds_str_size = bounds_array_l[step][proc].size();
-                
-                for (size_t index = 0; index < bounds_str_size; index+=2){
+
+                for (size_t index = 0; index < bounds_str_size; index += 2) {
                     eigen_idx_type lower_b = bounds_array_l[step][proc][index];
-                    const eigen_idx_type upper_b = bounds_array_l[step][proc][index+1];
-                    
-                    for (eigen_idx_type node = lower_b; node<=upper_b; ++node){
-                        for (eigen_idx_type i = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node]; i < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1; ++i){
+                    const eigen_idx_type upper_b = bounds_array_l[step][proc][index + 1];
+
+                    for (eigen_idx_type node = lower_b; node <= upper_b; ++node) {
+                        for (eigen_idx_type i = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node]; i < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1; ++i) {
                             x[node] -= (*(instance->getComputationalDag().getCSR())).valuePtr()[i] * x[(*(instance->getComputationalDag().getCSR())).innerIndexPtr()[i]];
                         }
                         x[node] /= (*(instance->getComputationalDag().getCSR())).valuePtr()[(*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1];
                     }
                 }
-    #pragma omp barrier
-            }        
+#pragma omp barrier
+            }
         }
     }
 
-    void usolve_no_permutation_in_place(){
-    #pragma omp parallel num_threads(instance->numberOfProcessors())
+    void usolve_no_permutation_in_place() {
+#pragma omp parallel num_threads(instance->numberOfProcessors())
         {
             // Process each superstep starting from the last one (opposite of lsolve)
             const size_t proc = static_cast<size_t>(omp_get_thread_num());
@@ -297,49 +292,49 @@ class Sptrsv {
             do {
                 step--;
                 const size_t bounds_str_size = bounds_array_u[step][proc].size();
-                for (size_t index = 0; index < bounds_str_size; index+=2){
+                for (size_t index = 0; index < bounds_str_size; index += 2) {
                     eigen_idx_type node = bounds_array_u[step][proc][index] + 1;
-                    const eigen_idx_type lower_b = bounds_array_u[step][proc][index+1];
+                    const eigen_idx_type lower_b = bounds_array_u[step][proc][index + 1];
 
                     do {
                         node--;
-                        for (eigen_idx_type i=(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node] + 1; i < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node + 1]; ++i){
+                        for (eigen_idx_type i = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node] + 1; i < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node + 1]; ++i) {
                             x[node] -= (*(instance->getComputationalDag().getCSC())).valuePtr()[i] * x[(*(instance->getComputationalDag().getCSC())).innerIndexPtr()[i]];
                         }
                         x[node] /= (*(instance->getComputationalDag().getCSC())).valuePtr()[(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node]];
                     } while (node != lower_b);
                 }
-    #pragma omp barrier
-            } while (step!=0);    
+#pragma omp barrier
+            } while (step != 0);
         }
     }
 
-    void lsolve_no_permutation(){
-    #pragma omp parallel num_threads(instance->numberOfProcessors())
+    void lsolve_no_permutation() {
+#pragma omp parallel num_threads(instance->numberOfProcessors())
         {
             const size_t proc = static_cast<size_t>(omp_get_thread_num());
-            for (unsigned step = 0; step < num_supersteps; ++step){
+            for (unsigned step = 0; step < num_supersteps; ++step) {
                 const size_t bounds_str_size = bounds_array_l[step][proc].size();
-                
-                for (size_t index = 0; index < bounds_str_size; index+=2){
+
+                for (size_t index = 0; index < bounds_str_size; index += 2) {
                     eigen_idx_type lower_b = bounds_array_l[step][proc][index];
-                    const eigen_idx_type upper_b = bounds_array_l[step][proc][index+1];
-                    
-                    for (eigen_idx_type node = lower_b; node<=upper_b; ++node){
+                    const eigen_idx_type upper_b = bounds_array_l[step][proc][index + 1];
+
+                    for (eigen_idx_type node = lower_b; node <= upper_b; ++node) {
                         x[node] = b[node];
-                        for (eigen_idx_type i = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node]; i < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1; ++i){
+                        for (eigen_idx_type i = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node]; i < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1; ++i) {
                             x[node] -= (*(instance->getComputationalDag().getCSR())).valuePtr()[i] * x[(*(instance->getComputationalDag().getCSR())).innerIndexPtr()[i]];
                         }
                         x[node] /= (*(instance->getComputationalDag().getCSR())).valuePtr()[(*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1];
                     }
                 }
-    #pragma omp barrier
-            }        
+#pragma omp barrier
+            }
         }
     }
 
-    void usolve_no_permutation(){
-    #pragma omp parallel num_threads(instance->numberOfProcessors())
+    void usolve_no_permutation() {
+#pragma omp parallel num_threads(instance->numberOfProcessors())
         {
             // Process each superstep starting from the last one (opposite of lsolve)
             const size_t proc = static_cast<size_t>(omp_get_thread_num());
@@ -347,50 +342,48 @@ class Sptrsv {
             do {
                 step--;
                 const size_t bounds_str_size = bounds_array_u[step][proc].size();
-                for (size_t index = 0; index < bounds_str_size; index+=2){
+                for (size_t index = 0; index < bounds_str_size; index += 2) {
                     eigen_idx_type node = bounds_array_u[step][proc][index] + 1;
-                    const eigen_idx_type lower_b = bounds_array_u[step][proc][index+1];
+                    const eigen_idx_type lower_b = bounds_array_u[step][proc][index + 1];
 
                     do {
                         node--;
                         x[node] = b[node];
-                        for (eigen_idx_type i=(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node] + 1; i < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node + 1]; ++i){
+                        for (eigen_idx_type i = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node] + 1; i < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node + 1]; ++i) {
                             x[node] -= (*(instance->getComputationalDag().getCSC())).valuePtr()[i] * x[(*(instance->getComputationalDag().getCSC())).innerIndexPtr()[i]];
                         }
                         x[node] /= (*(instance->getComputationalDag().getCSC())).valuePtr()[(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node]];
                     } while (node != lower_b);
                 }
-    #pragma omp barrier
-            } while (step!=0);    
+#pragma omp barrier
+            } while (step != 0);
         }
     }
 
-    void lsolve_serial_in_place(){
+    void lsolve_serial_in_place() {
         eigen_idx_type number_of_vertices = static_cast<eigen_idx_type>(instance->numberOfVertices());
-        for (eigen_idx_type i = 0; i < number_of_vertices; ++i){
-            for (eigen_idx_type j = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i]; j < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1; ++j){
+        for (eigen_idx_type i = 0; i < number_of_vertices; ++i) {
+            for (eigen_idx_type j = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i]; j < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1; ++j) {
                 x[i] -= (*(instance->getComputationalDag().getCSR())).valuePtr()[j] * x[(*(instance->getComputationalDag().getCSR())).innerIndexPtr()[j]];
             }
             x[i] /= (*(instance->getComputationalDag().getCSR())).valuePtr()[(*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1];
         }
-        
     }
 
-    void usolve_serial_in_place(){
+    void usolve_serial_in_place() {
         eigen_idx_type number_of_vertices = static_cast<eigen_idx_type>(instance->numberOfVertices());
         eigen_idx_type i = number_of_vertices;
         do {
             i--;
-            for (eigen_idx_type j = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i] + 1; j < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i + 1]; ++j){
+            for (eigen_idx_type j = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i] + 1; j < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i + 1]; ++j) {
                 x[i] -= (*(instance->getComputationalDag().getCSC())).valuePtr()[j] * x[(*(instance->getComputationalDag().getCSC())).innerIndexPtr()[j]];
             }
             x[i] /= (*(instance->getComputationalDag().getCSC())).valuePtr()[(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i]];
         } while (i != 0);
-        
     }
 
     void lsolve_with_permutation_in_place() {
-        #pragma omp parallel num_threads(instance->numberOfProcessors())
+#pragma omp parallel num_threads(instance->numberOfProcessors())
         {
             for (unsigned step = 0; step < num_supersteps; step++) {
 
@@ -405,13 +398,13 @@ class Sptrsv {
                     x[_row_idx] /= val[row_ptr[_row_idx + 1] - 1];
                 }
 
-    #pragma omp barrier
+#pragma omp barrier
             }
         }
     }
 
     void lsolve_with_permutation() {
-        #pragma omp parallel num_threads(instance->numberOfProcessors())
+#pragma omp parallel num_threads(instance->numberOfProcessors())
         {
             for (unsigned step = 0; step < num_supersteps; step++) {
 
@@ -426,12 +419,11 @@ class Sptrsv {
                     x[_row_idx] /= val[row_ptr[_row_idx + 1] - 1];
                 }
 
-    #pragma omp barrier
+#pragma omp barrier
             }
         }
     }
 
-
     void reset_x() {
         eigen_idx_type number_of_vertices = static_cast<eigen_idx_type>(instance->numberOfVertices());
         for (eigen_idx_type i = 0; i < number_of_vertices; i++) {
@@ -459,13 +451,13 @@ class Sptrsv {
         }
     }
 
-    std::size_t get_number_of_vertices(){
-      return instance->numberOfVertices() ;
+    std::size_t get_number_of_vertices() {
+        return instance->numberOfVertices();
     }
 
     virtual ~Sptrsv() = default;
 };
 
-}
+} // namespace osp
 
 #endif
\ No newline at end of file
diff --git a/include/osp/bsp/model/BspSchedule.hpp b/include/osp/bsp/model/BspSchedule.hpp
index 1f214b00..eeeaeec3 100644
--- a/include/osp/bsp/model/BspSchedule.hpp
+++ b/include/osp/bsp/model/BspSchedule.hpp
@@ -19,16 +19,14 @@ limitations under the License.
 #pragma once
 
 #include <algorithm>
-#include <iostream>
-#include <list>
-#include <map>
 #include <stdexcept>
 #include <unordered_set>
 #include <vector>
 
-#include "IBspScheduleEval.hpp"
 #include "IBspSchedule.hpp"
+#include "IBspScheduleEval.hpp"
 #include "SetSchedule.hpp"
+#include "osp/bsp/model/cost/LazyCommunicationCost.hpp"
 #include "osp/concepts/computational_dag_concept.hpp"
 
 namespace osp {
@@ -37,25 +35,28 @@ namespace osp {
  * @class BspSchedule
  * @brief Represents a schedule for the Bulk Synchronous Parallel (BSP) model.
  *
- * The `BspSchedule` class is responsible for managing the assignment of nodes to processors and supersteps in the BSP
- * model. It stores information such as the number of supersteps, the assignment of nodes to processors and supersteps,
- * and the communication schedule.
+ * The `BspSchedule` class manages the assignment of nodes to processors and supersteps within the BSP model.
+ * It serves as a core component for scheduling algorithms, providing mechanisms to:
+ * - Store and retrieve node-to-processor and node-to-superstep assignments.
+ * - Validate schedules against precedence, memory, and node type constraints.
+ * - Compute costs associated with the schedule.
+ * - Manipulate the schedule, including updating assignments and merging supersteps.
  *
- * The class provides methods for setting and retrieving the assigned superstep and processor for a given node, as well
- * as methods for checking the validity of the communication schedule and computing the costs of the schedule. It also
- * provides methods for setting the assigned supersteps and processors based on external assignments, and for updating
- * the number of supersteps.
+ * This class is templated on `Graph_t`, which must satisfy the `computational_dag_concept`.
+ * Moreover, the work and communication weights of the nodes must be of the same type in order to properly compute the cost.
  *
- * The `BspSchedule` class is designed to work with a `BspInstance` object, which represents the instance of the BSP
- * problem being solved.
+ * It interacts closely with `BspInstance` to access problem-specific data and constraints. In fact, a `BspSchedule` object is tied to a `BspInstance` object.
  *
+ * @tparam Graph_t The type of the computational DAG, which must satisfy `is_computational_dag_v`.
  * @see BspInstance
+ * @see IBspSchedule
+ * @see IBspScheduleEval
  */
 template<typename Graph_t>
 class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_t> {
 
     static_assert(is_computational_dag_v<Graph_t>, "BspSchedule can only be used with computational DAGs.");
-    static_assert(std::is_same_v<v_workw_t<Graph_t>, v_commw_t<Graph_t> >, "BspSchedule requires work and comm. weights to have the same type.");
+    static_assert(std::is_same_v<v_workw_t<Graph_t>, v_commw_t<Graph_t>>, "BspSchedule requires work and comm. weights to have the same type.");
 
   protected:
     using vertex_idx = vertex_idx_t<Graph_t>;
@@ -67,83 +68,15 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
     std::vector<unsigned> node_to_processor_assignment;
     std::vector<unsigned> node_to_superstep_assignment;
 
-    void compute_lazy_communication_costs_helper(std::vector<std::vector<v_commw_t<Graph_t>>> & rec, std::vector<std::vector<v_commw_t<Graph_t>>> & send) const {
-        for (const auto &node : instance->vertices()) {
-
-            std::vector<unsigned> step_needed(instance->numberOfProcessors(), number_of_supersteps);
-            for (const auto &target : instance->getComputationalDag().children(node)) {
-
-                if (node_to_processor_assignment[node] != node_to_processor_assignment[target]) {
-                    step_needed[node_to_processor_assignment[target]] = std::min(
-                        step_needed[node_to_processor_assignment[target]], node_to_superstep_assignment[target]);
-                }
-            }
-
-            for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
-
-                if (step_needed[proc] < number_of_supersteps) {
-
-                    send[node_to_processor_assignment[node]][step_needed[proc] - getStaleness()] +=
-                        instance->sendCosts(node_to_processor_assignment[node], proc) *
-                        instance->getComputationalDag().vertex_comm_weight(node);
-
-                    rec[proc][step_needed[proc] - getStaleness()] += instance->sendCosts(node_to_processor_assignment[node], proc) *
-                                                        instance->getComputationalDag().vertex_comm_weight(node);
-                }
-            }
-        }
-    }
-
-    std::vector<v_commw_t<Graph_t>> compute_max_comm_per_step_helper(const std::vector<std::vector<v_commw_t<Graph_t>>> & rec, const std::vector<std::vector<v_commw_t<Graph_t>>> & send) const {
-        std::vector<v_commw_t<Graph_t>> max_comm_per_step(number_of_supersteps, 0);
-        for (unsigned step = 0; step < number_of_supersteps; step++) {
-            v_commw_t<Graph_t> max_send = 0;
-            v_commw_t<Graph_t> max_rec = 0;
-
-            for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
-                if (max_send < send[proc][step])
-                    max_send = send[proc][step];
-                if (max_rec < rec[proc][step])
-                    max_rec = rec[proc][step];
-            }
-            max_comm_per_step[step] = std::max(max_send, max_rec) * instance->communicationCosts();
-        }
-        return max_comm_per_step;
-    }
-
-    std::vector<v_workw_t<Graph_t>> compute_max_work_per_step_helper() const {
-        std::vector<std::vector<v_workw_t<Graph_t>>> work = std::vector<std::vector<v_workw_t<Graph_t>>>(
-            number_of_supersteps, std::vector<v_workw_t<Graph_t>>(instance->numberOfProcessors(), 0));
-        for (const auto &node : instance->vertices()) {
-            work[node_to_superstep_assignment[node]][node_to_processor_assignment[node]] +=
-                instance->getComputationalDag().vertex_work_weight(node);
-        }
-
-        std::vector<v_workw_t<Graph_t>> max_work_per_step(number_of_supersteps, 0);
-        for (unsigned step = 0; step < number_of_supersteps; step++) {
-            v_workw_t<Graph_t> max_work = 0;
-            for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
-                if (max_work < work[step][proc]) {
-                    max_work = work[step][proc];
-                }
-            }
-
-            max_work_per_step[step] = max_work;
-        }
-
-        return max_work_per_step;
-    }
-
   public:
-  
     BspSchedule() = delete;
 
     /**
-     * @brief Constructs a BspSchedule object with the specified Bspinstance->
+     * @brief Constructs a BspSchedule object with the specified BspInstance.
      *
      * @param inst The BspInstance for the schedule.
      */
-    BspSchedule(const BspInstance<Graph_t> &inst)
+    explicit BspSchedule(const BspInstance<Graph_t> &inst)
         : instance(&inst), number_of_supersteps(1),
           node_to_processor_assignment(std::vector<unsigned>(inst.numberOfVertices(), 0)),
           node_to_superstep_assignment(std::vector<unsigned>(inst.numberOfVertices(), 0)) {}
@@ -163,7 +96,12 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
         updateNumberOfSupersteps();
     }
 
-    BspSchedule(const IBspSchedule<Graph_t> &schedule)
+    /**
+     * @brief Copy constructor from an IBspSchedule.
+     *
+     * @param schedule The schedule to copy.
+     */
+    explicit BspSchedule(const IBspSchedule<Graph_t> &schedule)
         : instance(&schedule.getInstance()), number_of_supersteps(schedule.numberOfSupersteps()),
           node_to_processor_assignment(schedule.getInstance().numberOfVertices()),
           node_to_superstep_assignment(schedule.getInstance().numberOfVertices()) {
@@ -175,12 +113,23 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
         }
     }
 
+    /**
+     * @brief Copy constructor.
+     *
+     * @param schedule The schedule to copy.
+     */
     BspSchedule(const BspSchedule<Graph_t> &schedule)
         : instance(schedule.instance), number_of_supersteps(schedule.number_of_supersteps),
           node_to_processor_assignment(schedule.node_to_processor_assignment),
           node_to_superstep_assignment(schedule.node_to_superstep_assignment) {}
 
-    BspSchedule<Graph_t> operator=(const BspSchedule<Graph_t> &schedule) {
+    /**
+     * @brief Copy assignment operator.
+     *
+     * @param schedule The schedule to copy.
+     * @return A reference to this schedule.
+     */
+    BspSchedule<Graph_t> &operator=(const BspSchedule<Graph_t> &schedule) {
         if (this != &schedule) {
             instance = schedule.instance;
             number_of_supersteps = schedule.number_of_supersteps;
@@ -190,12 +139,23 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
         return *this;
     }
 
-    BspSchedule(BspSchedule<Graph_t> &&schedule)
+    /**
+     * @brief Move constructor.
+     *
+     * @param schedule The schedule to move.
+     */
+    BspSchedule(BspSchedule<Graph_t> &&schedule) noexcept
         : instance(schedule.instance), number_of_supersteps(schedule.number_of_supersteps),
           node_to_processor_assignment(std::move(schedule.node_to_processor_assignment)),
           node_to_superstep_assignment(std::move(schedule.node_to_superstep_assignment)) {}
 
-    BspSchedule<Graph_t> &operator=(BspSchedule<Graph_t> &&schedule) {
+    /**
+     * @brief Move assignment operator.
+     *
+     * @param schedule The schedule to move.
+     * @return A reference to this schedule.
+     */
+    BspSchedule<Graph_t> &operator=(BspSchedule<Graph_t> &&schedule) noexcept {
         if (this != &schedule) {
             instance = schedule.instance;
             number_of_supersteps = schedule.number_of_supersteps;
@@ -205,6 +165,13 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
         return *this;
     }
 
+    /**
+     * @brief Constructs a BspSchedule object from another schedule with a different graph type.
+     *
+     * @tparam Graph_t_other The graph type of the other schedule.
+     * @param instance_ The BspInstance for the new schedule.
+     * @param schedule The other schedule to copy from.
+     */
     template<typename Graph_t_other>
     BspSchedule(const BspInstance<Graph_t> &instance_, const BspSchedule<Graph_t_other> &schedule)
         : instance(&instance_), number_of_supersteps(schedule.numberOfSupersteps()),
@@ -221,19 +188,17 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
      *
      * @return A reference to the BspInstance for the schedule.
      */
-    inline const BspInstance<Graph_t> &getInstance() const override { return *instance; }
+    [[nodiscard]] const BspInstance<Graph_t> &getInstance() const override { return *instance; }
 
     /**
      * @brief Returns the number of supersteps in the schedule.
      *
      * @return The number of supersteps in the schedule.
      */
-    inline unsigned numberOfSupersteps() const override { return number_of_supersteps; }
+    [[nodiscard]] unsigned numberOfSupersteps() const override { return number_of_supersteps; }
 
     /**
-     * @brief Returns the number of processors in the schedule.
-     *
-     * @return The number of processors in the schedule.
+     * @brief Updates the number of supersteps based on the current assignment.
      */
     void updateNumberOfSupersteps() {
         number_of_supersteps = 0;
@@ -250,7 +215,7 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
      * @param node The node for which to return the assigned superstep.
      * @return The superstep assigned to the specified node.
      */
-    inline unsigned assignedSuperstep(vertex_idx node) const override { return node_to_superstep_assignment[node]; }
+    [[nodiscard]] unsigned assignedSuperstep(vertex_idx node) const override { return node_to_superstep_assignment[node]; }
 
     /**
      * @brief Returns the processor assigned to the specified node.
@@ -258,23 +223,32 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
      * @param node The node for which to return the assigned processor.
      * @return The processor assigned to the specified node.
      */
-    inline unsigned assignedProcessor(vertex_idx node) const override { return node_to_processor_assignment[node]; }
+    [[nodiscard]] unsigned assignedProcessor(vertex_idx node) const override { return node_to_processor_assignment[node]; }
 
     /**
      * @brief Returns the superstep assignment for the schedule.
      *
      * @return The superstep assignment for the schedule.
      */
-    inline const std::vector<unsigned> &assignedSupersteps() const { return node_to_superstep_assignment; }
-    inline std::vector<unsigned> &assignedSupersteps() { return node_to_superstep_assignment; }
+    [[nodiscard]] const std::vector<unsigned> &assignedSupersteps() const { return node_to_superstep_assignment; }
+    [[nodiscard]] std::vector<unsigned> &assignedSupersteps() { return node_to_superstep_assignment; }
 
     /**
      * @brief Returns the processor assignment for the schedule.
      *
      * @return The processor assignment for the schedule.
      */
-    inline const std::vector<unsigned> &assignedProcessors() const { return node_to_processor_assignment; }
-    inline std::vector<unsigned> &assignedProcessors() { return node_to_processor_assignment; }
+    [[nodiscard]] const std::vector<unsigned> &assignedProcessors() const { return node_to_processor_assignment; }
+    [[nodiscard]] std::vector<unsigned> &assignedProcessors() { return node_to_processor_assignment; }
+
+    /**
+     * @brief Returns the staleness of the schedule.
+     * The staleness determines the minimum number of supersteps that must elapse between the assignment of a node to a processor and the assignment of one of its neighbors to a different processor.
+     * The staleness for the BspSchedule is always 1.
+     *
+     * @return The staleness of the schedule.
+     */
+    [[nodiscard]] virtual unsigned getStaleness() const { return 1; }
 
     /**
      * @brief Sets the superstep assigned to the specified node.
@@ -283,7 +257,6 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
      * @param superstep The superstep to assign to the node.
      */
     void setAssignedSuperstep(vertex_idx node, unsigned superstep) {
-
         if (node < instance->numberOfVertices()) {
             node_to_superstep_assignment[node] = superstep;
 
@@ -297,12 +270,12 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
     }
 
     /**
-     * @brief Sets the superstep assigned to the specified node.
+     * @brief Sets the superstep assigned to the specified node without updating the number of supersteps.
      *
      * @param node The node for which to set the assigned superstep.
      * @param superstep The superstep to assign to the node.
      */
-    inline void setAssignedSuperstep_noUpdateNumSuperstep(vertex_idx node, unsigned superstep) {
+    void setAssignedSuperstepNoUpdateNumSuperstep(vertex_idx node, unsigned superstep) {
         node_to_superstep_assignment.at(node) = superstep;
     }
 
@@ -312,7 +285,7 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
      * @param node The node for which to set the assigned processor.
      * @param processor The processor to assign to the node.
      */
-    inline void setAssignedProcessor(vertex_idx node, unsigned processor) {
+    void setAssignedProcessor(vertex_idx node, unsigned processor) {
         node_to_processor_assignment.at(node) = processor;
     }
 
@@ -322,13 +295,10 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
      * @param vec The superstep assignment to set.
      */
     void setAssignedSupersteps(const std::vector<unsigned> &vec) {
-
-        if (vec.size() == static_cast<std::size_t>( instance->numberOfVertices() )) {
-
+        if (vec.size() == static_cast<std::size_t>(instance->numberOfVertices())) {
             number_of_supersteps = 0;
 
             for (vertex_idx_t<Graph_t> i = 0; i < instance->numberOfVertices(); ++i) {
-
                 if (vec[i] >= number_of_supersteps) {
                     number_of_supersteps = vec[i] + 1;
                 }
@@ -347,14 +317,13 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
      * @param vec The superstep assignment to set.
      */
     void setAssignedSupersteps(std::vector<unsigned> &&vec) {
-
-        if (vec.size() == static_cast<std::size_t>( instance->numberOfVertices() )) {
+        if (vec.size() == static_cast<std::size_t>(instance->numberOfVertices())) {
             node_to_superstep_assignment = std::move(vec);
         } else {
             throw std::invalid_argument(
                 "Invalid Argument while assigning supersteps: size does not match number of nodes.");
         }
-        
+
         updateNumberOfSupersteps();
     }
 
@@ -364,8 +333,7 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
      * @param vec The processor assignment to set.
      */
     void setAssignedProcessors(const std::vector<unsigned> &vec) {
-
-        if (vec.size() == static_cast<std::size_t>( instance->numberOfVertices() )) {
+        if (vec.size() == static_cast<std::size_t>(instance->numberOfVertices())) {
             node_to_processor_assignment = vec;
         } else {
             throw std::invalid_argument(
@@ -379,8 +347,7 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
      * @param vec The processor assignment to set.
      */
     void setAssignedProcessors(std::vector<unsigned> &&vec) {
-
-        if (vec.size() == static_cast<std::size_t>( instance->numberOfVertices() )) {
+        if (vec.size() == static_cast<std::size_t>(instance->numberOfVertices())) {
             node_to_processor_assignment = std::move(vec);
         } else {
             throw std::invalid_argument(
@@ -388,188 +355,57 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
         }
     }
 
-    virtual v_workw_t<Graph_t> computeWorkCosts() const override {
-        const std::vector<v_workw_t<Graph_t>> work_per_step = compute_max_work_per_step_helper();
-        return std::accumulate(work_per_step.begin(), work_per_step.end(), static_cast<v_workw_t<Graph_t>>(0));
-    }
-
-    double compute_total_communication_costs() const {
-
-        assert(satisfiesPrecedenceConstraints());
-
-        double total_communication = 0;
-
-        for (const auto &v : instance->vertices()) {
-            for (const auto &target : instance->getComputationalDag().children(v)) {
-
-                if (node_to_processor_assignment[v] != node_to_processor_assignment[target]) {
-                    total_communication +=
-                        instance->sendCosts(node_to_processor_assignment[v], node_to_processor_assignment[target]) *
-                        instance->getComputationalDag().vertex_comm_weight(v);
-                }
-            }
-        }
-
-        return total_communication * static_cast<double>(instance->communicationCosts()) / static_cast<double>(instance->numberOfProcessors());
-    }
-
-    double computeTotalCosts() const {
-
-        assert(satisfiesPrecedenceConstraints());
-
-        const v_commw_t<Graph_t> sync_cost =
-            number_of_supersteps >= 1
-                ? instance->synchronisationCosts() * static_cast<v_commw_t<Graph_t>>(number_of_supersteps - 1)
-                : 0;
-
-        return static_cast<double>(computeWorkCosts()) + compute_total_communication_costs() + sync_cost;
-    }
-
-    double compute_total_lambda_communication_cost() const {
-
-        assert(satisfiesPrecedenceConstraints());
-
-        double comm_costs = 0;
-        const double comm_multiplier = 1.0 / instance->numberOfProcessors();
-
-        for (const auto &v : instance->vertices()) {
-            if (instance->getComputationalDag().out_degree(v) == 0)
-                continue;
-
-            std::unordered_set<unsigned> target_procs;
-            for (const auto &target : instance->getComputationalDag().children(v)) {
-                target_procs.insert(node_to_processor_assignment[target]);
-            }
-
-            const unsigned source_proc = node_to_processor_assignment[v];
-            const auto v_comm_cost = instance->getComputationalDag().vertex_comm_weight(v);
-
-            for (const auto& target_proc : target_procs) {
-                comm_costs += v_comm_cost * instance->sendCosts(source_proc, target_proc);
-            }
-        }
-
-        return comm_costs * comm_multiplier * static_cast<double>(instance->communicationCosts());
-    }
-    
-    double computeTotalLambdaCosts() const {
-        assert(satisfiesPrecedenceConstraints());
-
-        const v_commw_t<Graph_t> sync_cost =
-            number_of_supersteps >= 1
-                ? instance->synchronisationCosts() * static_cast<v_commw_t<Graph_t>>(number_of_supersteps - 1)
-                : 0;
-
-        return static_cast<double>(computeWorkCosts()) + compute_total_lambda_communication_cost() + sync_cost;
-    }
-
-    v_commw_t<Graph_t> compute_buffered_sending_communication_costs() const {
-
-        std::vector<std::vector<v_commw_t<Graph_t>>> rec(instance->numberOfProcessors(),
-                                                         std::vector<v_commw_t<Graph_t>>(number_of_supersteps, 0));
-        std::vector<std::vector<v_commw_t<Graph_t>>> send(instance->numberOfProcessors(),
-                                                          std::vector<v_commw_t<Graph_t>>(number_of_supersteps, 0));
-
-        for (vertex_idx node = 0; node < instance->numberOfVertices(); node++) {
-
-            std::vector<unsigned> step_needed(instance->numberOfProcessors(), number_of_supersteps);
-            for (const auto &target : instance->getComputationalDag().children(node)) {
-
-                if (node_to_processor_assignment[node] != node_to_processor_assignment[target]) {
-                    step_needed[node_to_processor_assignment[target]] = std::min(
-                        step_needed[node_to_processor_assignment[target]], node_to_superstep_assignment[target]);
-                }
-            }
-
-            for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
-
-                if (step_needed[proc] < number_of_supersteps) {
-                    send[node_to_processor_assignment[node]][node_to_superstep_assignment[node]] +=
-                        instance->sendCosts(node_to_processor_assignment[node], proc) *
-                        instance->getComputationalDag().vertex_comm_weight(node);
-
-                    rec[proc][step_needed[proc] - 1] += instance->sendCosts(node_to_processor_assignment[node], proc) *
-                                                        instance->getComputationalDag().vertex_comm_weight(node);
-                }
-            }
-        }
-
-        const std::vector<v_commw_t<Graph_t>> max_comm_per_step = compute_max_comm_per_step_helper(rec, send);
-
-        v_commw_t<Graph_t> costs = 0;
-        for (unsigned step = 0; step < number_of_supersteps; step++) {
-            const auto step_comm_cost = max_comm_per_step[step];
-            costs += step_comm_cost;
-
-            if (step_comm_cost > 0) {
-                costs += instance->synchronisationCosts();
-            }
-        }
-        return costs;
-    }
-
-    v_workw_t<Graph_t> computeBufferedSendingCosts() const {
-
-        return compute_buffered_sending_communication_costs() + computeWorkCosts();
-    }
-
-    v_commw_t<Graph_t> compute_lazy_communication_costs() const {
-
-        std::vector<std::vector<v_commw_t<Graph_t>>> rec(instance->numberOfProcessors(),
-                                                         std::vector<v_commw_t<Graph_t>>(number_of_supersteps, 0));
-
-        std::vector<std::vector<v_commw_t<Graph_t>>> send(instance->numberOfProcessors(),
-                                                          std::vector<v_commw_t<Graph_t>>(number_of_supersteps, 0));
-
-        compute_lazy_communication_costs_helper(rec, send);
-        const std::vector<v_commw_t<Graph_t>> max_comm_per_step = compute_max_comm_per_step_helper(rec, send);
-
-        v_commw_t<Graph_t> costs = 0;
-        for (unsigned step = 0; step < number_of_supersteps; step++) {
-            const auto step_comm_cost = max_comm_per_step[step];
-            costs += step_comm_cost;
-
-            if (step_comm_cost > 0) {
-                costs += instance->synchronisationCosts();
-            }
-        }
+    /**
+     * @brief Computes the work costs of the schedule.
+     * The workload of a processor in a superstep is the sum of the workloads of all nodes assigned to that processor in that superstep.
+     * The workload in a superstep is the maximum workload of any processor in that superstep.
+     * The work cost of the schedule is the sum of the workloads of all supersteps.
+     *
+     * @return The work costs of the schedule.
+     */
+    virtual v_workw_t<Graph_t> computeWorkCosts() const override { return cost_helpers::compute_work_costs(*this); }
 
-        return costs;
-    }
+    /**
+     * @brief Computes the costs of the schedule accoring to lazy communication cost evaluation.
+     *
+     * @return The costs of the schedule.
+     */
+    virtual v_workw_t<Graph_t> computeCosts() const override { return LazyCommunicationCost<Graph_t>()(*this); }
 
-    virtual v_workw_t<Graph_t> computeCosts() const override { return compute_lazy_communication_costs() + computeWorkCosts(); }
+    /**
+     * @brief Checks if the schedule is valid.
+     *
+     * A schedule is valid if it satisfies all precedence, memory, and node type constraints.
+     *
+     * @return True if the schedule is valid, false otherwise.
+     */
+    [[nodiscard]] bool isValid() const { return satisfiesPrecedenceConstraints() && satisfiesMemoryConstraints() && satisfiesNodeTypeConstraints(); }
 
     /**
      * @brief Returns true if the schedule satisfies the precedence constraints of the computational DAG.
      *
      * The precedence constraints of the computational DAG are satisfied if, for each directed edge (u, v) such that u
-     * and v are assigned to different processors, the superstep assigned to node u is less than the superstep assigned
-     * to node v.
+     * and v are assigned to different processors, the difference between the superstep assigned to node u and the
+     * superstep assigned to node v is less than the staleness of the schedule. For the BspSchedule staleness is 1.
      *
      * @return True if the schedule satisfies the precedence constraints of the computational DAG, false otherwise.
      */
-    inline bool satisfiesPrecedenceConstraints() const {
-
+    [[nodiscard]] bool satisfiesPrecedenceConstraints() const {
         if (static_cast<vertex_idx_t<Graph_t>>(node_to_processor_assignment.size()) != instance->numberOfVertices() ||
             static_cast<vertex_idx_t<Graph_t>>(node_to_superstep_assignment.size()) != instance->numberOfVertices()) {
             return false;
         }
 
         for (const auto &v : instance->vertices()) {
-
             if (node_to_superstep_assignment[v] >= number_of_supersteps) {
                 return false;
             }
-
             if (node_to_processor_assignment[v] >= instance->numberOfProcessors()) {
                 return false;
             }
 
             for (const auto &target : instance->getComputationalDag().children(v)) {
-
-                const unsigned different_processors =
-                    (node_to_processor_assignment[v] == node_to_processor_assignment[target]) ? 0u : getStaleness();
-
+                const unsigned different_processors = (node_to_processor_assignment[v] == node_to_processor_assignment[target]) ? 0u : getStaleness();
                 if (node_to_superstep_assignment[v] + different_processors > node_to_superstep_assignment[target]) {
                     return false;
                 }
@@ -579,281 +415,359 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
         return true;
     }
 
-    bool satisfiesNodeTypeConstraints() const {
-
-        if (node_to_processor_assignment.size() != instance->numberOfVertices())
+    /**
+     * @brief Checks if the schedule satisfies node type constraints.
+     *
+     * Node type constraints are checked based on the compatibility of nodes with their assigned processors.
+     *
+     * @return True if node type constraints are satisfied, false otherwise.
+     */
+    [[nodiscard]] bool satisfiesNodeTypeConstraints() const {
+        if (node_to_processor_assignment.size() != instance->numberOfVertices()) {
             return false;
+        }
 
         for (const auto &node : instance->vertices()) {
-            if (!instance->isCompatible(node, node_to_processor_assignment[node]))
+            if (!instance->isCompatible(node, node_to_processor_assignment[node])) {
                 return false;
+            }
         }
 
         return true;
-    };
+    }
 
-    bool satisfiesMemoryConstraints() const {
+    /**
+     * @brief Checks if the schedule satisfies memory constraints.
+     *
+     * Memory constraints are checked based on the type of memory constraint specified in the architecture.
+     *
+     * @return True if memory constraints are satisfied, false otherwise.
+     */
+    [[nodiscard]] bool satisfiesMemoryConstraints() const {
 
         switch (instance->getArchitecture().getMemoryConstraintType()) {
 
-        case MEMORY_CONSTRAINT_TYPE::LOCAL: {
+        case MEMORY_CONSTRAINT_TYPE::LOCAL:
+            return satisfiesLocalMemoryConstraints();
 
-            SetSchedule set_schedule = SetSchedule(*this);
+        case MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT:
+            return satisfiesPersistentAndTransientMemoryConstraints();
 
-            for (unsigned step = 0; step < number_of_supersteps; step++) {
-                for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
+        case MEMORY_CONSTRAINT_TYPE::GLOBAL:
+            return satisfiesGlobalMemoryConstraints();
 
-                    v_memw_t<Graph_t> memory = 0;
-                    for (const auto &node : set_schedule.step_processor_vertices[step][proc]) {
-                        memory += instance->getComputationalDag().vertex_mem_weight(node);
-                    }
+        case MEMORY_CONSTRAINT_TYPE::LOCAL_IN_OUT:
+            return satisfiesLocalInOutMemoryConstraints();
 
-                    if (memory > instance->getArchitecture().memoryBound(proc)) {
-                        return false;
-                    }
-                }
-            }
+        case MEMORY_CONSTRAINT_TYPE::LOCAL_INC_EDGES:
+            return satisfiesLocalIncEdgesMemoryConstraints();
 
-            break;
-        }
+        case MEMORY_CONSTRAINT_TYPE::LOCAL_SOURCES_INC_EDGES:
+            return satisfiesLocalSourcesIncEdgesMemoryConstraints();
 
-        case MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT: {
-            std::vector<v_memw_t<Graph_t>> current_proc_persistent_memory(instance->numberOfProcessors(), 0);
-            std::vector<v_memw_t<Graph_t>> current_proc_transient_memory(instance->numberOfProcessors(), 0);
+        case MEMORY_CONSTRAINT_TYPE::NONE:
+            return true;
 
-            for (const auto &node : instance->vertices()) {
+        default:
+            throw std::invalid_argument("Unknown memory constraint type.");
+        }
+    }
 
-                const unsigned proc = node_to_processor_assignment[node];
-                current_proc_persistent_memory[proc] += instance->getComputationalDag().vertex_mem_weight(node);
-                current_proc_transient_memory[proc] = std::max(
-                    current_proc_transient_memory[proc], instance->getComputationalDag().vertex_comm_weight(node));
+    /**
+     * @brief Returns a vector of nodes assigned to the specified processor.
+     *
+     * @param processor The processor index.
+     * @return A vector of nodes assigned to the specified processor.
+     */
+    [[nodiscard]] std::vector<vertex_idx_t<Graph_t>> getAssignedNodeVector(unsigned int processor) const {
+        std::vector<vertex_idx_t<Graph_t>> vec;
 
-                if (current_proc_persistent_memory[proc] + current_proc_transient_memory[proc] >
-                    instance->getArchitecture().memoryBound(proc)) {
-                    return false;
-                }
+        for (const auto &node : instance->vertices()) {
+            if (node_to_processor_assignment[node] == processor) {
+                vec.push_back(node);
             }
-            break;
         }
 
-        case MEMORY_CONSTRAINT_TYPE::GLOBAL: {
-            std::vector<v_memw_t<Graph_t>> current_proc_memory(instance->numberOfProcessors(), 0);
-
-            for (const auto &node : instance->vertices()) {
+        return vec;
+    }
 
-                const unsigned proc = node_to_processor_assignment[node];
-                current_proc_memory[proc] += instance->getComputationalDag().vertex_mem_weight(node);
+    /**
+     * @brief Returns a vector of nodes assigned to the specified processor and superstep.
+     *
+     * @param processor The processor index.
+     * @param superstep The superstep index.
+     * @return A vector of nodes assigned to the specified processor and superstep.
+     */
+    [[nodiscard]] std::vector<vertex_idx_t<Graph_t>> getAssignedNodeVector(unsigned int processor, unsigned int superstep) const {
+        std::vector<vertex_idx_t<Graph_t>> vec;
 
-                if (current_proc_memory[proc] > instance->getArchitecture().memoryBound(proc)) {
-                    return false;
-                }
+        for (const auto &node : instance->vertices()) {
+            if (node_to_processor_assignment[node] == processor && node_to_superstep_assignment[node] == superstep) {
+                vec.push_back(node);
             }
-            break;
         }
 
-        case MEMORY_CONSTRAINT_TYPE::LOCAL_IN_OUT: {
-
-            SetSchedule set_schedule = SetSchedule(*this);
+        return vec;
+    }
 
-            for (unsigned step = 0; step < number_of_supersteps; step++) {
-                for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
+    /**
+     * @brief Sets the number of supersteps in the schedule.
+     *
+     * @param number_of_supersteps_ The number of supersteps.
+     */
+    void setNumberOfSupersteps(unsigned int number_of_supersteps_) {
+        number_of_supersteps = number_of_supersteps_;
+    }
 
-                    v_memw_t<Graph_t> memory = 0;
-                    for (const auto &node : set_schedule.step_processor_vertices[step][proc]) {
-                        memory += instance->getComputationalDag().vertex_mem_weight(node) +
-                                  instance->getComputationalDag().vertex_comm_weight(node);
+    /**
+     * @brief Returns the number of nodes assigned to the specified processor.
+     *
+     * @param processor The processor index.
+     * @return The number of nodes assigned to the specified processor.
+     */
+    [[nodiscard]] unsigned numAssignedNodes(unsigned processor) const {
+        unsigned num = 0;
 
-                        for (const auto &parent : instance->getComputationalDag().parents(node)) {
+        for (const auto &node : instance->vertices()) {
+            if (node_to_processor_assignment[node] == processor) {
+                num++;
+            }
+        }
 
-                            if (node_to_processor_assignment[parent] == proc &&
-                                node_to_superstep_assignment[parent] == step) {
-                                memory -= instance->getComputationalDag().vertex_comm_weight(parent);
-                            }
-                        }
-                    }
+        return num;
+    }
 
-                    if (memory > instance->getArchitecture().memoryBound(proc)) {
-                        return false;
-                    }
-                }
-            }
+    /**
+     * @brief Returns a vector containing the number of nodes assigned to each processor.
+     *
+     * @return A vector containing the number of nodes assigned to each processor.
+     */
+    [[nodiscard]] std::vector<unsigned> numAssignedNodesPerProcessor() const {
+        std::vector<unsigned> num(instance->numberOfProcessors(), 0);
 
-            break;
+        for (const auto &node : instance->vertices()) {
+            num[node_to_processor_assignment[node]]++;
         }
 
-        case MEMORY_CONSTRAINT_TYPE::LOCAL_INC_EDGES: {
+        return num;
+    }
+
+    /**
+     * @brief Returns a 2D vector containing the number of nodes assigned to each processor in each superstep.
+     *
+     * @return A 2D vector containing the number of nodes assigned to each processor in each superstep.
+     */
+    [[nodiscard]] std::vector<std::vector<unsigned>> numAssignedNodesPerSuperstepProcessor() const {
+        std::vector<std::vector<unsigned>> num(number_of_supersteps, std::vector<unsigned>(instance->numberOfProcessors(), 0));
+
+        for (const auto &v : instance->vertices()) {
+            num[node_to_superstep_assignment[v]][node_to_processor_assignment[v]] += 1;
+        }
 
-            SetSchedule set_schedule = SetSchedule(*this);
+        return num;
+    }
 
-            for (unsigned step = 0; step < number_of_supersteps; step++) {
-                for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
+    /**
+     * @brief Shrinks the schedule by merging supersteps where no communication occurs.
+     */
+    virtual void shrinkByMergingSupersteps() {
+        std::vector<bool> comm_phase_empty(number_of_supersteps, true);
+        for (const auto &node : instance->vertices())
+            for (const auto &child : instance->getComputationalDag().children(node))
+                if (node_to_processor_assignment[node] != node_to_processor_assignment[child])
+                    for (unsigned offset = 1; offset <= getStaleness(); ++offset)
+                        comm_phase_empty[node_to_superstep_assignment[child] - offset] = false;
 
-                    std::unordered_set<vertex_idx_t<Graph_t>> nodes_with_incoming_edges;
+        std::vector<unsigned> new_step_index(number_of_supersteps);
+        unsigned current_index = 0;
+        for (unsigned step = 0; step < number_of_supersteps; ++step) {
+            new_step_index[step] = current_index;
+            if (!comm_phase_empty[step])
+                current_index++;
+        }
+        for (const auto &node : instance->vertices())
+            node_to_superstep_assignment[node] = new_step_index[node_to_superstep_assignment[node]];
 
-                    v_memw_t<Graph_t> memory = 0;
-                    for (const auto &node : set_schedule.step_processor_vertices[step][proc]) {
-                        memory += instance->getComputationalDag().vertex_comm_weight(node);
+        setNumberOfSupersteps(current_index);
+    }
 
-                        for (const auto &parent : instance->getComputationalDag().parents(node)) {
+  private:
+    /**
+     * @brief Checks if the schedule satisfies local memory constraints.
+     *
+     * In this model, the memory usage of a processor in a superstep is the sum of the memory weights of all nodes
+     * assigned to it in that superstep.
+     *
+     * @return True if local memory constraints are satisfied, false otherwise.
+     */
+    bool satisfiesLocalMemoryConstraints() const {
+        SetSchedule set_schedule = SetSchedule(*this);
 
-                            if (node_to_superstep_assignment[parent] != step) {
-                                nodes_with_incoming_edges.insert(parent);
-                            }
-                        }
-                    }
+        for (unsigned step = 0; step < number_of_supersteps; step++) {
+            for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
 
-                    for (const auto &node : nodes_with_incoming_edges) {
-                        memory += instance->getComputationalDag().vertex_comm_weight(node);
-                    }
+                v_memw_t<Graph_t> memory = 0;
+                for (const auto &node : set_schedule.step_processor_vertices[step][proc]) {
+                    memory += instance->getComputationalDag().vertex_mem_weight(node);
+                }
 
-                    if (memory > instance->getArchitecture().memoryBound(proc)) {
-                        return false;
-                    }
+                if (memory > instance->getArchitecture().memoryBound(proc)) {
+                    return false;
                 }
             }
-            break;
         }
+        return true;
+    }
 
-        case MEMORY_CONSTRAINT_TYPE::LOCAL_SOURCES_INC_EDGES: {
-
-            SetSchedule set_schedule = SetSchedule(*this);
-
-            for (unsigned step = 0; step < number_of_supersteps; step++) {
-                for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
+    /**
+     * @brief Checks if the schedule satisfies persistent and transient memory constraints.
+     *
+     * This model distinguishes between persistent memory (node memory weight) and transient memory (max communication
+     * weight). The total memory usage on a processor is the sum of persistent memory of all assigned nodes plus the
+     * maximum transient memory required by any single node assigned to it.
+     *
+     * @return True if persistent and transient memory constraints are satisfied, false otherwise.
+     */
+    bool satisfiesPersistentAndTransientMemoryConstraints() const {
+        std::vector<v_memw_t<Graph_t>> current_proc_persistent_memory(instance->numberOfProcessors(), 0);
+        std::vector<v_memw_t<Graph_t>> current_proc_transient_memory(instance->numberOfProcessors(), 0);
 
-                    std::unordered_set<vertex_idx_t<Graph_t>> nodes_with_incoming_edges;
+        for (const auto &node : instance->vertices()) {
 
-                    v_memw_t<Graph_t> memory = 0;
-                    for (const auto &node : set_schedule.step_processor_vertices[step][proc]) {
+            const unsigned proc = node_to_processor_assignment[node];
+            current_proc_persistent_memory[proc] += instance->getComputationalDag().vertex_mem_weight(node);
+            current_proc_transient_memory[proc] = std::max(
+                current_proc_transient_memory[proc], instance->getComputationalDag().vertex_comm_weight(node));
 
-                        if (is_source(node, instance->getComputationalDag())) {
-                            memory += instance->getComputationalDag().vertex_mem_weight(node);
-                        }
+            if (current_proc_persistent_memory[proc] + current_proc_transient_memory[proc] >
+                instance->getArchitecture().memoryBound(proc)) {
+                return false;
+            }
+        }
+        return true;
+    }
 
-                        for (const auto &parent : instance->getComputationalDag().parents(node)) {
+    /**
+     * @brief Checks if the schedule satisfies global memory constraints.
+     *
+     * In this model, the memory usage of a processor is the sum of the memory weights of all nodes assigned to it,
+     * regardless of the superstep.
+     *
+     * @return True if global memory constraints are satisfied, false otherwise.
+     */
+    bool satisfiesGlobalMemoryConstraints() const {
+        std::vector<v_memw_t<Graph_t>> current_proc_memory(instance->numberOfProcessors(), 0);
 
-                            if (node_to_superstep_assignment[parent] != step) {
-                                nodes_with_incoming_edges.insert(parent);
-                            }
-                        }
-                    }
+        for (const auto &node : instance->vertices()) {
 
-                    for (const auto &node : nodes_with_incoming_edges) {
-                        memory += instance->getComputationalDag().vertex_comm_weight(node);
-                    }
+            const unsigned proc = node_to_processor_assignment[node];
+            current_proc_memory[proc] += instance->getComputationalDag().vertex_mem_weight(node);
 
-                    if (memory > instance->getArchitecture().memoryBound(proc)) {
-                        return false;
-                    }
-                }
+            if (current_proc_memory[proc] > instance->getArchitecture().memoryBound(proc)) {
+                return false;
             }
-            break;
         }
+        return true;
+    }
 
-        case MEMORY_CONSTRAINT_TYPE::NONE: {
-            break;
-        }
+    bool satisfiesLocalInOutMemoryConstraints() const {
 
-        default: {
-            throw std::invalid_argument("Unknown memory constraint type.");
-            break;
-        }
-        }
+        SetSchedule set_schedule = SetSchedule(*this);
 
-        return true;
-    };
+        for (unsigned step = 0; step < number_of_supersteps; step++) {
+            for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
 
-    std::vector<vertex_idx_t<Graph_t>> getAssignedNodeVector(unsigned int processor) const {
+                v_memw_t<Graph_t> memory = 0;
+                for (const auto &node : set_schedule.step_processor_vertices[step][proc]) {
+                    memory += instance->getComputationalDag().vertex_mem_weight(node) +
+                              instance->getComputationalDag().vertex_comm_weight(node);
 
-        std::vector<vertex_idx_t<Graph_t>> vec;
+                    for (const auto &parent : instance->getComputationalDag().parents(node)) {
 
-        for (const auto &node : instance->vertices()) {
+                        if (node_to_processor_assignment[parent] == proc &&
+                            node_to_superstep_assignment[parent] == step) {
+                            memory -= instance->getComputationalDag().vertex_comm_weight(parent);
+                        }
+                    }
+                }
 
-            if (node_to_processor_assignment[node] == processor) {
-                vec.push_back(node);
+                if (memory > instance->getArchitecture().memoryBound(proc)) {
+                    return false;
+                }
             }
         }
 
-        return vec;
+        return true;
     }
 
-    std::vector<vertex_idx_t<Graph_t>> getAssignedNodeVector(unsigned int processor, unsigned int superstep) const {
-        std::vector<vertex_idx_t<Graph_t>> vec;
+    bool satisfiesLocalIncEdgesMemoryConstraints() const {
 
-        for (const auto &node : instance->vertices()) {
+        SetSchedule set_schedule = SetSchedule(*this);
 
-            if (node_to_processor_assignment[node] == processor && node_to_superstep_assignment[node] == superstep) {
-                vec.push_back(node);
-            }
-        }
+        for (unsigned step = 0; step < number_of_supersteps; step++) {
+            for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
 
-        return vec;
-    }
+                std::unordered_set<vertex_idx_t<Graph_t>> nodes_with_incoming_edges;
 
-    inline void setNumberOfSupersteps(unsigned int number_of_supersteps_) {
-        number_of_supersteps = number_of_supersteps_;
-    }
+                v_memw_t<Graph_t> memory = 0;
+                for (const auto &node : set_schedule.step_processor_vertices[step][proc]) {
+                    memory += instance->getComputationalDag().vertex_comm_weight(node);
 
-    unsigned num_assigned_nodes(unsigned processor) const {
+                    for (const auto &parent : instance->getComputationalDag().parents(node)) {
 
-        unsigned num = 0;
+                        if (node_to_superstep_assignment[parent] != step) {
+                            nodes_with_incoming_edges.insert(parent);
+                        }
+                    }
+                }
 
-        for (const auto &node : instance->vertices()) {
-            if (node_to_processor_assignment[node] == processor) {
-                num++;
+                for (const auto &node : nodes_with_incoming_edges) {
+                    memory += instance->getComputationalDag().vertex_comm_weight(node);
+                }
+
+                if (memory > instance->getArchitecture().memoryBound(proc)) {
+                    return false;
+                }
             }
         }
-
-        return num;
+        return true;
     }
 
-    std::vector<unsigned> num_assigned_nodes_per_processor() const {
+    bool satisfiesLocalSourcesIncEdgesMemoryConstraints() const {
 
-        std::vector<unsigned> num(instance->numberOfProcessors(), 0);
+        SetSchedule set_schedule = SetSchedule(*this);
 
-        for (const auto &node : instance->vertices()) {
-            num[node_to_processor_assignment[node]]++;
-        }
-
-        return num;
-    }
+        for (unsigned step = 0; step < number_of_supersteps; step++) {
+            for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) {
 
-    std::vector<std::vector<unsigned>> num_assigned_nodes_per_superstep_processor() const {
+                std::unordered_set<vertex_idx_t<Graph_t>> nodes_with_incoming_edges;
 
-        std::vector<std::vector<unsigned>> num(number_of_supersteps,
-                                               std::vector<unsigned>(instance->numberOfProcessors(), 0));
+                v_memw_t<Graph_t> memory = 0;
+                for (const auto &node : set_schedule.step_processor_vertices[step][proc]) {
 
-        for (const auto &v : instance->vertices()) {
-            num[node_to_superstep_assignment[v]][node_to_processor_assignment[v]] += 1;
-        }
+                    if (is_source(node, instance->getComputationalDag())) {
+                        memory += instance->getComputationalDag().vertex_mem_weight(node);
+                    }
 
-        return num;
-    }
+                    for (const auto &parent : instance->getComputationalDag().parents(node)) {
 
-    virtual void shrinkByMergingSupersteps() {
+                        if (node_to_superstep_assignment[parent] != step) {
+                            nodes_with_incoming_edges.insert(parent);
+                        }
+                    }
+                }
 
-        std::vector<bool> comm_phase_empty(number_of_supersteps, true);
-        for (const auto& node : instance->vertices())
-            for (const auto &child : instance->getComputationalDag().children(node))
-                if(node_to_processor_assignment[node] != node_to_processor_assignment[child])
-                    for(unsigned offset = 1; offset <= getStaleness(); ++offset)
-                        comm_phase_empty[node_to_superstep_assignment[child] - offset] = false;
+                for (const auto &node : nodes_with_incoming_edges) {
+                    memory += instance->getComputationalDag().vertex_comm_weight(node);
+                }
 
-        std::vector<unsigned> new_step_index(number_of_supersteps);
-        unsigned current_index = 0;
-        for(unsigned step = 0; step < number_of_supersteps; ++step)
-        {
-            new_step_index[step] = current_index;
-            if(!comm_phase_empty[step])
-                current_index++;
+                if (memory > instance->getArchitecture().memoryBound(proc)) {
+                    return false;
+                }
+            }
         }
-        for (const auto& node : instance->vertices())
-            node_to_superstep_assignment[node] = new_step_index[node_to_superstep_assignment[node]];
-
-        setNumberOfSupersteps(current_index);
+        return true;
     }
-    
-    unsigned virtual getStaleness() const { return 1; }
 };
 
 } // namespace osp
\ No newline at end of file
diff --git a/include/osp/bsp/model/BspScheduleCS.hpp b/include/osp/bsp/model/BspScheduleCS.hpp
index 89d16757..63d94798 100644
--- a/include/osp/bsp/model/BspScheduleCS.hpp
+++ b/include/osp/bsp/model/BspScheduleCS.hpp
@@ -25,9 +25,8 @@ limitations under the License.
 #include <stdexcept>
 #include <vector>
 
-
-#include "IBspScheduleEval.hpp"
 #include "BspSchedule.hpp"
+#include "IBspScheduleEval.hpp"
 
 namespace osp {
 
@@ -64,8 +63,7 @@ class BspScheduleCS : public BspSchedule<Graph_t> {
     std::map<KeyTriple, unsigned> commSchedule;
 
   protected:
-
-    void compute_cs_communication_costs_helper(std::vector<std::vector<v_commw_t<Graph_t>>> & rec, std::vector<std::vector<v_commw_t<Graph_t>>> & send) const {        
+    void compute_cs_communication_costs_helper(std::vector<std::vector<v_commw_t<Graph_t>>> &rec, std::vector<std::vector<v_commw_t<Graph_t>>> &send) const {
         for (auto const &[key, val] : commSchedule) {
             send[std::get<1>(key)][val] +=
                 BspSchedule<Graph_t>::instance->sendCosts(std::get<1>(key), std::get<2>(key)) *
@@ -73,7 +71,7 @@ class BspScheduleCS : public BspSchedule<Graph_t> {
             rec[std::get<2>(key)][val] +=
                 BspSchedule<Graph_t>::instance->sendCosts(std::get<1>(key), std::get<2>(key)) *
                 BspSchedule<Graph_t>::instance->getComputationalDag().vertex_comm_weight(std::get<0>(key));
-        }       
+        }
     }
 
   public:
@@ -244,15 +242,11 @@ class BspScheduleCS : public BspSchedule<Graph_t> {
 
     v_commw_t<Graph_t> compute_cs_communication_costs() const {
 
-        std::vector<std::vector<v_commw_t<Graph_t>>> rec(
-            BspSchedule<Graph_t>::instance->numberOfProcessors(),
-            std::vector<v_commw_t<Graph_t>>(BspSchedule<Graph_t>::number_of_supersteps, 0));
-        std::vector<std::vector<v_commw_t<Graph_t>>> send(
-            BspSchedule<Graph_t>::instance->numberOfProcessors(),
-            std::vector<v_commw_t<Graph_t>>(BspSchedule<Graph_t>::number_of_supersteps, 0));
+        std::vector<std::vector<v_commw_t<Graph_t>>> rec(this->instance->numberOfProcessors(), std::vector<v_commw_t<Graph_t>>(this->number_of_supersteps, 0));
+        std::vector<std::vector<v_commw_t<Graph_t>>> send(this->instance->numberOfProcessors(), std::vector<v_commw_t<Graph_t>>(this->number_of_supersteps, 0));
 
         compute_cs_communication_costs_helper(rec, send);
-        const std::vector<v_commw_t<Graph_t>> max_comm_per_step = this->compute_max_comm_per_step_helper(rec, send);
+        const std::vector<v_commw_t<Graph_t>> max_comm_per_step = cost_helpers::compute_max_comm_per_step(*this, rec, send);
 
         v_commw_t<Graph_t> costs = 0;
         for (unsigned step = 0; step < this->number_of_supersteps; step++) {
@@ -342,7 +336,7 @@ class BspScheduleCS : public BspSchedule<Graph_t> {
                     if (proc != BspSchedule<Graph_t>::assignedProcessor(target)) {
                         require_sending[proc].insert(
                             {BspSchedule<Graph_t>::instance->getComputationalDag().vertex_comm_weight(node) * BspSchedule<Graph_t>::instance->getArchitecture().sendCosts(proc, BspSchedule<Graph_t>::node_to_processor_assignment[target]),
-                             node, 
+                             node,
                              BspSchedule<Graph_t>::node_to_processor_assignment[target]});
                     }
                 }
@@ -394,7 +388,7 @@ class BspScheduleCS : public BspSchedule<Graph_t> {
                     continue;
                 auto iter = require_sending[proc].begin();
                 while (iter != require_sending[proc].end()) {
-                    const auto& [comm_cost, node_to_send, dest_proc] = *iter;
+                    const auto &[comm_cost, node_to_send, dest_proc] = *iter;
                     if (comm_cost + send_cost[proc] > max_comm_cost ||
                         comm_cost + receive_cost[dest_proc] > max_comm_cost) {
                         iter++;
@@ -471,27 +465,23 @@ class BspScheduleCS : public BspSchedule<Graph_t> {
     virtual void shrinkByMergingSupersteps() override {
 
         std::vector<unsigned> superstep_latest_dependency(this->number_of_supersteps, 0);
-        std::vector<std::vector<unsigned> > first_at = getFirstPresence();
+        std::vector<std::vector<unsigned>> first_at = getFirstPresence();
 
         for (auto const &[key, val] : commSchedule)
-            if(this->assignedProcessor(std::get<0>(key)) != std::get<1>(key))
+            if (this->assignedProcessor(std::get<0>(key)) != std::get<1>(key))
                 superstep_latest_dependency[val] = std::max(superstep_latest_dependency[val], first_at[std::get<0>(key)][std::get<1>(key)]);
-        
 
         for (const auto &node : BspSchedule<Graph_t>::instance->getComputationalDag().vertices())
             for (const auto &child : BspSchedule<Graph_t>::instance->getComputationalDag().children(node))
-                if(this->assignedProcessor(node) != this->assignedProcessor(child))
+                if (this->assignedProcessor(node) != this->assignedProcessor(child))
                     superstep_latest_dependency[this->assignedSuperstep(child)] = std::max(superstep_latest_dependency[this->assignedSuperstep(child)], first_at[node][this->assignedProcessor(child)]);
 
         std::vector<bool> merge_with_previous(this->number_of_supersteps, false);
-        for(unsigned step = this->number_of_supersteps-1; step < this->number_of_supersteps; --step)
-        {
+        for (unsigned step = this->number_of_supersteps - 1; step < this->number_of_supersteps; --step) {
             unsigned limit = 0;
-            while(step > limit)
-            {
+            while (step > limit) {
                 limit = std::max(limit, superstep_latest_dependency[step]);
-                if(step > limit)
-                {
+                if (step > limit) {
                     merge_with_previous[step] = true;
                     --step;
                 }
@@ -500,26 +490,25 @@ class BspScheduleCS : public BspSchedule<Graph_t> {
 
         std::vector<unsigned> new_step_index(this->number_of_supersteps);
         unsigned current_index = std::numeric_limits<unsigned>::max();
-        for(unsigned step = 0; step < this->number_of_supersteps; ++step)
-        {
-            if(!merge_with_previous[step])
+        for (unsigned step = 0; step < this->number_of_supersteps; ++step) {
+            if (!merge_with_previous[step])
                 current_index++;
 
             new_step_index[step] = current_index;
         }
-        for (const auto& node : this->instance->vertices())
+        for (const auto &node : this->instance->vertices())
             this->node_to_superstep_assignment[node] = new_step_index[this->node_to_superstep_assignment[node]];
         for (auto &[key, val] : commSchedule)
             val = new_step_index[val];
 
-        this->setNumberOfSupersteps(current_index+1);
+        this->setNumberOfSupersteps(current_index + 1);
     }
 
     // for each vertex v and processor p, find the first superstep where v is present on p by the end of the compute phase
-    std::vector<std::vector<unsigned> > getFirstPresence() const {
+    std::vector<std::vector<unsigned>> getFirstPresence() const {
 
-        std::vector<std::vector<unsigned> > first_at(BspSchedule<Graph_t>::instance->numberOfVertices(),
-            std::vector<unsigned>(BspSchedule<Graph_t>::instance->numberOfProcessors(), std::numeric_limits<unsigned>::max()));
+        std::vector<std::vector<unsigned>> first_at(BspSchedule<Graph_t>::instance->numberOfVertices(),
+                                                    std::vector<unsigned>(BspSchedule<Graph_t>::instance->numberOfProcessors(), std::numeric_limits<unsigned>::max()));
 
         for (const auto &node : BspSchedule<Graph_t>::instance->getComputationalDag().vertices())
             first_at[node][this->assignedProcessor(node)] = this->assignedSuperstep(node);
@@ -532,11 +521,11 @@ class BspScheduleCS : public BspSchedule<Graph_t> {
     }
 
     // remove unneeded comm. schedule entries - these can happen in ILPs, partial ILPs, etc.
-    void cleanCommSchedule(){
+    void cleanCommSchedule() {
 
         // data that is already present before it arrives
-        std::vector<std::vector<std::multiset<unsigned> > > arrives_at(BspSchedule<Graph_t>::instance->numberOfVertices(),
-            std::vector<std::multiset<unsigned> >(BspSchedule<Graph_t>::instance->numberOfProcessors()));
+        std::vector<std::vector<std::multiset<unsigned>>> arrives_at(BspSchedule<Graph_t>::instance->numberOfVertices(),
+                                                                     std::vector<std::multiset<unsigned>>(BspSchedule<Graph_t>::instance->numberOfProcessors()));
         for (const auto &node : BspSchedule<Graph_t>::instance->getComputationalDag().vertices())
             arrives_at[node][this->assignedProcessor(node)].insert(this->assignedSuperstep(node));
 
@@ -544,48 +533,45 @@ class BspScheduleCS : public BspSchedule<Graph_t> {
             arrives_at[std::get<0>(key)][std::get<2>(key)].insert(val);
 
         std::vector<KeyTriple> toErase;
-        for (auto const &[key, val] : commSchedule)
-        {
+        for (auto const &[key, val] : commSchedule) {
             auto itr = arrives_at[std::get<0>(key)][std::get<2>(key)].begin();
-            if(*itr < val)
+            if (*itr < val)
                 toErase.push_back(key);
-            else if(*itr == val && ++itr != arrives_at[std::get<0>(key)][std::get<2>(key)].end() && *itr == val)
-            {
+            else if (*itr == val && ++itr != arrives_at[std::get<0>(key)][std::get<2>(key)].end() && *itr == val) {
                 toErase.push_back(key);
                 arrives_at[std::get<0>(key)][std::get<2>(key)].erase(itr);
             }
         }
 
-        for(const KeyTriple& key : toErase)
+        for (const KeyTriple &key : toErase)
             commSchedule.erase(key);
 
         // data that is not used after being sent
-        std::vector<std::vector<std::multiset<unsigned> > > used_at(BspSchedule<Graph_t>::instance->numberOfVertices(),
-            std::vector<std::multiset<unsigned> >(BspSchedule<Graph_t>::instance->numberOfProcessors()));
+        std::vector<std::vector<std::multiset<unsigned>>> used_at(BspSchedule<Graph_t>::instance->numberOfVertices(),
+                                                                  std::vector<std::multiset<unsigned>>(BspSchedule<Graph_t>::instance->numberOfProcessors()));
         for (const auto &node : BspSchedule<Graph_t>::instance->getComputationalDag().vertices())
             for (const auto &child : BspSchedule<Graph_t>::instance->getComputationalDag().children(node))
                 used_at[node][this->assignedProcessor(child)].insert(this->assignedSuperstep(child));
 
         for (auto const &[key, val] : commSchedule)
             used_at[std::get<0>(key)][std::get<1>(key)].insert(val);
-        
+
         // (need to visit cs entries in reverse superstep order here)
-        std::vector<std::vector<KeyTriple> > entries(this->number_of_supersteps);
+        std::vector<std::vector<KeyTriple>> entries(this->number_of_supersteps);
         for (auto const &[key, val] : commSchedule)
             entries[val].push_back(key);
 
         toErase.clear();
-        for(unsigned step = this->number_of_supersteps-1; step < this->number_of_supersteps; --step)
-            for(const KeyTriple& key : entries[step])
-                if(used_at[std::get<0>(key)][std::get<2>(key)].empty() ||
-                    *used_at[std::get<0>(key)][std::get<2>(key)].rbegin() <= step)
-                {
+        for (unsigned step = this->number_of_supersteps - 1; step < this->number_of_supersteps; --step)
+            for (const KeyTriple &key : entries[step])
+                if (used_at[std::get<0>(key)][std::get<2>(key)].empty() ||
+                    *used_at[std::get<0>(key)][std::get<2>(key)].rbegin() <= step) {
                     toErase.push_back(key);
-                    auto itr =  used_at[std::get<0>(key)][std::get<1>(key)].find(step);
+                    auto itr = used_at[std::get<0>(key)][std::get<1>(key)].find(step);
                     used_at[std::get<0>(key)][std::get<1>(key)].erase(itr);
                 }
-        
-        for(const KeyTriple& key : toErase)
+
+        for (const KeyTriple &key : toErase)
             commSchedule.erase(key);
     }
 };
diff --git a/include/osp/bsp/model/BspScheduleCostEvaluator.hpp b/include/osp/bsp/model/BspScheduleCostEvaluator.hpp
deleted file mode 100644
index c97d8b74..00000000
--- a/include/osp/bsp/model/BspScheduleCostEvaluator.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
-Copyright 2024 Huawei Technologies Co., Ltd.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
-*/
-
-#pragma once
-
-#include "BspSchedule.hpp"
-
-namespace osp {
-
-/**
- * @class BspScheduleCostEvaluator
- * @brief A class to compute various cost functions for a BspSchedule.
- *
- * This class wraps a BspSchedule by reference to avoid unnecessary copies
- * while providing an interface to compute different cost models.
- */
-template<typename Graph_t>
-class BspScheduleCostEvaluator {
-
-    static_assert(is_computational_dag_v<Graph_t>, "BspScheduleCostEvaluator can only be used with computational DAGs.");
-    static_assert(std::is_same_v<v_workw_t<Graph_t>, v_commw_t<Graph_t>>,
-                  "BspScheduleCostEvaluator requires work and comm. weights to have the same type.");
-
-  protected:
-    const BspSchedule<Graph_t>& schedule;
-    const BspInstance<Graph_t>& instance;
-
-    void compute_lazy_communication_costs_helper(std::vector<std::vector<v_commw_t<Graph_t>>> & rec, std::vector<std::vector<v_commw_t<Graph_t>>> & send) const {
-        const unsigned number_of_supersteps = schedule.numberOfSupersteps();
-        for (const auto &node : instance.vertices()) {
-
-            std::vector<unsigned> step_needed(instance.numberOfProcessors(), number_of_supersteps);
-            for (const auto &target : instance.getComputationalDag().children(node)) {
-
-                if (schedule.assignedProcessor(node) != schedule.assignedProcessor(target)) {
-                    step_needed[schedule.assignedProcessor(target)] = std::min(
-                        step_needed[schedule.assignedProcessor(target)], schedule.assignedSuperstep(target));
-                }
-            }
-
-            for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) {
-
-                if (step_needed[proc] < number_of_supersteps) {
-
-                    send[schedule.assignedProcessor(node)][step_needed[proc] - 1] +=
-                        instance.sendCosts(schedule.assignedProcessor(node), proc) *
-                        instance.getComputationalDag().vertex_comm_weight(node);
-
-                    rec[proc][step_needed[proc] - 1] += instance.sendCosts(schedule.assignedProcessor(node), proc) *
-                                                        instance.getComputationalDag().vertex_comm_weight(node);
-                }
-            }
-        }
-    }
-
-    std::vector<v_commw_t<Graph_t>> compute_max_comm_per_step_helper(const std::vector<std::vector<v_commw_t<Graph_t>>> & rec, const std::vector<std::vector<v_commw_t<Graph_t>>> & send) const {
-        const unsigned number_of_supersteps = schedule.numberOfSupersteps();
-        std::vector<v_commw_t<Graph_t>> max_comm_per_step(number_of_supersteps, 0);
-        for (unsigned step = 0; step < number_of_supersteps; step++) {
-            v_commw_t<Graph_t> max_send = 0;
-            v_commw_t<Graph_t> max_rec = 0;
-
-            for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) {
-                if (max_send < send[proc][step])
-                    max_send = send[proc][step];
-                if (max_rec < rec[proc][step])
-                    max_rec = rec[proc][step];
-            }
-            max_comm_per_step[step] = std::max(max_send, max_rec) * instance.communicationCosts();
-        }
-        return max_comm_per_step;
-    }
-
-  public:
-    /**
-     * @brief Construct a new Bsp Schedule Cost Evaluator object.
-     *
-     * @param sched The BspSchedule to evaluate.
-     */
-    BspScheduleCostEvaluator(const BspSchedule<Graph_t>& sched) : schedule(sched), instance(sched.getInstance()) {}
-
-    /**
-     * @brief Computes the communication costs using the lazy sending model.
-     *
-     * In the lazy sending model, data is sent in the superstep immediately
-     * preceding the superstep where it is first needed.
-     *
-     * @return The lazy communication costs.
-     */
-    v_commw_t<Graph_t> compute_lazy_communication_costs() const {
-
-        const unsigned number_of_supersteps = schedule.numberOfSupersteps();
-
-        std::vector<std::vector<v_commw_t<Graph_t>>> rec(instance.numberOfProcessors(),
-                                                         std::vector<v_commw_t<Graph_t>>(number_of_supersteps, 0));
-        std::vector<std::vector<v_commw_t<Graph_t>>> send(instance.numberOfProcessors(),
-                                                          std::vector<v_commw_t<Graph_t>>(number_of_supersteps, 0));
-
-        compute_lazy_communication_costs_helper(rec, send);
-        const std::vector<v_commw_t<Graph_t>> max_comm_per_step = compute_max_comm_per_step_helper(rec, send);
-
-        v_commw_t<Graph_t> costs = 0;
-        for (unsigned step = 0; step < number_of_supersteps; step++) {
-            const auto step_comm_cost = max_comm_per_step[step];
-            costs += step_comm_cost;
-            
-            costs += instance.synchronisationCosts();
-            
-        }
-
-        return costs;
-    }
-
-    /**
-     * @brief Computes the work costs for each superstep.
-     *
-     * @return The work cost per superstep.
-     */
-    std::vector<v_workw_t<Graph_t>> compute_max_work_per_step_helper() const {
-        const unsigned number_of_supersteps = schedule.numberOfSupersteps();
-        std::vector<std::vector<v_workw_t<Graph_t>>> work = std::vector<std::vector<v_workw_t<Graph_t>>>(
-            number_of_supersteps, std::vector<v_workw_t<Graph_t>>(instance.numberOfProcessors(), 0));
-        for (const auto &node : instance.vertices()) {
-            work[schedule.assignedSuperstep(node)][schedule.assignedProcessor(node)] +=
-                instance.getComputationalDag().vertex_work_weight(node);
-        }
-
-        std::vector<v_workw_t<Graph_t>> max_work_per_step(number_of_supersteps, 0);
-        for (unsigned step = 0; step < number_of_supersteps; step++) {
-            v_workw_t<Graph_t> max_work = 0;
-            for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) {
-                if (max_work < work[step][proc]) {
-                    max_work = work[step][proc];
-                }
-            }
-
-            max_work_per_step[step] = max_work;
-        }
-
-        return max_work_per_step;
-    }
-
-    /**
-     * @brief Computes the total work costs of the schedule.
-     *
-     * The work cost is the sum of the maximum work done in each superstep
-     * across all processors.
-     *
-     * @return The total work costs.
-     */
-    v_workw_t<Graph_t> computeWorkCosts() const {
-        const std::vector<v_workw_t<Graph_t>> work_per_step = compute_max_work_per_step_helper();
-        return std::accumulate(work_per_step.begin(), work_per_step.end(), static_cast<v_workw_t<Graph_t>>(0));
-    }
-
-    /**
-     * @brief Computes the total costs of the schedule using the lazy communication model.
-     *
-     * @return The total costs.
-     */
-    v_workw_t<Graph_t> computeCosts() const { return compute_lazy_communication_costs() + computeWorkCosts(); }
-};
-
-} // namespace osp
diff --git a/include/osp/bsp/model/MaxBspSchedule.hpp b/include/osp/bsp/model/MaxBspSchedule.hpp
index 22b9c4b8..e56c99d6 100644
--- a/include/osp/bsp/model/MaxBspSchedule.hpp
+++ b/include/osp/bsp/model/MaxBspSchedule.hpp
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "BspSchedule.hpp"
+#include "osp/bsp/model/cost/LazyCommunicationCost.hpp"
 #include "osp/concepts/computational_dag_concept.hpp"
 
 namespace osp {
@@ -41,14 +42,12 @@ template<typename Graph_t>
 class MaxBspSchedule : public BspSchedule<Graph_t> {
 
     static_assert(is_computational_dag_v<Graph_t>, "BspSchedule can only be used with computational DAGs.");
-    static_assert(std::is_same_v<v_workw_t<Graph_t>, v_commw_t<Graph_t> >, "BspSchedule requires work and comm. weights to have the same type.");
+    static_assert(std::is_same_v<v_workw_t<Graph_t>, v_commw_t<Graph_t>>, "BspSchedule requires work and comm. weights to have the same type.");
 
   protected:
     using vertex_idx = vertex_idx_t<Graph_t>;
-   
 
   public:
-  
     MaxBspSchedule() = delete;
 
     /**
@@ -67,7 +66,7 @@ class MaxBspSchedule : public BspSchedule<Graph_t> {
      * @param superstep_assignment_ The superstep assignment for the nodes.
      */
     MaxBspSchedule(const BspInstance<Graph_t> &inst, const std::vector<unsigned> &processor_assignment_,
-                const std::vector<unsigned> &superstep_assignment_) : BspSchedule<Graph_t>(inst, processor_assignment_, superstep_assignment_) {}
+                   const std::vector<unsigned> &superstep_assignment_) : BspSchedule<Graph_t>(inst, processor_assignment_, superstep_assignment_) {}
 
     MaxBspSchedule(const IBspSchedule<Graph_t> &schedule) : BspSchedule<Graph_t>(schedule) {}
 
@@ -89,26 +88,23 @@ class MaxBspSchedule : public BspSchedule<Graph_t> {
      */
     virtual ~MaxBspSchedule() = default;
 
-    virtual v_workw_t<Graph_t> computeCosts() const override { 
-        
-        std::vector<std::vector<v_commw_t<Graph_t>>> rec(this->instance->numberOfProcessors(),
-                                                         std::vector<v_commw_t<Graph_t>>(this->number_of_supersteps, 0));
+    virtual v_workw_t<Graph_t> computeCosts() const override {
 
-        std::vector<std::vector<v_commw_t<Graph_t>>> send(this->instance->numberOfProcessors(),
-                                                          std::vector<v_commw_t<Graph_t>>(this->number_of_supersteps, 0));
+        std::vector<std::vector<v_commw_t<Graph_t>>> rec(this->instance->numberOfProcessors(), std::vector<v_commw_t<Graph_t>>(this->number_of_supersteps, 0));
+        std::vector<std::vector<v_commw_t<Graph_t>>> send(this->instance->numberOfProcessors(), std::vector<v_commw_t<Graph_t>>(this->number_of_supersteps, 0));
 
-        this->compute_lazy_communication_costs_helper(rec, send);
-        const std::vector<v_commw_t<Graph_t>> max_comm_per_step = this->compute_max_comm_per_step_helper(rec, send);
-        const std::vector<v_workw_t<Graph_t>> max_work_per_step = this->compute_max_work_per_step_helper();
+        compute_lazy_communication_costs(*this, rec, send);
+        const std::vector<v_commw_t<Graph_t>> max_comm_per_step = cost_helpers::compute_max_comm_per_step(*this, rec, send);
+        const std::vector<v_workw_t<Graph_t>> max_work_per_step = cost_helpers::compute_max_work_per_step(*this);
 
         v_workw_t<Graph_t> costs = 0U;
         for (unsigned step = 0U; step < this->number_of_supersteps; step++) {
-            v_commw_t<Graph_t> step_comm_cost = (step == 0U) ? static_cast<v_commw_t<Graph_t>>(0) : max_comm_per_step[step - 1U];
+            const v_commw_t<Graph_t> step_comm_cost = (step == 0U) ? static_cast<v_commw_t<Graph_t>>(0) : max_comm_per_step[step - 1U];
+            costs += std::max(step_comm_cost, max_work_per_step[step]);
+
             if (step_comm_cost > static_cast<v_commw_t<Graph_t>>(0)) {
-                step_comm_cost += this->instance->synchronisationCosts();
+                costs += this->instance->synchronisationCosts();
             }
-            costs += std::max(step_comm_cost, max_work_per_step[step]);
-            
         }
         return costs;
     }
diff --git a/include/osp/bsp/model/MaxBspScheduleCS.hpp b/include/osp/bsp/model/MaxBspScheduleCS.hpp
index 5309cddd..79b49b33 100644
--- a/include/osp/bsp/model/MaxBspScheduleCS.hpp
+++ b/include/osp/bsp/model/MaxBspScheduleCS.hpp
@@ -35,14 +35,12 @@ template<typename Graph_t>
 class MaxBspScheduleCS : public BspScheduleCS<Graph_t> {
 
     static_assert(is_computational_dag_v<Graph_t>, "BspSchedule can only be used with computational DAGs.");
-    static_assert(std::is_same_v<v_workw_t<Graph_t>, v_commw_t<Graph_t> >, "BspSchedule requires work and comm. weights to have the same type.");
+    static_assert(std::is_same_v<v_workw_t<Graph_t>, v_commw_t<Graph_t>>, "BspSchedule requires work and comm. weights to have the same type.");
 
   protected:
     using vertex_idx = vertex_idx_t<Graph_t>;
-   
 
   public:
-  
     MaxBspScheduleCS() = delete;
 
     /**
@@ -89,8 +87,8 @@ class MaxBspScheduleCS : public BspScheduleCS<Graph_t> {
      */
     virtual ~MaxBspScheduleCS() = default;
 
-    virtual v_workw_t<Graph_t> computeCosts() const override { 
-        
+    virtual v_workw_t<Graph_t> computeCosts() const override {
+
         std::vector<std::vector<v_commw_t<Graph_t>>> rec(this->getInstance().numberOfProcessors(),
                                                          std::vector<v_commw_t<Graph_t>>(this->number_of_supersteps, 0));
 
@@ -98,22 +96,21 @@ class MaxBspScheduleCS : public BspScheduleCS<Graph_t> {
                                                           std::vector<v_commw_t<Graph_t>>(this->number_of_supersteps, 0));
 
         this->compute_cs_communication_costs_helper(rec, send);
-        const std::vector<v_commw_t<Graph_t>> max_comm_per_step = this->compute_max_comm_per_step_helper(rec, send);
-        const std::vector<v_workw_t<Graph_t>> max_work_per_step = this->compute_max_work_per_step_helper();
+        const std::vector<v_commw_t<Graph_t>> max_comm_per_step = cost_helpers::compute_max_comm_per_step(*this, rec, send);
+        const std::vector<v_workw_t<Graph_t>> max_work_per_step = cost_helpers::compute_max_work_per_step(*this);
 
         v_workw_t<Graph_t> costs = 0U;
         for (unsigned step = 0U; step < this->number_of_supersteps; step++) {
-            v_commw_t<Graph_t> step_comm_cost = (step == 0U) ? static_cast<v_commw_t<Graph_t>>(0) : max_comm_per_step[step - 1U];
-            if (step_comm_cost > static_cast<v_commw_t<Graph_t>>(0)) {
-                step_comm_cost += this->instance->synchronisationCosts();
-            }
+            const auto step_comm_cost = (step == 0U) ? static_cast<v_commw_t<Graph_t>>(0) : max_comm_per_step[step - 1U];
             costs += std::max(step_comm_cost, max_work_per_step[step]);
 
+            if (step_comm_cost > static_cast<v_commw_t<Graph_t>>(0)) {
+                costs += this->instance->synchronisationCosts();
+            }
         }
         return costs;
     }
 
     unsigned virtual getStaleness() const override { return 2; }
 };
-
 } // namespace osp
\ No newline at end of file
diff --git a/include/osp/bsp/model/cost/BufferedSendingCost.hpp b/include/osp/bsp/model/cost/BufferedSendingCost.hpp
new file mode 100644
index 00000000..f8b61f91
--- /dev/null
+++ b/include/osp/bsp/model/cost/BufferedSendingCost.hpp
@@ -0,0 +1,84 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include "osp/bsp/model/cost/CostModelHelpers.hpp"
+#include "osp/concepts/computational_dag_concept.hpp"
+#include <algorithm>
+#include <vector>
+
+namespace osp {
+
+/**
+ * @struct BufferedSendingCost
+ * @brief Implements the buffered sending cost model.
+ */
+template<typename Graph_t>
+struct BufferedSendingCost {
+
+    using cost_type = v_commw_t<Graph_t>;
+
+    cost_type operator()(const BspSchedule<Graph_t> &schedule) const {
+        const auto &instance = schedule.getInstance();
+        unsigned number_of_supersteps = schedule.numberOfSupersteps();
+        const auto &node_to_processor_assignment = schedule.assignedProcessors();
+        const auto &node_to_superstep_assignment = schedule.assignedSupersteps();
+        const auto staleness = schedule.getStaleness();
+
+        std::vector<std::vector<v_commw_t<Graph_t>>> rec(instance.numberOfProcessors(), std::vector<v_commw_t<Graph_t>>(number_of_supersteps, 0));
+        std::vector<std::vector<v_commw_t<Graph_t>>> send(instance.numberOfProcessors(), std::vector<v_commw_t<Graph_t>>(number_of_supersteps, 0));
+
+        for (vertex_idx_t<Graph_t> node = 0; node < instance.numberOfVertices(); node++) {
+
+            std::vector<unsigned> step_needed(instance.numberOfProcessors(), number_of_supersteps);
+            for (const auto &target : instance.getComputationalDag().children(node)) {
+
+                if (node_to_processor_assignment[node] != node_to_processor_assignment[target]) {
+                    step_needed[node_to_processor_assignment[target]] = std::min(step_needed[node_to_processor_assignment[target]], node_to_superstep_assignment[target]);
+                }
+            }
+
+            for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) {
+
+                if (step_needed[proc] < number_of_supersteps) {
+                    send[node_to_processor_assignment[node]][node_to_superstep_assignment[node]] += instance.sendCosts(node_to_processor_assignment[node], proc) * instance.getComputationalDag().vertex_comm_weight(node);
+
+                    if (step_needed[proc] >= staleness) {
+                        rec[proc][step_needed[proc] - staleness] += instance.sendCosts(node_to_processor_assignment[node], proc) * instance.getComputationalDag().vertex_comm_weight(node);
+                    }
+                }
+            }
+        }
+
+        const auto max_comm_per_step = cost_helpers::compute_max_comm_per_step(schedule, rec, send);
+        v_commw_t<Graph_t> comm_costs = 0;
+        for (unsigned step = 0; step < number_of_supersteps; step++) {
+            const auto step_comm_cost = max_comm_per_step[step];
+            comm_costs += step_comm_cost;
+
+            if (step_comm_cost > 0) {
+                comm_costs += instance.synchronisationCosts();
+            }
+        }
+
+        return comm_costs + cost_helpers::compute_work_costs(schedule);
+    }
+};
+
+} // namespace osp
diff --git a/include/osp/bsp/model/cost/CostModelHelpers.hpp b/include/osp/bsp/model/cost/CostModelHelpers.hpp
new file mode 100644
index 00000000..b1d449b4
--- /dev/null
+++ b/include/osp/bsp/model/cost/CostModelHelpers.hpp
@@ -0,0 +1,117 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include "osp/bsp/model/BspInstance.hpp"
+#include <algorithm>
+#include <vector>
+
+namespace osp {
+
+template<typename Graph_t>
+class BspSchedule;
+
+namespace cost_helpers {
+
+template<typename Graph_t>
+std::vector<v_commw_t<Graph_t>> compute_max_comm_per_step(
+    const BspInstance<Graph_t> &instance,
+    unsigned number_of_supersteps,
+    const std::vector<std::vector<v_commw_t<Graph_t>>> &rec,
+    const std::vector<std::vector<v_commw_t<Graph_t>>> &send) {
+
+    std::vector<v_commw_t<Graph_t>> max_comm_per_step(number_of_supersteps, 0);
+    for (unsigned step = 0; step < number_of_supersteps; step++) {
+        v_commw_t<Graph_t> max_send = 0;
+        v_commw_t<Graph_t> max_rec = 0;
+
+        for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) {
+            if (max_send < send[proc][step])
+                max_send = send[proc][step];
+            if (max_rec < rec[proc][step])
+                max_rec = rec[proc][step];
+        }
+        max_comm_per_step[step] = std::max(max_send, max_rec) * instance.communicationCosts();
+    }
+    return max_comm_per_step;
+}
+
+template<typename Graph_t>
+std::vector<v_commw_t<Graph_t>> compute_max_comm_per_step(
+    const BspSchedule<Graph_t> &schedule,
+    const std::vector<std::vector<v_commw_t<Graph_t>>> &rec,
+    const std::vector<std::vector<v_commw_t<Graph_t>>> &send) {
+    return compute_max_comm_per_step(schedule.getInstance(), schedule.numberOfSupersteps(), rec, send);
+}
+
+template<typename Graph_t>
+std::vector<v_workw_t<Graph_t>> compute_max_work_per_step(
+    const BspInstance<Graph_t> &instance,
+    unsigned number_of_supersteps,
+    const std::vector<unsigned> &node_to_processor_assignment,
+    const std::vector<unsigned> &node_to_superstep_assignment) {
+    std::vector<std::vector<v_workw_t<Graph_t>>> work = std::vector<std::vector<v_workw_t<Graph_t>>>(
+        number_of_supersteps, std::vector<v_workw_t<Graph_t>>(instance.numberOfProcessors(), 0));
+    for (const auto &node : instance.vertices()) {
+        work[node_to_superstep_assignment[node]][node_to_processor_assignment[node]] +=
+            instance.getComputationalDag().vertex_work_weight(node);
+    }
+
+    std::vector<v_workw_t<Graph_t>> max_work_per_step(number_of_supersteps, 0);
+    for (unsigned step = 0; step < number_of_supersteps; step++) {
+        v_workw_t<Graph_t> max_work = 0;
+        for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) {
+            if (max_work < work[step][proc]) {
+                max_work = work[step][proc];
+            }
+        }
+
+        max_work_per_step[step] = max_work;
+    }
+
+    return max_work_per_step;
+}
+
+template<typename Graph_t>
+std::vector<v_workw_t<Graph_t>> compute_max_work_per_step(
+    const BspSchedule<Graph_t> &schedule) {
+    return compute_max_work_per_step(schedule.getInstance(), schedule.numberOfSupersteps(), schedule.assignedProcessors(), schedule.assignedSupersteps());
+}
+
+template<typename Graph_t>
+v_workw_t<Graph_t> compute_work_costs(
+    const BspInstance<Graph_t> &instance,
+    unsigned number_of_supersteps,
+    const std::vector<unsigned> &node_to_processor_assignment,
+    const std::vector<unsigned> &node_to_superstep_assignment) {
+
+    std::vector<v_workw_t<Graph_t>> max_work_per_step = compute_max_work_per_step(instance, number_of_supersteps, node_to_processor_assignment, node_to_superstep_assignment);
+
+    return std::accumulate(max_work_per_step.begin(), max_work_per_step.end(), static_cast<v_workw_t<Graph_t>>(0));
+}
+
+template<typename Graph_t>
+v_workw_t<Graph_t> compute_work_costs(
+    const BspSchedule<Graph_t> &schedule) {
+
+    return compute_work_costs(schedule.getInstance(), schedule.numberOfSupersteps(), schedule.assignedProcessors(), schedule.assignedSupersteps());
+}
+
+} // namespace cost_helpers
+} // namespace osp
diff --git a/include/osp/bsp/model/cost/LazyCommunicationCost.hpp b/include/osp/bsp/model/cost/LazyCommunicationCost.hpp
new file mode 100644
index 00000000..64338481
--- /dev/null
+++ b/include/osp/bsp/model/cost/LazyCommunicationCost.hpp
@@ -0,0 +1,98 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include "osp/bsp/model/cost/CostModelHelpers.hpp"
+#include "osp/concepts/computational_dag_concept.hpp"
+#include <algorithm>
+#include <vector>
+
+namespace osp {
+
+template<typename Graph_t>
+void compute_lazy_communication_costs(
+    const BspInstance<Graph_t> &instance,
+    unsigned number_of_supersteps,
+    const std::vector<unsigned> &node_to_processor_assignment,
+    const std::vector<unsigned> &node_to_superstep_assignment,
+    const unsigned staleness,
+    std::vector<std::vector<v_commw_t<Graph_t>>> &rec,
+    std::vector<std::vector<v_commw_t<Graph_t>>> &send) {
+    for (const auto &node : instance.vertices()) {
+
+        std::vector<unsigned> step_needed(instance.numberOfProcessors(), number_of_supersteps);
+        for (const auto &target : instance.getComputationalDag().children(node)) {
+
+            if (node_to_processor_assignment[node] != node_to_processor_assignment[target]) {
+                step_needed[node_to_processor_assignment[target]] = std::min(step_needed[node_to_processor_assignment[target]], node_to_superstep_assignment[target]);
+            }
+        }
+
+        for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) {
+
+            if (step_needed[proc] < number_of_supersteps) {
+                send[node_to_processor_assignment[node]][step_needed[proc] - staleness] += instance.sendCosts(node_to_processor_assignment[node], proc) * instance.getComputationalDag().vertex_comm_weight(node);
+                rec[proc][step_needed[proc] - staleness] += instance.sendCosts(node_to_processor_assignment[node], proc) * instance.getComputationalDag().vertex_comm_weight(node);
+            }
+        }
+    }
+}
+
+template<typename Graph_t>
+void compute_lazy_communication_costs(
+    const BspSchedule<Graph_t> &schedule,
+    std::vector<std::vector<v_commw_t<Graph_t>>> &rec,
+    std::vector<std::vector<v_commw_t<Graph_t>>> &send) {
+    compute_lazy_communication_costs(schedule.getInstance(), schedule.numberOfSupersteps(), schedule.assignedProcessors(), schedule.assignedSupersteps(), schedule.getStaleness(), rec, send);
+}
+
+/**
+ * @struct LazyCommunicationCost
+ * @brief Implements the lazy communication cost model.
+ */
+template<typename Graph_t>
+struct LazyCommunicationCost {
+
+    using cost_type = v_workw_t<Graph_t>;
+
+    cost_type operator()(const BspSchedule<Graph_t> &schedule) const {
+        const auto &number_of_processors = schedule.getInstance().numberOfProcessors();
+        const auto &number_of_supersteps = schedule.numberOfSupersteps();
+
+        std::vector<std::vector<v_commw_t<Graph_t>>> rec(number_of_processors, std::vector<v_commw_t<Graph_t>>(number_of_supersteps, 0));
+        std::vector<std::vector<v_commw_t<Graph_t>>> send(number_of_processors, std::vector<v_commw_t<Graph_t>>(number_of_supersteps, 0));
+
+        compute_lazy_communication_costs(schedule, rec, send);
+        const auto max_comm_per_step = cost_helpers::compute_max_comm_per_step(schedule, rec, send);
+
+        v_commw_t<Graph_t> comm_costs = 0;
+        for (unsigned step = 0; step < number_of_supersteps; step++) {
+            const auto step_comm_cost = max_comm_per_step[step];
+            comm_costs += step_comm_cost;
+
+            if (step_comm_cost > 0) {
+                comm_costs += schedule.getInstance().synchronisationCosts();
+            }
+        }
+
+        return comm_costs + cost_helpers::compute_work_costs(schedule);
+    }
+};
+
+} // namespace osp
diff --git a/include/osp/bsp/model/cost/TotalCommunicationCost.hpp b/include/osp/bsp/model/cost/TotalCommunicationCost.hpp
new file mode 100644
index 00000000..3182f3c5
--- /dev/null
+++ b/include/osp/bsp/model/cost/TotalCommunicationCost.hpp
@@ -0,0 +1,62 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include "osp/bsp/model/cost/CostModelHelpers.hpp"
+#include "osp/concepts/computational_dag_concept.hpp"
+
+namespace osp {
+
+/**
+ * @struct TotalCommunicationCost
+ * @brief Implements the total communication cost model.
+ */
+template<typename Graph_t>
+struct TotalCommunicationCost {
+
+    using cost_type = double;
+
+    cost_type operator()(const BspSchedule<Graph_t> &schedule) const {
+
+        const auto &instance = schedule.getInstance();
+        const auto &node_to_processor_assignment = schedule.assignedProcessors();
+
+        v_commw_t<Graph_t> total_communication = 0;
+
+        for (const auto &v : instance.vertices()) {
+            for (const auto &target : instance.getComputationalDag().children(v)) {
+
+                if (node_to_processor_assignment[v] != node_to_processor_assignment[target]) {
+                    total_communication += instance.sendCosts(node_to_processor_assignment[v], node_to_processor_assignment[target]) * instance.getComputationalDag().vertex_comm_weight(v);
+                }
+            }
+        }
+
+        auto comm_cost = total_communication * static_cast<double>(instance.communicationCosts()) / static_cast<double>(instance.numberOfProcessors());
+
+        const unsigned number_of_supersteps = schedule.numberOfSupersteps();
+
+        auto work_cost = cost_helpers::compute_work_costs(schedule);
+        auto sync_cost = static_cast<v_commw_t<Graph_t>>(number_of_supersteps > 1 ? number_of_supersteps - 1 : 0) * instance.synchronisationCosts();
+
+        return comm_cost + work_cost + sync_cost;
+    }
+};
+
+} // namespace osp
diff --git a/include/osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp b/include/osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp
new file mode 100644
index 00000000..acab210f
--- /dev/null
+++ b/include/osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp
@@ -0,0 +1,70 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include "osp/bsp/model/cost/CostModelHelpers.hpp"
+#include "osp/concepts/computational_dag_concept.hpp"
+#include <unordered_set>
+
+namespace osp {
+
+/**
+ * @struct TotalLambdaCommunicationCost
+ * @brief Implements the total lambda communication cost model.
+ */
+template<typename Graph_t>
+struct TotalLambdaCommunicationCost {
+
+    using cost_type = double;
+
+    cost_type operator()(const BspSchedule<Graph_t> &schedule) const {
+        const auto &instance = schedule.getInstance();
+        const auto &node_to_processor_assignment = schedule.assignedProcessors();
+
+        v_commw_t<Graph_t> comm_costs = 0;
+        const double comm_multiplier = 1.0 / instance.numberOfProcessors();
+
+        for (const auto &v : instance.vertices()) {
+            if (instance.getComputationalDag().out_degree(v) == 0)
+                continue;
+
+            std::unordered_set<unsigned> target_procs;
+            for (const auto &target : instance.getComputationalDag().children(v)) {
+                target_procs.insert(node_to_processor_assignment[target]);
+            }
+
+            const unsigned source_proc = node_to_processor_assignment[v];
+            const auto v_comm_cost = instance.getComputationalDag().vertex_comm_weight(v);
+
+            for (const auto &target_proc : target_procs) {
+                comm_costs += v_comm_cost * instance.sendCosts(source_proc, target_proc);
+            }
+        }
+
+        const unsigned number_of_supersteps = schedule.numberOfSupersteps();
+
+        auto comm_cost = comm_costs * comm_multiplier * static_cast<double>(instance.communicationCosts());
+        auto work_cost = cost_helpers::compute_work_costs(schedule);
+        auto sync_cost = static_cast<v_commw_t<Graph_t>>(number_of_supersteps > 1 ? number_of_supersteps - 1 : 0) * instance.synchronisationCosts();
+
+        return comm_cost + static_cast<double>(work_cost) + static_cast<double>(sync_cost);
+    }
+};
+
+} // namespace osp
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/BspToMaxBspConverter.hpp b/include/osp/bsp/scheduler/GreedySchedulers/BspToMaxBspConverter.hpp
index 98c3ec2c..363f5c85 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/BspToMaxBspConverter.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/BspToMaxBspConverter.hpp
@@ -42,8 +42,8 @@ class GreedyBspToMaxBspConverter {
     std::vector<std::vector<std::deque<vertex_idx_t<Graph_t>>>> createSuperstepLists(const BspScheduleCS<Graph_t>& schedule, std::vector<double>& priorities) const;
 
   public:
-  
-    MaxBspSchedule<Graph_t> Convert(const BspSchedule<Graph_t>& schedule) const;  
+
+    MaxBspSchedule<Graph_t> Convert(const BspSchedule<Graph_t>& schedule) const;
     MaxBspScheduleCS<Graph_t> Convert(const BspScheduleCS<Graph_t>& schedule) const;
 
 };
@@ -106,7 +106,7 @@ MaxBspScheduleCS<Graph_t> GreedyBspToMaxBspConverter<Graph_t>::Convert(const Bsp
         std::vector<std::pair<KeyTriple, unsigned>> newly_freed_comm_steps;
         std::vector<cost_type> send_sum_of_newly_free_on_proc(schedule.getInstance().numberOfProcessors(), 0),
                                 rec_sum_of_newly_free_on_proc(schedule.getInstance().numberOfProcessors(), 0);
-        
+
         std::vector<std::pair<KeyTriple, unsigned>> comm_in_current_step;
 
         std::vector<cost_type> send_on_proc(schedule.getInstance().numberOfProcessors(), 0),
@@ -118,10 +118,10 @@ MaxBspScheduleCS<Graph_t> GreedyBspToMaxBspConverter<Graph_t>::Convert(const Bsp
             // I. Select the next node (from any proc) with highest priority
             unsigned chosen_proc = schedule.getInstance().numberOfProcessors();
             double best_prio = std::numeric_limits<double>::max();
-            
+
             for(unsigned proc = 0; proc < schedule.getInstance().numberOfProcessors(); ++proc)
             {
-                if(!proc_list[proc][step].empty() && (chosen_proc == schedule.getInstance().numberOfProcessors() || 
+                if(!proc_list[proc][step].empty() && (chosen_proc == schedule.getInstance().numberOfProcessors() ||
                     priorities[proc_list[proc][step].front()] < best_prio))
                 {
                     chosen_proc = proc;
@@ -241,7 +241,7 @@ MaxBspScheduleCS<Graph_t> GreedyBspToMaxBspConverter<Graph_t>::Convert(const Bsp
 
         for(const std::pair<KeyTriple, unsigned>& entry : newly_freed_comm_steps)
             free_comm_steps_for_superstep[step].insert(entry);
-        
+
         if(free_comm_steps_for_superstep[step].empty())
             continue;
 
@@ -256,8 +256,8 @@ MaxBspScheduleCS<Graph_t> GreedyBspToMaxBspConverter<Graph_t>::Convert(const Bsp
         send_on_proc.resize(schedule.getInstance().numberOfProcessors(), 0);
         rec_on_proc.clear();
         rec_on_proc.resize(schedule.getInstance().numberOfProcessors(), 0);
-        
-        std::set<std::pair<vertex_idx, unsigned>> late_arriving_nodes; 
+
+        std::set<std::pair<vertex_idx, unsigned>> late_arriving_nodes;
         for(const std::pair<KeyTriple, unsigned>& entry : free_comm_steps_for_superstep[step])
         {
             schedule_max.addCommunicationScheduleEntry(entry.first, current_step - 1);
@@ -293,8 +293,8 @@ MaxBspScheduleCS<Graph_t> GreedyBspToMaxBspConverter<Graph_t>::Convert(const Bsp
             max_comm_together = std::max(max_comm_together, rec_on_proc[proc]);
         }
 
-        cost_type work_limit = max_comm_after;        
-        if(max_comm_together + max_work_done <= max_comm_after + std::max(max_work_done, max_comm_current + schedule.getInstance().getArchitecture().synchronisationCosts()))
+        cost_type work_limit = max_comm_after;
+        if(max_comm_together + max_work_done <= max_comm_after + std::max(max_work_done, max_comm_current) + schedule.getInstance().getArchitecture().synchronisationCosts())
         {
             work_limit = max_comm_together;
             for(const std::pair<KeyTriple, unsigned>& entry : comm_in_current_step)
@@ -320,13 +320,13 @@ MaxBspScheduleCS<Graph_t> GreedyBspToMaxBspConverter<Graph_t>::Convert(const Bsp
                     continue;
 
                 bool has_dependency = false;
-                
+
                 for (const vertex_idx &parent : dag.parents(node))
                 {
                     if(schedule.assignedProcessor(node) != schedule.assignedProcessor(parent) &&
                         late_arriving_nodes.find(std::make_pair(parent, proc)) != late_arriving_nodes.end())
                             has_dependency = true;
-                    
+
                     if(schedule.assignedProcessor(node) == schedule.assignedProcessor(parent) &&
                         schedule.assignedSuperstep(parent) == step + 1 &&
                         brought_forward.find(parent) == brought_forward.end())
@@ -341,7 +341,7 @@ MaxBspScheduleCS<Graph_t> GreedyBspToMaxBspConverter<Graph_t>::Convert(const Bsp
                 schedule_max.setAssignedSuperstep(node, current_step);
                 work_remaining_proc_superstep[proc][step+1] -= dag.vertex_work_weight(node);
                 --nodes_remaining_superstep[step+1];
-                
+
                 for(const std::pair<KeyTriple, unsigned>& entry : dependent_comm_steps_for_node[node])
                     free_comm_steps_for_superstep[step+1].insert(entry);
             }
@@ -350,7 +350,7 @@ MaxBspScheduleCS<Graph_t> GreedyBspToMaxBspConverter<Graph_t>::Convert(const Bsp
             for(vertex_idx node : proc_list[proc][step+1])
                 if(brought_forward.find(node) == brought_forward.end())
                     remaining.push_back(node);
-            
+
             proc_list[proc][step+1] = remaining;
         }
 
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp
index 7aff3997..890f779c 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp
@@ -18,15 +18,14 @@ limitations under the License.
 
 #pragma once
 
-#include "osp/bsp/model/BspScheduleCostEvaluator.hpp"
+#include "osp/bsp/model/cost/LazyCommunicationCost.hpp"
 #include "osp/bsp/scheduler/Scheduler.hpp"
 #include "osp/bsp/scheduler/Serial.hpp"
-#include <vector>
 #include <string>
+#include <vector>
 
 namespace osp {
 
-
 /**
  * @class GreedyMetaScheduler
  * @brief The GreedyMetaScheduler class represents a meta-scheduler that selects the best schedule produced from a list of
@@ -35,18 +34,21 @@ namespace osp {
  * This class inherits from the Scheduler class and implements the computeSchedule() and getScheduleName() methods.
  * The computeSchedule() method iterates through a list of schedulers, computes a schedule using each one,
  * and returns the schedule with the minimum cost.
+ *
+ * @tparam Graph_t The graph type representing the computational DAG.
+ * @tparam CostModel The cost model functor to evaluate schedules. Defaults to LazyCommunicationCost.
  */
-template<typename Graph_t>
+template<typename Graph_t, typename CostModel = LazyCommunicationCost<Graph_t>>
 class GreedyMetaScheduler : public Scheduler<Graph_t> {
 
     Serial<Graph_t> serial_scheduler_;
-    std::vector<Scheduler<Graph_t>*> schedulers_;
+    std::vector<Scheduler<Graph_t> *> schedulers_;
 
     static constexpr bool verbose = false;
 
   public:
     /**
-     * @brief Default constructor for MetaScheduler.
+     * @brief Default constructor for GreedyMetaScheduler.
      */
     GreedyMetaScheduler() : Scheduler<Graph_t>() {}
 
@@ -56,32 +58,33 @@ class GreedyMetaScheduler : public Scheduler<Graph_t> {
     ~GreedyMetaScheduler() override = default;
 
     void addSerialScheduler() { schedulers_.push_back(&serial_scheduler_); }
-    void addScheduler(Scheduler<Graph_t> & s) { schedulers_.push_back(&s); }
+    void addScheduler(Scheduler<Graph_t> &s) { schedulers_.push_back(&s); }
     void resetScheduler() { schedulers_.clear(); }
 
     RETURN_STATUS computeSchedule(BspSchedule<Graph_t> &schedule) override {
         if (schedule.getInstance().getArchitecture().numberOfProcessors() == 1) {
-            if constexpr (verbose) std::cout << "Using serial scheduler for P=1." << std::endl;
+            if constexpr (verbose)
+                std::cout << "Using serial scheduler for P=1." << std::endl;
             serial_scheduler_.computeSchedule(schedule);
             return RETURN_STATUS::OSP_SUCCESS;
         }
 
-        v_workw_t<Graph_t> best_schedule_cost = std::numeric_limits<v_workw_t<Graph_t>>::max(); 
+        v_workw_t<Graph_t> best_schedule_cost = std::numeric_limits<v_workw_t<Graph_t>>::max();
         BspSchedule<Graph_t> current_schedule(schedule.getInstance());
 
-        for (Scheduler<Graph_t>* scheduler : schedulers_) {
+        for (Scheduler<Graph_t> *scheduler : schedulers_) {
             scheduler->computeSchedule(current_schedule);
-            BspScheduleCostEvaluator<Graph_t> evaluator(current_schedule);
-            const v_workw_t<Graph_t> schedule_cost = evaluator.computeCosts();
+            const v_workw_t<Graph_t> schedule_cost = CostModel()(current_schedule);
 
-            if constexpr (verbose) std::cout << "Executed scheduler " << scheduler->getScheduleName() << ", costs: " << schedule_cost << ", nr. supersteps: " << current_schedule.numberOfSupersteps() << std::endl;
+            if constexpr (verbose)
+                std::cout << "Executed scheduler " << scheduler->getScheduleName() << ", costs: " << schedule_cost << ", nr. supersteps: " << current_schedule.numberOfSupersteps() << std::endl;
 
             if (schedule_cost < best_schedule_cost) {
                 best_schedule_cost = schedule_cost;
                 schedule = current_schedule;
-                if constexpr (verbose) std::cout << "New best schedule!" << std::endl;     
+                if constexpr (verbose)
+                    std::cout << "New best schedule!" << std::endl;
             }
-
         }
 
         return RETURN_STATUS::OSP_SUCCESS;
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp
index 56d66316..38fae9ff 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp
@@ -18,7 +18,6 @@ limitations under the License.
 
 #pragma once
 
-
 #include <climits>
 #include <list>
 #include <map>
@@ -30,8 +29,8 @@ limitations under the License.
 
 // #define TIME_THREADS_GROW_LOCAL_PARALLEL
 #ifdef TIME_THREADS_GROW_LOCAL_PARALLEL
-    #include <chrono>
-    #include <iostream>
+#include <chrono>
+#include <iostream>
 #endif
 
 #include "osp/auxiliary/misc.hpp"
@@ -46,7 +45,7 @@ struct GrowLocalAutoCoresParallel_Params {
     weight_t syncCostMultiplierMinSuperstepWeight = 1;
     weight_t syncCostMultiplierParallelCheck = 4;
 
-    unsigned numThreads = 0; // 0 for auto
+    unsigned numThreads = 0;           // 0 for auto
     unsigned maxNumThreads = UINT_MAX; // used when auto num threads
 };
 
@@ -82,7 +81,7 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
      */
     virtual ~GrowLocalAutoCoresParallel() = default;
 
-    void computePartialSchedule(BspSchedule<Graph_t> &schedule, const std::vector<VertexType> &topOrder, const std::vector<VertexType> &posInTopOrder,  const VertexType startNode, const VertexType endNode, unsigned &supstep) const {
+    void computePartialSchedule(BspSchedule<Graph_t> &schedule, const std::vector<VertexType> &topOrder, const std::vector<VertexType> &posInTopOrder, const VertexType startNode, const VertexType endNode, unsigned &supstep) const {
 
 #ifdef TIME_THREADS_GROW_LOCAL_PARALLEL
         double startTime = omp_get_wtime();
@@ -107,7 +106,7 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
             if constexpr (has_children_in_vertex_order_v<Graph_t>) {
                 for (VertexType vert = startNode; vert < endNode; ++vert) {
                     for (const VertexType &chld : graph.children(vert)) {
-                        if ( chld >= endNode ) {
+                        if (chld >= endNode) {
                             break;
                         }
                         ++predec[chld - startNode];
@@ -116,7 +115,7 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
             } else {
                 for (VertexType vert = startNode; vert < endNode; ++vert) {
                     for (const VertexType &chld : graph.children(vert)) {
-                        if ( chld < endNode ) {
+                        if (chld < endNode) {
                             ++predec[chld - startNode];
                         }
                     }
@@ -127,39 +126,36 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                 VertexType vert = topOrder[index];
                 for (const VertexType &par : graph.parents(vert)) {
                     VertexType posPar = posInTopOrder[par];
-                    if ( posPar >= startNode ) {
+                    if (posPar >= startNode) {
                         ++predec[index - startNode];
                     }
                 }
             }
         }
 
-
-
-        for(VertexType nodePos = startNode; nodePos < endNode; nodePos++) {
+        for (VertexType nodePos = startNode; nodePos < endNode; nodePos++) {
             VertexType index = nodePos - startNode;
             if (predec[index] == 0) {
                 if constexpr (has_vertices_in_top_order_v<Graph_t>) {
-                    ready.insert( nodePos );
+                    ready.insert(nodePos);
                 } else {
-                    ready.insert( topOrder[nodePos] );
+                    ready.insert(topOrder[nodePos]);
                 }
             }
         }
 
-        
         std::vector<std::vector<VertexType>> new_assignments(P);
         std::vector<std::vector<VertexType>> best_new_assignments(P);
-        
+
         const v_workw_t<Graph_t> minWeightParallelCheck = params.syncCostMultiplierParallelCheck * instance.synchronisationCosts();
         const v_workw_t<Graph_t> minSuperstepWeight = params.syncCostMultiplierMinSuperstepWeight * instance.synchronisationCosts();
-        
+
         double desiredParallelism = static_cast<double>(P);
-        
+
         VertexType total_assigned = 0;
         supstep = 0;
 
-        while(total_assigned < N) {
+        while (total_assigned < N) {
             VertexType limit = params.minSuperstepSize;
             double best_score = 0;
             double best_parallelism = 0;
@@ -169,7 +165,7 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
 
             bool continueSuperstepAttempts = true;
 
-            while(continueSuperstepAttempts) {
+            while (continueSuperstepAttempts) {
                 for (unsigned p = 0; p < P; p++) {
                     new_assignments[p].clear();
                 }
@@ -178,21 +174,20 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                 for (unsigned p = 0; p < P; p++) {
                     procReady[p].clear();
                 }
-                
+
                 readyIter = ready.begin();
 
                 VertexType new_total_assigned = 0;
                 v_workw_t<Graph_t> weight_limit = 0;
                 v_workw_t<Graph_t> total_weight_assigned = 0;
 
-
                 // Processor 0
-                while(new_assignments[0].size() < limit) {
+                while (new_assignments[0].size() < limit) {
                     VertexType chosen_node = std::numeric_limits<VertexType>::max();
-                    if(!procReady[0].empty()) {
+                    if (!procReady[0].empty()) {
                         chosen_node = *procReady[0].begin();
                         procReady[0].erase(procReady[0].begin());
-                    } else if( readyIter != ready.end() ) {
+                    } else if (readyIter != ready.end()) {
                         chosen_node = *readyIter;
                         readyIter++;
                     } else {
@@ -221,9 +216,9 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                             }
                         }
 
-                        if ( schedule.assignedProcessor(succ) == UINT_MAX ) {
+                        if (schedule.assignedProcessor(succ) == UINT_MAX) {
                             schedule.setAssignedProcessor(succ, 0);
-                        } else if ( schedule.assignedProcessor(succ) != 0 ) {
+                        } else if (schedule.assignedProcessor(succ) != 0) {
                             schedule.setAssignedProcessor(succ, P);
                         }
 
@@ -235,8 +230,8 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                         }
 
                         --predec[succIndex];
-                        if(predec[succIndex] == 0) {
-                            if( schedule.assignedProcessor(succ) == 0 ) {
+                        if (predec[succIndex] == 0) {
+                            if (schedule.assignedProcessor(succ) == 0) {
                                 procReady[0].insert(succ);
                             } else {
                                 futureReady.push_back(succ);
@@ -244,23 +239,22 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                         }
                     }
                 }
-                
-                total_weight_assigned += weight_limit;
-
 
+                total_weight_assigned += weight_limit;
 
                 // Processors 1 through P-1
-                for(unsigned proc = 1; proc < P; ++proc) {
+                for (unsigned proc = 1; proc < P; ++proc) {
                     v_workw_t<Graph_t> current_weight_assigned = 0;
-                    while(current_weight_assigned < weight_limit) {
+                    while (current_weight_assigned < weight_limit) {
                         VertexType chosen_node = std::numeric_limits<VertexType>::max();
-                        if(!procReady[proc].empty()) {
+                        if (!procReady[proc].empty()) {
                             chosen_node = *procReady[proc].begin();
                             procReady[proc].erase(procReady[proc].begin());
-                        } else if( readyIter != ready.end() ) {
+                        } else if (readyIter != ready.end()) {
                             chosen_node = *readyIter;
                             readyIter++;
-                        } else break;
+                        } else
+                            break;
 
                         new_assignments[proc].push_back(chosen_node);
                         schedule.setAssignedProcessor(chosen_node, proc);
@@ -284,9 +278,9 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                                 }
                             }
 
-                            if ( schedule.assignedProcessor(succ) == UINT_MAX ) {
+                            if (schedule.assignedProcessor(succ) == UINT_MAX) {
                                 schedule.setAssignedProcessor(succ, proc);
-                            } else if ( schedule.assignedProcessor(succ) != proc ) {
+                            } else if (schedule.assignedProcessor(succ) != proc) {
                                 schedule.setAssignedProcessor(succ, P);
                             }
 
@@ -298,8 +292,8 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                             }
 
                             --predec[succIndex];
-                            if(predec[succIndex] == 0) {
-                                if( schedule.assignedProcessor(succ) == proc ) {
+                            if (predec[succIndex] == 0) {
+                                if (schedule.assignedProcessor(succ) == proc) {
                                     procReady[proc].insert(succ);
                                 } else {
                                     futureReady.push_back(succ);
@@ -308,20 +302,18 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                         }
                     }
 
-                    
                     weight_limit = std::max(weight_limit, current_weight_assigned);
                     total_weight_assigned += current_weight_assigned;
                 }
 
                 bool accept_step = false;
 
-                double score = static_cast<double>(total_weight_assigned) / static_cast<double>( weight_limit + instance.synchronisationCosts() );
+                double score = static_cast<double>(total_weight_assigned) / static_cast<double>(weight_limit + instance.synchronisationCosts());
                 double parallelism = 0;
                 if (weight_limit > 0) {
                     parallelism = static_cast<double>(total_weight_assigned) / static_cast<double>(weight_limit);
                 }
 
-
                 if (score > 0.97 * best_score) { // It is possible to make this less strict, i.e. score > 0.98 * best_score. The purpose of this would be to encourage larger supersteps.
                     best_score = std::max(best_score, score);
                     best_parallelism = parallelism;
@@ -338,25 +330,25 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
 
                 if (weight_limit <= minSuperstepWeight) {
                     continueSuperstepAttempts = true;
-                    if(total_assigned + new_total_assigned == N) {
+                    if (total_assigned + new_total_assigned == N) {
                         accept_step = true;
                         continueSuperstepAttempts = false;
                     }
                 }
 
-                if(total_assigned + new_total_assigned == N) {
+                if (total_assigned + new_total_assigned == N) {
                     continueSuperstepAttempts = false;
                 }
 
                 // undo proc assingments and predec increases in any case
-                for(unsigned proc = 0; proc < P; ++proc) {
-                    for(const VertexType &node : new_assignments[proc]) {
+                for (unsigned proc = 0; proc < P; ++proc) {
+                    for (const VertexType &node : new_assignments[proc]) {
                         schedule.setAssignedProcessor(node, UINT_MAX);
                     }
                 }
 
-                for(unsigned proc = 0; proc < P; ++proc) {
-                    for(const VertexType &node : new_assignments[proc]) {
+                for (unsigned proc = 0; proc < P; ++proc) {
+                    for (const VertexType &node : new_assignments[proc]) {
                         for (const VertexType &succ : graph.children(node)) {
                             if constexpr (has_vertices_in_top_order_v<Graph_t>) {
                                 if constexpr (has_children_in_vertex_order_v<Graph_t>) {
@@ -386,8 +378,8 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                     }
                 }
 
-                for(unsigned proc = 0; proc < P; ++proc) {
-                    for(const VertexType &node : new_assignments[proc]) {
+                for (unsigned proc = 0; proc < P; ++proc) {
+                    for (const VertexType &node : new_assignments[proc]) {
                         for (const VertexType &succ : graph.children(node)) {
                             if constexpr (has_vertices_in_top_order_v<Graph_t>) {
                                 if constexpr (has_children_in_vertex_order_v<Graph_t>) {
@@ -410,7 +402,7 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                     }
                 }
 
-                if(accept_step) {
+                if (accept_step) {
                     best_new_assignments.swap(new_assignments);
                     best_futureReady.swap(futureReady);
                     best_procReady.swap(procReady);
@@ -418,20 +410,20 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
                 }
 
                 limit++;
-                limit += ( limit / 2 );
+                limit += (limit / 2);
             }
 
             // apply best iteration
             ready.erase(ready.begin(), bestReadyIter);
             ready.insert(best_futureReady.begin(), best_futureReady.end());
             for (unsigned proc = 0; proc < P; proc++) {
-                ready.merge( best_procReady[proc] );
+                ready.merge(best_procReady[proc]);
             }
 
-            for(unsigned proc = 0; proc < P; ++proc) {
-                for(const VertexType &node : best_new_assignments[proc]) {
+            for (unsigned proc = 0; proc < P; ++proc) {
+                for (const VertexType &node : best_new_assignments[proc]) {
                     schedule.setAssignedProcessor(node, proc);
-                    schedule.setAssignedSuperstep_noUpdateNumSuperstep(node, supstep);
+                    schedule.setAssignedSuperstepNoUpdateNumSuperstep(node, supstep);
                     ++total_assigned;
 
                     for (const VertexType &succ : graph.children(node)) {
@@ -478,16 +470,16 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
         std::cout << outputString;
 #endif
     }
-    
+
     void incrementScheduleSupersteps(BspSchedule<Graph_t> &schedule, const VertexType startNode, const VertexType endNode, const unsigned incr) const {
         for (VertexType node = startNode; node < endNode; node++) {
-            schedule.setAssignedSuperstep_noUpdateNumSuperstep(node, schedule.assignedSuperstep(node) + incr);
+            schedule.setAssignedSuperstepNoUpdateNumSuperstep(node, schedule.assignedSuperstep(node) + incr);
         }
     }
     void incrementScheduleSupersteps_TopOrder(BspSchedule<Graph_t> &schedule, const std::vector<VertexType> &topOrder, const VertexType startIndex, const VertexType endIndex, const unsigned incr) const {
         for (VertexType index = startIndex; index < endIndex; index++) {
             const VertexType node = topOrder[index];
-            schedule.setAssignedSuperstep_noUpdateNumSuperstep(node, schedule.assignedSuperstep(node) + incr);
+            schedule.setAssignedSuperstepNoUpdateNumSuperstep(node, schedule.assignedSuperstep(node) + incr);
         }
     }
 
@@ -495,10 +487,10 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
 
         const BspInstance<Graph_t> &instance = schedule.getInstance();
         const Graph_t &graph = instance.getComputationalDag();
-        
+
         const VertexType N = instance.numberOfVertices();
 
-        for (VertexType vert = 0; vert < N; ++vert ) {
+        for (VertexType vert = 0; vert < N; ++vert) {
             schedule.setAssignedProcessor(vert, UINT_MAX);
         }
 
@@ -525,30 +517,30 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
         std::vector<VertexType> posInTopOrder;
         if constexpr (not has_vertices_in_top_order_v<Graph_t>) {
             posInTopOrder = std::vector<VertexType>(graph.num_vertices());
-            for (VertexType ind = 0; ind < static_cast<VertexType>( topOrder.size() ); ++ind) {
-                posInTopOrder[ topOrder[ind] ] = ind;
+            for (VertexType ind = 0; ind < static_cast<VertexType>(topOrder.size()); ++ind) {
+                posInTopOrder[topOrder[ind]] = ind;
             }
         }
 
-        #pragma omp parallel num_threads(numThreads) default(none) shared(schedule, topOrder, posInTopOrder, superstepsThread, supstepIncr, numThreads, startNodes, incr)
+#pragma omp parallel num_threads(numThreads) default(none) shared(schedule, topOrder, posInTopOrder, superstepsThread, supstepIncr, numThreads, startNodes, incr)
         {
-        #pragma omp for schedule(static, 1)
+#pragma omp for schedule(static, 1)
             for (unsigned thr = 0; thr < numThreads; thr++) {
                 computePartialSchedule(schedule, topOrder, posInTopOrder, startNodes[thr], startNodes[thr + 1], superstepsThread[thr * UnsignedPadding]);
             }
 
-        #pragma omp master
-        {
-            for (unsigned thr = 0; thr < numThreads; thr++) {
-                supstepIncr[thr] = incr;
-                incr += superstepsThread[thr * UnsignedPadding];
+#pragma omp master
+            {
+                for (unsigned thr = 0; thr < numThreads; thr++) {
+                    supstepIncr[thr] = incr;
+                    incr += superstepsThread[thr * UnsignedPadding];
+                }
+                // the value of incr is now the number of supersteps
             }
-            // the value of incr is now the number of supersteps
-        }
 
-        #pragma omp barrier
+#pragma omp barrier
 
-        #pragma omp for schedule(static, 1)
+#pragma omp for schedule(static, 1)
             for (unsigned thr = 0; thr < numThreads; thr++) {
                 if constexpr (has_vertices_in_top_order_v<Graph_t>) {
                     incrementScheduleSupersteps(schedule, startNodes[thr], startNodes[thr + 1], supstepIncr[thr]);
@@ -576,7 +568,7 @@ class GrowLocalAutoCoresParallel : public Scheduler<Graph_t> {
         unsigned numThreads = params.numThreads;
         if (numThreads == 0) {
             // numThreads = static_cast<unsigned>(std::sqrt( static_cast<double>((schedule.getInstance().numberOfVertices() / 1000000)))) + 1;
-            numThreads = static_cast<unsigned>(std::log2( static_cast<double>((schedule.getInstance().numberOfVertices() / 1000)))) + 1;
+            numThreads = static_cast<unsigned>(std::log2(static_cast<double>((schedule.getInstance().numberOfVertices() / 1000)))) + 1;
         }
         numThreads = std::min(numThreads, params.maxNumThreads);
         if (numThreads == 0) {
diff --git a/include/osp/bsp/scheduler/IlpSchedulers/CoptCommScheduleOptimizer.hpp b/include/osp/bsp/scheduler/IlpSchedulers/CoptCommScheduleOptimizer.hpp
index f3a66b70..1b8d72b9 100644
--- a/include/osp/bsp/scheduler/IlpSchedulers/CoptCommScheduleOptimizer.hpp
+++ b/include/osp/bsp/scheduler/IlpSchedulers/CoptCommScheduleOptimizer.hpp
@@ -38,13 +38,13 @@ class CoptCommScheduleOptimizer {
 
     static_assert(is_computational_dag_v<Graph_t>, "CoptFullScheduler can only be used with computational DAGs.");
 
-    bool num_supersteps_can_change = true;
+    bool ignore_latency = false;
 
     unsigned int timeLimitSeconds = 600;
 
   protected:
 
-    VarArray superstep_used_var;
+    VarArray superstep_has_comm;
     VarArray max_comm_superstep_var;
     std::vector<std::vector<std::vector<VarArray>>> comm_processor_to_processor_superstep_node_var;
 
@@ -67,7 +67,7 @@ class CoptCommScheduleOptimizer {
 
     virtual void setTimeLimitSeconds(unsigned int limit) { timeLimitSeconds = limit; }
     inline unsigned int getTimeLimitSeconds() const { return timeLimitSeconds; }
-    virtual void setNumSuperstepsCanChange(bool can_change_) { num_supersteps_can_change = can_change_; }
+    virtual void setIgnoreLatency(bool ignore_latency_) { ignore_latency = ignore_latency_; }
 };
 
 
@@ -110,7 +110,7 @@ bool CoptCommScheduleOptimizer<Graph_t>::canShrinkResultingSchedule(unsigned num
 
     for (unsigned step = 0; step < number_of_supersteps - 1; step++) {
 
-        if (superstep_used_var[static_cast<int>(step)].Get(COPT_DBLINFO_VALUE) <= 0.01)
+        if (superstep_has_comm[static_cast<int>(step)].Get(COPT_DBLINFO_VALUE) <= 0.01)
             return true;
     }
     return false;
@@ -187,13 +187,13 @@ void CoptCommScheduleOptimizer<Graph_t>::setInitialSolution(BspScheduleCS<Graph_
                                                                                         [static_cast<int>(node)], 0);
                 }
 
-    if(num_supersteps_can_change)
+    if(!ignore_latency)
     {
         std::vector<unsigned> comm_phase_used(num_supersteps, 0);
         for (auto const &[key, val] : cs)
             comm_phase_used[val] = 1;
         for (unsigned step = 0; step < num_supersteps; step++)
-            model.SetMipStart(superstep_used_var[static_cast<int>(step)], comm_phase_used[step]);
+            model.SetMipStart(superstep_has_comm[static_cast<int>(step)], comm_phase_used[step]);
     }
 
     std::vector<std::vector<v_commw_t<Graph_t>>> send(num_supersteps, std::vector<v_commw_t<Graph_t>>(num_processors, 0));
@@ -227,8 +227,8 @@ void CoptCommScheduleOptimizer<Graph_t>::setupVariablesConstraintsObjective(cons
     const unsigned num_vertices = static_cast<unsigned>(schedule.getInstance().numberOfVertices());
 
     // variables indicating if superstep is used at all
-    if (num_supersteps_can_change) {
-        superstep_used_var = model.AddVars(static_cast<int>(max_number_supersteps), COPT_BINARY, "superstep_used");
+    if (!ignore_latency) {
+        superstep_has_comm = model.AddVars(static_cast<int>(max_number_supersteps), COPT_BINARY, "superstep_has_comm");
     }
 
     max_comm_superstep_var = model.AddVars(static_cast<int>(max_number_supersteps), COPT_INTEGER, "max_comm_superstep");
@@ -250,7 +250,7 @@ void CoptCommScheduleOptimizer<Graph_t>::setupVariablesConstraintsObjective(cons
         }
     }
 
-    if (num_supersteps_can_change) {
+    if (!ignore_latency) {
         unsigned M = num_processors * num_processors * num_vertices;
         for (unsigned int step = 0; step < schedule.numberOfSupersteps(); step++) {
 
@@ -269,7 +269,7 @@ void CoptCommScheduleOptimizer<Graph_t>::setupVariablesConstraintsObjective(cons
                 }
             }
 
-            model.AddConstr(expr <= M * superstep_used_var[static_cast<int>(step)]);
+            model.AddConstr(expr <= M * superstep_has_comm[static_cast<int>(step)]);
         }
     }
     // precedence constraint: if task is computed then all of its predecessors must have been present
@@ -356,11 +356,11 @@ void CoptCommScheduleOptimizer<Graph_t>::setupVariablesConstraintsObjective(cons
       */
     Expr expr;
 
-    if (num_supersteps_can_change) {
+    if (!ignore_latency) {
 
         for (unsigned int step = 0; step < max_number_supersteps; step++) {
             expr += schedule.getInstance().communicationCosts() * max_comm_superstep_var[static_cast<int>(step)] +
-                    schedule.getInstance().synchronisationCosts() * superstep_used_var[static_cast<int>(step)];
+                    schedule.getInstance().synchronisationCosts() * superstep_has_comm[static_cast<int>(step)];
         }
     } else {
 
diff --git a/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp b/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp
index 79e22c54..aa199c45 100644
--- a/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp
+++ b/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp
@@ -62,17 +62,17 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
 
   private:
     bool allow_recomputation;
-    bool is_max_bsp = false;
     bool use_memory_constraint;
+    bool use_initial_schedule_recomp = false;
     bool use_initial_schedule = false;
     bool write_solutions_found;
-    bool use_initial_schedule_recomp = false;
+    bool is_max_bsp = false;
 
     unsigned timeLimitSeconds = 0;
 
     const BspScheduleCS<Graph_t> *initial_schedule;
     const BspScheduleRecomp<Graph_t> *initial_schedule_recomp;
-    
+
     std::string write_solutions_path;
     std::string solution_file_prefix;
 
@@ -172,7 +172,7 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
 
             return schedule;
         }
-        
+
         BspScheduleRecomp<Graph_t> constructBspScheduleRecompFromCallback() {
 
             unsigned number_of_supersteps = 0;
@@ -262,6 +262,7 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
         if(is_max_bsp && number_of_supersteps>0) // can ignore last 2 comm phases in this case
             --number_of_supersteps;
 
+        schedule.getCommunicationSchedule().clear();
         for (const auto &node : instance.vertices()) {
 
             for (unsigned int p_from = 0; p_from < instance.numberOfProcessors(); p_from++) {
@@ -310,6 +311,7 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
             }
         }
 
+        schedule.getCommunicationSchedule().clear();
         for (unsigned int node = 0; node < schedule.getInstance().numberOfVertices(); node++) {
 
             for (unsigned int p_from = 0; p_from < schedule.getInstance().numberOfProcessors(); p_from++) {
@@ -405,11 +407,11 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
             }
             else
             {
-                first_at[node][initial_schedule->assignedProcessor(node)] = std::min(first_at[node][initial_schedule->assignedProcessor(node)], 
+                first_at[node][initial_schedule->assignedProcessor(node)] = std::min(first_at[node][initial_schedule->assignedProcessor(node)],
                                                                                     initial_schedule->assignedSuperstep(node) );
             }
         }
-    
+
         unsigned staleness = is_max_bsp ? 2 : 1;
         for (const auto &node : DAG.vertices()) {
 
@@ -452,10 +454,10 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
                                                                                         [static_cast<int>(node)], 1);
                     else
                         model.SetMipStart(comm_processor_to_processor_superstep_node_var[proc][proc][step]
-                                                                                        [static_cast<int>(node)], 0); 
+                                                                                        [static_cast<int>(node)], 0);
                 }
 
-        for (const auto &node : DAG.vertices()) {            
+        for (const auto &node : DAG.vertices()) {
 
             for (unsigned proc = 0; proc < num_processors; proc++) {
 
@@ -548,10 +550,19 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
         // variables indicating if superstep is used at all
         superstep_used_var = model.AddVars(static_cast<int>(max_number_supersteps), COPT_BINARY, "superstep_used");
 
+        VarArray superstep_has_comm, mergeable_superstep_penalty;
+        if(is_max_bsp)
+        {
+            // variables indicating if there is any communication in superstep
+            superstep_has_comm = model.AddVars(static_cast<int>(max_number_supersteps), COPT_BINARY, "superstep_has_comm");
+            // variables that incentivize the schedule to be continuous - needs to be done differently for maxBsp
+            mergeable_superstep_penalty = model.AddVars(static_cast<int>(max_number_supersteps), COPT_BINARY, "mergeable_superstep_penalty");
+        }
+
+        // variables for assigments of nodes to processor and superstep
         node_to_processor_superstep_var = std::vector<std::vector<VarArray>>(
             instance.numberOfVertices(), std::vector<VarArray>(instance.numberOfProcessors()));
 
-        // variables for assigments of nodes to processor and superstep
         for (const auto &node : instance.vertices()) {
 
             for (unsigned int processor = 0; processor < instance.numberOfProcessors(); processor++) {
@@ -598,7 +609,7 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
                 }
             }
             model.AddConstr(expr <= static_cast<double>(instance.numberOfVertices() * instance.numberOfProcessors()) *
-                                        superstep_used_var.GetVar(static_cast<int>(step)));
+                                        superstep_used_var[static_cast<int>(step)]);
         }
 
         // nodes are assigend depending on whether recomputation is allowed or not
@@ -688,6 +699,29 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
             }
         }
 
+        // synchronization cost calculation & forcing continuous schedule in maxBsp
+        if(is_max_bsp)
+        {
+            for (unsigned int step = 0; step < max_number_supersteps; step++) {
+                Expr expr;
+                for (const auto &node : instance.vertices()) {
+                    for (unsigned int p_from = 0; p_from < instance.numberOfProcessors(); p_from++) {
+                        for (unsigned int p_to = 0; p_to < instance.numberOfProcessors(); p_to++) {
+                            if(p_from != p_to)
+                                expr += comm_processor_to_processor_superstep_node_var[p_from][p_to][step][static_cast<int>(node)];
+                        }
+                    }
+                }
+                model.AddConstr(static_cast<unsigned>(instance.numberOfProcessors() * instance.numberOfProcessors() * instance.numberOfVertices()) *
+                                superstep_has_comm[static_cast<int>(step)] >= expr);
+            }
+
+            // if step i and (i+1) has no comm, and (i+2) has work, then (i+1) and (i+2) are mergeable -> penalize
+            for (unsigned int step = 0; step < max_number_supersteps - 2; step++)
+                model.AddConstr(superstep_used_var[static_cast<int>(step + 2)] - superstep_has_comm[static_cast<int>(step)]
+                                - superstep_has_comm[static_cast<int>(step + 1)] <= mergeable_superstep_penalty[static_cast<int>(step)]);
+        }
+
         max_comm_superstep_var =
             model.AddVars(static_cast<int>(max_number_supersteps), COPT_INTEGER, "max_comm_superstep");
         // coptModel.AddVars(max_number_supersteps, 0, COPT_INFINITY, 0, COPT_INTEGER, "max_comm_superstep");
@@ -770,10 +804,10 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
                 model.AddConstr(max_superstep_var[static_cast<int>(step)] >= max_work_superstep_var[static_cast<int>(step)]);
                 if(step > 0)
                     model.AddConstr(max_superstep_var[static_cast<int>(step)] >= instance.communicationCosts() * max_comm_superstep_var[static_cast<int>(step-1)]);
-                expr += max_superstep_var[static_cast<int>(step)]; +
-                        instance.synchronisationCosts() * superstep_used_var[static_cast<int>(step)];
+                expr += max_superstep_var[static_cast<int>(step)];
+                expr += instance.synchronisationCosts() * superstep_has_comm[static_cast<int>(step)];
+                expr += instance.synchronisationCosts() * mergeable_superstep_penalty[static_cast<int>(step)];
             }
-
         }
         else
         {
@@ -782,9 +816,10 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
                         instance.communicationCosts() * max_comm_superstep_var[static_cast<int>(step)] +
                         instance.synchronisationCosts() * superstep_used_var[static_cast<int>(step)];
             }
+            expr -= instance.synchronisationCosts();
         }
 
-        model.SetObjective(expr - instance.synchronisationCosts(), COPT_MINIMIZE);
+        model.SetObjective(expr, COPT_MINIMIZE);
     }
 
     RETURN_STATUS run_scheduler(BspScheduleCS<Graph_t> &schedule) {
@@ -824,7 +859,7 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
 
   public:
     CoptFullScheduler(unsigned steps = 5)
-        : allow_recomputation(false), use_memory_constraint(false), use_initial_schedule(false), 
+        : allow_recomputation(false), use_memory_constraint(false), use_initial_schedule(false),
           write_solutions_found(false), initial_schedule(0), max_number_supersteps(steps) {
 
         // solution_callback.comm_processor_to_processor_superstep_node_var_ptr =
@@ -889,7 +924,7 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
             return status;
         }
     }
-    
+
     virtual RETURN_STATUS computeMaxBspScheduleCS(MaxBspScheduleCS<Graph_t> &schedule) {
         allow_recomputation = false;
         is_max_bsp = true;
@@ -897,7 +932,7 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
     }
 
 
-    virtual RETURN_STATUS computeScheduleCS(BspScheduleCS<Graph_t> &schedule) override { 
+    virtual RETURN_STATUS computeScheduleCS(BspScheduleCS<Graph_t> &schedule) override {
         allow_recomputation = false;
         is_max_bsp = false;
         return run_scheduler(schedule);
@@ -942,7 +977,7 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
     };
 
     virtual void computeScheduleBase(const BspScheduleRecomp<Graph_t> &schedule, Model &model) {
-    
+
         if (timeLimitSeconds > 0) {
             model.SetDblParam(COPT_DBLPARAM_TIMELIMIT, timeLimitSeconds);
         }
@@ -1064,6 +1099,13 @@ class CoptFullScheduler : public Scheduler<Graph_t> {
      */
     inline unsigned getMaxNumberOfSupersteps() const { return max_number_supersteps; }
 
+    /**
+     * @brief Sets the time limit for the ILP solving.
+     *
+     * @param time_limit_seconds_ The time limit in seconds.
+     */
+    inline void setTimeLimitSeconds(unsigned time_limit_seconds_) { timeLimitSeconds = time_limit_seconds_; }
+
     /**
      * @brief Get the name of the schedule.
      *
diff --git a/include/osp/bsp/scheduler/IlpSchedulers/CoptPartialScheduler.hpp b/include/osp/bsp/scheduler/IlpSchedulers/CoptPartialScheduler.hpp
index 10fa2243..db9a01f3 100644
--- a/include/osp/bsp/scheduler/IlpSchedulers/CoptPartialScheduler.hpp
+++ b/include/osp/bsp/scheduler/IlpSchedulers/CoptPartialScheduler.hpp
@@ -64,6 +64,8 @@ class CoptPartialScheduler {
     std::vector<std::vector<std::vector<VarArray>>> comm_processor_to_processor_superstep_node_var;
     std::vector<std::vector<VarArray>> comm_to_processor_superstep_source_var;
 
+    bool has_fixed_comm_in_preceding_step;
+
     void setupVariablesConstraintsObjective(const BspScheduleCS<Graph_t>& schedule, Model& model);
 
     void setInitialSolution(const BspScheduleCS<Graph_t>& schedule, Model &model);
@@ -156,9 +158,9 @@ void CoptPartialScheduler<Graph_t>::setInitialSolution(const BspScheduleCS<Graph
                 for (unsigned step = 0; step < max_number_supersteps && step <= end_superstep - start_superstep; step++) {
 
                     const auto &key = std::make_tuple(node, p1, p2);
-                    if (cs.find(key) != cs.end() && cs.at(key) == start_superstep + step) 
+                    if (cs.find(key) != cs.end() && cs.at(key) == start_superstep + step)
                         model.SetMipStart(comm_processor_to_processor_superstep_node_var[p1][p2][step][static_cast<int>(node_local_ID[node])], 1);
-                    else 
+                    else
                         model.SetMipStart(comm_processor_to_processor_superstep_node_var[p1][p2][step][static_cast<int>(node_local_ID[node])], 0);
                 }
             }
@@ -178,7 +180,7 @@ void CoptPartialScheduler<Graph_t>::setInitialSolution(const BspScheduleCS<Graph
             for (unsigned step = 0; step < max_number_supersteps + 1 && step <= end_superstep - start_superstep + 1; step++) {
 
                 const auto &key = std::make_tuple(source, schedule.assignedProcessor(source), proc);
-                if (cs.find(key) != cs.end() && cs.at(key) == start_superstep + step - 1) 
+                if (cs.find(key) != cs.end() && cs.at(key) == start_superstep + step - 1)
                     model.SetMipStart(comm_to_processor_superstep_source_var[proc][step][static_cast<int>(source_local_ID[source])], 1);
                 else if(step > 0)
                     model.SetMipStart(comm_to_processor_superstep_source_var[proc][step][static_cast<int>(source_local_ID[source])], 0);
@@ -296,6 +298,8 @@ void CoptPartialScheduler<Graph_t>::setupVariablesConstraintsObjective(const Bsp
     */
     // variables indicating if superstep is used at all
     superstep_used_var = model.AddVars(static_cast<int>(max_number_supersteps), COPT_BINARY, "superstep_used");
+    VarArray superstep_has_comm = model.AddVars(static_cast<int>(max_number_supersteps+1), COPT_BINARY, "superstep_has_comm");
+    VarArray has_comm_at_end = model.AddVars(1, COPT_BINARY, "has_comm_at_end");
 
     // variables for assigments of nodes to processor and superstep
     node_to_processor_superstep_var = std::vector<std::vector<VarArray>>(num_vertices, std::vector<VarArray>(num_processors));
@@ -333,7 +337,7 @@ void CoptPartialScheduler<Graph_t>::setupVariablesConstraintsObjective(const Bsp
 
             comm_to_processor_superstep_source_var[proc][step] =
                 model.AddVars(static_cast<int>(num_sources), COPT_BINARY, "comm_to_processor_superstep_source");
-            
+
             if(step < max_number_supersteps)
                 present_on_processor_superstep_source_var[proc][step] =
                     model.AddVars(static_cast<int>(num_sources), COPT_BINARY, "present_on_processor_superstep_source");
@@ -357,29 +361,59 @@ void CoptPartialScheduler<Graph_t>::setupVariablesConstraintsObjective(const Bsp
         model.AddConstr(superstep_used_var[static_cast<int>(step)] >= superstep_used_var[static_cast<int>(step + 1)]);
     }
 
-    // superstep is used at all
-    unsigned large_constant = static_cast<unsigned>(num_vertices+num_sources) * num_processors * num_processors * 2;
+    // check whether superstep is used at all (work or comm), and whether superstep has any communication at all
+    unsigned large_constant_work = static_cast<unsigned>(num_vertices) * num_processors;
+    unsigned large_constant_comm = static_cast<unsigned>(num_vertices+num_sources) * num_processors * num_processors + static_cast<unsigned>(fixed_comm_steps.size());
     for (unsigned int step = 0; step < max_number_supersteps; step++) {
 
-        Expr expr;
+        Expr expr_work, expr_comm;
         for (vertex_idx_t<Graph_t> node = 0; node < num_vertices; node++) {
 
             for (unsigned int processor = 0; processor < num_processors; processor++) {
-                expr += node_to_processor_superstep_var[node][processor][static_cast<int>(step)];
-                
+                expr_work += node_to_processor_superstep_var[node][processor][static_cast<int>(step)];
+
                 for (unsigned int p_other = 0; p_other < num_processors; p_other++)
                     if(processor != p_other)
-                        expr += comm_processor_to_processor_superstep_node_var[processor][p_other][step][static_cast<int>(node)];
+                        expr_comm += comm_processor_to_processor_superstep_node_var[processor][p_other][step][static_cast<int>(node)];
             }
         }
         for (vertex_idx_t<Graph_t> source = 0; source < num_sources; source++)
             for (unsigned int processor = 0; processor < num_processors; processor++)
                 if(source_present_before.find(std::make_pair(source, processor)) == source_present_before.end())
-                    expr += comm_to_processor_superstep_source_var[processor][step+1][static_cast<int>(source)]; 
+                    expr_comm += comm_to_processor_superstep_source_var[processor][step+1][static_cast<int>(source)];
+
+        for (unsigned index = 0; index < fixed_comm_steps.size(); ++index)
+            if(std::get<3>(fixed_comm_steps[index]) == start_superstep + step)
+                expr_comm += keep_fixed_comm_step[static_cast<int>(index)];
 
-        model.AddConstr(expr <= large_constant * superstep_used_var[static_cast<int>(step)]);
+        model.AddConstr(expr_comm <= large_constant_comm * superstep_has_comm[static_cast<int>(step+1)]);
+        model.AddConstr(expr_work <= large_constant_work * superstep_used_var[static_cast<int>(step)]);
+        model.AddConstr(superstep_has_comm[static_cast<int>(step+1)] <= superstep_used_var[static_cast<int>(step)]);
     }
 
+    // check communication usage in edge case: comm phase before the segment
+    if(has_fixed_comm_in_preceding_step)
+        model.AddConstr(superstep_has_comm[0] == 1);
+    else {
+        Expr expr_comm_0;
+        for (vertex_idx_t<Graph_t> source = 0; source < num_sources; source++)
+            for (unsigned int processor = 0; processor < num_processors; processor++)
+                if(source_present_before.find(std::make_pair(source, processor)) == source_present_before.end())
+                    expr_comm_0 += comm_to_processor_superstep_source_var[processor][0][static_cast<int>(source)];
+        for (unsigned index = 0; index < fixed_comm_steps.size(); ++index)
+            expr_comm_0 += 1 - keep_fixed_comm_step[static_cast<int>(index)];
+        model.AddConstr(expr_comm_0 <= (static_cast<unsigned>(num_sources) * num_processors + static_cast<unsigned>(fixed_comm_steps.size())) * superstep_has_comm[0]);
+    }
+
+    // check if there is any communication at the end of the subschedule
+    for (unsigned int step = 0; step < max_number_supersteps - 1; step++)
+    {
+        model.AddConstr(superstep_used_var[static_cast<int>(step)] - superstep_used_var[static_cast<int>(step + 1)] +
+                        superstep_has_comm[static_cast<int>(step+1)] - 1 <= has_comm_at_end[0]);
+    }
+    model.AddConstr(superstep_used_var[static_cast<int>(max_number_supersteps - 1)] +
+                        superstep_has_comm[static_cast<int>(max_number_supersteps)] - 1 <= has_comm_at_end[0]);
+
     // nodes are assigend
     for (vertex_idx_t<Graph_t> node = 0; node < num_vertices; node++) {
 
@@ -421,7 +455,7 @@ void CoptPartialScheduler<Graph_t>::setupVariablesConstraintsObjective(const Bsp
             }
         }
     }
-    
+
     // combines two constraints: node can only be communicated if it is present; and node is present if it was computed
     // or communicated
     for (unsigned int step = 0; step < max_number_supersteps; step++) {
@@ -602,8 +636,10 @@ void CoptPartialScheduler<Graph_t>::setupVariablesConstraintsObjective(const Bsp
     }
 
     expr += schedule.getInstance().communicationCosts() * max_comm_superstep_var[0];
+    expr += schedule.getInstance().synchronisationCosts() * superstep_has_comm[0];
+    expr += schedule.getInstance().synchronisationCosts() * has_comm_at_end[0];
 
-    model.SetObjective(expr, COPT_MINIMIZE);
+    model.SetObjective(expr - schedule.getInstance().synchronisationCosts(), COPT_MINIMIZE);
 };
 
 template<typename Graph_t>
@@ -671,12 +707,12 @@ void CoptPartialScheduler<Graph_t>::setupVertexMaps(const BspScheduleCS<Graph_t>
             for(unsigned proc2 = 0; proc2 < schedule.getInstance().numberOfProcessors(); ++proc2)
             {
                 if(proc1 == proc2)
-                    continue;          
+                    continue;
                 auto itr = schedule.getCommunicationSchedule().find(std::make_tuple(source, proc1, proc2));
                 if (itr != schedule.getCommunicationSchedule().end() && itr->second > end_superstep)
                     procs_needing_this.insert(schedule.assignedProcessor(proc1));
             }
-        
+
         for(unsigned proc : procs_needing_this)
             if(first_at[source][proc] >= start_superstep && first_at[source][proc] <= end_superstep + 1)
                 source_needed_after_on_proc.emplace_back(source_and_ID.second, proc);
@@ -692,12 +728,12 @@ void CoptPartialScheduler<Graph_t>::setupVertexMaps(const BspScheduleCS<Graph_t>
 
         for(unsigned proc1 = 0; proc1 < schedule.getInstance().numberOfProcessors(); ++proc1)
             for(unsigned proc2 = 0; proc2 < schedule.getInstance().numberOfProcessors(); ++proc2)
-            {                
+            {
                 auto itr = schedule.getCommunicationSchedule().find(std::make_tuple(node, proc1, proc2));
                 if (itr != schedule.getCommunicationSchedule().end() && proc1 != proc2 && itr->second > end_superstep)
                     procs_needing_this.insert(schedule.assignedProcessor(proc1));
             }
-        
+
         for(unsigned proc : procs_needing_this)
             if(first_at[node][proc] <= end_superstep + 1)
                 node_needed_after_on_proc.emplace_back(node_and_ID.second, proc);
@@ -705,13 +741,18 @@ void CoptPartialScheduler<Graph_t>::setupVertexMaps(const BspScheduleCS<Graph_t>
 
 
     // comm steps that just happen to be in this interval, but not connected to the nodes within
+    has_fixed_comm_in_preceding_step = false;
     for (const auto &[key, val] : schedule.getCommunicationSchedule())
     {
         vertex_idx_t<Graph_t> source = std::get<0>(key);
-        if(source_local_ID.find(source) == source_local_ID.end() && 
+        if(source_local_ID.find(source) == source_local_ID.end() &&
             schedule.assignedSuperstep(source) < start_superstep &&
             val >= start_superstep - 1 && val <= end_superstep)
+            {
                 fixed_comm_steps.emplace_back(std::get<0>(key), std::get<1>(key), std::get<2>(key), val);
+                if(val == start_superstep - 1)
+                    has_fixed_comm_in_preceding_step = true;
+            }
     }
 
 };
diff --git a/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp b/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp
index 362b0744..5d759687 100644
--- a/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp
+++ b/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp
@@ -289,6 +289,8 @@ class TotalCommunicationScheduler : public Scheduler<Graph_t> {
   protected:
     unsigned int max_number_supersteps;
 
+    unsigned time_limit_seconds;
+
     VarArray superstep_used_var;
     std::vector<std::vector<VarArray>> node_to_processor_superstep_var;
     std::vector<std::vector<VarArray>> edge_vars;
@@ -633,7 +635,7 @@ class TotalCommunicationScheduler : public Scheduler<Graph_t> {
 
     virtual ~TotalCommunicationScheduler() = default;
 
-    virtual RETURN_STATUS computeScheduleWithTimeLimit(BspSchedule<Graph_t> &schedule, unsigned timeout) override {
+    virtual RETURN_STATUS computeScheduleWithTimeLimit(BspSchedule<Graph_t> &schedule, unsigned timeout) {
         model.SetDblParam(COPT_DBLPARAM_TIMELIMIT, timeout);
         return computeSchedule(schedule);
     }
@@ -668,7 +670,7 @@ class TotalCommunicationScheduler : public Scheduler<Graph_t> {
             loadInitialSchedule();
         }
 
-        
+
         model.SetIntParam(COPT_INTPARAM_THREADS, 128);
         model.SetIntParam(COPT_INTPARAM_STRONGBRANCHING, 1);
         model.SetIntParam(COPT_INTPARAM_LPMETHOD, 1);
@@ -846,6 +848,13 @@ class TotalCommunicationScheduler : public Scheduler<Graph_t> {
      */
     inline double bestBound() { return model.GetDblAttr(COPT_DBLATTR_BESTBND); }
 
+    /**
+     * @brief Sets the time limit for the ILP solving.
+     *
+     * @param time_limit_seconds_ The time limit in seconds.
+     */
+    inline void setTimeLimitSeconds(unsigned time_limit_seconds_) { time_limit_seconds = time_limit_seconds_; }
+
     /**
      * @brief Get the name of the schedule.
      *
diff --git a/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp b/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp
index f876229b..48a983a6 100644
--- a/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp
@@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
-@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner   
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
 */
 
 #pragma once
@@ -165,7 +165,7 @@ template<typename Graph_t>
 RETURN_STATUS HillClimbingScheduler<Graph_t>::improveScheduleWithStepLimit(BspSchedule<Graph_t> &input_schedule, const unsigned stepLimit) {
 
     schedule = &input_schedule;
-    
+
     CreateSupstepLists();
     Init();
     for (unsigned step = 0; step < stepLimit; ++step)
@@ -208,7 +208,7 @@ void HillClimbingScheduler<Graph_t>::Init() {
             else
                 succSteps[node][schedule->assignedProcessor(succ)].at(schedule->assignedSuperstep(succ)) += 1;
         }
-    
+
     // Cost data
     workCost.clear();
     workCost.resize(M, std::vector<cost_type>(P, 0));
@@ -278,13 +278,12 @@ void HillClimbingScheduler<Graph_t>::Init() {
             commCostPointer[step][proc] = commCostList[step].insert(entry).first;
         }
         cost_type comm_cost = schedule->getInstance().getArchitecture().communicationCosts() * commCostList[step].rbegin()->first;
-        if(comm_cost > 0)
-                comm_cost += schedule->getInstance().getArchitecture().synchronisationCosts();
-        
+        cost_type sync_cost = (comm_cost > 0) ? schedule->getInstance().getArchitecture().synchronisationCosts() : 0;
+
         if(schedule->getStaleness() == 1)
-            cost += comm_cost + work_cost[step+1];
+            cost += comm_cost + work_cost[step+1] + sync_cost;
         else
-            cost += std::max(comm_cost, work_cost[step+1]);
+            cost += std::max(comm_cost, work_cost[step+1]) + sync_cost;
     }
 
     updatePromisingMoves();
@@ -320,7 +319,7 @@ void HillClimbingScheduler<Graph_t>::updatePromisingMoves()
         for(unsigned proc=0; proc<P; ++proc)
             if(schedule->assignedProcessor(node)!=proc && nrPredOnProc[proc]>0)
                 ++otherProcUsed;
-                
+
         if(otherProcUsed==1)
             for(unsigned proc=0; proc<P; ++proc)
                 if(schedule->assignedProcessor(node)!=proc && nrPredOnProc[proc]>0 && schedule->getInstance().isCompatible(node,proc))
@@ -381,7 +380,7 @@ void HillClimbingScheduler<Graph_t>::updateNodeMovesEarlier(const vertex_idx nod
     if (schedule->assignedSuperstep(node) == 0)
         return;
 
-    std::set<unsigned> predProc; 
+    std::set<unsigned> predProc;
     for (const vertex_idx &pred : schedule->getInstance().getComputationalDag().parents(node)) {
         if (schedule->assignedSuperstep(pred) == schedule->assignedSuperstep(node))
             return;
@@ -461,7 +460,7 @@ template<typename Graph_t>
 void HillClimbingScheduler<Graph_t>::updateMoveOptions(vertex_idx node, int where)
 {
     const Graph_t &G = schedule->getInstance().getComputationalDag();
-    
+
     updateNodeMoves(node);
     if(where==0)
     {
@@ -632,7 +631,7 @@ int HillClimbingScheduler<Graph_t>::moveCostChange(const vertex_idx node, unsign
 
             unsigned affectedStep = succSteps[node][j].begin()->first - schedule->getStaleness();
             if (j == p) {
-                sentInc.emplace_back(affectedStep, oldProc, 
+                sentInc.emplace_back(affectedStep, oldProc,
                                      -static_cast<int>(schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(oldProc, j)));
                 recInc.emplace_back(affectedStep, p, -static_cast<int>(schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(oldProc, j)));
             } else if (j == oldProc) {
@@ -738,8 +737,7 @@ int HillClimbingScheduler<Graph_t>::moveCostChange(const vertex_idx node, unsign
     bool last_affected_empty = false;
     for (const unsigned sstep : affectedSteps) {
         cost_type oldMax = schedule->getInstance().getArchitecture().communicationCosts() * commCostList[sstep].rbegin()->first;
-        if(HCwithLatency && oldMax > 0)
-            oldMax += schedule->getInstance().getArchitecture().synchronisationCosts();
+        cost_type oldSync = (HCwithLatency && oldMax > 0) ? schedule->getInstance().getArchitecture().synchronisationCosts() : 0;
 
         cost_type newMax = 0;
         for (unsigned j = 0; j < schedule->getInstance().getArchitecture().numberOfProcessors(); ++j) {
@@ -755,9 +753,8 @@ int HillClimbingScheduler<Graph_t>::moveCostChange(const vertex_idx node, unsign
                 newMax = static_cast<cost_type>(static_cast<int>(received[sstep][j]) + diff);
         }
         newMax *= schedule->getInstance().getArchitecture().communicationCosts();
-        if(HCwithLatency && newMax > 0)
-            newMax += schedule->getInstance().getArchitecture().synchronisationCosts();
-        
+        cost_type newSync = (HCwithLatency && newMax > 0) ? schedule->getInstance().getArchitecture().synchronisationCosts() : 0;
+
         if(newMax == 0)
         {
             if(schedule->getStaleness() == 1)
@@ -780,7 +777,7 @@ int HillClimbingScheduler<Graph_t>::moveCostChange(const vertex_idx node, unsign
             oldMax = std::max(oldMax, workCostList[sstep+1].rbegin()->first);
             newMax = std::max(newMax, itrWork != newWorkCost.end() ? itrWork->second : workCostList[sstep+1].rbegin()->first);
         }
-        change += static_cast<int>(newMax) - static_cast<int>(oldMax);
+        change += static_cast<int>(newMax + newSync) - static_cast<int>(oldMax + oldSync);
     }
 
     changing.newCost = static_cast<cost_type>(static_cast<int>(cost) + change);
@@ -881,7 +878,7 @@ bool HillClimbingScheduler<Graph_t>::Improve() {
 
         if(!canMove[static_cast<Direction>(where)][node][proc])
             continue;
-        
+
         if(use_memory_constraint && violatesMemConstraint(node, proc, where-1))
             continue;
 
@@ -893,7 +890,7 @@ bool HillClimbingScheduler<Graph_t>::Improve() {
             executeMove(node, proc, where-1, moveData);
             if(shrink && moveData.canShrink)
                 Init();
-            
+
             return true;
         }
 
@@ -966,13 +963,13 @@ bool HillClimbingScheduler<Graph_t>::violatesMemConstraint(vertex_idx node, unsi
     if(memory_used[processor][static_cast<unsigned>(static_cast<int>(schedule->assignedSuperstep(node))+where)]
         + schedule->getInstance().getComputationalDag().vertex_mem_weight(node) > schedule->getInstance().memoryBound(processor)) // TODO ANDRAS double check change
         return true;
-    
+
     return false;
 }
 
 template<typename Graph_t>
 void HillClimbingScheduler<Graph_t>::CreateSupstepLists() {
-    
+
     const unsigned P = schedule->getInstance().getArchitecture().numberOfProcessors();
     const Graph_t &G = schedule->getInstance().getComputationalDag();
 
diff --git a/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing_for_comm_schedule.hpp b/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing_for_comm_schedule.hpp
index fd9aa352..ba895b70 100644
--- a/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing_for_comm_schedule.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing_for_comm_schedule.hpp
@@ -13,17 +13,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
-@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner   
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
 */
 
 #pragma once
 
 #include "osp/bsp/model/BspScheduleCS.hpp"
-#include "osp/bsp/model/BspScheduleCostEvaluator.hpp"
+#include "osp/bsp/model/cost/CostModelHelpers.hpp"
 #include "osp/bsp/scheduler/Scheduler.hpp"
 #include "osp/graph_algorithms/directed_graph_top_sort.hpp"
 
-namespace osp{
+namespace osp {
 
 template<typename Graph_t>
 class HillClimbingForCommSteps {
@@ -44,8 +44,8 @@ class HillClimbingForCommSteps {
     std::vector<std::vector<unsigned>> commSchedule;
     std::vector<std::vector<std::list<vertex_idx>>> supsteplists;
     std::vector<std::set<std::pair<cost_type, unsigned>>> commCostList;
-    std::vector<std::vector<typename std::set<std::pair<cost_type, unsigned> >::iterator>> commCostPointer;
-    std::vector<std::vector<cost_type> > sent, received, commCost;
+    std::vector<std::vector<typename std::set<std::pair<cost_type, unsigned>>::iterator>> commCostPointer;
+    std::vector<std::vector<cost_type>> sent, received, commCost;
     std::vector<std::vector<std::pair<unsigned, unsigned>>> commBounds;
     std::vector<std::vector<std::list<std::pair<vertex_idx, unsigned>>>> commSchedSendLists;
     std::vector<std::vector<typename std::list<std::pair<vertex_idx, unsigned>>::iterator>> commSchedSendListPointer;
@@ -60,7 +60,7 @@ class HillClimbingForCommSteps {
     // Initialize data structures (based on current schedule)
     void Init();
 
-        // compute cost change incurred by a potential move
+    // compute cost change incurred by a potential move
     int moveCostChange(vertex_idx node, unsigned p, unsigned step);
 
     // execute a move, updating the comm. schedule and the data structures
@@ -79,11 +79,11 @@ class HillClimbingForCommSteps {
 
     virtual RETURN_STATUS improveSchedule(BspScheduleCS<Graph_t> &input_schedule);
 
-    //call with time limit
+    // call with time limit
     virtual RETURN_STATUS improveScheduleWithTimeLimit(BspScheduleCS<Graph_t> &input_schedule, const unsigned timeLimit);
 
-    //setting parameters
-    void setSteepestAscend(bool steepestAscent_) {steepestAscent = steepestAscent_;}
+    // setting parameters
+    void setSteepestAscend(bool steepestAscent_) { steepestAscent = steepestAscent_; }
 
     virtual std::string getScheduleName() const { return "HillClimbingForCommSchedule"; }
 };
@@ -100,11 +100,11 @@ RETURN_STATUS HillClimbingForCommSteps<Graph_t>::improveScheduleWithTimeLimit(Bs
 
     schedule = &input_schedule;
 
-    if(schedule->numberOfSupersteps() <= 2)
+    if (schedule->numberOfSupersteps() <= 2)
         return RETURN_STATUS::OSP_SUCCESS;
 
     Init();
-    //ConvertCommSchedule();
+    // ConvertCommSchedule();
     const std::chrono::steady_clock::time_point startTime = std::chrono::steady_clock::now();
 
     unsigned counter = 0;
@@ -119,14 +119,11 @@ RETURN_STATUS HillClimbingForCommSteps<Graph_t>::improveScheduleWithTimeLimit(Bs
             }
         }
 
-
     ConvertCommSchedule();
 
     return RETURN_STATUS::OSP_SUCCESS;
-
 }
 
-
 // Initialization for comm. schedule hill climbing
 template<typename Graph_t>
 void HillClimbingForCommSteps<Graph_t>::Init() {
@@ -136,7 +133,7 @@ void HillClimbingForCommSteps<Graph_t>::Init() {
     const Graph_t &G = schedule->getInstance().getComputationalDag();
 
     CreateSupstepLists();
-    cost = schedule->computeCosts()-schedule->computeWorkCosts();
+    cost = schedule->computeCosts();
 
     nextSupstep = 0;
     commSchedule.clear();
@@ -154,13 +151,13 @@ void HillClimbingForCommSteps<Graph_t>::Init() {
     commBounds.clear();
     commBounds.resize(N, std::vector<std::pair<unsigned, unsigned>>(P));
     commSchedSendLists.clear();
-    commSchedSendLists.resize(M - 1, std::vector<std::list<std::pair<vertex_idx, unsigned> >>(P));
+    commSchedSendLists.resize(M - 1, std::vector<std::list<std::pair<vertex_idx, unsigned>>>(P));
     commSchedRecLists.clear();
-    commSchedRecLists.resize(M - 1, std::vector<std::list<std::pair<vertex_idx, unsigned> >>(P));
+    commSchedRecLists.resize(M - 1, std::vector<std::list<std::pair<vertex_idx, unsigned>>>(P));
     commSchedSendListPointer.clear();
-    commSchedSendListPointer.resize(N, std::vector<typename std::list<std::pair<vertex_idx, unsigned> >::iterator>(P));
+    commSchedSendListPointer.resize(N, std::vector<typename std::list<std::pair<vertex_idx, unsigned>>::iterator>(P));
     commSchedRecListPointer.clear();
-    commSchedRecListPointer.resize(N, std::vector<typename std::list<std::pair<vertex_idx, unsigned> >::iterator>(P));
+    commSchedRecListPointer.resize(N, std::vector<typename std::list<std::pair<vertex_idx, unsigned>>::iterator>(P));
 
     // initialize to lazy comm schedule first - to make sure it's correct even if e.g. com scehdule has indirect sending
     for (unsigned step = 1; step < M; ++step)
@@ -169,24 +166,22 @@ void HillClimbingForCommSteps<Graph_t>::Init() {
                 for (const vertex_idx &pred : G.parents(node))
                     if (schedule->assignedProcessor(pred) != schedule->assignedProcessor(node) &&
                         commSchedule[pred][schedule->assignedProcessor(node)] == UINT_MAX) {
-                            commSchedule[pred][schedule->assignedProcessor(node)] = step - schedule->getStaleness();
-                            commBounds[pred][schedule->assignedProcessor(node)] = std::make_pair(schedule->assignedSuperstep(pred), step - schedule->getStaleness());
+                        commSchedule[pred][schedule->assignedProcessor(node)] = step - schedule->getStaleness();
+                        commBounds[pred][schedule->assignedProcessor(node)] = std::make_pair(schedule->assignedSuperstep(pred), step - schedule->getStaleness());
                     }
 
     // overwrite with original comm schedule, wherever possible
     const std::map<std::tuple<vertex_idx, unsigned, unsigned>, unsigned int> originalCommSchedule = schedule->getCommunicationSchedule();
-    for(vertex_idx node = 0; node < N; ++node)
-        for (unsigned proc = 0; proc < P; ++proc)
-        {
-            if(commSchedule[node][proc] == UINT_MAX )
+    for (vertex_idx node = 0; node < N; ++node)
+        for (unsigned proc = 0; proc < P; ++proc) {
+            if (commSchedule[node][proc] == UINT_MAX)
                 continue;
-            
+
             const auto comm_schedule_key = std::make_tuple(node, schedule->assignedProcessor(node), proc);
             auto mapIterator = originalCommSchedule.find(comm_schedule_key);
-            if (mapIterator != originalCommSchedule.end())
-            {
+            if (mapIterator != originalCommSchedule.end()) {
                 unsigned originalStep = mapIterator->second;
-                if(originalStep >= commBounds[node][proc].first && originalStep <= commBounds[node][proc].second)
+                if (originalStep >= commBounds[node][proc].first && originalStep <= commBounds[node][proc].second)
                     commSchedule[node][proc] = originalStep;
             }
 
@@ -197,30 +192,25 @@ void HillClimbingForCommSteps<Graph_t>::Init() {
             commSchedRecLists[step][proc].emplace_front(node, proc);
             commSchedRecListPointer[node][proc] =
                 commSchedRecLists[step][proc].begin();
-            
+
             sent[step][schedule->assignedProcessor(node)] +=
-                            schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(schedule->assignedProcessor(node), proc);
+                schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(schedule->assignedProcessor(node), proc);
             received[step][proc] +=
-                            schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(schedule->assignedProcessor(node), proc);
-                    
-
+                schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(schedule->assignedProcessor(node), proc);
         }
-    
+
     for (unsigned step = 0; step < M - 1; ++step)
-        for (unsigned proc = 0; proc < P; ++proc)
-        {
+        for (unsigned proc = 0; proc < P; ++proc) {
             commCost[step][proc] = std::max(sent[step][proc], received[step][proc]);
             commCostPointer[step][proc] = commCostList[step].emplace(commCost[step][proc], proc).first;
         }
 
     // set minimum cost - differs for BSP and MaxBSP
     minimum_cost_per_superstep.clear();
-    if(schedule->getStaleness() == 1)
-        minimum_cost_per_superstep.resize(M-1, 0);
-    else
-    {
-        BspScheduleCostEvaluator<Graph_t> evaluator(*schedule);
-        minimum_cost_per_superstep = evaluator.compute_max_work_per_step_helper();
+    if (schedule->getStaleness() == 1)
+        minimum_cost_per_superstep.resize(M - 1, 0);
+    else {
+        minimum_cost_per_superstep = cost_helpers::compute_max_work_per_step(*schedule);
         minimum_cost_per_superstep.erase(minimum_cost_per_superstep.begin());
     }
 }
@@ -234,13 +224,12 @@ int HillClimbingForCommSteps<Graph_t>::moveCostChange(const vertex_idx node, con
 
     // Change at old place
     auto itr = commCostList[oldStep].rbegin();
-    cost_type oldMax = std::max(itr->first * schedule->getInstance().getArchitecture().communicationCosts()
-                                + schedule->getInstance().getArchitecture().synchronisationCosts(), minimum_cost_per_superstep[oldStep]);
+    cost_type oldMax = std::max(itr->first * schedule->getInstance().getArchitecture().communicationCosts(), minimum_cost_per_superstep[oldStep]) + schedule->getInstance().getArchitecture().synchronisationCosts();
     cost_type maxSource =
         std::max(sent[oldStep][sourceProc] - schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p),
                  received[oldStep][sourceProc]);
     cost_type maxTarget = std::max(sent[oldStep][p],
-                                received[oldStep][p] - schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p));
+                                   received[oldStep][p] - schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p));
     cost_type maxOther = 0;
     for (; itr != commCostList[oldStep].rend(); ++itr)
         if (itr->second != sourceProc && itr->second != p) {
@@ -249,23 +238,21 @@ int HillClimbingForCommSteps<Graph_t>::moveCostChange(const vertex_idx node, con
         }
 
     cost_type newMax = std::max(std::max(maxSource, maxTarget), maxOther) * schedule->getInstance().getArchitecture().communicationCosts();
-    if(newMax > 0)
-        newMax += schedule->getInstance().getArchitecture().synchronisationCosts();
-    newMax = std::max(newMax, minimum_cost_per_superstep[oldStep]); 
+    cost_type newSync = (newMax > 0) ? schedule->getInstance().getArchitecture().synchronisationCosts() : 0;
+    newMax = std::max(newMax, minimum_cost_per_superstep[oldStep]) + newSync;
     change += static_cast<int>(newMax) - static_cast<int>(oldMax);
 
     // Change at new place
     oldMax = commCostList[step].rbegin()->first * schedule->getInstance().getArchitecture().communicationCosts();
-    if(oldMax > 0)
-        oldMax += schedule->getInstance().getArchitecture().synchronisationCosts();
+    cost_type oldSync = (oldMax > 0) ? schedule->getInstance().getArchitecture().synchronisationCosts() : 0;
     oldMax = std::max(oldMax, minimum_cost_per_superstep[step]);
-    maxSource = schedule->getInstance().getArchitecture().synchronisationCosts() + schedule->getInstance().getArchitecture().communicationCosts() *
-                (sent[step][sourceProc] + schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p));
-    maxTarget = schedule->getInstance().getArchitecture().synchronisationCosts() + schedule->getInstance().getArchitecture().communicationCosts() *
-                (received[step][p] + schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p));
+    maxSource = schedule->getInstance().getArchitecture().communicationCosts() *
+                    (sent[step][sourceProc] + schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p));
+    maxTarget = schedule->getInstance().getArchitecture().communicationCosts() *
+                    (received[step][p] + schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p));
 
     newMax = std::max(std::max(oldMax, maxSource), maxTarget);
-    change += static_cast<int>(newMax) - static_cast<int>(oldMax);
+    change += static_cast<int>(newMax + schedule->getInstance().getArchitecture().synchronisationCosts()) - static_cast<int>(oldMax + oldSync);
 
     return change;
 }
@@ -335,7 +322,7 @@ bool HillClimbingForCommSteps<Graph_t>::Improve() {
     unsigned startingSupstep = nextSupstep;
 
     // iterate over supersteps
-    while(true) {
+    while (true) {
         auto itr = commCostList[nextSupstep].rbegin();
 
         if (itr == commCostList[nextSupstep].crend())
@@ -343,10 +330,9 @@ bool HillClimbingForCommSteps<Graph_t>::Improve() {
 
         // find maximal comm cost that dominates the h-relation
         const cost_type commMax = itr->first;
-        if (commMax == 0)
-        {
-            nextSupstep = (nextSupstep+1)%(M-1);
-            if(nextSupstep == startingSupstep)
+        if (commMax == 0) {
+            nextSupstep = (nextSupstep + 1) % (M - 1);
+            if (nextSupstep == startingSupstep)
                 break;
             else
                 continue;
@@ -357,7 +343,7 @@ bool HillClimbingForCommSteps<Graph_t>::Improve() {
             const unsigned maxProc = itr->second;
 
             if (sent[nextSupstep][maxProc] == commMax)
-                for (const std::pair<vertex_idx, unsigned>& entry : commSchedSendLists[nextSupstep][maxProc]) {
+                for (const std::pair<vertex_idx, unsigned> &entry : commSchedSendLists[nextSupstep][maxProc]) {
                     const vertex_idx node = entry.first;
                     const unsigned p = entry.second;
                     // iterate over alternative supsteps to place this communication step
@@ -380,7 +366,7 @@ bool HillClimbingForCommSteps<Graph_t>::Improve() {
                 }
 
             if (received[nextSupstep][maxProc] == commMax)
-                for (const std::pair<vertex_idx, unsigned>& entry : commSchedRecLists[nextSupstep][maxProc]) {
+                for (const std::pair<vertex_idx, unsigned> &entry : commSchedRecLists[nextSupstep][maxProc]) {
                     const vertex_idx node = entry.first;
                     const unsigned p = entry.second;
                     // iterate over alternative supsteps to place this communication step
@@ -404,8 +390,8 @@ bool HillClimbingForCommSteps<Graph_t>::Improve() {
                 }
         }
 
-        nextSupstep = (nextSupstep+1)%(M-1);
-        if(nextSupstep == startingSupstep)
+        nextSupstep = (nextSupstep + 1) % (M - 1);
+        if (nextSupstep == startingSupstep)
             break;
     }
 
@@ -419,7 +405,7 @@ bool HillClimbingForCommSteps<Graph_t>::Improve() {
 
 template<typename Graph_t>
 void HillClimbingForCommSteps<Graph_t>::CreateSupstepLists() {
-    
+
     const unsigned P = schedule->getInstance().getArchitecture().numberOfProcessors();
     const Graph_t &G = schedule->getInstance().getComputationalDag();
 
@@ -432,21 +418,18 @@ void HillClimbingForCommSteps<Graph_t>::CreateSupstepLists() {
     const std::vector<vertex_idx> topOrder = GetTopOrder(G);
     for (vertex_idx node : topOrder)
         supsteplists[schedule->assignedSuperstep(node)][schedule->assignedProcessor(node)].push_back(node);
-
 }
 
 template<typename Graph_t>
-void HillClimbingForCommSteps<Graph_t>::ConvertCommSchedule()
-{
+void HillClimbingForCommSteps<Graph_t>::ConvertCommSchedule() {
     const vertex_idx N = static_cast<vertex_idx>(schedule->getInstance().getComputationalDag().num_vertices());
     const unsigned P = schedule->getInstance().getArchitecture().numberOfProcessors();
 
     std::map<std::tuple<vertex_idx, unsigned, unsigned>, unsigned> newCommSchedule;
 
-    for(vertex_idx node=0; node < N; ++node)
-        for(unsigned proc=0; proc < P; ++proc)
-            if(commSchedule[node][proc] != UINT_MAX)
-            {
+    for (vertex_idx node = 0; node < N; ++node)
+        for (unsigned proc = 0; proc < P; ++proc)
+            if (commSchedule[node][proc] != UINT_MAX) {
                 const auto comm_schedule_key = std::make_tuple(node, schedule->assignedProcessor(node), proc);
                 newCommSchedule[comm_schedule_key] = commSchedule[node][proc];
             }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 8a6260bd..21fc5509 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -133,7 +133,7 @@ _add_test( bit_mask )
 _add_test( filereader DATA )
 
 
-## scheduler 
+## scheduler
 if (COPT_FOUND)
 
 #_add_test( ilp_bsp_scheduler )
@@ -160,6 +160,8 @@ _add_test( cuthill_mckee )
 
 _add_test( maxbsp_converter_and_hc )
 
+_add_test( cost_evaluation )
+
 ## pebbling ILPs
 
 if (COPT_FOUND)
diff --git a/tests/bsp_schedule.cpp b/tests/bsp_schedule.cpp
index 8d0a611d..0b587266 100644
--- a/tests/bsp_schedule.cpp
+++ b/tests/bsp_schedule.cpp
@@ -19,21 +19,25 @@ limitations under the License.
 #define BOOST_TEST_MODULE Bsp_Architecture
 #include <boost/test/unit_test.hpp>
 
+#include "osp/auxiliary/io/DotFileWriter.hpp"
+#include "osp/auxiliary/io/arch_file_reader.hpp"
+#include "osp/auxiliary/io/general_file_reader.hpp"
+#include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
 #include "osp/bsp/model/BspInstance.hpp"
 #include "osp/bsp/model/BspSchedule.hpp"
 #include "osp/bsp/model/BspScheduleCS.hpp"
+#include "osp/bsp/model/BspScheduleRecomp.hpp"
 #include "osp/bsp/model/MaxBspSchedule.hpp"
 #include "osp/bsp/model/MaxBspScheduleCS.hpp"
-#include "osp/bsp/model/BspScheduleRecomp.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp"
-#include "osp/auxiliary/io/DotFileWriter.hpp"
-#include "osp/auxiliary/io/arch_file_reader.hpp"
-#include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
-#include "osp/auxiliary/io/general_file_reader.hpp"
 #include <filesystem>
 #include <iostream>
 
+#include "osp/bsp/model/cost/BufferedSendingCost.hpp"
+#include "osp/bsp/model/cost/LazyCommunicationCost.hpp"
+#include "osp/bsp/model/cost/TotalCommunicationCost.hpp"
+#include "osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/BspLocking.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/CilkScheduler.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/EtfScheduler.hpp"
@@ -70,7 +74,7 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) {
     BOOST_CHECK_EQUAL(instance.getComputationalDag().num_vertices(), 54);
     BOOST_CHECK_EQUAL(instance.getComputationalDag().num_vertex_types(), 1);
 
-    std::vector<Scheduler<graph> *> schedulers = {new BspLocking<graph>(),         new EtfScheduler<graph>(),
+    std::vector<Scheduler<graph> *> schedulers = {new BspLocking<graph>(), new EtfScheduler<graph>(),
                                                   new GreedyBspScheduler<graph>(), new GreedyChildren<graph>(),
                                                   new GrowLocalAutoCores<graph>(), new VarianceFillup<graph>()};
 
@@ -93,8 +97,8 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) {
         BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
 
         BOOST_CHECK_EQUAL(schedule.computeCosts(), expected_bsp_costs[i]);
-        BOOST_CHECK_EQUAL(schedule.computeTotalCosts(), expected_total_costs[i]);
-        BOOST_CHECK_EQUAL(schedule.computeBufferedSendingCosts(), expected_buffered_sending_costs[i]);
+        BOOST_CHECK_EQUAL(TotalCommunicationCost<graph>()(schedule), expected_total_costs[i]);
+        BOOST_CHECK_EQUAL(BufferedSendingCost<graph>()(schedule), expected_buffered_sending_costs[i]);
         BOOST_CHECK_EQUAL(schedule.numberOfSupersteps(), expected_supersteps[i]);
 
         BspScheduleCS<graph> schedule_cs(instance);
@@ -118,7 +122,6 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) {
     BOOST_CHECK_EQUAL(RETURN_STATUS::OSP_SUCCESS, result);
     BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
     BOOST_CHECK_EQUAL(schedule.numberOfSupersteps(), 1);
-
 }
 
 BOOST_AUTO_TEST_CASE(test_schedule_writer) {
@@ -230,7 +233,7 @@ BOOST_AUTO_TEST_CASE(test_bsp_schedule_cs) {
     }
 
     file_reader::readGraph((cwd / "data/spaa/tiny/instance_bicgstab.hdag").string(),
-                                                    instance.getComputationalDag());
+                           instance.getComputationalDag());
 
     BspSchedule<graph> schedule(instance);
     BspLocking<graph> scheduler;
@@ -337,7 +340,7 @@ BOOST_AUTO_TEST_CASE(test_max_bsp_schedule) {
 
     BspInstance<graph> instance;
     instance.setNumberOfProcessors(2);
-    instance.setCommunicationCosts(10); // g=10
+    instance.setCommunicationCosts(10);    // g=10
     instance.setSynchronisationCosts(100); // l=100 (not used in MaxBspSchedule cost model)
 
     auto &dag = instance.getComputationalDag();
@@ -419,7 +422,7 @@ BOOST_AUTO_TEST_CASE(test_max_bsp_schedule_cs) {
 
     BspInstance<graph> instance;
     instance.setNumberOfProcessors(2);
-    instance.setCommunicationCosts(10); // g=10
+    instance.setCommunicationCosts(10);    // g=10
     instance.setSynchronisationCosts(100); // l=100
 
     auto &dag = instance.getComputationalDag();
diff --git a/tests/cost_evaluation.cpp b/tests/cost_evaluation.cpp
new file mode 100644
index 00000000..27f7660c
--- /dev/null
+++ b/tests/cost_evaluation.cpp
@@ -0,0 +1,121 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#define BOOST_TEST_MODULE CostEvaluation
+#include <boost/test/unit_test.hpp>
+
+#include "osp/bsp/model/BspInstance.hpp"
+#include "osp/bsp/model/BspSchedule.hpp"
+#include "osp/bsp/model/cost/BufferedSendingCost.hpp"
+#include "osp/bsp/model/cost/LazyCommunicationCost.hpp"
+#include "osp/bsp/model/cost/TotalCommunicationCost.hpp"
+#include "osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp"
+#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
+
+using namespace osp;
+
+BOOST_AUTO_TEST_CASE(test_cost_models_simple_dag) {
+
+    using graph = computational_dag_edge_idx_vector_impl_def_int_t;
+
+    BspInstance<graph> instance;
+    instance.setNumberOfProcessors(2);
+    instance.setCommunicationCosts(10);
+    instance.setSynchronisationCosts(5);
+
+    auto &dag = instance.getComputationalDag();
+    dag.add_vertex(10, 1, 0);
+    dag.add_vertex(20, 2, 0);
+    dag.add_vertex(30, 3, 0);
+    dag.add_vertex(40, 4, 0);
+    dag.add_vertex(50, 5, 0);
+    dag.add_edge(0, 1);
+    dag.add_edge(0, 2);
+    dag.add_edge(1, 4);
+    dag.add_edge(2, 3);
+    dag.add_edge(3, 4);
+
+    BspSchedule<graph> schedule(instance);
+
+    schedule.setAssignedProcessor(0, 0);
+    schedule.setAssignedSuperstep(0, 0);
+    schedule.setAssignedProcessor(1, 0);
+    schedule.setAssignedSuperstep(1, 1);
+    schedule.setAssignedProcessor(2, 1);
+    schedule.setAssignedSuperstep(2, 1);
+    schedule.setAssignedProcessor(3, 1);
+    schedule.setAssignedSuperstep(3, 2);
+    schedule.setAssignedProcessor(4, 1);
+    schedule.setAssignedSuperstep(4, 3);
+    schedule.updateNumberOfSupersteps();
+
+    BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
+    BOOST_CHECK_EQUAL(schedule.numberOfSupersteps(), 4);
+
+    // Work cost (BSP model) = sum of max work per superstep across processors
+    // SS0: max(P0=10, P1=0) = 10
+    // SS1: max(P0=20, P1=30) = 30
+    // SS2: max(P0=0, P1=40) = 40
+    // SS3: max(P0=0, P1=50) = 50
+    // Total work = 10 + 30 + 40 + 50 = 130
+    BOOST_CHECK_EQUAL(schedule.computeWorkCosts(), 130);
+
+    // LazyCommunicationCost
+    // Sends/receives at step_needed - staleness (staleness=1)
+    // Node 0→{P1}: step_needed=1, send/rec at SS0, vol=1*1*g=10
+    // Node 1→{P1}: step_needed=3, send/rec at SS2, vol=2*1*g=20
+    // Max comm per step: SS0=10, SS1=0, SS2=20, SS3=0
+    // Comm = 10 + 20 = 30
+    // Syncs = 2 * L = 2 * 5 = 10 (only steps with comm)
+    // Total = 30 + 10 + 130 = 170
+    BOOST_CHECK_EQUAL(LazyCommunicationCost<graph>()(schedule), 170);
+
+    // BufferedSendingCost
+    // Send at producer step, receive at step_needed - staleness
+    // Node 0 (SS0): send to P1, vol=1*1*g=10 at SS0, rec at SS0
+    // Node 1 (SS1): send to P1, vol=2*1*g=20 at SS1, rec at SS2
+    // Send volumes: SS0[P0]=10, SS1[P0]=20, SS2[P0]=0, SS3[P0]=0
+    // Recv volumes: SS0[P1]=10, SS1[P1]=0, SS2[P1]=20, SS3[P1]=0
+    // Max comm per step: SS0=10, SS1=20, SS2=20, SS3=0
+    // Comm = 10 + 20 + 20 = 50
+    // Syncs = 3 * L = 3 * 5 = 15 (all steps with comm)
+    // Total = 50 + 15 + 130 = 195
+    BOOST_CHECK_EQUAL(BufferedSendingCost<graph>()(schedule), 195);
+
+    // TotalCommunicationCost
+    // Sum of cross-processor edge comm weights * g / P
+    // Cross edges: 0→2 (cw=1), 1→4 (cw=2)
+    // Total cross comm weight = (1 + 2) * 1 = 3
+    // Comm cost = 3 * 10 / 2 = 15
+    // Work = 130
+    // Sync = 3 * 5 = 15 (number_of_supersteps - 1)
+    // Total = 15 + 130 + 15 = 160
+    BOOST_CHECK_EQUAL(TotalCommunicationCost<graph>()(schedule), 160);
+
+    // TotalLambdaCommunicationCost
+    // For each node, sum comm_weight * sendCosts over unique target processors
+    // Then multiply total by (1/P) * g
+    // Node 0 (P0, cw=1): target_procs={P0,P1} → 1*(0+1) = 1
+    // Node 1 (P0, cw=2): target_procs={P1} → 2*1 = 2
+    // Node 2 (P1, cw=3): target_procs={P1} → 3*0 = 0
+    // Node 3 (P1, cw=4): target_procs={P1} → 4*0 = 0
+    // comm_costs = 1+2+0+0 = 3, comm_cost = 3 * (1/2) * 10 = 15
+    // Work = 130, Sync = 3 * 5 = 15
+    // Total = 15 + 130 + 15 = 160
+    BOOST_CHECK_EQUAL(TotalLambdaCommunicationCost<graph>()(schedule), 160);
+}
diff --git a/tests/ilp_bsp_scheduler.cpp b/tests/ilp_bsp_scheduler.cpp
index 151fd0b7..fc6934b4 100644
--- a/tests/ilp_bsp_scheduler.cpp
+++ b/tests/ilp_bsp_scheduler.cpp
@@ -64,7 +64,7 @@ BOOST_AUTO_TEST_CASE(test_total) {
     scheduler_to.setTimeLimitSeconds(10);
 
     const auto result_to = scheduler_to.computeSchedule(schedule_to);
-    BOOST_CHECK_EQUAL(RETURN_STATUS::BEST_FOUND, result_to);
+    BOOST_CHECK(result_to == RETURN_STATUS::OSP_SUCCESS || result_to == RETURN_STATUS::BEST_FOUND);
     BOOST_CHECK(schedule_to.satisfiesPrecedenceConstraints());
 
     BspSchedule<graph> schedule(instance);