diff --git a/.clang-format b/.clang-format index b622f779..93fcdef5 100644 --- a/.clang-format +++ b/.clang-format @@ -50,7 +50,7 @@ BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeColon BreakAfterJavaFieldAnnotations: false BreakStringLiterals: true -ColumnLimit: 120 +ColumnLimit: 0 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: true ConstructorInitializerAllOnOneLineOrOnePerLine: false @@ -134,4 +134,3 @@ TabWidth: 8 UseCRLF: false UseTab: Never ... - diff --git a/.githooks/pre-commit b/.githooks/pre-commit index 81cf6a75..8fd073ee 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -47,28 +47,24 @@ fi # Format all .cpp and .hpp files -for file in $(git diff --cached --name-only | grep -E '\.cpp$|\.hpp$'); do - # Check if the file exists - if [ ! -f $file ]; then - continue +# We use 'git diff-index' to avoid issues with spaces in filenames +git diff --cached --name-only --diff-filter=ACM $against | grep -E '\.(cpp|hpp)$' | while read -r file; do + if [ -f "$file" ]; then + # Apply clang-format in-place + clang-format -i "$file" + # Re-add the file to the commit to include the formatting changes + git add "$file" fi - git add $file done -# Remove trailing whitespace from all files (except .so) -for file in $(git diff --cached --name-only | grep -vE '\.so$'); do - # Check if the file exists - if [ ! -f $file ]; then - continue +# Remove trailing whitespace from all files (except .so and binary files) +# Using sed to remove trailing whitespace +git diff --cached --name-only --diff-filter=ACM $against | grep -vE '\.so$' | while read -r file; do + if [ -f "$file" ]; then + # Check if file is text to avoid corrupting binaries + if file "$file" | grep -q "text"; then + sed -i 's/[[:space:]]*$//' "$file" + git add "$file" + fi fi - git add $file done - -for file in $(git diff --cached --name-only); do - # Check if the file exists - if [ ! -f $file ]; then - continue - fi - git add $file -done - diff --git a/apps/test_suite_runner/StatsModules/BspCommStatsModule.hpp b/apps/test_suite_runner/StatsModules/BspCommStatsModule.hpp index 6b9e96fd..7f1066ee 100644 --- a/apps/test_suite_runner/StatsModules/BspCommStatsModule.hpp +++ b/apps/test_suite_runner/StatsModules/BspCommStatsModule.hpp @@ -18,36 +18,36 @@ limitations under the License. #pragma once -#include -#include -#include #include "IStatsModule.hpp" #include "osp/bsp/model/BspSchedule.hpp" // Still needed +#include "osp/bsp/model/cost/BufferedSendingCost.hpp" +#include "osp/bsp/model/cost/TotalCommunicationCost.hpp" +#include "osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp" +#include +#include +#include namespace osp { template -class BspCommStatsModule : public IStatisticModule> { -public: - -private: +class BspCommStatsModule : public IStatisticModule> { + public: + private: const std::vector metric_headers = { - "TotalCommCost", "TotalLambdaCommCost", "BufferedSendingCosts" - }; - -public: + "TotalCommCost", "TotalLambdaCommCost", "BufferedSendingCosts"}; + public: std::vector get_metric_headers() const override { return metric_headers; } std::map record_statistics( - const BspSchedule& schedule, - std::ofstream& /*log_stream*/) const override { + const BspSchedule &schedule, + std::ofstream & /*log_stream*/) const override { std::map stats; - stats["TotalCommCost"] = std::to_string(schedule.computeTotalCosts()); - stats["TotalLambdaCommCost"] = std::to_string(schedule.computeTotalLambdaCosts()); - stats["BufferedSendingCosts"] = std::to_string(schedule.computeBufferedSendingCosts()); + stats["TotalCommCost"] = std::to_string(TotalCommunicationCost()(schedule)); + stats["TotalLambdaCommCost"] = std::to_string(TotalLambdaCommunicationCost()(schedule)); + stats["BufferedSendingCosts"] = std::to_string(BufferedSendingCost()(schedule)); return stats; } }; diff --git a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp index 175f5335..d7f7e77f 100644 --- a/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp +++ b/include/osp/auxiliary/sptrsv_simulator/sptrsv.hpp @@ -20,6 +20,7 @@ limitations under the License. #ifdef EIGEN_FOUND +#include #include #include #include @@ -28,16 +29,13 @@ limitations under the License. #include #include #include -#include -#include -#include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp" #include "osp/bsp/model/BspInstance.hpp" #include "osp/bsp/model/BspSchedule.hpp" +#include "osp/graph_implementations/eigen_matrix_adapter/sparse_matrix.hpp" namespace osp { - template class Sptrsv { using uVertType = typename SparseMatrixImp::vertex_idx; @@ -51,14 +49,14 @@ class Sptrsv { std::vector col_idx; std::vector row_ptr; - + std::vector row_idx; std::vector col_ptr; std::vector> step_proc_ptr; std::vector> step_proc_num; - double * x; + double *x; const double *b; unsigned num_supersteps; @@ -67,12 +65,12 @@ class Sptrsv { std::vector>> vector_step_processor_vertices_u; std::vector ready; - std::vector>> bounds_array_l; - std::vector>> bounds_array_u; + std::vector>> bounds_array_l; + std::vector>> bounds_array_u; Sptrsv() = default; - Sptrsv(BspInstance> &inst) : instance(&inst) {}; + Sptrsv(BspInstance> &inst) : instance(&inst) {}; void setup_csr_no_permutation(const BspSchedule> &schedule) { vector_step_processor_vertices = std::vector>>( @@ -93,73 +91,71 @@ class Sptrsv { num_supersteps = schedule.numberOfSupersteps(); size_t number_of_vertices = instance->getComputationalDag().num_vertices(); - #pragma omp parallel num_threads(2) +#pragma omp parallel num_threads(2) { int id = omp_get_thread_num(); - switch(id) { - case 0: - { - for (size_t node=0; node < number_of_vertices; ++node){ - vector_step_processor_vertices[schedule.assignedSuperstep(node)][schedule.assignedProcessor(node)].push_back(static_cast(node)); - } - - for (unsigned int step=0; stepnumberOfProcessors(); ++proc){ - if (!vector_step_processor_vertices[step][proc].empty()){ - eigen_idx_type start = vector_step_processor_vertices[step][proc][0]; - eigen_idx_type prev = vector_step_processor_vertices[step][proc][0]; - - for (size_t i=1; i< vector_step_processor_vertices[step][proc].size(); ++i){ - if(vector_step_processor_vertices[step][proc][i] != prev + 1){ - bounds_array_l[step][proc].push_back(start); - bounds_array_l[step][proc].push_back(prev); - start = vector_step_processor_vertices[step][proc][i]; - } - prev = vector_step_processor_vertices[step][proc][i]; - } + switch (id) { + case 0: { + for (size_t node = 0; node < number_of_vertices; ++node) { + vector_step_processor_vertices[schedule.assignedSuperstep(node)][schedule.assignedProcessor(node)].push_back(static_cast(node)); + } - bounds_array_l[step][proc].push_back(start); - bounds_array_l[step][proc].push_back(prev); + for (unsigned int step = 0; step < schedule.numberOfSupersteps(); ++step) { + for (unsigned int proc = 0; proc < instance->numberOfProcessors(); ++proc) { + if (!vector_step_processor_vertices[step][proc].empty()) { + eigen_idx_type start = vector_step_processor_vertices[step][proc][0]; + eigen_idx_type prev = vector_step_processor_vertices[step][proc][0]; + + for (size_t i = 1; i < vector_step_processor_vertices[step][proc].size(); ++i) { + if (vector_step_processor_vertices[step][proc][i] != prev + 1) { + bounds_array_l[step][proc].push_back(start); + bounds_array_l[step][proc].push_back(prev); + start = vector_step_processor_vertices[step][proc][i]; + } + prev = vector_step_processor_vertices[step][proc][i]; } + + bounds_array_l[step][proc].push_back(start); + bounds_array_l[step][proc].push_back(prev); } } - - break; } - case 1: - { - size_t node=number_of_vertices; - do { - node--; - vector_step_processor_vertices_u[schedule.assignedSuperstep(node)][schedule.assignedProcessor(node)].push_back(static_cast(node)); - } while (node > 0); - - for (unsigned int step=0; stepnumberOfProcessors(); ++proc){ - if (!vector_step_processor_vertices_u[step][proc].empty()){ - eigen_idx_type start_u = static_cast(vector_step_processor_vertices_u[step][proc][0]); - eigen_idx_type prev_u = static_cast(vector_step_processor_vertices_u[step][proc][0]); - - for (size_t i=1; i(vector_step_processor_vertices_u[step][proc][i]) != prev_u - 1){ - bounds_array_u[step][proc].push_back(start_u); - bounds_array_u[step][proc].push_back(prev_u); - start_u = static_cast(vector_step_processor_vertices_u[step][proc][i]); - } - prev_u = static_cast(vector_step_processor_vertices_u[step][proc][i]); - } - bounds_array_u[step][proc].push_back(start_u); - bounds_array_u[step][proc].push_back(prev_u); + break; + } + case 1: { + size_t node = number_of_vertices; + do { + node--; + vector_step_processor_vertices_u[schedule.assignedSuperstep(node)][schedule.assignedProcessor(node)].push_back(static_cast(node)); + } while (node > 0); + + for (unsigned int step = 0; step < schedule.numberOfSupersteps(); ++step) { + for (unsigned int proc = 0; proc < instance->numberOfProcessors(); ++proc) { + if (!vector_step_processor_vertices_u[step][proc].empty()) { + eigen_idx_type start_u = static_cast(vector_step_processor_vertices_u[step][proc][0]); + eigen_idx_type prev_u = static_cast(vector_step_processor_vertices_u[step][proc][0]); + + for (size_t i = 1; i < vector_step_processor_vertices_u[step][proc].size(); ++i) { + if (static_cast(vector_step_processor_vertices_u[step][proc][i]) != prev_u - 1) { + bounds_array_u[step][proc].push_back(start_u); + bounds_array_u[step][proc].push_back(prev_u); + start_u = static_cast(vector_step_processor_vertices_u[step][proc][i]); + } + prev_u = static_cast(vector_step_processor_vertices_u[step][proc][i]); } + + bounds_array_u[step][proc].push_back(start_u); + bounds_array_u[step][proc].push_back(prev_u); } } - - break; - } - default:{ - std::cout << "Unexpected Behaviour" << std::endl; } + + break; + } + default: { + std::cout << "Unexpected Behaviour" << std::endl; + } } } } @@ -184,7 +180,7 @@ class Sptrsv { step_proc_ptr = std::vector>(num_supersteps, std::vector(instance->numberOfProcessors(), 0)); - step_proc_num = schedule.num_assigned_nodes_per_superstep_processor(); + step_proc_num = schedule.numAssignedNodesPerSuperstepProcessor(); unsigned current_step = 0; unsigned current_processor = 0; @@ -194,9 +190,9 @@ class Sptrsv { for (const uVertType &node : perm_inv) { if (schedule.assignedProcessor(node) != current_processor || schedule.assignedSuperstep(node) != current_step) { - + while (schedule.assignedProcessor(node) != current_processor || - schedule.assignedSuperstep(node) != current_step) { + schedule.assignedSuperstep(node) != current_step) { if (current_processor < instance->numberOfProcessors() - 1) { current_processor++; @@ -207,7 +203,6 @@ class Sptrsv { } step_proc_ptr[current_step][current_processor] = static_cast(row_ptr.size()); - } row_ptr.push_back(col_idx.size()); @@ -225,7 +220,7 @@ class Sptrsv { const auto *outer = instance->getComputationalDag().getCSR()->outerIndexPtr(); for (uVertType par_ind = static_cast(outer[node]); par_ind < static_cast(outer[node + 1] - 1); ++par_ind) { - if (static_cast(instance->getComputationalDag().getCSR()->innerIndexPtr()[par_ind]) == perm_inv[par]){ + if (static_cast(instance->getComputationalDag().getCSR()->innerIndexPtr()[par_ind]) == perm_inv[par]) { val.push_back(instance->getComputationalDag().getCSR()->valuePtr()[par_ind]); found++; } @@ -234,62 +229,62 @@ class Sptrsv { } col_idx.push_back(perm[node]); - val.push_back(instance->getComputationalDag().getCSR()->valuePtr()[ instance->getComputationalDag().getCSR()->outerIndexPtr()[node + 1] - 1 ]); + val.push_back(instance->getComputationalDag().getCSR()->valuePtr()[instance->getComputationalDag().getCSR()->outerIndexPtr()[node + 1] - 1]); } row_ptr.push_back(col_idx.size()); } - void lsolve_serial(){ + void lsolve_serial() { eigen_idx_type number_of_vertices = static_cast(instance->numberOfVertices()); - for (eigen_idx_type i = 0; i < number_of_vertices; ++i){ + for (eigen_idx_type i = 0; i < number_of_vertices; ++i) { x[i] = b[i]; - for (eigen_idx_type j = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i]; j < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1; ++j){ + for (eigen_idx_type j = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i]; j < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1; ++j) { x[i] -= (*(instance->getComputationalDag().getCSR())).valuePtr()[j] * x[(*(instance->getComputationalDag().getCSR())).innerIndexPtr()[j]]; } x[i] /= (*(instance->getComputationalDag().getCSR())).valuePtr()[(*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1]; } } - void usolve_serial(){ + void usolve_serial() { eigen_idx_type number_of_vertices = static_cast(instance->numberOfVertices()); eigen_idx_type i = number_of_vertices; do { i--; x[i] = b[i]; - for (eigen_idx_type j = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i] + 1; j < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i + 1]; ++j){ + for (eigen_idx_type j = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i] + 1; j < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i + 1]; ++j) { x[i] -= (*(instance->getComputationalDag().getCSC())).valuePtr()[j] * x[(*(instance->getComputationalDag().getCSC())).innerIndexPtr()[j]]; } x[i] /= (*(instance->getComputationalDag().getCSC())).valuePtr()[(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i]]; } while (i != 0); } - void lsolve_no_permutation_in_place(){ - #pragma omp parallel num_threads(instance->numberOfProcessors()) + void lsolve_no_permutation_in_place() { +#pragma omp parallel num_threads(instance->numberOfProcessors()) { const size_t proc = static_cast(omp_get_thread_num()); - for (unsigned step = 0; step < num_supersteps; ++step){ + for (unsigned step = 0; step < num_supersteps; ++step) { const size_t bounds_str_size = bounds_array_l[step][proc].size(); - - for (size_t index = 0; index < bounds_str_size; index+=2){ + + for (size_t index = 0; index < bounds_str_size; index += 2) { eigen_idx_type lower_b = bounds_array_l[step][proc][index]; - const eigen_idx_type upper_b = bounds_array_l[step][proc][index+1]; - - for (eigen_idx_type node = lower_b; node<=upper_b; ++node){ - for (eigen_idx_type i = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node]; i < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1; ++i){ + const eigen_idx_type upper_b = bounds_array_l[step][proc][index + 1]; + + for (eigen_idx_type node = lower_b; node <= upper_b; ++node) { + for (eigen_idx_type i = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node]; i < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1; ++i) { x[node] -= (*(instance->getComputationalDag().getCSR())).valuePtr()[i] * x[(*(instance->getComputationalDag().getCSR())).innerIndexPtr()[i]]; } x[node] /= (*(instance->getComputationalDag().getCSR())).valuePtr()[(*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1]; } } - #pragma omp barrier - } +#pragma omp barrier + } } } - void usolve_no_permutation_in_place(){ - #pragma omp parallel num_threads(instance->numberOfProcessors()) + void usolve_no_permutation_in_place() { +#pragma omp parallel num_threads(instance->numberOfProcessors()) { // Process each superstep starting from the last one (opposite of lsolve) const size_t proc = static_cast(omp_get_thread_num()); @@ -297,49 +292,49 @@ class Sptrsv { do { step--; const size_t bounds_str_size = bounds_array_u[step][proc].size(); - for (size_t index = 0; index < bounds_str_size; index+=2){ + for (size_t index = 0; index < bounds_str_size; index += 2) { eigen_idx_type node = bounds_array_u[step][proc][index] + 1; - const eigen_idx_type lower_b = bounds_array_u[step][proc][index+1]; + const eigen_idx_type lower_b = bounds_array_u[step][proc][index + 1]; do { node--; - for (eigen_idx_type i=(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node] + 1; i < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node + 1]; ++i){ + for (eigen_idx_type i = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node] + 1; i < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node + 1]; ++i) { x[node] -= (*(instance->getComputationalDag().getCSC())).valuePtr()[i] * x[(*(instance->getComputationalDag().getCSC())).innerIndexPtr()[i]]; } x[node] /= (*(instance->getComputationalDag().getCSC())).valuePtr()[(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node]]; } while (node != lower_b); } - #pragma omp barrier - } while (step!=0); +#pragma omp barrier + } while (step != 0); } } - void lsolve_no_permutation(){ - #pragma omp parallel num_threads(instance->numberOfProcessors()) + void lsolve_no_permutation() { +#pragma omp parallel num_threads(instance->numberOfProcessors()) { const size_t proc = static_cast(omp_get_thread_num()); - for (unsigned step = 0; step < num_supersteps; ++step){ + for (unsigned step = 0; step < num_supersteps; ++step) { const size_t bounds_str_size = bounds_array_l[step][proc].size(); - - for (size_t index = 0; index < bounds_str_size; index+=2){ + + for (size_t index = 0; index < bounds_str_size; index += 2) { eigen_idx_type lower_b = bounds_array_l[step][proc][index]; - const eigen_idx_type upper_b = bounds_array_l[step][proc][index+1]; - - for (eigen_idx_type node = lower_b; node<=upper_b; ++node){ + const eigen_idx_type upper_b = bounds_array_l[step][proc][index + 1]; + + for (eigen_idx_type node = lower_b; node <= upper_b; ++node) { x[node] = b[node]; - for (eigen_idx_type i = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node]; i < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1; ++i){ + for (eigen_idx_type i = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node]; i < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1; ++i) { x[node] -= (*(instance->getComputationalDag().getCSR())).valuePtr()[i] * x[(*(instance->getComputationalDag().getCSR())).innerIndexPtr()[i]]; } x[node] /= (*(instance->getComputationalDag().getCSR())).valuePtr()[(*(instance->getComputationalDag().getCSR())).outerIndexPtr()[node + 1] - 1]; } } - #pragma omp barrier - } +#pragma omp barrier + } } } - void usolve_no_permutation(){ - #pragma omp parallel num_threads(instance->numberOfProcessors()) + void usolve_no_permutation() { +#pragma omp parallel num_threads(instance->numberOfProcessors()) { // Process each superstep starting from the last one (opposite of lsolve) const size_t proc = static_cast(omp_get_thread_num()); @@ -347,50 +342,48 @@ class Sptrsv { do { step--; const size_t bounds_str_size = bounds_array_u[step][proc].size(); - for (size_t index = 0; index < bounds_str_size; index+=2){ + for (size_t index = 0; index < bounds_str_size; index += 2) { eigen_idx_type node = bounds_array_u[step][proc][index] + 1; - const eigen_idx_type lower_b = bounds_array_u[step][proc][index+1]; + const eigen_idx_type lower_b = bounds_array_u[step][proc][index + 1]; do { node--; x[node] = b[node]; - for (eigen_idx_type i=(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node] + 1; i < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node + 1]; ++i){ + for (eigen_idx_type i = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node] + 1; i < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node + 1]; ++i) { x[node] -= (*(instance->getComputationalDag().getCSC())).valuePtr()[i] * x[(*(instance->getComputationalDag().getCSC())).innerIndexPtr()[i]]; } x[node] /= (*(instance->getComputationalDag().getCSC())).valuePtr()[(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[node]]; } while (node != lower_b); } - #pragma omp barrier - } while (step!=0); +#pragma omp barrier + } while (step != 0); } } - void lsolve_serial_in_place(){ + void lsolve_serial_in_place() { eigen_idx_type number_of_vertices = static_cast(instance->numberOfVertices()); - for (eigen_idx_type i = 0; i < number_of_vertices; ++i){ - for (eigen_idx_type j = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i]; j < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1; ++j){ + for (eigen_idx_type i = 0; i < number_of_vertices; ++i) { + for (eigen_idx_type j = (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i]; j < (*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1; ++j) { x[i] -= (*(instance->getComputationalDag().getCSR())).valuePtr()[j] * x[(*(instance->getComputationalDag().getCSR())).innerIndexPtr()[j]]; } x[i] /= (*(instance->getComputationalDag().getCSR())).valuePtr()[(*(instance->getComputationalDag().getCSR())).outerIndexPtr()[i + 1] - 1]; } - } - void usolve_serial_in_place(){ + void usolve_serial_in_place() { eigen_idx_type number_of_vertices = static_cast(instance->numberOfVertices()); eigen_idx_type i = number_of_vertices; do { i--; - for (eigen_idx_type j = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i] + 1; j < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i + 1]; ++j){ + for (eigen_idx_type j = (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i] + 1; j < (*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i + 1]; ++j) { x[i] -= (*(instance->getComputationalDag().getCSC())).valuePtr()[j] * x[(*(instance->getComputationalDag().getCSC())).innerIndexPtr()[j]]; } x[i] /= (*(instance->getComputationalDag().getCSC())).valuePtr()[(*(instance->getComputationalDag().getCSC())).outerIndexPtr()[i]]; } while (i != 0); - } void lsolve_with_permutation_in_place() { - #pragma omp parallel num_threads(instance->numberOfProcessors()) +#pragma omp parallel num_threads(instance->numberOfProcessors()) { for (unsigned step = 0; step < num_supersteps; step++) { @@ -405,13 +398,13 @@ class Sptrsv { x[_row_idx] /= val[row_ptr[_row_idx + 1] - 1]; } - #pragma omp barrier +#pragma omp barrier } } } void lsolve_with_permutation() { - #pragma omp parallel num_threads(instance->numberOfProcessors()) +#pragma omp parallel num_threads(instance->numberOfProcessors()) { for (unsigned step = 0; step < num_supersteps; step++) { @@ -426,12 +419,11 @@ class Sptrsv { x[_row_idx] /= val[row_ptr[_row_idx + 1] - 1]; } - #pragma omp barrier +#pragma omp barrier } } } - void reset_x() { eigen_idx_type number_of_vertices = static_cast(instance->numberOfVertices()); for (eigen_idx_type i = 0; i < number_of_vertices; i++) { @@ -459,13 +451,13 @@ class Sptrsv { } } - std::size_t get_number_of_vertices(){ - return instance->numberOfVertices() ; + std::size_t get_number_of_vertices() { + return instance->numberOfVertices(); } virtual ~Sptrsv() = default; }; -} +} // namespace osp #endif \ No newline at end of file diff --git a/include/osp/bsp/model/BspSchedule.hpp b/include/osp/bsp/model/BspSchedule.hpp index 1f214b00..eeeaeec3 100644 --- a/include/osp/bsp/model/BspSchedule.hpp +++ b/include/osp/bsp/model/BspSchedule.hpp @@ -19,16 +19,14 @@ limitations under the License. #pragma once #include -#include -#include -#include #include #include #include -#include "IBspScheduleEval.hpp" #include "IBspSchedule.hpp" +#include "IBspScheduleEval.hpp" #include "SetSchedule.hpp" +#include "osp/bsp/model/cost/LazyCommunicationCost.hpp" #include "osp/concepts/computational_dag_concept.hpp" namespace osp { @@ -37,25 +35,28 @@ namespace osp { * @class BspSchedule * @brief Represents a schedule for the Bulk Synchronous Parallel (BSP) model. * - * The `BspSchedule` class is responsible for managing the assignment of nodes to processors and supersteps in the BSP - * model. It stores information such as the number of supersteps, the assignment of nodes to processors and supersteps, - * and the communication schedule. + * The `BspSchedule` class manages the assignment of nodes to processors and supersteps within the BSP model. + * It serves as a core component for scheduling algorithms, providing mechanisms to: + * - Store and retrieve node-to-processor and node-to-superstep assignments. + * - Validate schedules against precedence, memory, and node type constraints. + * - Compute costs associated with the schedule. + * - Manipulate the schedule, including updating assignments and merging supersteps. * - * The class provides methods for setting and retrieving the assigned superstep and processor for a given node, as well - * as methods for checking the validity of the communication schedule and computing the costs of the schedule. It also - * provides methods for setting the assigned supersteps and processors based on external assignments, and for updating - * the number of supersteps. + * This class is templated on `Graph_t`, which must satisfy the `computational_dag_concept`. + * Moreover, the work and communication weights of the nodes must be of the same type in order to properly compute the cost. * - * The `BspSchedule` class is designed to work with a `BspInstance` object, which represents the instance of the BSP - * problem being solved. + * It interacts closely with `BspInstance` to access problem-specific data and constraints. In fact, a `BspSchedule` object is tied to a `BspInstance` object. * + * @tparam Graph_t The type of the computational DAG, which must satisfy `is_computational_dag_v`. * @see BspInstance + * @see IBspSchedule + * @see IBspScheduleEval */ template class BspSchedule : public IBspSchedule, public IBspScheduleEval { static_assert(is_computational_dag_v, "BspSchedule can only be used with computational DAGs."); - static_assert(std::is_same_v, v_commw_t >, "BspSchedule requires work and comm. weights to have the same type."); + static_assert(std::is_same_v, v_commw_t>, "BspSchedule requires work and comm. weights to have the same type."); protected: using vertex_idx = vertex_idx_t; @@ -67,83 +68,15 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval node_to_processor_assignment; std::vector node_to_superstep_assignment; - void compute_lazy_communication_costs_helper(std::vector>> & rec, std::vector>> & send) const { - for (const auto &node : instance->vertices()) { - - std::vector step_needed(instance->numberOfProcessors(), number_of_supersteps); - for (const auto &target : instance->getComputationalDag().children(node)) { - - if (node_to_processor_assignment[node] != node_to_processor_assignment[target]) { - step_needed[node_to_processor_assignment[target]] = std::min( - step_needed[node_to_processor_assignment[target]], node_to_superstep_assignment[target]); - } - } - - for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - - if (step_needed[proc] < number_of_supersteps) { - - send[node_to_processor_assignment[node]][step_needed[proc] - getStaleness()] += - instance->sendCosts(node_to_processor_assignment[node], proc) * - instance->getComputationalDag().vertex_comm_weight(node); - - rec[proc][step_needed[proc] - getStaleness()] += instance->sendCosts(node_to_processor_assignment[node], proc) * - instance->getComputationalDag().vertex_comm_weight(node); - } - } - } - } - - std::vector> compute_max_comm_per_step_helper(const std::vector>> & rec, const std::vector>> & send) const { - std::vector> max_comm_per_step(number_of_supersteps, 0); - for (unsigned step = 0; step < number_of_supersteps; step++) { - v_commw_t max_send = 0; - v_commw_t max_rec = 0; - - for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - if (max_send < send[proc][step]) - max_send = send[proc][step]; - if (max_rec < rec[proc][step]) - max_rec = rec[proc][step]; - } - max_comm_per_step[step] = std::max(max_send, max_rec) * instance->communicationCosts(); - } - return max_comm_per_step; - } - - std::vector> compute_max_work_per_step_helper() const { - std::vector>> work = std::vector>>( - number_of_supersteps, std::vector>(instance->numberOfProcessors(), 0)); - for (const auto &node : instance->vertices()) { - work[node_to_superstep_assignment[node]][node_to_processor_assignment[node]] += - instance->getComputationalDag().vertex_work_weight(node); - } - - std::vector> max_work_per_step(number_of_supersteps, 0); - for (unsigned step = 0; step < number_of_supersteps; step++) { - v_workw_t max_work = 0; - for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - if (max_work < work[step][proc]) { - max_work = work[step][proc]; - } - } - - max_work_per_step[step] = max_work; - } - - return max_work_per_step; - } - public: - BspSchedule() = delete; /** - * @brief Constructs a BspSchedule object with the specified Bspinstance-> + * @brief Constructs a BspSchedule object with the specified BspInstance. * * @param inst The BspInstance for the schedule. */ - BspSchedule(const BspInstance &inst) + explicit BspSchedule(const BspInstance &inst) : instance(&inst), number_of_supersteps(1), node_to_processor_assignment(std::vector(inst.numberOfVertices(), 0)), node_to_superstep_assignment(std::vector(inst.numberOfVertices(), 0)) {} @@ -163,7 +96,12 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval &schedule) + /** + * @brief Copy constructor from an IBspSchedule. + * + * @param schedule The schedule to copy. + */ + explicit BspSchedule(const IBspSchedule &schedule) : instance(&schedule.getInstance()), number_of_supersteps(schedule.numberOfSupersteps()), node_to_processor_assignment(schedule.getInstance().numberOfVertices()), node_to_superstep_assignment(schedule.getInstance().numberOfVertices()) { @@ -175,12 +113,23 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval &schedule) : instance(schedule.instance), number_of_supersteps(schedule.number_of_supersteps), node_to_processor_assignment(schedule.node_to_processor_assignment), node_to_superstep_assignment(schedule.node_to_superstep_assignment) {} - BspSchedule operator=(const BspSchedule &schedule) { + /** + * @brief Copy assignment operator. + * + * @param schedule The schedule to copy. + * @return A reference to this schedule. + */ + BspSchedule &operator=(const BspSchedule &schedule) { if (this != &schedule) { instance = schedule.instance; number_of_supersteps = schedule.number_of_supersteps; @@ -190,12 +139,23 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval &&schedule) + /** + * @brief Move constructor. + * + * @param schedule The schedule to move. + */ + BspSchedule(BspSchedule &&schedule) noexcept : instance(schedule.instance), number_of_supersteps(schedule.number_of_supersteps), node_to_processor_assignment(std::move(schedule.node_to_processor_assignment)), node_to_superstep_assignment(std::move(schedule.node_to_superstep_assignment)) {} - BspSchedule &operator=(BspSchedule &&schedule) { + /** + * @brief Move assignment operator. + * + * @param schedule The schedule to move. + * @return A reference to this schedule. + */ + BspSchedule &operator=(BspSchedule &&schedule) noexcept { if (this != &schedule) { instance = schedule.instance; number_of_supersteps = schedule.number_of_supersteps; @@ -205,6 +165,13 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval BspSchedule(const BspInstance &instance_, const BspSchedule &schedule) : instance(&instance_), number_of_supersteps(schedule.numberOfSupersteps()), @@ -221,19 +188,17 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval &getInstance() const override { return *instance; } + [[nodiscard]] const BspInstance &getInstance() const override { return *instance; } /** * @brief Returns the number of supersteps in the schedule. * * @return The number of supersteps in the schedule. */ - inline unsigned numberOfSupersteps() const override { return number_of_supersteps; } + [[nodiscard]] unsigned numberOfSupersteps() const override { return number_of_supersteps; } /** - * @brief Returns the number of processors in the schedule. - * - * @return The number of processors in the schedule. + * @brief Updates the number of supersteps based on the current assignment. */ void updateNumberOfSupersteps() { number_of_supersteps = 0; @@ -250,7 +215,7 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval, public IBspScheduleEval &assignedSupersteps() const { return node_to_superstep_assignment; } - inline std::vector &assignedSupersteps() { return node_to_superstep_assignment; } + [[nodiscard]] const std::vector &assignedSupersteps() const { return node_to_superstep_assignment; } + [[nodiscard]] std::vector &assignedSupersteps() { return node_to_superstep_assignment; } /** * @brief Returns the processor assignment for the schedule. * * @return The processor assignment for the schedule. */ - inline const std::vector &assignedProcessors() const { return node_to_processor_assignment; } - inline std::vector &assignedProcessors() { return node_to_processor_assignment; } + [[nodiscard]] const std::vector &assignedProcessors() const { return node_to_processor_assignment; } + [[nodiscard]] std::vector &assignedProcessors() { return node_to_processor_assignment; } + + /** + * @brief Returns the staleness of the schedule. + * The staleness determines the minimum number of supersteps that must elapse between the assignment of a node to a processor and the assignment of one of its neighbors to a different processor. + * The staleness for the BspSchedule is always 1. + * + * @return The staleness of the schedule. + */ + [[nodiscard]] virtual unsigned getStaleness() const { return 1; } /** * @brief Sets the superstep assigned to the specified node. @@ -283,7 +257,6 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEvalnumberOfVertices()) { node_to_superstep_assignment[node] = superstep; @@ -297,12 +270,12 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval, public IBspScheduleEval, public IBspScheduleEval &vec) { - - if (vec.size() == static_cast( instance->numberOfVertices() )) { - + if (vec.size() == static_cast(instance->numberOfVertices())) { number_of_supersteps = 0; for (vertex_idx_t i = 0; i < instance->numberOfVertices(); ++i) { - if (vec[i] >= number_of_supersteps) { number_of_supersteps = vec[i] + 1; } @@ -347,14 +317,13 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval &&vec) { - - if (vec.size() == static_cast( instance->numberOfVertices() )) { + if (vec.size() == static_cast(instance->numberOfVertices())) { node_to_superstep_assignment = std::move(vec); } else { throw std::invalid_argument( "Invalid Argument while assigning supersteps: size does not match number of nodes."); } - + updateNumberOfSupersteps(); } @@ -364,8 +333,7 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval &vec) { - - if (vec.size() == static_cast( instance->numberOfVertices() )) { + if (vec.size() == static_cast(instance->numberOfVertices())) { node_to_processor_assignment = vec; } else { throw std::invalid_argument( @@ -379,8 +347,7 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval &&vec) { - - if (vec.size() == static_cast( instance->numberOfVertices() )) { + if (vec.size() == static_cast(instance->numberOfVertices())) { node_to_processor_assignment = std::move(vec); } else { throw std::invalid_argument( @@ -388,188 +355,57 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval computeWorkCosts() const override { - const std::vector> work_per_step = compute_max_work_per_step_helper(); - return std::accumulate(work_per_step.begin(), work_per_step.end(), static_cast>(0)); - } - - double compute_total_communication_costs() const { - - assert(satisfiesPrecedenceConstraints()); - - double total_communication = 0; - - for (const auto &v : instance->vertices()) { - for (const auto &target : instance->getComputationalDag().children(v)) { - - if (node_to_processor_assignment[v] != node_to_processor_assignment[target]) { - total_communication += - instance->sendCosts(node_to_processor_assignment[v], node_to_processor_assignment[target]) * - instance->getComputationalDag().vertex_comm_weight(v); - } - } - } - - return total_communication * static_cast(instance->communicationCosts()) / static_cast(instance->numberOfProcessors()); - } - - double computeTotalCosts() const { - - assert(satisfiesPrecedenceConstraints()); - - const v_commw_t sync_cost = - number_of_supersteps >= 1 - ? instance->synchronisationCosts() * static_cast>(number_of_supersteps - 1) - : 0; - - return static_cast(computeWorkCosts()) + compute_total_communication_costs() + sync_cost; - } - - double compute_total_lambda_communication_cost() const { - - assert(satisfiesPrecedenceConstraints()); - - double comm_costs = 0; - const double comm_multiplier = 1.0 / instance->numberOfProcessors(); - - for (const auto &v : instance->vertices()) { - if (instance->getComputationalDag().out_degree(v) == 0) - continue; - - std::unordered_set target_procs; - for (const auto &target : instance->getComputationalDag().children(v)) { - target_procs.insert(node_to_processor_assignment[target]); - } - - const unsigned source_proc = node_to_processor_assignment[v]; - const auto v_comm_cost = instance->getComputationalDag().vertex_comm_weight(v); - - for (const auto& target_proc : target_procs) { - comm_costs += v_comm_cost * instance->sendCosts(source_proc, target_proc); - } - } - - return comm_costs * comm_multiplier * static_cast(instance->communicationCosts()); - } - - double computeTotalLambdaCosts() const { - assert(satisfiesPrecedenceConstraints()); - - const v_commw_t sync_cost = - number_of_supersteps >= 1 - ? instance->synchronisationCosts() * static_cast>(number_of_supersteps - 1) - : 0; - - return static_cast(computeWorkCosts()) + compute_total_lambda_communication_cost() + sync_cost; - } - - v_commw_t compute_buffered_sending_communication_costs() const { - - std::vector>> rec(instance->numberOfProcessors(), - std::vector>(number_of_supersteps, 0)); - std::vector>> send(instance->numberOfProcessors(), - std::vector>(number_of_supersteps, 0)); - - for (vertex_idx node = 0; node < instance->numberOfVertices(); node++) { - - std::vector step_needed(instance->numberOfProcessors(), number_of_supersteps); - for (const auto &target : instance->getComputationalDag().children(node)) { - - if (node_to_processor_assignment[node] != node_to_processor_assignment[target]) { - step_needed[node_to_processor_assignment[target]] = std::min( - step_needed[node_to_processor_assignment[target]], node_to_superstep_assignment[target]); - } - } - - for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - - if (step_needed[proc] < number_of_supersteps) { - send[node_to_processor_assignment[node]][node_to_superstep_assignment[node]] += - instance->sendCosts(node_to_processor_assignment[node], proc) * - instance->getComputationalDag().vertex_comm_weight(node); - - rec[proc][step_needed[proc] - 1] += instance->sendCosts(node_to_processor_assignment[node], proc) * - instance->getComputationalDag().vertex_comm_weight(node); - } - } - } - - const std::vector> max_comm_per_step = compute_max_comm_per_step_helper(rec, send); - - v_commw_t costs = 0; - for (unsigned step = 0; step < number_of_supersteps; step++) { - const auto step_comm_cost = max_comm_per_step[step]; - costs += step_comm_cost; - - if (step_comm_cost > 0) { - costs += instance->synchronisationCosts(); - } - } - return costs; - } - - v_workw_t computeBufferedSendingCosts() const { - - return compute_buffered_sending_communication_costs() + computeWorkCosts(); - } - - v_commw_t compute_lazy_communication_costs() const { - - std::vector>> rec(instance->numberOfProcessors(), - std::vector>(number_of_supersteps, 0)); - - std::vector>> send(instance->numberOfProcessors(), - std::vector>(number_of_supersteps, 0)); - - compute_lazy_communication_costs_helper(rec, send); - const std::vector> max_comm_per_step = compute_max_comm_per_step_helper(rec, send); - - v_commw_t costs = 0; - for (unsigned step = 0; step < number_of_supersteps; step++) { - const auto step_comm_cost = max_comm_per_step[step]; - costs += step_comm_cost; - - if (step_comm_cost > 0) { - costs += instance->synchronisationCosts(); - } - } + /** + * @brief Computes the work costs of the schedule. + * The workload of a processor in a superstep is the sum of the workloads of all nodes assigned to that processor in that superstep. + * The workload in a superstep is the maximum workload of any processor in that superstep. + * The work cost of the schedule is the sum of the workloads of all supersteps. + * + * @return The work costs of the schedule. + */ + virtual v_workw_t computeWorkCosts() const override { return cost_helpers::compute_work_costs(*this); } - return costs; - } + /** + * @brief Computes the costs of the schedule accoring to lazy communication cost evaluation. + * + * @return The costs of the schedule. + */ + virtual v_workw_t computeCosts() const override { return LazyCommunicationCost()(*this); } - virtual v_workw_t computeCosts() const override { return compute_lazy_communication_costs() + computeWorkCosts(); } + /** + * @brief Checks if the schedule is valid. + * + * A schedule is valid if it satisfies all precedence, memory, and node type constraints. + * + * @return True if the schedule is valid, false otherwise. + */ + [[nodiscard]] bool isValid() const { return satisfiesPrecedenceConstraints() && satisfiesMemoryConstraints() && satisfiesNodeTypeConstraints(); } /** * @brief Returns true if the schedule satisfies the precedence constraints of the computational DAG. * * The precedence constraints of the computational DAG are satisfied if, for each directed edge (u, v) such that u - * and v are assigned to different processors, the superstep assigned to node u is less than the superstep assigned - * to node v. + * and v are assigned to different processors, the difference between the superstep assigned to node u and the + * superstep assigned to node v is less than the staleness of the schedule. For the BspSchedule staleness is 1. * * @return True if the schedule satisfies the precedence constraints of the computational DAG, false otherwise. */ - inline bool satisfiesPrecedenceConstraints() const { - + [[nodiscard]] bool satisfiesPrecedenceConstraints() const { if (static_cast>(node_to_processor_assignment.size()) != instance->numberOfVertices() || static_cast>(node_to_superstep_assignment.size()) != instance->numberOfVertices()) { return false; } for (const auto &v : instance->vertices()) { - if (node_to_superstep_assignment[v] >= number_of_supersteps) { return false; } - if (node_to_processor_assignment[v] >= instance->numberOfProcessors()) { return false; } for (const auto &target : instance->getComputationalDag().children(v)) { - - const unsigned different_processors = - (node_to_processor_assignment[v] == node_to_processor_assignment[target]) ? 0u : getStaleness(); - + const unsigned different_processors = (node_to_processor_assignment[v] == node_to_processor_assignment[target]) ? 0u : getStaleness(); if (node_to_superstep_assignment[v] + different_processors > node_to_superstep_assignment[target]) { return false; } @@ -579,281 +415,359 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEvalnumberOfVertices()) + /** + * @brief Checks if the schedule satisfies node type constraints. + * + * Node type constraints are checked based on the compatibility of nodes with their assigned processors. + * + * @return True if node type constraints are satisfied, false otherwise. + */ + [[nodiscard]] bool satisfiesNodeTypeConstraints() const { + if (node_to_processor_assignment.size() != instance->numberOfVertices()) { return false; + } for (const auto &node : instance->vertices()) { - if (!instance->isCompatible(node, node_to_processor_assignment[node])) + if (!instance->isCompatible(node, node_to_processor_assignment[node])) { return false; + } } return true; - }; + } - bool satisfiesMemoryConstraints() const { + /** + * @brief Checks if the schedule satisfies memory constraints. + * + * Memory constraints are checked based on the type of memory constraint specified in the architecture. + * + * @return True if memory constraints are satisfied, false otherwise. + */ + [[nodiscard]] bool satisfiesMemoryConstraints() const { switch (instance->getArchitecture().getMemoryConstraintType()) { - case MEMORY_CONSTRAINT_TYPE::LOCAL: { + case MEMORY_CONSTRAINT_TYPE::LOCAL: + return satisfiesLocalMemoryConstraints(); - SetSchedule set_schedule = SetSchedule(*this); + case MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT: + return satisfiesPersistentAndTransientMemoryConstraints(); - for (unsigned step = 0; step < number_of_supersteps; step++) { - for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { + case MEMORY_CONSTRAINT_TYPE::GLOBAL: + return satisfiesGlobalMemoryConstraints(); - v_memw_t memory = 0; - for (const auto &node : set_schedule.step_processor_vertices[step][proc]) { - memory += instance->getComputationalDag().vertex_mem_weight(node); - } + case MEMORY_CONSTRAINT_TYPE::LOCAL_IN_OUT: + return satisfiesLocalInOutMemoryConstraints(); - if (memory > instance->getArchitecture().memoryBound(proc)) { - return false; - } - } - } + case MEMORY_CONSTRAINT_TYPE::LOCAL_INC_EDGES: + return satisfiesLocalIncEdgesMemoryConstraints(); - break; - } + case MEMORY_CONSTRAINT_TYPE::LOCAL_SOURCES_INC_EDGES: + return satisfiesLocalSourcesIncEdgesMemoryConstraints(); - case MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT: { - std::vector> current_proc_persistent_memory(instance->numberOfProcessors(), 0); - std::vector> current_proc_transient_memory(instance->numberOfProcessors(), 0); + case MEMORY_CONSTRAINT_TYPE::NONE: + return true; - for (const auto &node : instance->vertices()) { + default: + throw std::invalid_argument("Unknown memory constraint type."); + } + } - const unsigned proc = node_to_processor_assignment[node]; - current_proc_persistent_memory[proc] += instance->getComputationalDag().vertex_mem_weight(node); - current_proc_transient_memory[proc] = std::max( - current_proc_transient_memory[proc], instance->getComputationalDag().vertex_comm_weight(node)); + /** + * @brief Returns a vector of nodes assigned to the specified processor. + * + * @param processor The processor index. + * @return A vector of nodes assigned to the specified processor. + */ + [[nodiscard]] std::vector> getAssignedNodeVector(unsigned int processor) const { + std::vector> vec; - if (current_proc_persistent_memory[proc] + current_proc_transient_memory[proc] > - instance->getArchitecture().memoryBound(proc)) { - return false; - } + for (const auto &node : instance->vertices()) { + if (node_to_processor_assignment[node] == processor) { + vec.push_back(node); } - break; } - case MEMORY_CONSTRAINT_TYPE::GLOBAL: { - std::vector> current_proc_memory(instance->numberOfProcessors(), 0); - - for (const auto &node : instance->vertices()) { + return vec; + } - const unsigned proc = node_to_processor_assignment[node]; - current_proc_memory[proc] += instance->getComputationalDag().vertex_mem_weight(node); + /** + * @brief Returns a vector of nodes assigned to the specified processor and superstep. + * + * @param processor The processor index. + * @param superstep The superstep index. + * @return A vector of nodes assigned to the specified processor and superstep. + */ + [[nodiscard]] std::vector> getAssignedNodeVector(unsigned int processor, unsigned int superstep) const { + std::vector> vec; - if (current_proc_memory[proc] > instance->getArchitecture().memoryBound(proc)) { - return false; - } + for (const auto &node : instance->vertices()) { + if (node_to_processor_assignment[node] == processor && node_to_superstep_assignment[node] == superstep) { + vec.push_back(node); } - break; } - case MEMORY_CONSTRAINT_TYPE::LOCAL_IN_OUT: { - - SetSchedule set_schedule = SetSchedule(*this); + return vec; + } - for (unsigned step = 0; step < number_of_supersteps; step++) { - for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { + /** + * @brief Sets the number of supersteps in the schedule. + * + * @param number_of_supersteps_ The number of supersteps. + */ + void setNumberOfSupersteps(unsigned int number_of_supersteps_) { + number_of_supersteps = number_of_supersteps_; + } - v_memw_t memory = 0; - for (const auto &node : set_schedule.step_processor_vertices[step][proc]) { - memory += instance->getComputationalDag().vertex_mem_weight(node) + - instance->getComputationalDag().vertex_comm_weight(node); + /** + * @brief Returns the number of nodes assigned to the specified processor. + * + * @param processor The processor index. + * @return The number of nodes assigned to the specified processor. + */ + [[nodiscard]] unsigned numAssignedNodes(unsigned processor) const { + unsigned num = 0; - for (const auto &parent : instance->getComputationalDag().parents(node)) { + for (const auto &node : instance->vertices()) { + if (node_to_processor_assignment[node] == processor) { + num++; + } + } - if (node_to_processor_assignment[parent] == proc && - node_to_superstep_assignment[parent] == step) { - memory -= instance->getComputationalDag().vertex_comm_weight(parent); - } - } - } + return num; + } - if (memory > instance->getArchitecture().memoryBound(proc)) { - return false; - } - } - } + /** + * @brief Returns a vector containing the number of nodes assigned to each processor. + * + * @return A vector containing the number of nodes assigned to each processor. + */ + [[nodiscard]] std::vector numAssignedNodesPerProcessor() const { + std::vector num(instance->numberOfProcessors(), 0); - break; + for (const auto &node : instance->vertices()) { + num[node_to_processor_assignment[node]]++; } - case MEMORY_CONSTRAINT_TYPE::LOCAL_INC_EDGES: { + return num; + } + + /** + * @brief Returns a 2D vector containing the number of nodes assigned to each processor in each superstep. + * + * @return A 2D vector containing the number of nodes assigned to each processor in each superstep. + */ + [[nodiscard]] std::vector> numAssignedNodesPerSuperstepProcessor() const { + std::vector> num(number_of_supersteps, std::vector(instance->numberOfProcessors(), 0)); + + for (const auto &v : instance->vertices()) { + num[node_to_superstep_assignment[v]][node_to_processor_assignment[v]] += 1; + } - SetSchedule set_schedule = SetSchedule(*this); + return num; + } - for (unsigned step = 0; step < number_of_supersteps; step++) { - for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { + /** + * @brief Shrinks the schedule by merging supersteps where no communication occurs. + */ + virtual void shrinkByMergingSupersteps() { + std::vector comm_phase_empty(number_of_supersteps, true); + for (const auto &node : instance->vertices()) + for (const auto &child : instance->getComputationalDag().children(node)) + if (node_to_processor_assignment[node] != node_to_processor_assignment[child]) + for (unsigned offset = 1; offset <= getStaleness(); ++offset) + comm_phase_empty[node_to_superstep_assignment[child] - offset] = false; - std::unordered_set> nodes_with_incoming_edges; + std::vector new_step_index(number_of_supersteps); + unsigned current_index = 0; + for (unsigned step = 0; step < number_of_supersteps; ++step) { + new_step_index[step] = current_index; + if (!comm_phase_empty[step]) + current_index++; + } + for (const auto &node : instance->vertices()) + node_to_superstep_assignment[node] = new_step_index[node_to_superstep_assignment[node]]; - v_memw_t memory = 0; - for (const auto &node : set_schedule.step_processor_vertices[step][proc]) { - memory += instance->getComputationalDag().vertex_comm_weight(node); + setNumberOfSupersteps(current_index); + } - for (const auto &parent : instance->getComputationalDag().parents(node)) { + private: + /** + * @brief Checks if the schedule satisfies local memory constraints. + * + * In this model, the memory usage of a processor in a superstep is the sum of the memory weights of all nodes + * assigned to it in that superstep. + * + * @return True if local memory constraints are satisfied, false otherwise. + */ + bool satisfiesLocalMemoryConstraints() const { + SetSchedule set_schedule = SetSchedule(*this); - if (node_to_superstep_assignment[parent] != step) { - nodes_with_incoming_edges.insert(parent); - } - } - } + for (unsigned step = 0; step < number_of_supersteps; step++) { + for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - for (const auto &node : nodes_with_incoming_edges) { - memory += instance->getComputationalDag().vertex_comm_weight(node); - } + v_memw_t memory = 0; + for (const auto &node : set_schedule.step_processor_vertices[step][proc]) { + memory += instance->getComputationalDag().vertex_mem_weight(node); + } - if (memory > instance->getArchitecture().memoryBound(proc)) { - return false; - } + if (memory > instance->getArchitecture().memoryBound(proc)) { + return false; } } - break; } + return true; + } - case MEMORY_CONSTRAINT_TYPE::LOCAL_SOURCES_INC_EDGES: { - - SetSchedule set_schedule = SetSchedule(*this); - - for (unsigned step = 0; step < number_of_supersteps; step++) { - for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { + /** + * @brief Checks if the schedule satisfies persistent and transient memory constraints. + * + * This model distinguishes between persistent memory (node memory weight) and transient memory (max communication + * weight). The total memory usage on a processor is the sum of persistent memory of all assigned nodes plus the + * maximum transient memory required by any single node assigned to it. + * + * @return True if persistent and transient memory constraints are satisfied, false otherwise. + */ + bool satisfiesPersistentAndTransientMemoryConstraints() const { + std::vector> current_proc_persistent_memory(instance->numberOfProcessors(), 0); + std::vector> current_proc_transient_memory(instance->numberOfProcessors(), 0); - std::unordered_set> nodes_with_incoming_edges; + for (const auto &node : instance->vertices()) { - v_memw_t memory = 0; - for (const auto &node : set_schedule.step_processor_vertices[step][proc]) { + const unsigned proc = node_to_processor_assignment[node]; + current_proc_persistent_memory[proc] += instance->getComputationalDag().vertex_mem_weight(node); + current_proc_transient_memory[proc] = std::max( + current_proc_transient_memory[proc], instance->getComputationalDag().vertex_comm_weight(node)); - if (is_source(node, instance->getComputationalDag())) { - memory += instance->getComputationalDag().vertex_mem_weight(node); - } + if (current_proc_persistent_memory[proc] + current_proc_transient_memory[proc] > + instance->getArchitecture().memoryBound(proc)) { + return false; + } + } + return true; + } - for (const auto &parent : instance->getComputationalDag().parents(node)) { + /** + * @brief Checks if the schedule satisfies global memory constraints. + * + * In this model, the memory usage of a processor is the sum of the memory weights of all nodes assigned to it, + * regardless of the superstep. + * + * @return True if global memory constraints are satisfied, false otherwise. + */ + bool satisfiesGlobalMemoryConstraints() const { + std::vector> current_proc_memory(instance->numberOfProcessors(), 0); - if (node_to_superstep_assignment[parent] != step) { - nodes_with_incoming_edges.insert(parent); - } - } - } + for (const auto &node : instance->vertices()) { - for (const auto &node : nodes_with_incoming_edges) { - memory += instance->getComputationalDag().vertex_comm_weight(node); - } + const unsigned proc = node_to_processor_assignment[node]; + current_proc_memory[proc] += instance->getComputationalDag().vertex_mem_weight(node); - if (memory > instance->getArchitecture().memoryBound(proc)) { - return false; - } - } + if (current_proc_memory[proc] > instance->getArchitecture().memoryBound(proc)) { + return false; } - break; } + return true; + } - case MEMORY_CONSTRAINT_TYPE::NONE: { - break; - } + bool satisfiesLocalInOutMemoryConstraints() const { - default: { - throw std::invalid_argument("Unknown memory constraint type."); - break; - } - } + SetSchedule set_schedule = SetSchedule(*this); - return true; - }; + for (unsigned step = 0; step < number_of_supersteps; step++) { + for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - std::vector> getAssignedNodeVector(unsigned int processor) const { + v_memw_t memory = 0; + for (const auto &node : set_schedule.step_processor_vertices[step][proc]) { + memory += instance->getComputationalDag().vertex_mem_weight(node) + + instance->getComputationalDag().vertex_comm_weight(node); - std::vector> vec; + for (const auto &parent : instance->getComputationalDag().parents(node)) { - for (const auto &node : instance->vertices()) { + if (node_to_processor_assignment[parent] == proc && + node_to_superstep_assignment[parent] == step) { + memory -= instance->getComputationalDag().vertex_comm_weight(parent); + } + } + } - if (node_to_processor_assignment[node] == processor) { - vec.push_back(node); + if (memory > instance->getArchitecture().memoryBound(proc)) { + return false; + } } } - return vec; + return true; } - std::vector> getAssignedNodeVector(unsigned int processor, unsigned int superstep) const { - std::vector> vec; + bool satisfiesLocalIncEdgesMemoryConstraints() const { - for (const auto &node : instance->vertices()) { + SetSchedule set_schedule = SetSchedule(*this); - if (node_to_processor_assignment[node] == processor && node_to_superstep_assignment[node] == superstep) { - vec.push_back(node); - } - } + for (unsigned step = 0; step < number_of_supersteps; step++) { + for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - return vec; - } + std::unordered_set> nodes_with_incoming_edges; - inline void setNumberOfSupersteps(unsigned int number_of_supersteps_) { - number_of_supersteps = number_of_supersteps_; - } + v_memw_t memory = 0; + for (const auto &node : set_schedule.step_processor_vertices[step][proc]) { + memory += instance->getComputationalDag().vertex_comm_weight(node); - unsigned num_assigned_nodes(unsigned processor) const { + for (const auto &parent : instance->getComputationalDag().parents(node)) { - unsigned num = 0; + if (node_to_superstep_assignment[parent] != step) { + nodes_with_incoming_edges.insert(parent); + } + } + } - for (const auto &node : instance->vertices()) { - if (node_to_processor_assignment[node] == processor) { - num++; + for (const auto &node : nodes_with_incoming_edges) { + memory += instance->getComputationalDag().vertex_comm_weight(node); + } + + if (memory > instance->getArchitecture().memoryBound(proc)) { + return false; + } } } - - return num; + return true; } - std::vector num_assigned_nodes_per_processor() const { + bool satisfiesLocalSourcesIncEdgesMemoryConstraints() const { - std::vector num(instance->numberOfProcessors(), 0); + SetSchedule set_schedule = SetSchedule(*this); - for (const auto &node : instance->vertices()) { - num[node_to_processor_assignment[node]]++; - } - - return num; - } + for (unsigned step = 0; step < number_of_supersteps; step++) { + for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - std::vector> num_assigned_nodes_per_superstep_processor() const { + std::unordered_set> nodes_with_incoming_edges; - std::vector> num(number_of_supersteps, - std::vector(instance->numberOfProcessors(), 0)); + v_memw_t memory = 0; + for (const auto &node : set_schedule.step_processor_vertices[step][proc]) { - for (const auto &v : instance->vertices()) { - num[node_to_superstep_assignment[v]][node_to_processor_assignment[v]] += 1; - } + if (is_source(node, instance->getComputationalDag())) { + memory += instance->getComputationalDag().vertex_mem_weight(node); + } - return num; - } + for (const auto &parent : instance->getComputationalDag().parents(node)) { - virtual void shrinkByMergingSupersteps() { + if (node_to_superstep_assignment[parent] != step) { + nodes_with_incoming_edges.insert(parent); + } + } + } - std::vector comm_phase_empty(number_of_supersteps, true); - for (const auto& node : instance->vertices()) - for (const auto &child : instance->getComputationalDag().children(node)) - if(node_to_processor_assignment[node] != node_to_processor_assignment[child]) - for(unsigned offset = 1; offset <= getStaleness(); ++offset) - comm_phase_empty[node_to_superstep_assignment[child] - offset] = false; + for (const auto &node : nodes_with_incoming_edges) { + memory += instance->getComputationalDag().vertex_comm_weight(node); + } - std::vector new_step_index(number_of_supersteps); - unsigned current_index = 0; - for(unsigned step = 0; step < number_of_supersteps; ++step) - { - new_step_index[step] = current_index; - if(!comm_phase_empty[step]) - current_index++; + if (memory > instance->getArchitecture().memoryBound(proc)) { + return false; + } + } } - for (const auto& node : instance->vertices()) - node_to_superstep_assignment[node] = new_step_index[node_to_superstep_assignment[node]]; - - setNumberOfSupersteps(current_index); + return true; } - - unsigned virtual getStaleness() const { return 1; } }; } // namespace osp \ No newline at end of file diff --git a/include/osp/bsp/model/BspScheduleCS.hpp b/include/osp/bsp/model/BspScheduleCS.hpp index 89d16757..63d94798 100644 --- a/include/osp/bsp/model/BspScheduleCS.hpp +++ b/include/osp/bsp/model/BspScheduleCS.hpp @@ -25,9 +25,8 @@ limitations under the License. #include #include - -#include "IBspScheduleEval.hpp" #include "BspSchedule.hpp" +#include "IBspScheduleEval.hpp" namespace osp { @@ -64,8 +63,7 @@ class BspScheduleCS : public BspSchedule { std::map commSchedule; protected: - - void compute_cs_communication_costs_helper(std::vector>> & rec, std::vector>> & send) const { + void compute_cs_communication_costs_helper(std::vector>> &rec, std::vector>> &send) const { for (auto const &[key, val] : commSchedule) { send[std::get<1>(key)][val] += BspSchedule::instance->sendCosts(std::get<1>(key), std::get<2>(key)) * @@ -73,7 +71,7 @@ class BspScheduleCS : public BspSchedule { rec[std::get<2>(key)][val] += BspSchedule::instance->sendCosts(std::get<1>(key), std::get<2>(key)) * BspSchedule::instance->getComputationalDag().vertex_comm_weight(std::get<0>(key)); - } + } } public: @@ -244,15 +242,11 @@ class BspScheduleCS : public BspSchedule { v_commw_t compute_cs_communication_costs() const { - std::vector>> rec( - BspSchedule::instance->numberOfProcessors(), - std::vector>(BspSchedule::number_of_supersteps, 0)); - std::vector>> send( - BspSchedule::instance->numberOfProcessors(), - std::vector>(BspSchedule::number_of_supersteps, 0)); + std::vector>> rec(this->instance->numberOfProcessors(), std::vector>(this->number_of_supersteps, 0)); + std::vector>> send(this->instance->numberOfProcessors(), std::vector>(this->number_of_supersteps, 0)); compute_cs_communication_costs_helper(rec, send); - const std::vector> max_comm_per_step = this->compute_max_comm_per_step_helper(rec, send); + const std::vector> max_comm_per_step = cost_helpers::compute_max_comm_per_step(*this, rec, send); v_commw_t costs = 0; for (unsigned step = 0; step < this->number_of_supersteps; step++) { @@ -342,7 +336,7 @@ class BspScheduleCS : public BspSchedule { if (proc != BspSchedule::assignedProcessor(target)) { require_sending[proc].insert( {BspSchedule::instance->getComputationalDag().vertex_comm_weight(node) * BspSchedule::instance->getArchitecture().sendCosts(proc, BspSchedule::node_to_processor_assignment[target]), - node, + node, BspSchedule::node_to_processor_assignment[target]}); } } @@ -394,7 +388,7 @@ class BspScheduleCS : public BspSchedule { continue; auto iter = require_sending[proc].begin(); while (iter != require_sending[proc].end()) { - const auto& [comm_cost, node_to_send, dest_proc] = *iter; + const auto &[comm_cost, node_to_send, dest_proc] = *iter; if (comm_cost + send_cost[proc] > max_comm_cost || comm_cost + receive_cost[dest_proc] > max_comm_cost) { iter++; @@ -471,27 +465,23 @@ class BspScheduleCS : public BspSchedule { virtual void shrinkByMergingSupersteps() override { std::vector superstep_latest_dependency(this->number_of_supersteps, 0); - std::vector > first_at = getFirstPresence(); + std::vector> first_at = getFirstPresence(); for (auto const &[key, val] : commSchedule) - if(this->assignedProcessor(std::get<0>(key)) != std::get<1>(key)) + if (this->assignedProcessor(std::get<0>(key)) != std::get<1>(key)) superstep_latest_dependency[val] = std::max(superstep_latest_dependency[val], first_at[std::get<0>(key)][std::get<1>(key)]); - for (const auto &node : BspSchedule::instance->getComputationalDag().vertices()) for (const auto &child : BspSchedule::instance->getComputationalDag().children(node)) - if(this->assignedProcessor(node) != this->assignedProcessor(child)) + if (this->assignedProcessor(node) != this->assignedProcessor(child)) superstep_latest_dependency[this->assignedSuperstep(child)] = std::max(superstep_latest_dependency[this->assignedSuperstep(child)], first_at[node][this->assignedProcessor(child)]); std::vector merge_with_previous(this->number_of_supersteps, false); - for(unsigned step = this->number_of_supersteps-1; step < this->number_of_supersteps; --step) - { + for (unsigned step = this->number_of_supersteps - 1; step < this->number_of_supersteps; --step) { unsigned limit = 0; - while(step > limit) - { + while (step > limit) { limit = std::max(limit, superstep_latest_dependency[step]); - if(step > limit) - { + if (step > limit) { merge_with_previous[step] = true; --step; } @@ -500,26 +490,25 @@ class BspScheduleCS : public BspSchedule { std::vector new_step_index(this->number_of_supersteps); unsigned current_index = std::numeric_limits::max(); - for(unsigned step = 0; step < this->number_of_supersteps; ++step) - { - if(!merge_with_previous[step]) + for (unsigned step = 0; step < this->number_of_supersteps; ++step) { + if (!merge_with_previous[step]) current_index++; new_step_index[step] = current_index; } - for (const auto& node : this->instance->vertices()) + for (const auto &node : this->instance->vertices()) this->node_to_superstep_assignment[node] = new_step_index[this->node_to_superstep_assignment[node]]; for (auto &[key, val] : commSchedule) val = new_step_index[val]; - this->setNumberOfSupersteps(current_index+1); + this->setNumberOfSupersteps(current_index + 1); } // for each vertex v and processor p, find the first superstep where v is present on p by the end of the compute phase - std::vector > getFirstPresence() const { + std::vector> getFirstPresence() const { - std::vector > first_at(BspSchedule::instance->numberOfVertices(), - std::vector(BspSchedule::instance->numberOfProcessors(), std::numeric_limits::max())); + std::vector> first_at(BspSchedule::instance->numberOfVertices(), + std::vector(BspSchedule::instance->numberOfProcessors(), std::numeric_limits::max())); for (const auto &node : BspSchedule::instance->getComputationalDag().vertices()) first_at[node][this->assignedProcessor(node)] = this->assignedSuperstep(node); @@ -532,11 +521,11 @@ class BspScheduleCS : public BspSchedule { } // remove unneeded comm. schedule entries - these can happen in ILPs, partial ILPs, etc. - void cleanCommSchedule(){ + void cleanCommSchedule() { // data that is already present before it arrives - std::vector > > arrives_at(BspSchedule::instance->numberOfVertices(), - std::vector >(BspSchedule::instance->numberOfProcessors())); + std::vector>> arrives_at(BspSchedule::instance->numberOfVertices(), + std::vector>(BspSchedule::instance->numberOfProcessors())); for (const auto &node : BspSchedule::instance->getComputationalDag().vertices()) arrives_at[node][this->assignedProcessor(node)].insert(this->assignedSuperstep(node)); @@ -544,48 +533,45 @@ class BspScheduleCS : public BspSchedule { arrives_at[std::get<0>(key)][std::get<2>(key)].insert(val); std::vector toErase; - for (auto const &[key, val] : commSchedule) - { + for (auto const &[key, val] : commSchedule) { auto itr = arrives_at[std::get<0>(key)][std::get<2>(key)].begin(); - if(*itr < val) + if (*itr < val) toErase.push_back(key); - else if(*itr == val && ++itr != arrives_at[std::get<0>(key)][std::get<2>(key)].end() && *itr == val) - { + else if (*itr == val && ++itr != arrives_at[std::get<0>(key)][std::get<2>(key)].end() && *itr == val) { toErase.push_back(key); arrives_at[std::get<0>(key)][std::get<2>(key)].erase(itr); } } - for(const KeyTriple& key : toErase) + for (const KeyTriple &key : toErase) commSchedule.erase(key); // data that is not used after being sent - std::vector > > used_at(BspSchedule::instance->numberOfVertices(), - std::vector >(BspSchedule::instance->numberOfProcessors())); + std::vector>> used_at(BspSchedule::instance->numberOfVertices(), + std::vector>(BspSchedule::instance->numberOfProcessors())); for (const auto &node : BspSchedule::instance->getComputationalDag().vertices()) for (const auto &child : BspSchedule::instance->getComputationalDag().children(node)) used_at[node][this->assignedProcessor(child)].insert(this->assignedSuperstep(child)); for (auto const &[key, val] : commSchedule) used_at[std::get<0>(key)][std::get<1>(key)].insert(val); - + // (need to visit cs entries in reverse superstep order here) - std::vector > entries(this->number_of_supersteps); + std::vector> entries(this->number_of_supersteps); for (auto const &[key, val] : commSchedule) entries[val].push_back(key); toErase.clear(); - for(unsigned step = this->number_of_supersteps-1; step < this->number_of_supersteps; --step) - for(const KeyTriple& key : entries[step]) - if(used_at[std::get<0>(key)][std::get<2>(key)].empty() || - *used_at[std::get<0>(key)][std::get<2>(key)].rbegin() <= step) - { + for (unsigned step = this->number_of_supersteps - 1; step < this->number_of_supersteps; --step) + for (const KeyTriple &key : entries[step]) + if (used_at[std::get<0>(key)][std::get<2>(key)].empty() || + *used_at[std::get<0>(key)][std::get<2>(key)].rbegin() <= step) { toErase.push_back(key); - auto itr = used_at[std::get<0>(key)][std::get<1>(key)].find(step); + auto itr = used_at[std::get<0>(key)][std::get<1>(key)].find(step); used_at[std::get<0>(key)][std::get<1>(key)].erase(itr); } - - for(const KeyTriple& key : toErase) + + for (const KeyTriple &key : toErase) commSchedule.erase(key); } }; diff --git a/include/osp/bsp/model/BspScheduleCostEvaluator.hpp b/include/osp/bsp/model/BspScheduleCostEvaluator.hpp deleted file mode 100644 index c97d8b74..00000000 --- a/include/osp/bsp/model/BspScheduleCostEvaluator.hpp +++ /dev/null @@ -1,179 +0,0 @@ -/* -Copyright 2024 Huawei Technologies Co., Ltd. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner -*/ - -#pragma once - -#include "BspSchedule.hpp" - -namespace osp { - -/** - * @class BspScheduleCostEvaluator - * @brief A class to compute various cost functions for a BspSchedule. - * - * This class wraps a BspSchedule by reference to avoid unnecessary copies - * while providing an interface to compute different cost models. - */ -template -class BspScheduleCostEvaluator { - - static_assert(is_computational_dag_v, "BspScheduleCostEvaluator can only be used with computational DAGs."); - static_assert(std::is_same_v, v_commw_t>, - "BspScheduleCostEvaluator requires work and comm. weights to have the same type."); - - protected: - const BspSchedule& schedule; - const BspInstance& instance; - - void compute_lazy_communication_costs_helper(std::vector>> & rec, std::vector>> & send) const { - const unsigned number_of_supersteps = schedule.numberOfSupersteps(); - for (const auto &node : instance.vertices()) { - - std::vector step_needed(instance.numberOfProcessors(), number_of_supersteps); - for (const auto &target : instance.getComputationalDag().children(node)) { - - if (schedule.assignedProcessor(node) != schedule.assignedProcessor(target)) { - step_needed[schedule.assignedProcessor(target)] = std::min( - step_needed[schedule.assignedProcessor(target)], schedule.assignedSuperstep(target)); - } - } - - for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) { - - if (step_needed[proc] < number_of_supersteps) { - - send[schedule.assignedProcessor(node)][step_needed[proc] - 1] += - instance.sendCosts(schedule.assignedProcessor(node), proc) * - instance.getComputationalDag().vertex_comm_weight(node); - - rec[proc][step_needed[proc] - 1] += instance.sendCosts(schedule.assignedProcessor(node), proc) * - instance.getComputationalDag().vertex_comm_weight(node); - } - } - } - } - - std::vector> compute_max_comm_per_step_helper(const std::vector>> & rec, const std::vector>> & send) const { - const unsigned number_of_supersteps = schedule.numberOfSupersteps(); - std::vector> max_comm_per_step(number_of_supersteps, 0); - for (unsigned step = 0; step < number_of_supersteps; step++) { - v_commw_t max_send = 0; - v_commw_t max_rec = 0; - - for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) { - if (max_send < send[proc][step]) - max_send = send[proc][step]; - if (max_rec < rec[proc][step]) - max_rec = rec[proc][step]; - } - max_comm_per_step[step] = std::max(max_send, max_rec) * instance.communicationCosts(); - } - return max_comm_per_step; - } - - public: - /** - * @brief Construct a new Bsp Schedule Cost Evaluator object. - * - * @param sched The BspSchedule to evaluate. - */ - BspScheduleCostEvaluator(const BspSchedule& sched) : schedule(sched), instance(sched.getInstance()) {} - - /** - * @brief Computes the communication costs using the lazy sending model. - * - * In the lazy sending model, data is sent in the superstep immediately - * preceding the superstep where it is first needed. - * - * @return The lazy communication costs. - */ - v_commw_t compute_lazy_communication_costs() const { - - const unsigned number_of_supersteps = schedule.numberOfSupersteps(); - - std::vector>> rec(instance.numberOfProcessors(), - std::vector>(number_of_supersteps, 0)); - std::vector>> send(instance.numberOfProcessors(), - std::vector>(number_of_supersteps, 0)); - - compute_lazy_communication_costs_helper(rec, send); - const std::vector> max_comm_per_step = compute_max_comm_per_step_helper(rec, send); - - v_commw_t costs = 0; - for (unsigned step = 0; step < number_of_supersteps; step++) { - const auto step_comm_cost = max_comm_per_step[step]; - costs += step_comm_cost; - - costs += instance.synchronisationCosts(); - - } - - return costs; - } - - /** - * @brief Computes the work costs for each superstep. - * - * @return The work cost per superstep. - */ - std::vector> compute_max_work_per_step_helper() const { - const unsigned number_of_supersteps = schedule.numberOfSupersteps(); - std::vector>> work = std::vector>>( - number_of_supersteps, std::vector>(instance.numberOfProcessors(), 0)); - for (const auto &node : instance.vertices()) { - work[schedule.assignedSuperstep(node)][schedule.assignedProcessor(node)] += - instance.getComputationalDag().vertex_work_weight(node); - } - - std::vector> max_work_per_step(number_of_supersteps, 0); - for (unsigned step = 0; step < number_of_supersteps; step++) { - v_workw_t max_work = 0; - for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) { - if (max_work < work[step][proc]) { - max_work = work[step][proc]; - } - } - - max_work_per_step[step] = max_work; - } - - return max_work_per_step; - } - - /** - * @brief Computes the total work costs of the schedule. - * - * The work cost is the sum of the maximum work done in each superstep - * across all processors. - * - * @return The total work costs. - */ - v_workw_t computeWorkCosts() const { - const std::vector> work_per_step = compute_max_work_per_step_helper(); - return std::accumulate(work_per_step.begin(), work_per_step.end(), static_cast>(0)); - } - - /** - * @brief Computes the total costs of the schedule using the lazy communication model. - * - * @return The total costs. - */ - v_workw_t computeCosts() const { return compute_lazy_communication_costs() + computeWorkCosts(); } -}; - -} // namespace osp diff --git a/include/osp/bsp/model/MaxBspSchedule.hpp b/include/osp/bsp/model/MaxBspSchedule.hpp index 22b9c4b8..e56c99d6 100644 --- a/include/osp/bsp/model/MaxBspSchedule.hpp +++ b/include/osp/bsp/model/MaxBspSchedule.hpp @@ -27,6 +27,7 @@ limitations under the License. #include #include "BspSchedule.hpp" +#include "osp/bsp/model/cost/LazyCommunicationCost.hpp" #include "osp/concepts/computational_dag_concept.hpp" namespace osp { @@ -41,14 +42,12 @@ template class MaxBspSchedule : public BspSchedule { static_assert(is_computational_dag_v, "BspSchedule can only be used with computational DAGs."); - static_assert(std::is_same_v, v_commw_t >, "BspSchedule requires work and comm. weights to have the same type."); + static_assert(std::is_same_v, v_commw_t>, "BspSchedule requires work and comm. weights to have the same type."); protected: using vertex_idx = vertex_idx_t; - public: - MaxBspSchedule() = delete; /** @@ -67,7 +66,7 @@ class MaxBspSchedule : public BspSchedule { * @param superstep_assignment_ The superstep assignment for the nodes. */ MaxBspSchedule(const BspInstance &inst, const std::vector &processor_assignment_, - const std::vector &superstep_assignment_) : BspSchedule(inst, processor_assignment_, superstep_assignment_) {} + const std::vector &superstep_assignment_) : BspSchedule(inst, processor_assignment_, superstep_assignment_) {} MaxBspSchedule(const IBspSchedule &schedule) : BspSchedule(schedule) {} @@ -89,26 +88,23 @@ class MaxBspSchedule : public BspSchedule { */ virtual ~MaxBspSchedule() = default; - virtual v_workw_t computeCosts() const override { - - std::vector>> rec(this->instance->numberOfProcessors(), - std::vector>(this->number_of_supersteps, 0)); + virtual v_workw_t computeCosts() const override { - std::vector>> send(this->instance->numberOfProcessors(), - std::vector>(this->number_of_supersteps, 0)); + std::vector>> rec(this->instance->numberOfProcessors(), std::vector>(this->number_of_supersteps, 0)); + std::vector>> send(this->instance->numberOfProcessors(), std::vector>(this->number_of_supersteps, 0)); - this->compute_lazy_communication_costs_helper(rec, send); - const std::vector> max_comm_per_step = this->compute_max_comm_per_step_helper(rec, send); - const std::vector> max_work_per_step = this->compute_max_work_per_step_helper(); + compute_lazy_communication_costs(*this, rec, send); + const std::vector> max_comm_per_step = cost_helpers::compute_max_comm_per_step(*this, rec, send); + const std::vector> max_work_per_step = cost_helpers::compute_max_work_per_step(*this); v_workw_t costs = 0U; for (unsigned step = 0U; step < this->number_of_supersteps; step++) { - v_commw_t step_comm_cost = (step == 0U) ? static_cast>(0) : max_comm_per_step[step - 1U]; + const v_commw_t step_comm_cost = (step == 0U) ? static_cast>(0) : max_comm_per_step[step - 1U]; + costs += std::max(step_comm_cost, max_work_per_step[step]); + if (step_comm_cost > static_cast>(0)) { - step_comm_cost += this->instance->synchronisationCosts(); + costs += this->instance->synchronisationCosts(); } - costs += std::max(step_comm_cost, max_work_per_step[step]); - } return costs; } diff --git a/include/osp/bsp/model/MaxBspScheduleCS.hpp b/include/osp/bsp/model/MaxBspScheduleCS.hpp index 5309cddd..79b49b33 100644 --- a/include/osp/bsp/model/MaxBspScheduleCS.hpp +++ b/include/osp/bsp/model/MaxBspScheduleCS.hpp @@ -35,14 +35,12 @@ template class MaxBspScheduleCS : public BspScheduleCS { static_assert(is_computational_dag_v, "BspSchedule can only be used with computational DAGs."); - static_assert(std::is_same_v, v_commw_t >, "BspSchedule requires work and comm. weights to have the same type."); + static_assert(std::is_same_v, v_commw_t>, "BspSchedule requires work and comm. weights to have the same type."); protected: using vertex_idx = vertex_idx_t; - public: - MaxBspScheduleCS() = delete; /** @@ -89,8 +87,8 @@ class MaxBspScheduleCS : public BspScheduleCS { */ virtual ~MaxBspScheduleCS() = default; - virtual v_workw_t computeCosts() const override { - + virtual v_workw_t computeCosts() const override { + std::vector>> rec(this->getInstance().numberOfProcessors(), std::vector>(this->number_of_supersteps, 0)); @@ -98,22 +96,21 @@ class MaxBspScheduleCS : public BspScheduleCS { std::vector>(this->number_of_supersteps, 0)); this->compute_cs_communication_costs_helper(rec, send); - const std::vector> max_comm_per_step = this->compute_max_comm_per_step_helper(rec, send); - const std::vector> max_work_per_step = this->compute_max_work_per_step_helper(); + const std::vector> max_comm_per_step = cost_helpers::compute_max_comm_per_step(*this, rec, send); + const std::vector> max_work_per_step = cost_helpers::compute_max_work_per_step(*this); v_workw_t costs = 0U; for (unsigned step = 0U; step < this->number_of_supersteps; step++) { - v_commw_t step_comm_cost = (step == 0U) ? static_cast>(0) : max_comm_per_step[step - 1U]; - if (step_comm_cost > static_cast>(0)) { - step_comm_cost += this->instance->synchronisationCosts(); - } + const auto step_comm_cost = (step == 0U) ? static_cast>(0) : max_comm_per_step[step - 1U]; costs += std::max(step_comm_cost, max_work_per_step[step]); + if (step_comm_cost > static_cast>(0)) { + costs += this->instance->synchronisationCosts(); + } } return costs; } unsigned virtual getStaleness() const override { return 2; } }; - } // namespace osp \ No newline at end of file diff --git a/include/osp/bsp/model/cost/BufferedSendingCost.hpp b/include/osp/bsp/model/cost/BufferedSendingCost.hpp new file mode 100644 index 00000000..f8b61f91 --- /dev/null +++ b/include/osp/bsp/model/cost/BufferedSendingCost.hpp @@ -0,0 +1,84 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include "osp/bsp/model/cost/CostModelHelpers.hpp" +#include "osp/concepts/computational_dag_concept.hpp" +#include +#include + +namespace osp { + +/** + * @struct BufferedSendingCost + * @brief Implements the buffered sending cost model. + */ +template +struct BufferedSendingCost { + + using cost_type = v_commw_t; + + cost_type operator()(const BspSchedule &schedule) const { + const auto &instance = schedule.getInstance(); + unsigned number_of_supersteps = schedule.numberOfSupersteps(); + const auto &node_to_processor_assignment = schedule.assignedProcessors(); + const auto &node_to_superstep_assignment = schedule.assignedSupersteps(); + const auto staleness = schedule.getStaleness(); + + std::vector>> rec(instance.numberOfProcessors(), std::vector>(number_of_supersteps, 0)); + std::vector>> send(instance.numberOfProcessors(), std::vector>(number_of_supersteps, 0)); + + for (vertex_idx_t node = 0; node < instance.numberOfVertices(); node++) { + + std::vector step_needed(instance.numberOfProcessors(), number_of_supersteps); + for (const auto &target : instance.getComputationalDag().children(node)) { + + if (node_to_processor_assignment[node] != node_to_processor_assignment[target]) { + step_needed[node_to_processor_assignment[target]] = std::min(step_needed[node_to_processor_assignment[target]], node_to_superstep_assignment[target]); + } + } + + for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) { + + if (step_needed[proc] < number_of_supersteps) { + send[node_to_processor_assignment[node]][node_to_superstep_assignment[node]] += instance.sendCosts(node_to_processor_assignment[node], proc) * instance.getComputationalDag().vertex_comm_weight(node); + + if (step_needed[proc] >= staleness) { + rec[proc][step_needed[proc] - staleness] += instance.sendCosts(node_to_processor_assignment[node], proc) * instance.getComputationalDag().vertex_comm_weight(node); + } + } + } + } + + const auto max_comm_per_step = cost_helpers::compute_max_comm_per_step(schedule, rec, send); + v_commw_t comm_costs = 0; + for (unsigned step = 0; step < number_of_supersteps; step++) { + const auto step_comm_cost = max_comm_per_step[step]; + comm_costs += step_comm_cost; + + if (step_comm_cost > 0) { + comm_costs += instance.synchronisationCosts(); + } + } + + return comm_costs + cost_helpers::compute_work_costs(schedule); + } +}; + +} // namespace osp diff --git a/include/osp/bsp/model/cost/CostModelHelpers.hpp b/include/osp/bsp/model/cost/CostModelHelpers.hpp new file mode 100644 index 00000000..b1d449b4 --- /dev/null +++ b/include/osp/bsp/model/cost/CostModelHelpers.hpp @@ -0,0 +1,117 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include "osp/bsp/model/BspInstance.hpp" +#include +#include + +namespace osp { + +template +class BspSchedule; + +namespace cost_helpers { + +template +std::vector> compute_max_comm_per_step( + const BspInstance &instance, + unsigned number_of_supersteps, + const std::vector>> &rec, + const std::vector>> &send) { + + std::vector> max_comm_per_step(number_of_supersteps, 0); + for (unsigned step = 0; step < number_of_supersteps; step++) { + v_commw_t max_send = 0; + v_commw_t max_rec = 0; + + for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) { + if (max_send < send[proc][step]) + max_send = send[proc][step]; + if (max_rec < rec[proc][step]) + max_rec = rec[proc][step]; + } + max_comm_per_step[step] = std::max(max_send, max_rec) * instance.communicationCosts(); + } + return max_comm_per_step; +} + +template +std::vector> compute_max_comm_per_step( + const BspSchedule &schedule, + const std::vector>> &rec, + const std::vector>> &send) { + return compute_max_comm_per_step(schedule.getInstance(), schedule.numberOfSupersteps(), rec, send); +} + +template +std::vector> compute_max_work_per_step( + const BspInstance &instance, + unsigned number_of_supersteps, + const std::vector &node_to_processor_assignment, + const std::vector &node_to_superstep_assignment) { + std::vector>> work = std::vector>>( + number_of_supersteps, std::vector>(instance.numberOfProcessors(), 0)); + for (const auto &node : instance.vertices()) { + work[node_to_superstep_assignment[node]][node_to_processor_assignment[node]] += + instance.getComputationalDag().vertex_work_weight(node); + } + + std::vector> max_work_per_step(number_of_supersteps, 0); + for (unsigned step = 0; step < number_of_supersteps; step++) { + v_workw_t max_work = 0; + for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) { + if (max_work < work[step][proc]) { + max_work = work[step][proc]; + } + } + + max_work_per_step[step] = max_work; + } + + return max_work_per_step; +} + +template +std::vector> compute_max_work_per_step( + const BspSchedule &schedule) { + return compute_max_work_per_step(schedule.getInstance(), schedule.numberOfSupersteps(), schedule.assignedProcessors(), schedule.assignedSupersteps()); +} + +template +v_workw_t compute_work_costs( + const BspInstance &instance, + unsigned number_of_supersteps, + const std::vector &node_to_processor_assignment, + const std::vector &node_to_superstep_assignment) { + + std::vector> max_work_per_step = compute_max_work_per_step(instance, number_of_supersteps, node_to_processor_assignment, node_to_superstep_assignment); + + return std::accumulate(max_work_per_step.begin(), max_work_per_step.end(), static_cast>(0)); +} + +template +v_workw_t compute_work_costs( + const BspSchedule &schedule) { + + return compute_work_costs(schedule.getInstance(), schedule.numberOfSupersteps(), schedule.assignedProcessors(), schedule.assignedSupersteps()); +} + +} // namespace cost_helpers +} // namespace osp diff --git a/include/osp/bsp/model/cost/LazyCommunicationCost.hpp b/include/osp/bsp/model/cost/LazyCommunicationCost.hpp new file mode 100644 index 00000000..64338481 --- /dev/null +++ b/include/osp/bsp/model/cost/LazyCommunicationCost.hpp @@ -0,0 +1,98 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include "osp/bsp/model/cost/CostModelHelpers.hpp" +#include "osp/concepts/computational_dag_concept.hpp" +#include +#include + +namespace osp { + +template +void compute_lazy_communication_costs( + const BspInstance &instance, + unsigned number_of_supersteps, + const std::vector &node_to_processor_assignment, + const std::vector &node_to_superstep_assignment, + const unsigned staleness, + std::vector>> &rec, + std::vector>> &send) { + for (const auto &node : instance.vertices()) { + + std::vector step_needed(instance.numberOfProcessors(), number_of_supersteps); + for (const auto &target : instance.getComputationalDag().children(node)) { + + if (node_to_processor_assignment[node] != node_to_processor_assignment[target]) { + step_needed[node_to_processor_assignment[target]] = std::min(step_needed[node_to_processor_assignment[target]], node_to_superstep_assignment[target]); + } + } + + for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) { + + if (step_needed[proc] < number_of_supersteps) { + send[node_to_processor_assignment[node]][step_needed[proc] - staleness] += instance.sendCosts(node_to_processor_assignment[node], proc) * instance.getComputationalDag().vertex_comm_weight(node); + rec[proc][step_needed[proc] - staleness] += instance.sendCosts(node_to_processor_assignment[node], proc) * instance.getComputationalDag().vertex_comm_weight(node); + } + } + } +} + +template +void compute_lazy_communication_costs( + const BspSchedule &schedule, + std::vector>> &rec, + std::vector>> &send) { + compute_lazy_communication_costs(schedule.getInstance(), schedule.numberOfSupersteps(), schedule.assignedProcessors(), schedule.assignedSupersteps(), schedule.getStaleness(), rec, send); +} + +/** + * @struct LazyCommunicationCost + * @brief Implements the lazy communication cost model. + */ +template +struct LazyCommunicationCost { + + using cost_type = v_workw_t; + + cost_type operator()(const BspSchedule &schedule) const { + const auto &number_of_processors = schedule.getInstance().numberOfProcessors(); + const auto &number_of_supersteps = schedule.numberOfSupersteps(); + + std::vector>> rec(number_of_processors, std::vector>(number_of_supersteps, 0)); + std::vector>> send(number_of_processors, std::vector>(number_of_supersteps, 0)); + + compute_lazy_communication_costs(schedule, rec, send); + const auto max_comm_per_step = cost_helpers::compute_max_comm_per_step(schedule, rec, send); + + v_commw_t comm_costs = 0; + for (unsigned step = 0; step < number_of_supersteps; step++) { + const auto step_comm_cost = max_comm_per_step[step]; + comm_costs += step_comm_cost; + + if (step_comm_cost > 0) { + comm_costs += schedule.getInstance().synchronisationCosts(); + } + } + + return comm_costs + cost_helpers::compute_work_costs(schedule); + } +}; + +} // namespace osp diff --git a/include/osp/bsp/model/cost/TotalCommunicationCost.hpp b/include/osp/bsp/model/cost/TotalCommunicationCost.hpp new file mode 100644 index 00000000..3182f3c5 --- /dev/null +++ b/include/osp/bsp/model/cost/TotalCommunicationCost.hpp @@ -0,0 +1,62 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include "osp/bsp/model/cost/CostModelHelpers.hpp" +#include "osp/concepts/computational_dag_concept.hpp" + +namespace osp { + +/** + * @struct TotalCommunicationCost + * @brief Implements the total communication cost model. + */ +template +struct TotalCommunicationCost { + + using cost_type = double; + + cost_type operator()(const BspSchedule &schedule) const { + + const auto &instance = schedule.getInstance(); + const auto &node_to_processor_assignment = schedule.assignedProcessors(); + + v_commw_t total_communication = 0; + + for (const auto &v : instance.vertices()) { + for (const auto &target : instance.getComputationalDag().children(v)) { + + if (node_to_processor_assignment[v] != node_to_processor_assignment[target]) { + total_communication += instance.sendCosts(node_to_processor_assignment[v], node_to_processor_assignment[target]) * instance.getComputationalDag().vertex_comm_weight(v); + } + } + } + + auto comm_cost = total_communication * static_cast(instance.communicationCosts()) / static_cast(instance.numberOfProcessors()); + + const unsigned number_of_supersteps = schedule.numberOfSupersteps(); + + auto work_cost = cost_helpers::compute_work_costs(schedule); + auto sync_cost = static_cast>(number_of_supersteps > 1 ? number_of_supersteps - 1 : 0) * instance.synchronisationCosts(); + + return comm_cost + work_cost + sync_cost; + } +}; + +} // namespace osp diff --git a/include/osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp b/include/osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp new file mode 100644 index 00000000..acab210f --- /dev/null +++ b/include/osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp @@ -0,0 +1,70 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include "osp/bsp/model/cost/CostModelHelpers.hpp" +#include "osp/concepts/computational_dag_concept.hpp" +#include + +namespace osp { + +/** + * @struct TotalLambdaCommunicationCost + * @brief Implements the total lambda communication cost model. + */ +template +struct TotalLambdaCommunicationCost { + + using cost_type = double; + + cost_type operator()(const BspSchedule &schedule) const { + const auto &instance = schedule.getInstance(); + const auto &node_to_processor_assignment = schedule.assignedProcessors(); + + v_commw_t comm_costs = 0; + const double comm_multiplier = 1.0 / instance.numberOfProcessors(); + + for (const auto &v : instance.vertices()) { + if (instance.getComputationalDag().out_degree(v) == 0) + continue; + + std::unordered_set target_procs; + for (const auto &target : instance.getComputationalDag().children(v)) { + target_procs.insert(node_to_processor_assignment[target]); + } + + const unsigned source_proc = node_to_processor_assignment[v]; + const auto v_comm_cost = instance.getComputationalDag().vertex_comm_weight(v); + + for (const auto &target_proc : target_procs) { + comm_costs += v_comm_cost * instance.sendCosts(source_proc, target_proc); + } + } + + const unsigned number_of_supersteps = schedule.numberOfSupersteps(); + + auto comm_cost = comm_costs * comm_multiplier * static_cast(instance.communicationCosts()); + auto work_cost = cost_helpers::compute_work_costs(schedule); + auto sync_cost = static_cast>(number_of_supersteps > 1 ? number_of_supersteps - 1 : 0) * instance.synchronisationCosts(); + + return comm_cost + static_cast(work_cost) + static_cast(sync_cost); + } +}; + +} // namespace osp diff --git a/include/osp/bsp/scheduler/GreedySchedulers/BspToMaxBspConverter.hpp b/include/osp/bsp/scheduler/GreedySchedulers/BspToMaxBspConverter.hpp index 98c3ec2c..363f5c85 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/BspToMaxBspConverter.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/BspToMaxBspConverter.hpp @@ -42,8 +42,8 @@ class GreedyBspToMaxBspConverter { std::vector>>> createSuperstepLists(const BspScheduleCS& schedule, std::vector& priorities) const; public: - - MaxBspSchedule Convert(const BspSchedule& schedule) const; + + MaxBspSchedule Convert(const BspSchedule& schedule) const; MaxBspScheduleCS Convert(const BspScheduleCS& schedule) const; }; @@ -106,7 +106,7 @@ MaxBspScheduleCS GreedyBspToMaxBspConverter::Convert(const Bsp std::vector> newly_freed_comm_steps; std::vector send_sum_of_newly_free_on_proc(schedule.getInstance().numberOfProcessors(), 0), rec_sum_of_newly_free_on_proc(schedule.getInstance().numberOfProcessors(), 0); - + std::vector> comm_in_current_step; std::vector send_on_proc(schedule.getInstance().numberOfProcessors(), 0), @@ -118,10 +118,10 @@ MaxBspScheduleCS GreedyBspToMaxBspConverter::Convert(const Bsp // I. Select the next node (from any proc) with highest priority unsigned chosen_proc = schedule.getInstance().numberOfProcessors(); double best_prio = std::numeric_limits::max(); - + for(unsigned proc = 0; proc < schedule.getInstance().numberOfProcessors(); ++proc) { - if(!proc_list[proc][step].empty() && (chosen_proc == schedule.getInstance().numberOfProcessors() || + if(!proc_list[proc][step].empty() && (chosen_proc == schedule.getInstance().numberOfProcessors() || priorities[proc_list[proc][step].front()] < best_prio)) { chosen_proc = proc; @@ -241,7 +241,7 @@ MaxBspScheduleCS GreedyBspToMaxBspConverter::Convert(const Bsp for(const std::pair& entry : newly_freed_comm_steps) free_comm_steps_for_superstep[step].insert(entry); - + if(free_comm_steps_for_superstep[step].empty()) continue; @@ -256,8 +256,8 @@ MaxBspScheduleCS GreedyBspToMaxBspConverter::Convert(const Bsp send_on_proc.resize(schedule.getInstance().numberOfProcessors(), 0); rec_on_proc.clear(); rec_on_proc.resize(schedule.getInstance().numberOfProcessors(), 0); - - std::set> late_arriving_nodes; + + std::set> late_arriving_nodes; for(const std::pair& entry : free_comm_steps_for_superstep[step]) { schedule_max.addCommunicationScheduleEntry(entry.first, current_step - 1); @@ -293,8 +293,8 @@ MaxBspScheduleCS GreedyBspToMaxBspConverter::Convert(const Bsp max_comm_together = std::max(max_comm_together, rec_on_proc[proc]); } - cost_type work_limit = max_comm_after; - if(max_comm_together + max_work_done <= max_comm_after + std::max(max_work_done, max_comm_current + schedule.getInstance().getArchitecture().synchronisationCosts())) + cost_type work_limit = max_comm_after; + if(max_comm_together + max_work_done <= max_comm_after + std::max(max_work_done, max_comm_current) + schedule.getInstance().getArchitecture().synchronisationCosts()) { work_limit = max_comm_together; for(const std::pair& entry : comm_in_current_step) @@ -320,13 +320,13 @@ MaxBspScheduleCS GreedyBspToMaxBspConverter::Convert(const Bsp continue; bool has_dependency = false; - + for (const vertex_idx &parent : dag.parents(node)) { if(schedule.assignedProcessor(node) != schedule.assignedProcessor(parent) && late_arriving_nodes.find(std::make_pair(parent, proc)) != late_arriving_nodes.end()) has_dependency = true; - + if(schedule.assignedProcessor(node) == schedule.assignedProcessor(parent) && schedule.assignedSuperstep(parent) == step + 1 && brought_forward.find(parent) == brought_forward.end()) @@ -341,7 +341,7 @@ MaxBspScheduleCS GreedyBspToMaxBspConverter::Convert(const Bsp schedule_max.setAssignedSuperstep(node, current_step); work_remaining_proc_superstep[proc][step+1] -= dag.vertex_work_weight(node); --nodes_remaining_superstep[step+1]; - + for(const std::pair& entry : dependent_comm_steps_for_node[node]) free_comm_steps_for_superstep[step+1].insert(entry); } @@ -350,7 +350,7 @@ MaxBspScheduleCS GreedyBspToMaxBspConverter::Convert(const Bsp for(vertex_idx node : proc_list[proc][step+1]) if(brought_forward.find(node) == brought_forward.end()) remaining.push_back(node); - + proc_list[proc][step+1] = remaining; } diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp index 7aff3997..890f779c 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp @@ -18,15 +18,14 @@ limitations under the License. #pragma once -#include "osp/bsp/model/BspScheduleCostEvaluator.hpp" +#include "osp/bsp/model/cost/LazyCommunicationCost.hpp" #include "osp/bsp/scheduler/Scheduler.hpp" #include "osp/bsp/scheduler/Serial.hpp" -#include #include +#include namespace osp { - /** * @class GreedyMetaScheduler * @brief The GreedyMetaScheduler class represents a meta-scheduler that selects the best schedule produced from a list of @@ -35,18 +34,21 @@ namespace osp { * This class inherits from the Scheduler class and implements the computeSchedule() and getScheduleName() methods. * The computeSchedule() method iterates through a list of schedulers, computes a schedule using each one, * and returns the schedule with the minimum cost. + * + * @tparam Graph_t The graph type representing the computational DAG. + * @tparam CostModel The cost model functor to evaluate schedules. Defaults to LazyCommunicationCost. */ -template +template> class GreedyMetaScheduler : public Scheduler { Serial serial_scheduler_; - std::vector*> schedulers_; + std::vector *> schedulers_; static constexpr bool verbose = false; public: /** - * @brief Default constructor for MetaScheduler. + * @brief Default constructor for GreedyMetaScheduler. */ GreedyMetaScheduler() : Scheduler() {} @@ -56,32 +58,33 @@ class GreedyMetaScheduler : public Scheduler { ~GreedyMetaScheduler() override = default; void addSerialScheduler() { schedulers_.push_back(&serial_scheduler_); } - void addScheduler(Scheduler & s) { schedulers_.push_back(&s); } + void addScheduler(Scheduler &s) { schedulers_.push_back(&s); } void resetScheduler() { schedulers_.clear(); } RETURN_STATUS computeSchedule(BspSchedule &schedule) override { if (schedule.getInstance().getArchitecture().numberOfProcessors() == 1) { - if constexpr (verbose) std::cout << "Using serial scheduler for P=1." << std::endl; + if constexpr (verbose) + std::cout << "Using serial scheduler for P=1." << std::endl; serial_scheduler_.computeSchedule(schedule); return RETURN_STATUS::OSP_SUCCESS; } - v_workw_t best_schedule_cost = std::numeric_limits>::max(); + v_workw_t best_schedule_cost = std::numeric_limits>::max(); BspSchedule current_schedule(schedule.getInstance()); - for (Scheduler* scheduler : schedulers_) { + for (Scheduler *scheduler : schedulers_) { scheduler->computeSchedule(current_schedule); - BspScheduleCostEvaluator evaluator(current_schedule); - const v_workw_t schedule_cost = evaluator.computeCosts(); + const v_workw_t schedule_cost = CostModel()(current_schedule); - if constexpr (verbose) std::cout << "Executed scheduler " << scheduler->getScheduleName() << ", costs: " << schedule_cost << ", nr. supersteps: " << current_schedule.numberOfSupersteps() << std::endl; + if constexpr (verbose) + std::cout << "Executed scheduler " << scheduler->getScheduleName() << ", costs: " << schedule_cost << ", nr. supersteps: " << current_schedule.numberOfSupersteps() << std::endl; if (schedule_cost < best_schedule_cost) { best_schedule_cost = schedule_cost; schedule = current_schedule; - if constexpr (verbose) std::cout << "New best schedule!" << std::endl; + if constexpr (verbose) + std::cout << "New best schedule!" << std::endl; } - } return RETURN_STATUS::OSP_SUCCESS; diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp index 56d66316..38fae9ff 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp @@ -18,7 +18,6 @@ limitations under the License. #pragma once - #include #include #include @@ -30,8 +29,8 @@ limitations under the License. // #define TIME_THREADS_GROW_LOCAL_PARALLEL #ifdef TIME_THREADS_GROW_LOCAL_PARALLEL - #include - #include +#include +#include #endif #include "osp/auxiliary/misc.hpp" @@ -46,7 +45,7 @@ struct GrowLocalAutoCoresParallel_Params { weight_t syncCostMultiplierMinSuperstepWeight = 1; weight_t syncCostMultiplierParallelCheck = 4; - unsigned numThreads = 0; // 0 for auto + unsigned numThreads = 0; // 0 for auto unsigned maxNumThreads = UINT_MAX; // used when auto num threads }; @@ -82,7 +81,7 @@ class GrowLocalAutoCoresParallel : public Scheduler { */ virtual ~GrowLocalAutoCoresParallel() = default; - void computePartialSchedule(BspSchedule &schedule, const std::vector &topOrder, const std::vector &posInTopOrder, const VertexType startNode, const VertexType endNode, unsigned &supstep) const { + void computePartialSchedule(BspSchedule &schedule, const std::vector &topOrder, const std::vector &posInTopOrder, const VertexType startNode, const VertexType endNode, unsigned &supstep) const { #ifdef TIME_THREADS_GROW_LOCAL_PARALLEL double startTime = omp_get_wtime(); @@ -107,7 +106,7 @@ class GrowLocalAutoCoresParallel : public Scheduler { if constexpr (has_children_in_vertex_order_v) { for (VertexType vert = startNode; vert < endNode; ++vert) { for (const VertexType &chld : graph.children(vert)) { - if ( chld >= endNode ) { + if (chld >= endNode) { break; } ++predec[chld - startNode]; @@ -116,7 +115,7 @@ class GrowLocalAutoCoresParallel : public Scheduler { } else { for (VertexType vert = startNode; vert < endNode; ++vert) { for (const VertexType &chld : graph.children(vert)) { - if ( chld < endNode ) { + if (chld < endNode) { ++predec[chld - startNode]; } } @@ -127,39 +126,36 @@ class GrowLocalAutoCoresParallel : public Scheduler { VertexType vert = topOrder[index]; for (const VertexType &par : graph.parents(vert)) { VertexType posPar = posInTopOrder[par]; - if ( posPar >= startNode ) { + if (posPar >= startNode) { ++predec[index - startNode]; } } } } - - - for(VertexType nodePos = startNode; nodePos < endNode; nodePos++) { + for (VertexType nodePos = startNode; nodePos < endNode; nodePos++) { VertexType index = nodePos - startNode; if (predec[index] == 0) { if constexpr (has_vertices_in_top_order_v) { - ready.insert( nodePos ); + ready.insert(nodePos); } else { - ready.insert( topOrder[nodePos] ); + ready.insert(topOrder[nodePos]); } } } - std::vector> new_assignments(P); std::vector> best_new_assignments(P); - + const v_workw_t minWeightParallelCheck = params.syncCostMultiplierParallelCheck * instance.synchronisationCosts(); const v_workw_t minSuperstepWeight = params.syncCostMultiplierMinSuperstepWeight * instance.synchronisationCosts(); - + double desiredParallelism = static_cast(P); - + VertexType total_assigned = 0; supstep = 0; - while(total_assigned < N) { + while (total_assigned < N) { VertexType limit = params.minSuperstepSize; double best_score = 0; double best_parallelism = 0; @@ -169,7 +165,7 @@ class GrowLocalAutoCoresParallel : public Scheduler { bool continueSuperstepAttempts = true; - while(continueSuperstepAttempts) { + while (continueSuperstepAttempts) { for (unsigned p = 0; p < P; p++) { new_assignments[p].clear(); } @@ -178,21 +174,20 @@ class GrowLocalAutoCoresParallel : public Scheduler { for (unsigned p = 0; p < P; p++) { procReady[p].clear(); } - + readyIter = ready.begin(); VertexType new_total_assigned = 0; v_workw_t weight_limit = 0; v_workw_t total_weight_assigned = 0; - // Processor 0 - while(new_assignments[0].size() < limit) { + while (new_assignments[0].size() < limit) { VertexType chosen_node = std::numeric_limits::max(); - if(!procReady[0].empty()) { + if (!procReady[0].empty()) { chosen_node = *procReady[0].begin(); procReady[0].erase(procReady[0].begin()); - } else if( readyIter != ready.end() ) { + } else if (readyIter != ready.end()) { chosen_node = *readyIter; readyIter++; } else { @@ -221,9 +216,9 @@ class GrowLocalAutoCoresParallel : public Scheduler { } } - if ( schedule.assignedProcessor(succ) == UINT_MAX ) { + if (schedule.assignedProcessor(succ) == UINT_MAX) { schedule.setAssignedProcessor(succ, 0); - } else if ( schedule.assignedProcessor(succ) != 0 ) { + } else if (schedule.assignedProcessor(succ) != 0) { schedule.setAssignedProcessor(succ, P); } @@ -235,8 +230,8 @@ class GrowLocalAutoCoresParallel : public Scheduler { } --predec[succIndex]; - if(predec[succIndex] == 0) { - if( schedule.assignedProcessor(succ) == 0 ) { + if (predec[succIndex] == 0) { + if (schedule.assignedProcessor(succ) == 0) { procReady[0].insert(succ); } else { futureReady.push_back(succ); @@ -244,23 +239,22 @@ class GrowLocalAutoCoresParallel : public Scheduler { } } } - - total_weight_assigned += weight_limit; - + total_weight_assigned += weight_limit; // Processors 1 through P-1 - for(unsigned proc = 1; proc < P; ++proc) { + for (unsigned proc = 1; proc < P; ++proc) { v_workw_t current_weight_assigned = 0; - while(current_weight_assigned < weight_limit) { + while (current_weight_assigned < weight_limit) { VertexType chosen_node = std::numeric_limits::max(); - if(!procReady[proc].empty()) { + if (!procReady[proc].empty()) { chosen_node = *procReady[proc].begin(); procReady[proc].erase(procReady[proc].begin()); - } else if( readyIter != ready.end() ) { + } else if (readyIter != ready.end()) { chosen_node = *readyIter; readyIter++; - } else break; + } else + break; new_assignments[proc].push_back(chosen_node); schedule.setAssignedProcessor(chosen_node, proc); @@ -284,9 +278,9 @@ class GrowLocalAutoCoresParallel : public Scheduler { } } - if ( schedule.assignedProcessor(succ) == UINT_MAX ) { + if (schedule.assignedProcessor(succ) == UINT_MAX) { schedule.setAssignedProcessor(succ, proc); - } else if ( schedule.assignedProcessor(succ) != proc ) { + } else if (schedule.assignedProcessor(succ) != proc) { schedule.setAssignedProcessor(succ, P); } @@ -298,8 +292,8 @@ class GrowLocalAutoCoresParallel : public Scheduler { } --predec[succIndex]; - if(predec[succIndex] == 0) { - if( schedule.assignedProcessor(succ) == proc ) { + if (predec[succIndex] == 0) { + if (schedule.assignedProcessor(succ) == proc) { procReady[proc].insert(succ); } else { futureReady.push_back(succ); @@ -308,20 +302,18 @@ class GrowLocalAutoCoresParallel : public Scheduler { } } - weight_limit = std::max(weight_limit, current_weight_assigned); total_weight_assigned += current_weight_assigned; } bool accept_step = false; - double score = static_cast(total_weight_assigned) / static_cast( weight_limit + instance.synchronisationCosts() ); + double score = static_cast(total_weight_assigned) / static_cast(weight_limit + instance.synchronisationCosts()); double parallelism = 0; if (weight_limit > 0) { parallelism = static_cast(total_weight_assigned) / static_cast(weight_limit); } - if (score > 0.97 * best_score) { // It is possible to make this less strict, i.e. score > 0.98 * best_score. The purpose of this would be to encourage larger supersteps. best_score = std::max(best_score, score); best_parallelism = parallelism; @@ -338,25 +330,25 @@ class GrowLocalAutoCoresParallel : public Scheduler { if (weight_limit <= minSuperstepWeight) { continueSuperstepAttempts = true; - if(total_assigned + new_total_assigned == N) { + if (total_assigned + new_total_assigned == N) { accept_step = true; continueSuperstepAttempts = false; } } - if(total_assigned + new_total_assigned == N) { + if (total_assigned + new_total_assigned == N) { continueSuperstepAttempts = false; } // undo proc assingments and predec increases in any case - for(unsigned proc = 0; proc < P; ++proc) { - for(const VertexType &node : new_assignments[proc]) { + for (unsigned proc = 0; proc < P; ++proc) { + for (const VertexType &node : new_assignments[proc]) { schedule.setAssignedProcessor(node, UINT_MAX); } } - for(unsigned proc = 0; proc < P; ++proc) { - for(const VertexType &node : new_assignments[proc]) { + for (unsigned proc = 0; proc < P; ++proc) { + for (const VertexType &node : new_assignments[proc]) { for (const VertexType &succ : graph.children(node)) { if constexpr (has_vertices_in_top_order_v) { if constexpr (has_children_in_vertex_order_v) { @@ -386,8 +378,8 @@ class GrowLocalAutoCoresParallel : public Scheduler { } } - for(unsigned proc = 0; proc < P; ++proc) { - for(const VertexType &node : new_assignments[proc]) { + for (unsigned proc = 0; proc < P; ++proc) { + for (const VertexType &node : new_assignments[proc]) { for (const VertexType &succ : graph.children(node)) { if constexpr (has_vertices_in_top_order_v) { if constexpr (has_children_in_vertex_order_v) { @@ -410,7 +402,7 @@ class GrowLocalAutoCoresParallel : public Scheduler { } } - if(accept_step) { + if (accept_step) { best_new_assignments.swap(new_assignments); best_futureReady.swap(futureReady); best_procReady.swap(procReady); @@ -418,20 +410,20 @@ class GrowLocalAutoCoresParallel : public Scheduler { } limit++; - limit += ( limit / 2 ); + limit += (limit / 2); } // apply best iteration ready.erase(ready.begin(), bestReadyIter); ready.insert(best_futureReady.begin(), best_futureReady.end()); for (unsigned proc = 0; proc < P; proc++) { - ready.merge( best_procReady[proc] ); + ready.merge(best_procReady[proc]); } - for(unsigned proc = 0; proc < P; ++proc) { - for(const VertexType &node : best_new_assignments[proc]) { + for (unsigned proc = 0; proc < P; ++proc) { + for (const VertexType &node : best_new_assignments[proc]) { schedule.setAssignedProcessor(node, proc); - schedule.setAssignedSuperstep_noUpdateNumSuperstep(node, supstep); + schedule.setAssignedSuperstepNoUpdateNumSuperstep(node, supstep); ++total_assigned; for (const VertexType &succ : graph.children(node)) { @@ -478,16 +470,16 @@ class GrowLocalAutoCoresParallel : public Scheduler { std::cout << outputString; #endif } - + void incrementScheduleSupersteps(BspSchedule &schedule, const VertexType startNode, const VertexType endNode, const unsigned incr) const { for (VertexType node = startNode; node < endNode; node++) { - schedule.setAssignedSuperstep_noUpdateNumSuperstep(node, schedule.assignedSuperstep(node) + incr); + schedule.setAssignedSuperstepNoUpdateNumSuperstep(node, schedule.assignedSuperstep(node) + incr); } } void incrementScheduleSupersteps_TopOrder(BspSchedule &schedule, const std::vector &topOrder, const VertexType startIndex, const VertexType endIndex, const unsigned incr) const { for (VertexType index = startIndex; index < endIndex; index++) { const VertexType node = topOrder[index]; - schedule.setAssignedSuperstep_noUpdateNumSuperstep(node, schedule.assignedSuperstep(node) + incr); + schedule.setAssignedSuperstepNoUpdateNumSuperstep(node, schedule.assignedSuperstep(node) + incr); } } @@ -495,10 +487,10 @@ class GrowLocalAutoCoresParallel : public Scheduler { const BspInstance &instance = schedule.getInstance(); const Graph_t &graph = instance.getComputationalDag(); - + const VertexType N = instance.numberOfVertices(); - for (VertexType vert = 0; vert < N; ++vert ) { + for (VertexType vert = 0; vert < N; ++vert) { schedule.setAssignedProcessor(vert, UINT_MAX); } @@ -525,30 +517,30 @@ class GrowLocalAutoCoresParallel : public Scheduler { std::vector posInTopOrder; if constexpr (not has_vertices_in_top_order_v) { posInTopOrder = std::vector(graph.num_vertices()); - for (VertexType ind = 0; ind < static_cast( topOrder.size() ); ++ind) { - posInTopOrder[ topOrder[ind] ] = ind; + for (VertexType ind = 0; ind < static_cast(topOrder.size()); ++ind) { + posInTopOrder[topOrder[ind]] = ind; } } - #pragma omp parallel num_threads(numThreads) default(none) shared(schedule, topOrder, posInTopOrder, superstepsThread, supstepIncr, numThreads, startNodes, incr) +#pragma omp parallel num_threads(numThreads) default(none) shared(schedule, topOrder, posInTopOrder, superstepsThread, supstepIncr, numThreads, startNodes, incr) { - #pragma omp for schedule(static, 1) +#pragma omp for schedule(static, 1) for (unsigned thr = 0; thr < numThreads; thr++) { computePartialSchedule(schedule, topOrder, posInTopOrder, startNodes[thr], startNodes[thr + 1], superstepsThread[thr * UnsignedPadding]); } - #pragma omp master - { - for (unsigned thr = 0; thr < numThreads; thr++) { - supstepIncr[thr] = incr; - incr += superstepsThread[thr * UnsignedPadding]; +#pragma omp master + { + for (unsigned thr = 0; thr < numThreads; thr++) { + supstepIncr[thr] = incr; + incr += superstepsThread[thr * UnsignedPadding]; + } + // the value of incr is now the number of supersteps } - // the value of incr is now the number of supersteps - } - #pragma omp barrier +#pragma omp barrier - #pragma omp for schedule(static, 1) +#pragma omp for schedule(static, 1) for (unsigned thr = 0; thr < numThreads; thr++) { if constexpr (has_vertices_in_top_order_v) { incrementScheduleSupersteps(schedule, startNodes[thr], startNodes[thr + 1], supstepIncr[thr]); @@ -576,7 +568,7 @@ class GrowLocalAutoCoresParallel : public Scheduler { unsigned numThreads = params.numThreads; if (numThreads == 0) { // numThreads = static_cast(std::sqrt( static_cast((schedule.getInstance().numberOfVertices() / 1000000)))) + 1; - numThreads = static_cast(std::log2( static_cast((schedule.getInstance().numberOfVertices() / 1000)))) + 1; + numThreads = static_cast(std::log2(static_cast((schedule.getInstance().numberOfVertices() / 1000)))) + 1; } numThreads = std::min(numThreads, params.maxNumThreads); if (numThreads == 0) { diff --git a/include/osp/bsp/scheduler/IlpSchedulers/CoptCommScheduleOptimizer.hpp b/include/osp/bsp/scheduler/IlpSchedulers/CoptCommScheduleOptimizer.hpp index f3a66b70..1b8d72b9 100644 --- a/include/osp/bsp/scheduler/IlpSchedulers/CoptCommScheduleOptimizer.hpp +++ b/include/osp/bsp/scheduler/IlpSchedulers/CoptCommScheduleOptimizer.hpp @@ -38,13 +38,13 @@ class CoptCommScheduleOptimizer { static_assert(is_computational_dag_v, "CoptFullScheduler can only be used with computational DAGs."); - bool num_supersteps_can_change = true; + bool ignore_latency = false; unsigned int timeLimitSeconds = 600; protected: - VarArray superstep_used_var; + VarArray superstep_has_comm; VarArray max_comm_superstep_var; std::vector>> comm_processor_to_processor_superstep_node_var; @@ -67,7 +67,7 @@ class CoptCommScheduleOptimizer { virtual void setTimeLimitSeconds(unsigned int limit) { timeLimitSeconds = limit; } inline unsigned int getTimeLimitSeconds() const { return timeLimitSeconds; } - virtual void setNumSuperstepsCanChange(bool can_change_) { num_supersteps_can_change = can_change_; } + virtual void setIgnoreLatency(bool ignore_latency_) { ignore_latency = ignore_latency_; } }; @@ -110,7 +110,7 @@ bool CoptCommScheduleOptimizer::canShrinkResultingSchedule(unsigned num for (unsigned step = 0; step < number_of_supersteps - 1; step++) { - if (superstep_used_var[static_cast(step)].Get(COPT_DBLINFO_VALUE) <= 0.01) + if (superstep_has_comm[static_cast(step)].Get(COPT_DBLINFO_VALUE) <= 0.01) return true; } return false; @@ -187,13 +187,13 @@ void CoptCommScheduleOptimizer::setInitialSolution(BspScheduleCS(node)], 0); } - if(num_supersteps_can_change) + if(!ignore_latency) { std::vector comm_phase_used(num_supersteps, 0); for (auto const &[key, val] : cs) comm_phase_used[val] = 1; for (unsigned step = 0; step < num_supersteps; step++) - model.SetMipStart(superstep_used_var[static_cast(step)], comm_phase_used[step]); + model.SetMipStart(superstep_has_comm[static_cast(step)], comm_phase_used[step]); } std::vector>> send(num_supersteps, std::vector>(num_processors, 0)); @@ -227,8 +227,8 @@ void CoptCommScheduleOptimizer::setupVariablesConstraintsObjective(cons const unsigned num_vertices = static_cast(schedule.getInstance().numberOfVertices()); // variables indicating if superstep is used at all - if (num_supersteps_can_change) { - superstep_used_var = model.AddVars(static_cast(max_number_supersteps), COPT_BINARY, "superstep_used"); + if (!ignore_latency) { + superstep_has_comm = model.AddVars(static_cast(max_number_supersteps), COPT_BINARY, "superstep_has_comm"); } max_comm_superstep_var = model.AddVars(static_cast(max_number_supersteps), COPT_INTEGER, "max_comm_superstep"); @@ -250,7 +250,7 @@ void CoptCommScheduleOptimizer::setupVariablesConstraintsObjective(cons } } - if (num_supersteps_can_change) { + if (!ignore_latency) { unsigned M = num_processors * num_processors * num_vertices; for (unsigned int step = 0; step < schedule.numberOfSupersteps(); step++) { @@ -269,7 +269,7 @@ void CoptCommScheduleOptimizer::setupVariablesConstraintsObjective(cons } } - model.AddConstr(expr <= M * superstep_used_var[static_cast(step)]); + model.AddConstr(expr <= M * superstep_has_comm[static_cast(step)]); } } // precedence constraint: if task is computed then all of its predecessors must have been present @@ -356,11 +356,11 @@ void CoptCommScheduleOptimizer::setupVariablesConstraintsObjective(cons */ Expr expr; - if (num_supersteps_can_change) { + if (!ignore_latency) { for (unsigned int step = 0; step < max_number_supersteps; step++) { expr += schedule.getInstance().communicationCosts() * max_comm_superstep_var[static_cast(step)] + - schedule.getInstance().synchronisationCosts() * superstep_used_var[static_cast(step)]; + schedule.getInstance().synchronisationCosts() * superstep_has_comm[static_cast(step)]; } } else { diff --git a/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp b/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp index 79e22c54..aa199c45 100644 --- a/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp +++ b/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp @@ -62,17 +62,17 @@ class CoptFullScheduler : public Scheduler { private: bool allow_recomputation; - bool is_max_bsp = false; bool use_memory_constraint; + bool use_initial_schedule_recomp = false; bool use_initial_schedule = false; bool write_solutions_found; - bool use_initial_schedule_recomp = false; + bool is_max_bsp = false; unsigned timeLimitSeconds = 0; const BspScheduleCS *initial_schedule; const BspScheduleRecomp *initial_schedule_recomp; - + std::string write_solutions_path; std::string solution_file_prefix; @@ -172,7 +172,7 @@ class CoptFullScheduler : public Scheduler { return schedule; } - + BspScheduleRecomp constructBspScheduleRecompFromCallback() { unsigned number_of_supersteps = 0; @@ -262,6 +262,7 @@ class CoptFullScheduler : public Scheduler { if(is_max_bsp && number_of_supersteps>0) // can ignore last 2 comm phases in this case --number_of_supersteps; + schedule.getCommunicationSchedule().clear(); for (const auto &node : instance.vertices()) { for (unsigned int p_from = 0; p_from < instance.numberOfProcessors(); p_from++) { @@ -310,6 +311,7 @@ class CoptFullScheduler : public Scheduler { } } + schedule.getCommunicationSchedule().clear(); for (unsigned int node = 0; node < schedule.getInstance().numberOfVertices(); node++) { for (unsigned int p_from = 0; p_from < schedule.getInstance().numberOfProcessors(); p_from++) { @@ -405,11 +407,11 @@ class CoptFullScheduler : public Scheduler { } else { - first_at[node][initial_schedule->assignedProcessor(node)] = std::min(first_at[node][initial_schedule->assignedProcessor(node)], + first_at[node][initial_schedule->assignedProcessor(node)] = std::min(first_at[node][initial_schedule->assignedProcessor(node)], initial_schedule->assignedSuperstep(node) ); } } - + unsigned staleness = is_max_bsp ? 2 : 1; for (const auto &node : DAG.vertices()) { @@ -452,10 +454,10 @@ class CoptFullScheduler : public Scheduler { [static_cast(node)], 1); else model.SetMipStart(comm_processor_to_processor_superstep_node_var[proc][proc][step] - [static_cast(node)], 0); + [static_cast(node)], 0); } - for (const auto &node : DAG.vertices()) { + for (const auto &node : DAG.vertices()) { for (unsigned proc = 0; proc < num_processors; proc++) { @@ -548,10 +550,19 @@ class CoptFullScheduler : public Scheduler { // variables indicating if superstep is used at all superstep_used_var = model.AddVars(static_cast(max_number_supersteps), COPT_BINARY, "superstep_used"); + VarArray superstep_has_comm, mergeable_superstep_penalty; + if(is_max_bsp) + { + // variables indicating if there is any communication in superstep + superstep_has_comm = model.AddVars(static_cast(max_number_supersteps), COPT_BINARY, "superstep_has_comm"); + // variables that incentivize the schedule to be continuous - needs to be done differently for maxBsp + mergeable_superstep_penalty = model.AddVars(static_cast(max_number_supersteps), COPT_BINARY, "mergeable_superstep_penalty"); + } + + // variables for assigments of nodes to processor and superstep node_to_processor_superstep_var = std::vector>( instance.numberOfVertices(), std::vector(instance.numberOfProcessors())); - // variables for assigments of nodes to processor and superstep for (const auto &node : instance.vertices()) { for (unsigned int processor = 0; processor < instance.numberOfProcessors(); processor++) { @@ -598,7 +609,7 @@ class CoptFullScheduler : public Scheduler { } } model.AddConstr(expr <= static_cast(instance.numberOfVertices() * instance.numberOfProcessors()) * - superstep_used_var.GetVar(static_cast(step))); + superstep_used_var[static_cast(step)]); } // nodes are assigend depending on whether recomputation is allowed or not @@ -688,6 +699,29 @@ class CoptFullScheduler : public Scheduler { } } + // synchronization cost calculation & forcing continuous schedule in maxBsp + if(is_max_bsp) + { + for (unsigned int step = 0; step < max_number_supersteps; step++) { + Expr expr; + for (const auto &node : instance.vertices()) { + for (unsigned int p_from = 0; p_from < instance.numberOfProcessors(); p_from++) { + for (unsigned int p_to = 0; p_to < instance.numberOfProcessors(); p_to++) { + if(p_from != p_to) + expr += comm_processor_to_processor_superstep_node_var[p_from][p_to][step][static_cast(node)]; + } + } + } + model.AddConstr(static_cast(instance.numberOfProcessors() * instance.numberOfProcessors() * instance.numberOfVertices()) * + superstep_has_comm[static_cast(step)] >= expr); + } + + // if step i and (i+1) has no comm, and (i+2) has work, then (i+1) and (i+2) are mergeable -> penalize + for (unsigned int step = 0; step < max_number_supersteps - 2; step++) + model.AddConstr(superstep_used_var[static_cast(step + 2)] - superstep_has_comm[static_cast(step)] + - superstep_has_comm[static_cast(step + 1)] <= mergeable_superstep_penalty[static_cast(step)]); + } + max_comm_superstep_var = model.AddVars(static_cast(max_number_supersteps), COPT_INTEGER, "max_comm_superstep"); // coptModel.AddVars(max_number_supersteps, 0, COPT_INFINITY, 0, COPT_INTEGER, "max_comm_superstep"); @@ -770,10 +804,10 @@ class CoptFullScheduler : public Scheduler { model.AddConstr(max_superstep_var[static_cast(step)] >= max_work_superstep_var[static_cast(step)]); if(step > 0) model.AddConstr(max_superstep_var[static_cast(step)] >= instance.communicationCosts() * max_comm_superstep_var[static_cast(step-1)]); - expr += max_superstep_var[static_cast(step)]; + - instance.synchronisationCosts() * superstep_used_var[static_cast(step)]; + expr += max_superstep_var[static_cast(step)]; + expr += instance.synchronisationCosts() * superstep_has_comm[static_cast(step)]; + expr += instance.synchronisationCosts() * mergeable_superstep_penalty[static_cast(step)]; } - } else { @@ -782,9 +816,10 @@ class CoptFullScheduler : public Scheduler { instance.communicationCosts() * max_comm_superstep_var[static_cast(step)] + instance.synchronisationCosts() * superstep_used_var[static_cast(step)]; } + expr -= instance.synchronisationCosts(); } - model.SetObjective(expr - instance.synchronisationCosts(), COPT_MINIMIZE); + model.SetObjective(expr, COPT_MINIMIZE); } RETURN_STATUS run_scheduler(BspScheduleCS &schedule) { @@ -824,7 +859,7 @@ class CoptFullScheduler : public Scheduler { public: CoptFullScheduler(unsigned steps = 5) - : allow_recomputation(false), use_memory_constraint(false), use_initial_schedule(false), + : allow_recomputation(false), use_memory_constraint(false), use_initial_schedule(false), write_solutions_found(false), initial_schedule(0), max_number_supersteps(steps) { // solution_callback.comm_processor_to_processor_superstep_node_var_ptr = @@ -889,7 +924,7 @@ class CoptFullScheduler : public Scheduler { return status; } } - + virtual RETURN_STATUS computeMaxBspScheduleCS(MaxBspScheduleCS &schedule) { allow_recomputation = false; is_max_bsp = true; @@ -897,7 +932,7 @@ class CoptFullScheduler : public Scheduler { } - virtual RETURN_STATUS computeScheduleCS(BspScheduleCS &schedule) override { + virtual RETURN_STATUS computeScheduleCS(BspScheduleCS &schedule) override { allow_recomputation = false; is_max_bsp = false; return run_scheduler(schedule); @@ -942,7 +977,7 @@ class CoptFullScheduler : public Scheduler { }; virtual void computeScheduleBase(const BspScheduleRecomp &schedule, Model &model) { - + if (timeLimitSeconds > 0) { model.SetDblParam(COPT_DBLPARAM_TIMELIMIT, timeLimitSeconds); } @@ -1064,6 +1099,13 @@ class CoptFullScheduler : public Scheduler { */ inline unsigned getMaxNumberOfSupersteps() const { return max_number_supersteps; } + /** + * @brief Sets the time limit for the ILP solving. + * + * @param time_limit_seconds_ The time limit in seconds. + */ + inline void setTimeLimitSeconds(unsigned time_limit_seconds_) { timeLimitSeconds = time_limit_seconds_; } + /** * @brief Get the name of the schedule. * diff --git a/include/osp/bsp/scheduler/IlpSchedulers/CoptPartialScheduler.hpp b/include/osp/bsp/scheduler/IlpSchedulers/CoptPartialScheduler.hpp index 10fa2243..db9a01f3 100644 --- a/include/osp/bsp/scheduler/IlpSchedulers/CoptPartialScheduler.hpp +++ b/include/osp/bsp/scheduler/IlpSchedulers/CoptPartialScheduler.hpp @@ -64,6 +64,8 @@ class CoptPartialScheduler { std::vector>> comm_processor_to_processor_superstep_node_var; std::vector> comm_to_processor_superstep_source_var; + bool has_fixed_comm_in_preceding_step; + void setupVariablesConstraintsObjective(const BspScheduleCS& schedule, Model& model); void setInitialSolution(const BspScheduleCS& schedule, Model &model); @@ -156,9 +158,9 @@ void CoptPartialScheduler::setInitialSolution(const BspScheduleCS(node_local_ID[node])], 1); - else + else model.SetMipStart(comm_processor_to_processor_superstep_node_var[p1][p2][step][static_cast(node_local_ID[node])], 0); } } @@ -178,7 +180,7 @@ void CoptPartialScheduler::setInitialSolution(const BspScheduleCS(source_local_ID[source])], 1); else if(step > 0) model.SetMipStart(comm_to_processor_superstep_source_var[proc][step][static_cast(source_local_ID[source])], 0); @@ -296,6 +298,8 @@ void CoptPartialScheduler::setupVariablesConstraintsObjective(const Bsp */ // variables indicating if superstep is used at all superstep_used_var = model.AddVars(static_cast(max_number_supersteps), COPT_BINARY, "superstep_used"); + VarArray superstep_has_comm = model.AddVars(static_cast(max_number_supersteps+1), COPT_BINARY, "superstep_has_comm"); + VarArray has_comm_at_end = model.AddVars(1, COPT_BINARY, "has_comm_at_end"); // variables for assigments of nodes to processor and superstep node_to_processor_superstep_var = std::vector>(num_vertices, std::vector(num_processors)); @@ -333,7 +337,7 @@ void CoptPartialScheduler::setupVariablesConstraintsObjective(const Bsp comm_to_processor_superstep_source_var[proc][step] = model.AddVars(static_cast(num_sources), COPT_BINARY, "comm_to_processor_superstep_source"); - + if(step < max_number_supersteps) present_on_processor_superstep_source_var[proc][step] = model.AddVars(static_cast(num_sources), COPT_BINARY, "present_on_processor_superstep_source"); @@ -357,29 +361,59 @@ void CoptPartialScheduler::setupVariablesConstraintsObjective(const Bsp model.AddConstr(superstep_used_var[static_cast(step)] >= superstep_used_var[static_cast(step + 1)]); } - // superstep is used at all - unsigned large_constant = static_cast(num_vertices+num_sources) * num_processors * num_processors * 2; + // check whether superstep is used at all (work or comm), and whether superstep has any communication at all + unsigned large_constant_work = static_cast(num_vertices) * num_processors; + unsigned large_constant_comm = static_cast(num_vertices+num_sources) * num_processors * num_processors + static_cast(fixed_comm_steps.size()); for (unsigned int step = 0; step < max_number_supersteps; step++) { - Expr expr; + Expr expr_work, expr_comm; for (vertex_idx_t node = 0; node < num_vertices; node++) { for (unsigned int processor = 0; processor < num_processors; processor++) { - expr += node_to_processor_superstep_var[node][processor][static_cast(step)]; - + expr_work += node_to_processor_superstep_var[node][processor][static_cast(step)]; + for (unsigned int p_other = 0; p_other < num_processors; p_other++) if(processor != p_other) - expr += comm_processor_to_processor_superstep_node_var[processor][p_other][step][static_cast(node)]; + expr_comm += comm_processor_to_processor_superstep_node_var[processor][p_other][step][static_cast(node)]; } } for (vertex_idx_t source = 0; source < num_sources; source++) for (unsigned int processor = 0; processor < num_processors; processor++) if(source_present_before.find(std::make_pair(source, processor)) == source_present_before.end()) - expr += comm_to_processor_superstep_source_var[processor][step+1][static_cast(source)]; + expr_comm += comm_to_processor_superstep_source_var[processor][step+1][static_cast(source)]; + + for (unsigned index = 0; index < fixed_comm_steps.size(); ++index) + if(std::get<3>(fixed_comm_steps[index]) == start_superstep + step) + expr_comm += keep_fixed_comm_step[static_cast(index)]; - model.AddConstr(expr <= large_constant * superstep_used_var[static_cast(step)]); + model.AddConstr(expr_comm <= large_constant_comm * superstep_has_comm[static_cast(step+1)]); + model.AddConstr(expr_work <= large_constant_work * superstep_used_var[static_cast(step)]); + model.AddConstr(superstep_has_comm[static_cast(step+1)] <= superstep_used_var[static_cast(step)]); } + // check communication usage in edge case: comm phase before the segment + if(has_fixed_comm_in_preceding_step) + model.AddConstr(superstep_has_comm[0] == 1); + else { + Expr expr_comm_0; + for (vertex_idx_t source = 0; source < num_sources; source++) + for (unsigned int processor = 0; processor < num_processors; processor++) + if(source_present_before.find(std::make_pair(source, processor)) == source_present_before.end()) + expr_comm_0 += comm_to_processor_superstep_source_var[processor][0][static_cast(source)]; + for (unsigned index = 0; index < fixed_comm_steps.size(); ++index) + expr_comm_0 += 1 - keep_fixed_comm_step[static_cast(index)]; + model.AddConstr(expr_comm_0 <= (static_cast(num_sources) * num_processors + static_cast(fixed_comm_steps.size())) * superstep_has_comm[0]); + } + + // check if there is any communication at the end of the subschedule + for (unsigned int step = 0; step < max_number_supersteps - 1; step++) + { + model.AddConstr(superstep_used_var[static_cast(step)] - superstep_used_var[static_cast(step + 1)] + + superstep_has_comm[static_cast(step+1)] - 1 <= has_comm_at_end[0]); + } + model.AddConstr(superstep_used_var[static_cast(max_number_supersteps - 1)] + + superstep_has_comm[static_cast(max_number_supersteps)] - 1 <= has_comm_at_end[0]); + // nodes are assigend for (vertex_idx_t node = 0; node < num_vertices; node++) { @@ -421,7 +455,7 @@ void CoptPartialScheduler::setupVariablesConstraintsObjective(const Bsp } } } - + // combines two constraints: node can only be communicated if it is present; and node is present if it was computed // or communicated for (unsigned int step = 0; step < max_number_supersteps; step++) { @@ -602,8 +636,10 @@ void CoptPartialScheduler::setupVariablesConstraintsObjective(const Bsp } expr += schedule.getInstance().communicationCosts() * max_comm_superstep_var[0]; + expr += schedule.getInstance().synchronisationCosts() * superstep_has_comm[0]; + expr += schedule.getInstance().synchronisationCosts() * has_comm_at_end[0]; - model.SetObjective(expr, COPT_MINIMIZE); + model.SetObjective(expr - schedule.getInstance().synchronisationCosts(), COPT_MINIMIZE); }; template @@ -671,12 +707,12 @@ void CoptPartialScheduler::setupVertexMaps(const BspScheduleCS for(unsigned proc2 = 0; proc2 < schedule.getInstance().numberOfProcessors(); ++proc2) { if(proc1 == proc2) - continue; + continue; auto itr = schedule.getCommunicationSchedule().find(std::make_tuple(source, proc1, proc2)); if (itr != schedule.getCommunicationSchedule().end() && itr->second > end_superstep) procs_needing_this.insert(schedule.assignedProcessor(proc1)); } - + for(unsigned proc : procs_needing_this) if(first_at[source][proc] >= start_superstep && first_at[source][proc] <= end_superstep + 1) source_needed_after_on_proc.emplace_back(source_and_ID.second, proc); @@ -692,12 +728,12 @@ void CoptPartialScheduler::setupVertexMaps(const BspScheduleCS for(unsigned proc1 = 0; proc1 < schedule.getInstance().numberOfProcessors(); ++proc1) for(unsigned proc2 = 0; proc2 < schedule.getInstance().numberOfProcessors(); ++proc2) - { + { auto itr = schedule.getCommunicationSchedule().find(std::make_tuple(node, proc1, proc2)); if (itr != schedule.getCommunicationSchedule().end() && proc1 != proc2 && itr->second > end_superstep) procs_needing_this.insert(schedule.assignedProcessor(proc1)); } - + for(unsigned proc : procs_needing_this) if(first_at[node][proc] <= end_superstep + 1) node_needed_after_on_proc.emplace_back(node_and_ID.second, proc); @@ -705,13 +741,18 @@ void CoptPartialScheduler::setupVertexMaps(const BspScheduleCS // comm steps that just happen to be in this interval, but not connected to the nodes within + has_fixed_comm_in_preceding_step = false; for (const auto &[key, val] : schedule.getCommunicationSchedule()) { vertex_idx_t source = std::get<0>(key); - if(source_local_ID.find(source) == source_local_ID.end() && + if(source_local_ID.find(source) == source_local_ID.end() && schedule.assignedSuperstep(source) < start_superstep && val >= start_superstep - 1 && val <= end_superstep) + { fixed_comm_steps.emplace_back(std::get<0>(key), std::get<1>(key), std::get<2>(key), val); + if(val == start_superstep - 1) + has_fixed_comm_in_preceding_step = true; + } } }; diff --git a/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp b/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp index 362b0744..5d759687 100644 --- a/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp +++ b/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp @@ -289,6 +289,8 @@ class TotalCommunicationScheduler : public Scheduler { protected: unsigned int max_number_supersteps; + unsigned time_limit_seconds; + VarArray superstep_used_var; std::vector> node_to_processor_superstep_var; std::vector> edge_vars; @@ -633,7 +635,7 @@ class TotalCommunicationScheduler : public Scheduler { virtual ~TotalCommunicationScheduler() = default; - virtual RETURN_STATUS computeScheduleWithTimeLimit(BspSchedule &schedule, unsigned timeout) override { + virtual RETURN_STATUS computeScheduleWithTimeLimit(BspSchedule &schedule, unsigned timeout) { model.SetDblParam(COPT_DBLPARAM_TIMELIMIT, timeout); return computeSchedule(schedule); } @@ -668,7 +670,7 @@ class TotalCommunicationScheduler : public Scheduler { loadInitialSchedule(); } - + model.SetIntParam(COPT_INTPARAM_THREADS, 128); model.SetIntParam(COPT_INTPARAM_STRONGBRANCHING, 1); model.SetIntParam(COPT_INTPARAM_LPMETHOD, 1); @@ -846,6 +848,13 @@ class TotalCommunicationScheduler : public Scheduler { */ inline double bestBound() { return model.GetDblAttr(COPT_DBLATTR_BESTBND); } + /** + * @brief Sets the time limit for the ILP solving. + * + * @param time_limit_seconds_ The time limit in seconds. + */ + inline void setTimeLimitSeconds(unsigned time_limit_seconds_) { time_limit_seconds = time_limit_seconds_; } + /** * @brief Get the name of the schedule. * diff --git a/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp b/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp index f876229b..48a983a6 100644 --- a/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner */ #pragma once @@ -165,7 +165,7 @@ template RETURN_STATUS HillClimbingScheduler::improveScheduleWithStepLimit(BspSchedule &input_schedule, const unsigned stepLimit) { schedule = &input_schedule; - + CreateSupstepLists(); Init(); for (unsigned step = 0; step < stepLimit; ++step) @@ -208,7 +208,7 @@ void HillClimbingScheduler::Init() { else succSteps[node][schedule->assignedProcessor(succ)].at(schedule->assignedSuperstep(succ)) += 1; } - + // Cost data workCost.clear(); workCost.resize(M, std::vector(P, 0)); @@ -278,13 +278,12 @@ void HillClimbingScheduler::Init() { commCostPointer[step][proc] = commCostList[step].insert(entry).first; } cost_type comm_cost = schedule->getInstance().getArchitecture().communicationCosts() * commCostList[step].rbegin()->first; - if(comm_cost > 0) - comm_cost += schedule->getInstance().getArchitecture().synchronisationCosts(); - + cost_type sync_cost = (comm_cost > 0) ? schedule->getInstance().getArchitecture().synchronisationCosts() : 0; + if(schedule->getStaleness() == 1) - cost += comm_cost + work_cost[step+1]; + cost += comm_cost + work_cost[step+1] + sync_cost; else - cost += std::max(comm_cost, work_cost[step+1]); + cost += std::max(comm_cost, work_cost[step+1]) + sync_cost; } updatePromisingMoves(); @@ -320,7 +319,7 @@ void HillClimbingScheduler::updatePromisingMoves() for(unsigned proc=0; procassignedProcessor(node)!=proc && nrPredOnProc[proc]>0) ++otherProcUsed; - + if(otherProcUsed==1) for(unsigned proc=0; procassignedProcessor(node)!=proc && nrPredOnProc[proc]>0 && schedule->getInstance().isCompatible(node,proc)) @@ -381,7 +380,7 @@ void HillClimbingScheduler::updateNodeMovesEarlier(const vertex_idx nod if (schedule->assignedSuperstep(node) == 0) return; - std::set predProc; + std::set predProc; for (const vertex_idx &pred : schedule->getInstance().getComputationalDag().parents(node)) { if (schedule->assignedSuperstep(pred) == schedule->assignedSuperstep(node)) return; @@ -461,7 +460,7 @@ template void HillClimbingScheduler::updateMoveOptions(vertex_idx node, int where) { const Graph_t &G = schedule->getInstance().getComputationalDag(); - + updateNodeMoves(node); if(where==0) { @@ -632,7 +631,7 @@ int HillClimbingScheduler::moveCostChange(const vertex_idx node, unsign unsigned affectedStep = succSteps[node][j].begin()->first - schedule->getStaleness(); if (j == p) { - sentInc.emplace_back(affectedStep, oldProc, + sentInc.emplace_back(affectedStep, oldProc, -static_cast(schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(oldProc, j))); recInc.emplace_back(affectedStep, p, -static_cast(schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(oldProc, j))); } else if (j == oldProc) { @@ -738,8 +737,7 @@ int HillClimbingScheduler::moveCostChange(const vertex_idx node, unsign bool last_affected_empty = false; for (const unsigned sstep : affectedSteps) { cost_type oldMax = schedule->getInstance().getArchitecture().communicationCosts() * commCostList[sstep].rbegin()->first; - if(HCwithLatency && oldMax > 0) - oldMax += schedule->getInstance().getArchitecture().synchronisationCosts(); + cost_type oldSync = (HCwithLatency && oldMax > 0) ? schedule->getInstance().getArchitecture().synchronisationCosts() : 0; cost_type newMax = 0; for (unsigned j = 0; j < schedule->getInstance().getArchitecture().numberOfProcessors(); ++j) { @@ -755,9 +753,8 @@ int HillClimbingScheduler::moveCostChange(const vertex_idx node, unsign newMax = static_cast(static_cast(received[sstep][j]) + diff); } newMax *= schedule->getInstance().getArchitecture().communicationCosts(); - if(HCwithLatency && newMax > 0) - newMax += schedule->getInstance().getArchitecture().synchronisationCosts(); - + cost_type newSync = (HCwithLatency && newMax > 0) ? schedule->getInstance().getArchitecture().synchronisationCosts() : 0; + if(newMax == 0) { if(schedule->getStaleness() == 1) @@ -780,7 +777,7 @@ int HillClimbingScheduler::moveCostChange(const vertex_idx node, unsign oldMax = std::max(oldMax, workCostList[sstep+1].rbegin()->first); newMax = std::max(newMax, itrWork != newWorkCost.end() ? itrWork->second : workCostList[sstep+1].rbegin()->first); } - change += static_cast(newMax) - static_cast(oldMax); + change += static_cast(newMax + newSync) - static_cast(oldMax + oldSync); } changing.newCost = static_cast(static_cast(cost) + change); @@ -881,7 +878,7 @@ bool HillClimbingScheduler::Improve() { if(!canMove[static_cast(where)][node][proc]) continue; - + if(use_memory_constraint && violatesMemConstraint(node, proc, where-1)) continue; @@ -893,7 +890,7 @@ bool HillClimbingScheduler::Improve() { executeMove(node, proc, where-1, moveData); if(shrink && moveData.canShrink) Init(); - + return true; } @@ -966,13 +963,13 @@ bool HillClimbingScheduler::violatesMemConstraint(vertex_idx node, unsi if(memory_used[processor][static_cast(static_cast(schedule->assignedSuperstep(node))+where)] + schedule->getInstance().getComputationalDag().vertex_mem_weight(node) > schedule->getInstance().memoryBound(processor)) // TODO ANDRAS double check change return true; - + return false; } template void HillClimbingScheduler::CreateSupstepLists() { - + const unsigned P = schedule->getInstance().getArchitecture().numberOfProcessors(); const Graph_t &G = schedule->getInstance().getComputationalDag(); diff --git a/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing_for_comm_schedule.hpp b/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing_for_comm_schedule.hpp index fd9aa352..ba895b70 100644 --- a/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing_for_comm_schedule.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing_for_comm_schedule.hpp @@ -13,17 +13,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner */ #pragma once #include "osp/bsp/model/BspScheduleCS.hpp" -#include "osp/bsp/model/BspScheduleCostEvaluator.hpp" +#include "osp/bsp/model/cost/CostModelHelpers.hpp" #include "osp/bsp/scheduler/Scheduler.hpp" #include "osp/graph_algorithms/directed_graph_top_sort.hpp" -namespace osp{ +namespace osp { template class HillClimbingForCommSteps { @@ -44,8 +44,8 @@ class HillClimbingForCommSteps { std::vector> commSchedule; std::vector>> supsteplists; std::vector>> commCostList; - std::vector >::iterator>> commCostPointer; - std::vector > sent, received, commCost; + std::vector>::iterator>> commCostPointer; + std::vector> sent, received, commCost; std::vector>> commBounds; std::vector>>> commSchedSendLists; std::vector>::iterator>> commSchedSendListPointer; @@ -60,7 +60,7 @@ class HillClimbingForCommSteps { // Initialize data structures (based on current schedule) void Init(); - // compute cost change incurred by a potential move + // compute cost change incurred by a potential move int moveCostChange(vertex_idx node, unsigned p, unsigned step); // execute a move, updating the comm. schedule and the data structures @@ -79,11 +79,11 @@ class HillClimbingForCommSteps { virtual RETURN_STATUS improveSchedule(BspScheduleCS &input_schedule); - //call with time limit + // call with time limit virtual RETURN_STATUS improveScheduleWithTimeLimit(BspScheduleCS &input_schedule, const unsigned timeLimit); - //setting parameters - void setSteepestAscend(bool steepestAscent_) {steepestAscent = steepestAscent_;} + // setting parameters + void setSteepestAscend(bool steepestAscent_) { steepestAscent = steepestAscent_; } virtual std::string getScheduleName() const { return "HillClimbingForCommSchedule"; } }; @@ -100,11 +100,11 @@ RETURN_STATUS HillClimbingForCommSteps::improveScheduleWithTimeLimit(Bs schedule = &input_schedule; - if(schedule->numberOfSupersteps() <= 2) + if (schedule->numberOfSupersteps() <= 2) return RETURN_STATUS::OSP_SUCCESS; Init(); - //ConvertCommSchedule(); + // ConvertCommSchedule(); const std::chrono::steady_clock::time_point startTime = std::chrono::steady_clock::now(); unsigned counter = 0; @@ -119,14 +119,11 @@ RETURN_STATUS HillClimbingForCommSteps::improveScheduleWithTimeLimit(Bs } } - ConvertCommSchedule(); return RETURN_STATUS::OSP_SUCCESS; - } - // Initialization for comm. schedule hill climbing template void HillClimbingForCommSteps::Init() { @@ -136,7 +133,7 @@ void HillClimbingForCommSteps::Init() { const Graph_t &G = schedule->getInstance().getComputationalDag(); CreateSupstepLists(); - cost = schedule->computeCosts()-schedule->computeWorkCosts(); + cost = schedule->computeCosts(); nextSupstep = 0; commSchedule.clear(); @@ -154,13 +151,13 @@ void HillClimbingForCommSteps::Init() { commBounds.clear(); commBounds.resize(N, std::vector>(P)); commSchedSendLists.clear(); - commSchedSendLists.resize(M - 1, std::vector >>(P)); + commSchedSendLists.resize(M - 1, std::vector>>(P)); commSchedRecLists.clear(); - commSchedRecLists.resize(M - 1, std::vector >>(P)); + commSchedRecLists.resize(M - 1, std::vector>>(P)); commSchedSendListPointer.clear(); - commSchedSendListPointer.resize(N, std::vector >::iterator>(P)); + commSchedSendListPointer.resize(N, std::vector>::iterator>(P)); commSchedRecListPointer.clear(); - commSchedRecListPointer.resize(N, std::vector >::iterator>(P)); + commSchedRecListPointer.resize(N, std::vector>::iterator>(P)); // initialize to lazy comm schedule first - to make sure it's correct even if e.g. com scehdule has indirect sending for (unsigned step = 1; step < M; ++step) @@ -169,24 +166,22 @@ void HillClimbingForCommSteps::Init() { for (const vertex_idx &pred : G.parents(node)) if (schedule->assignedProcessor(pred) != schedule->assignedProcessor(node) && commSchedule[pred][schedule->assignedProcessor(node)] == UINT_MAX) { - commSchedule[pred][schedule->assignedProcessor(node)] = step - schedule->getStaleness(); - commBounds[pred][schedule->assignedProcessor(node)] = std::make_pair(schedule->assignedSuperstep(pred), step - schedule->getStaleness()); + commSchedule[pred][schedule->assignedProcessor(node)] = step - schedule->getStaleness(); + commBounds[pred][schedule->assignedProcessor(node)] = std::make_pair(schedule->assignedSuperstep(pred), step - schedule->getStaleness()); } // overwrite with original comm schedule, wherever possible const std::map, unsigned int> originalCommSchedule = schedule->getCommunicationSchedule(); - for(vertex_idx node = 0; node < N; ++node) - for (unsigned proc = 0; proc < P; ++proc) - { - if(commSchedule[node][proc] == UINT_MAX ) + for (vertex_idx node = 0; node < N; ++node) + for (unsigned proc = 0; proc < P; ++proc) { + if (commSchedule[node][proc] == UINT_MAX) continue; - + const auto comm_schedule_key = std::make_tuple(node, schedule->assignedProcessor(node), proc); auto mapIterator = originalCommSchedule.find(comm_schedule_key); - if (mapIterator != originalCommSchedule.end()) - { + if (mapIterator != originalCommSchedule.end()) { unsigned originalStep = mapIterator->second; - if(originalStep >= commBounds[node][proc].first && originalStep <= commBounds[node][proc].second) + if (originalStep >= commBounds[node][proc].first && originalStep <= commBounds[node][proc].second) commSchedule[node][proc] = originalStep; } @@ -197,30 +192,25 @@ void HillClimbingForCommSteps::Init() { commSchedRecLists[step][proc].emplace_front(node, proc); commSchedRecListPointer[node][proc] = commSchedRecLists[step][proc].begin(); - + sent[step][schedule->assignedProcessor(node)] += - schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(schedule->assignedProcessor(node), proc); + schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(schedule->assignedProcessor(node), proc); received[step][proc] += - schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(schedule->assignedProcessor(node), proc); - - + schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(schedule->assignedProcessor(node), proc); } - + for (unsigned step = 0; step < M - 1; ++step) - for (unsigned proc = 0; proc < P; ++proc) - { + for (unsigned proc = 0; proc < P; ++proc) { commCost[step][proc] = std::max(sent[step][proc], received[step][proc]); commCostPointer[step][proc] = commCostList[step].emplace(commCost[step][proc], proc).first; } // set minimum cost - differs for BSP and MaxBSP minimum_cost_per_superstep.clear(); - if(schedule->getStaleness() == 1) - minimum_cost_per_superstep.resize(M-1, 0); - else - { - BspScheduleCostEvaluator evaluator(*schedule); - minimum_cost_per_superstep = evaluator.compute_max_work_per_step_helper(); + if (schedule->getStaleness() == 1) + minimum_cost_per_superstep.resize(M - 1, 0); + else { + minimum_cost_per_superstep = cost_helpers::compute_max_work_per_step(*schedule); minimum_cost_per_superstep.erase(minimum_cost_per_superstep.begin()); } } @@ -234,13 +224,12 @@ int HillClimbingForCommSteps::moveCostChange(const vertex_idx node, con // Change at old place auto itr = commCostList[oldStep].rbegin(); - cost_type oldMax = std::max(itr->first * schedule->getInstance().getArchitecture().communicationCosts() - + schedule->getInstance().getArchitecture().synchronisationCosts(), minimum_cost_per_superstep[oldStep]); + cost_type oldMax = std::max(itr->first * schedule->getInstance().getArchitecture().communicationCosts(), minimum_cost_per_superstep[oldStep]) + schedule->getInstance().getArchitecture().synchronisationCosts(); cost_type maxSource = std::max(sent[oldStep][sourceProc] - schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p), received[oldStep][sourceProc]); cost_type maxTarget = std::max(sent[oldStep][p], - received[oldStep][p] - schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p)); + received[oldStep][p] - schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p)); cost_type maxOther = 0; for (; itr != commCostList[oldStep].rend(); ++itr) if (itr->second != sourceProc && itr->second != p) { @@ -249,23 +238,21 @@ int HillClimbingForCommSteps::moveCostChange(const vertex_idx node, con } cost_type newMax = std::max(std::max(maxSource, maxTarget), maxOther) * schedule->getInstance().getArchitecture().communicationCosts(); - if(newMax > 0) - newMax += schedule->getInstance().getArchitecture().synchronisationCosts(); - newMax = std::max(newMax, minimum_cost_per_superstep[oldStep]); + cost_type newSync = (newMax > 0) ? schedule->getInstance().getArchitecture().synchronisationCosts() : 0; + newMax = std::max(newMax, minimum_cost_per_superstep[oldStep]) + newSync; change += static_cast(newMax) - static_cast(oldMax); // Change at new place oldMax = commCostList[step].rbegin()->first * schedule->getInstance().getArchitecture().communicationCosts(); - if(oldMax > 0) - oldMax += schedule->getInstance().getArchitecture().synchronisationCosts(); + cost_type oldSync = (oldMax > 0) ? schedule->getInstance().getArchitecture().synchronisationCosts() : 0; oldMax = std::max(oldMax, minimum_cost_per_superstep[step]); - maxSource = schedule->getInstance().getArchitecture().synchronisationCosts() + schedule->getInstance().getArchitecture().communicationCosts() * - (sent[step][sourceProc] + schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p)); - maxTarget = schedule->getInstance().getArchitecture().synchronisationCosts() + schedule->getInstance().getArchitecture().communicationCosts() * - (received[step][p] + schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p)); + maxSource = schedule->getInstance().getArchitecture().communicationCosts() * + (sent[step][sourceProc] + schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p)); + maxTarget = schedule->getInstance().getArchitecture().communicationCosts() * + (received[step][p] + schedule->getInstance().getComputationalDag().vertex_comm_weight(node) * schedule->getInstance().getArchitecture().sendCosts(sourceProc, p)); newMax = std::max(std::max(oldMax, maxSource), maxTarget); - change += static_cast(newMax) - static_cast(oldMax); + change += static_cast(newMax + schedule->getInstance().getArchitecture().synchronisationCosts()) - static_cast(oldMax + oldSync); return change; } @@ -335,7 +322,7 @@ bool HillClimbingForCommSteps::Improve() { unsigned startingSupstep = nextSupstep; // iterate over supersteps - while(true) { + while (true) { auto itr = commCostList[nextSupstep].rbegin(); if (itr == commCostList[nextSupstep].crend()) @@ -343,10 +330,9 @@ bool HillClimbingForCommSteps::Improve() { // find maximal comm cost that dominates the h-relation const cost_type commMax = itr->first; - if (commMax == 0) - { - nextSupstep = (nextSupstep+1)%(M-1); - if(nextSupstep == startingSupstep) + if (commMax == 0) { + nextSupstep = (nextSupstep + 1) % (M - 1); + if (nextSupstep == startingSupstep) break; else continue; @@ -357,7 +343,7 @@ bool HillClimbingForCommSteps::Improve() { const unsigned maxProc = itr->second; if (sent[nextSupstep][maxProc] == commMax) - for (const std::pair& entry : commSchedSendLists[nextSupstep][maxProc]) { + for (const std::pair &entry : commSchedSendLists[nextSupstep][maxProc]) { const vertex_idx node = entry.first; const unsigned p = entry.second; // iterate over alternative supsteps to place this communication step @@ -380,7 +366,7 @@ bool HillClimbingForCommSteps::Improve() { } if (received[nextSupstep][maxProc] == commMax) - for (const std::pair& entry : commSchedRecLists[nextSupstep][maxProc]) { + for (const std::pair &entry : commSchedRecLists[nextSupstep][maxProc]) { const vertex_idx node = entry.first; const unsigned p = entry.second; // iterate over alternative supsteps to place this communication step @@ -404,8 +390,8 @@ bool HillClimbingForCommSteps::Improve() { } } - nextSupstep = (nextSupstep+1)%(M-1); - if(nextSupstep == startingSupstep) + nextSupstep = (nextSupstep + 1) % (M - 1); + if (nextSupstep == startingSupstep) break; } @@ -419,7 +405,7 @@ bool HillClimbingForCommSteps::Improve() { template void HillClimbingForCommSteps::CreateSupstepLists() { - + const unsigned P = schedule->getInstance().getArchitecture().numberOfProcessors(); const Graph_t &G = schedule->getInstance().getComputationalDag(); @@ -432,21 +418,18 @@ void HillClimbingForCommSteps::CreateSupstepLists() { const std::vector topOrder = GetTopOrder(G); for (vertex_idx node : topOrder) supsteplists[schedule->assignedSuperstep(node)][schedule->assignedProcessor(node)].push_back(node); - } template -void HillClimbingForCommSteps::ConvertCommSchedule() -{ +void HillClimbingForCommSteps::ConvertCommSchedule() { const vertex_idx N = static_cast(schedule->getInstance().getComputationalDag().num_vertices()); const unsigned P = schedule->getInstance().getArchitecture().numberOfProcessors(); std::map, unsigned> newCommSchedule; - for(vertex_idx node=0; node < N; ++node) - for(unsigned proc=0; proc < P; ++proc) - if(commSchedule[node][proc] != UINT_MAX) - { + for (vertex_idx node = 0; node < N; ++node) + for (unsigned proc = 0; proc < P; ++proc) + if (commSchedule[node][proc] != UINT_MAX) { const auto comm_schedule_key = std::make_tuple(node, schedule->assignedProcessor(node), proc); newCommSchedule[comm_schedule_key] = commSchedule[node][proc]; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8a6260bd..21fc5509 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -133,7 +133,7 @@ _add_test( bit_mask ) _add_test( filereader DATA ) -## scheduler +## scheduler if (COPT_FOUND) #_add_test( ilp_bsp_scheduler ) @@ -160,6 +160,8 @@ _add_test( cuthill_mckee ) _add_test( maxbsp_converter_and_hc ) +_add_test( cost_evaluation ) + ## pebbling ILPs if (COPT_FOUND) diff --git a/tests/bsp_schedule.cpp b/tests/bsp_schedule.cpp index 8d0a611d..0b587266 100644 --- a/tests/bsp_schedule.cpp +++ b/tests/bsp_schedule.cpp @@ -19,21 +19,25 @@ limitations under the License. #define BOOST_TEST_MODULE Bsp_Architecture #include +#include "osp/auxiliary/io/DotFileWriter.hpp" +#include "osp/auxiliary/io/arch_file_reader.hpp" +#include "osp/auxiliary/io/general_file_reader.hpp" +#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include "osp/bsp/model/BspInstance.hpp" #include "osp/bsp/model/BspSchedule.hpp" #include "osp/bsp/model/BspScheduleCS.hpp" +#include "osp/bsp/model/BspScheduleRecomp.hpp" #include "osp/bsp/model/MaxBspSchedule.hpp" #include "osp/bsp/model/MaxBspScheduleCS.hpp" -#include "osp/bsp/model/BspScheduleRecomp.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" -#include "osp/auxiliary/io/DotFileWriter.hpp" -#include "osp/auxiliary/io/arch_file_reader.hpp" -#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" -#include "osp/auxiliary/io/general_file_reader.hpp" #include #include +#include "osp/bsp/model/cost/BufferedSendingCost.hpp" +#include "osp/bsp/model/cost/LazyCommunicationCost.hpp" +#include "osp/bsp/model/cost/TotalCommunicationCost.hpp" +#include "osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp" #include "osp/bsp/scheduler/GreedySchedulers/BspLocking.hpp" #include "osp/bsp/scheduler/GreedySchedulers/CilkScheduler.hpp" #include "osp/bsp/scheduler/GreedySchedulers/EtfScheduler.hpp" @@ -70,7 +74,7 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) { BOOST_CHECK_EQUAL(instance.getComputationalDag().num_vertices(), 54); BOOST_CHECK_EQUAL(instance.getComputationalDag().num_vertex_types(), 1); - std::vector *> schedulers = {new BspLocking(), new EtfScheduler(), + std::vector *> schedulers = {new BspLocking(), new EtfScheduler(), new GreedyBspScheduler(), new GreedyChildren(), new GrowLocalAutoCores(), new VarianceFillup()}; @@ -93,8 +97,8 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) { BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); BOOST_CHECK_EQUAL(schedule.computeCosts(), expected_bsp_costs[i]); - BOOST_CHECK_EQUAL(schedule.computeTotalCosts(), expected_total_costs[i]); - BOOST_CHECK_EQUAL(schedule.computeBufferedSendingCosts(), expected_buffered_sending_costs[i]); + BOOST_CHECK_EQUAL(TotalCommunicationCost()(schedule), expected_total_costs[i]); + BOOST_CHECK_EQUAL(BufferedSendingCost()(schedule), expected_buffered_sending_costs[i]); BOOST_CHECK_EQUAL(schedule.numberOfSupersteps(), expected_supersteps[i]); BspScheduleCS schedule_cs(instance); @@ -118,7 +122,6 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) { BOOST_CHECK_EQUAL(RETURN_STATUS::OSP_SUCCESS, result); BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); BOOST_CHECK_EQUAL(schedule.numberOfSupersteps(), 1); - } BOOST_AUTO_TEST_CASE(test_schedule_writer) { @@ -230,7 +233,7 @@ BOOST_AUTO_TEST_CASE(test_bsp_schedule_cs) { } file_reader::readGraph((cwd / "data/spaa/tiny/instance_bicgstab.hdag").string(), - instance.getComputationalDag()); + instance.getComputationalDag()); BspSchedule schedule(instance); BspLocking scheduler; @@ -337,7 +340,7 @@ BOOST_AUTO_TEST_CASE(test_max_bsp_schedule) { BspInstance instance; instance.setNumberOfProcessors(2); - instance.setCommunicationCosts(10); // g=10 + instance.setCommunicationCosts(10); // g=10 instance.setSynchronisationCosts(100); // l=100 (not used in MaxBspSchedule cost model) auto &dag = instance.getComputationalDag(); @@ -419,7 +422,7 @@ BOOST_AUTO_TEST_CASE(test_max_bsp_schedule_cs) { BspInstance instance; instance.setNumberOfProcessors(2); - instance.setCommunicationCosts(10); // g=10 + instance.setCommunicationCosts(10); // g=10 instance.setSynchronisationCosts(100); // l=100 auto &dag = instance.getComputationalDag(); diff --git a/tests/cost_evaluation.cpp b/tests/cost_evaluation.cpp new file mode 100644 index 00000000..27f7660c --- /dev/null +++ b/tests/cost_evaluation.cpp @@ -0,0 +1,121 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#define BOOST_TEST_MODULE CostEvaluation +#include + +#include "osp/bsp/model/BspInstance.hpp" +#include "osp/bsp/model/BspSchedule.hpp" +#include "osp/bsp/model/cost/BufferedSendingCost.hpp" +#include "osp/bsp/model/cost/LazyCommunicationCost.hpp" +#include "osp/bsp/model/cost/TotalCommunicationCost.hpp" +#include "osp/bsp/model/cost/TotalLambdaCommunicationCost.hpp" +#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" + +using namespace osp; + +BOOST_AUTO_TEST_CASE(test_cost_models_simple_dag) { + + using graph = computational_dag_edge_idx_vector_impl_def_int_t; + + BspInstance instance; + instance.setNumberOfProcessors(2); + instance.setCommunicationCosts(10); + instance.setSynchronisationCosts(5); + + auto &dag = instance.getComputationalDag(); + dag.add_vertex(10, 1, 0); + dag.add_vertex(20, 2, 0); + dag.add_vertex(30, 3, 0); + dag.add_vertex(40, 4, 0); + dag.add_vertex(50, 5, 0); + dag.add_edge(0, 1); + dag.add_edge(0, 2); + dag.add_edge(1, 4); + dag.add_edge(2, 3); + dag.add_edge(3, 4); + + BspSchedule schedule(instance); + + schedule.setAssignedProcessor(0, 0); + schedule.setAssignedSuperstep(0, 0); + schedule.setAssignedProcessor(1, 0); + schedule.setAssignedSuperstep(1, 1); + schedule.setAssignedProcessor(2, 1); + schedule.setAssignedSuperstep(2, 1); + schedule.setAssignedProcessor(3, 1); + schedule.setAssignedSuperstep(3, 2); + schedule.setAssignedProcessor(4, 1); + schedule.setAssignedSuperstep(4, 3); + schedule.updateNumberOfSupersteps(); + + BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); + BOOST_CHECK_EQUAL(schedule.numberOfSupersteps(), 4); + + // Work cost (BSP model) = sum of max work per superstep across processors + // SS0: max(P0=10, P1=0) = 10 + // SS1: max(P0=20, P1=30) = 30 + // SS2: max(P0=0, P1=40) = 40 + // SS3: max(P0=0, P1=50) = 50 + // Total work = 10 + 30 + 40 + 50 = 130 + BOOST_CHECK_EQUAL(schedule.computeWorkCosts(), 130); + + // LazyCommunicationCost + // Sends/receives at step_needed - staleness (staleness=1) + // Node 0→{P1}: step_needed=1, send/rec at SS0, vol=1*1*g=10 + // Node 1→{P1}: step_needed=3, send/rec at SS2, vol=2*1*g=20 + // Max comm per step: SS0=10, SS1=0, SS2=20, SS3=0 + // Comm = 10 + 20 = 30 + // Syncs = 2 * L = 2 * 5 = 10 (only steps with comm) + // Total = 30 + 10 + 130 = 170 + BOOST_CHECK_EQUAL(LazyCommunicationCost()(schedule), 170); + + // BufferedSendingCost + // Send at producer step, receive at step_needed - staleness + // Node 0 (SS0): send to P1, vol=1*1*g=10 at SS0, rec at SS0 + // Node 1 (SS1): send to P1, vol=2*1*g=20 at SS1, rec at SS2 + // Send volumes: SS0[P0]=10, SS1[P0]=20, SS2[P0]=0, SS3[P0]=0 + // Recv volumes: SS0[P1]=10, SS1[P1]=0, SS2[P1]=20, SS3[P1]=0 + // Max comm per step: SS0=10, SS1=20, SS2=20, SS3=0 + // Comm = 10 + 20 + 20 = 50 + // Syncs = 3 * L = 3 * 5 = 15 (all steps with comm) + // Total = 50 + 15 + 130 = 195 + BOOST_CHECK_EQUAL(BufferedSendingCost()(schedule), 195); + + // TotalCommunicationCost + // Sum of cross-processor edge comm weights * g / P + // Cross edges: 0→2 (cw=1), 1→4 (cw=2) + // Total cross comm weight = (1 + 2) * 1 = 3 + // Comm cost = 3 * 10 / 2 = 15 + // Work = 130 + // Sync = 3 * 5 = 15 (number_of_supersteps - 1) + // Total = 15 + 130 + 15 = 160 + BOOST_CHECK_EQUAL(TotalCommunicationCost()(schedule), 160); + + // TotalLambdaCommunicationCost + // For each node, sum comm_weight * sendCosts over unique target processors + // Then multiply total by (1/P) * g + // Node 0 (P0, cw=1): target_procs={P0,P1} → 1*(0+1) = 1 + // Node 1 (P0, cw=2): target_procs={P1} → 2*1 = 2 + // Node 2 (P1, cw=3): target_procs={P1} → 3*0 = 0 + // Node 3 (P1, cw=4): target_procs={P1} → 4*0 = 0 + // comm_costs = 1+2+0+0 = 3, comm_cost = 3 * (1/2) * 10 = 15 + // Work = 130, Sync = 3 * 5 = 15 + // Total = 15 + 130 + 15 = 160 + BOOST_CHECK_EQUAL(TotalLambdaCommunicationCost()(schedule), 160); +} diff --git a/tests/ilp_bsp_scheduler.cpp b/tests/ilp_bsp_scheduler.cpp index 151fd0b7..fc6934b4 100644 --- a/tests/ilp_bsp_scheduler.cpp +++ b/tests/ilp_bsp_scheduler.cpp @@ -64,7 +64,7 @@ BOOST_AUTO_TEST_CASE(test_total) { scheduler_to.setTimeLimitSeconds(10); const auto result_to = scheduler_to.computeSchedule(schedule_to); - BOOST_CHECK_EQUAL(RETURN_STATUS::BEST_FOUND, result_to); + BOOST_CHECK(result_to == RETURN_STATUS::OSP_SUCCESS || result_to == RETURN_STATUS::BEST_FOUND); BOOST_CHECK(schedule_to.satisfiesPrecedenceConstraints()); BspSchedule schedule(instance);