From 5b97b94252fa22cfa7bd156b3dc23959111928a4 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 17 Jul 2025 14:36:44 -0700 Subject: [PATCH 1/7] Add basic GitHub workflow to sanity check compilation. --- .github/workflows/build.yml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..a429f3c --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,33 @@ +name: "Build" + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build: + strategy: + matrix: + include: + - name: "NVHPC SDK 25.5" + image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04" + cmake_options: "-DCUDECOMP_ENABLE_NVSHMEM=1 -DCUDECOMP_BUILD_EXTRAS=1" + - name: "NVHPC SDK 25.5, no NVSHMEM" + image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04" + cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1" + + name: ${{ matrix.name }} + runs-on: ubuntu-latest + container: + image: ${matrix.image} + + steps: + - name: "Checkout code" + uses: actions/checkout@v4 + + - name: "Compile" + run: | + mkdir -p build && cd build && \ + cmake ${{ matrix.cmake_options }} .. && \ + make -j$(nproc) From 38a6006e003180ffd7d449252a13d56358e706a4 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 17 Jul 2025 14:42:27 -0700 Subject: [PATCH 2/7] Fix. --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a429f3c..2778a8e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,7 @@ jobs: name: ${{ matrix.name }} runs-on: ubuntu-latest container: - image: ${matrix.image} + image: ${{ matrix.image }} steps: - name: "Checkout code" From 6f5c996a9b975102c82d0002a0de3c632431f173 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 17 Jul 2025 15:29:38 -0700 Subject: [PATCH 3/7] Add code style check with clang-format. --- .github/workflows/format.yaml | 47 +++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 .github/workflows/format.yaml diff --git a/.github/workflows/format.yaml b/.github/workflows/format.yaml new file mode 100644 index 0000000..57c6de0 --- /dev/null +++ b/.github/workflows/format.yaml @@ -0,0 +1,47 @@ +name: "Code Format" + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + clang-format: + runs-on: ubuntu-latest + name: "clang-format check" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Run clang-format check + run: | + # Install clang-format + sudo apt-get update && sudo apt-get install -y clang-format + + # Collect names of files that are not properly formatted + filelist=`find src include examples -name "*.cc" -o -name "*.cu" -o -name "*.h" -o -name "*.cuh"` + files_to_fix=() + for file in $filelist; do + if ! clang-format --dry-run --Werror "$file" 2>/dev/null; then + files_to_fix+=("$file") + fi + done + + # If any file is not properly formatted, print diff and exit with error + if [ ${#files_to_fix[@]} -gt 0 ]; then + # Print the list of files that are not properly formatted + echo "ERROR: Some files are not properly formatted. To resolve issues, run:" + for file in "${files_to_fix[@]}"; do + echo "clang-format -i $file" + done + echo + + for file in "${files_to_fix[@]}"; do + echo "Diff for $file:" + bash -c "clang-format $file | diff $file -; exit 0" + echo + done + + exit 1 + fi \ No newline at end of file From a58fedb4bde62cc771e1c2cfba32a79ab9b66909 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 18 Jul 2025 08:59:35 -0700 Subject: [PATCH 4/7] Run clang-format. --- examples/cc/taylor_green/tg.cu | 2 +- include/internal/comm_routines.h | 43 +++++++++++++------------------- include/internal/common.h | 9 ++++--- include/internal/exceptions.h | 18 ++++++------- include/internal/transpose.h | 2 +- src/autotune.cc | 1 - src/cudecomp.cc | 30 ++++++++-------------- 7 files changed, 44 insertions(+), 61 deletions(-) diff --git a/examples/cc/taylor_green/tg.cu b/examples/cc/taylor_green/tg.cu index f5292b7..68203e7 100644 --- a/examples/cc/taylor_green/tg.cu +++ b/examples/cc/taylor_green/tg.cu @@ -332,7 +332,7 @@ public: enum TimeScheme { RK1, RK4 }; TGSolver(int64_t N, real_t nu, real_t dt, real_t cfl, TimeScheme tscheme = RK1) - : N(N), nu(nu), dt_(dt), cfl(cfl), tscheme(tscheme) {}; + : N(N), nu(nu), dt_(dt), cfl(cfl), tscheme(tscheme){}; void finalize() { // Free memory for (int i = 0; i < 3; ++i) { diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h index 1521419..789ba87 100644 --- a/include/internal/comm_routines.h +++ b/include/internal/comm_routines.h @@ -57,36 +57,27 @@ static inline bool canUseMpiAlltoall(const std::vector& send_count const std::vector& send_offsets, const std::vector& recv_counts, const std::vector& recv_offsets) { - auto scount = send_counts[0]; - auto rcount = recv_counts[0]; - // Check that send and recv counts are constants - for (int i = 1; i < send_counts.size(); ++i) { - if (send_counts[i] != scount) { - return false; - } - } - for (int i = 1; i < recv_counts.size(); ++i) { - if (recv_counts[i] != rcount) { - return false; - } - } + auto scount = send_counts[0]; + auto rcount = recv_counts[0]; + // Check that send and recv counts are constants + for (int i = 1; i < send_counts.size(); ++i) { + if (send_counts[i] != scount) { return false; } + } + for (int i = 1; i < recv_counts.size(); ++i) { + if (recv_counts[i] != rcount) { return false; } + } - // Check that offsets are contiguous and equal to counts - for (int i = 0; i < send_offsets.size(); ++i) { - if (send_offsets[i] != i * scount) { - return false; - } - } - for (int i = 0; i < recv_offsets.size(); ++i) { - if (recv_offsets[i] != i * rcount) { - return false; - } - } + // Check that offsets are contiguous and equal to counts + for (int i = 0; i < send_offsets.size(); ++i) { + if (send_offsets[i] != i * scount) { return false; } + } + for (int i = 0; i < recv_offsets.size(); ++i) { + if (recv_offsets[i] != i * rcount) { return false; } + } - return true; + return true; } - #ifdef ENABLE_NVSHMEM #define CUDECOMP_NVSHMEM_CHUNK_SZ (static_cast(1024 * 1024 * 1024)) template diff --git a/include/internal/common.h b/include/internal/common.h index 829bfc8..a299271 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -55,7 +55,7 @@ typedef std::pair, unsigned typedef std::pair, unsigned int> mnnvl_info; #endif typedef std::shared_ptr ncclComm; -} +} // namespace cudecomp // cuDecomp handle containing general information struct cudecompHandle { @@ -75,7 +75,7 @@ struct cudecompHandle { // Entries for NCCL management cudecomp::ncclComm nccl_comm; // NCCL communicator (global) cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems) - bool nccl_enable_ubr = false; // Flag to control NCCL user buffer registration usage + bool nccl_enable_ubr = false; // Flag to control NCCL user buffer registration usage std::unordered_map>> nccl_ubr_handles; // map of allocated buffer address to NCCL registration handle(s) @@ -140,8 +140,9 @@ struct cudecompGridDesc { cudecomp::graphCache graph_cache; // CUDA graph cache - cudecomp::ncclComm nccl_comm; // NCCL communicator (global), shared from handle - cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle + cudecomp::ncclComm nccl_comm; // NCCL communicator (global), shared from handle + cudecomp::ncclComm + nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle bool initialized = false; }; diff --git a/include/internal/exceptions.h b/include/internal/exceptions.h index 6aadb48..f1415a9 100644 --- a/include/internal/exceptions.h +++ b/include/internal/exceptions.h @@ -97,63 +97,63 @@ class BaseException : public std::exception { class InvalidUsage : public BaseException { public: InvalidUsage(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "Invalid usage.", extra_info) {}; + : BaseException(file, line, "Invalid usage.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_INVALID_USAGE; } }; class NotSupported : public BaseException { public: NotSupported(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "Not supported.", extra_info) {}; + : BaseException(file, line, "Not supported.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NOT_SUPPORTED; } }; class InternalError : public BaseException { public: InternalError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "Internal error.", extra_info) {}; + : BaseException(file, line, "Internal error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_INTERNAL_ERROR; } }; class CudaError : public BaseException { public: CudaError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "CUDA error.", extra_info) {}; + : BaseException(file, line, "CUDA error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_CUDA_ERROR; } }; class CutensorError : public BaseException { public: CutensorError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "cuTENSOR error.", extra_info) {}; + : BaseException(file, line, "cuTENSOR error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_CUTENSOR_ERROR; } }; class MpiError : public BaseException { public: MpiError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "MPI error.", extra_info) {}; + : BaseException(file, line, "MPI error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_MPI_ERROR; } }; class NcclError : public BaseException { public: NcclError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "NCCL error.", extra_info) {}; + : BaseException(file, line, "NCCL error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NCCL_ERROR; } }; class NvshmemError : public BaseException { public: NvshmemError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "NVSHMEM error.", extra_info) {}; + : BaseException(file, line, "NVSHMEM error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NVSHMEM_ERROR; } }; class NvmlError : public BaseException { public: NvmlError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "NVML error.", extra_info) {}; + : BaseException(file, line, "NVML error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NVML_ERROR; } }; diff --git a/include/internal/transpose.h b/include/internal/transpose.h index ad3982a..fb47c76 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -35,8 +35,8 @@ #include #include -#include #include +#include #include #include diff --git a/src/autotune.cc b/src/autotune.cc index 6b8a74d..b49cf0b 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -452,7 +452,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID; #endif } - } // Free test data and workspace diff --git a/src/cudecomp.cc b/src/cudecomp.cc index 39aa0fa..bd4338e 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -533,7 +533,8 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes if (!handle->nccl_comm) { handle->nccl_comm = ncclCommFromMPIComm(handle->mpi_comm); } if (!handle->nccl_local_comm) { if (grid_desc->config.pdims[0] > 0 && grid_desc->config.pdims[1] > 0) { - // If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL communicator + // If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL + // communicator grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1]; grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; int color_row = grid_desc->pidx[0]; @@ -550,14 +551,14 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes if ((grid_desc->row_comm_info.ngroups == 1 && grid_desc->row_comm_info.nranks > 1) || (grid_desc->col_comm_info.ngroups == 1 && grid_desc->col_comm_info.nranks > 1)) { handle->nccl_local_comm = ncclCommFromMPIComm( - handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm); + handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm); } CHECK_MPI(MPI_Comm_free(&row_comm)); CHECK_MPI(MPI_Comm_free(&col_comm)); } else { // If pdims are not set, set up local NCCL communicator for use during autotuning handle->nccl_local_comm = ncclCommFromMPIComm( - handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm); + handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm); } } if (!handle->pl_stream) { @@ -654,16 +655,15 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes #endif if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) || haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) { - // If this grid descriptor initialized the group local NCCL communicator but does not need it, release reference to it + // If this grid descriptor initialized the group local NCCL communicator but does not need it, release reference + // to it if (grid_desc->nccl_local_comm) { if ((grid_desc->row_comm_info.ngroups > 1 || grid_desc->row_comm_info.nranks == 1) && (grid_desc->col_comm_info.ngroups > 1 || grid_desc->col_comm_info.nranks == 1)) { grid_desc->nccl_local_comm.reset(); // If handle has the only remaining reference to the local NCCL communicator, destroy it to reclaim resources - if (handle->nccl_local_comm.use_count() == 1) { - handle->nccl_local_comm.reset(); - } + if (handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); } } } } else { @@ -672,12 +672,8 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes grid_desc->nccl_local_comm.reset(); // Destroy NCCL communicators to reclaim resources if not used by other grid descriptors - if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { - handle->nccl_comm.reset(); - } - if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { - handle->nccl_local_comm.reset(); - } + if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { handle->nccl_comm.reset(); } + if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); } } *grid_desc_in = grid_desc; @@ -733,12 +729,8 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe grid_desc->nccl_local_comm.reset(); // Destroy NCCL communicators to reclaim resources if not used by other grid descriptors - if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { - handle->nccl_comm.reset(); - } - if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { - handle->nccl_local_comm.reset(); - } + if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { handle->nccl_comm.reset(); } + if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); } } #ifdef ENABLE_NVSHMEM From 1d4092d9f3d57e5af0c70d11f849d3e44e2f361a Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 18 Jul 2025 09:09:27 -0700 Subject: [PATCH 5/7] Add additional compilation tests against older NVHPC SDK version. --- .github/workflows/build.yml | 17 ++++++++++++----- .github/workflows/format.yaml | 8 +++++--- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2778a8e..4e62d5d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,12 +10,18 @@ jobs: strategy: matrix: include: - - name: "NVHPC SDK 25.5" + - name: "NVHPC SDK 25.5, CUDA 12.9" image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04" cmake_options: "-DCUDECOMP_ENABLE_NVSHMEM=1 -DCUDECOMP_BUILD_EXTRAS=1" - - name: "NVHPC SDK 25.5, no NVSHMEM" + - name: "NVHPC SDK 25.5, CUDA 12.9, no NVSHMEM" image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04" cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1" + - name: "NVHPC SDK 23.11, CUDA 11.8" + image: "nvcr.io/nvidia/nvhpc:22.11-devel-cuda11.8-ubuntu20.04" + cmake_options: "-DCUDECOMP_ENABLE_NVSHMEM=1 -DCUDECOMP_BUILD_EXTRAS=1" + - name: "NVHPC SDK 23.11, CUDA 11.8, no NVSHMEM" + image: "nvcr.io/nvidia/nvhpc:22.11-devel-cuda11.8-ubuntu20.04" + cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1" name: ${{ matrix.name }} runs-on: ubuntu-latest @@ -28,6 +34,7 @@ jobs: - name: "Compile" run: | - mkdir -p build && cd build && \ - cmake ${{ matrix.cmake_options }} .. && \ - make -j$(nproc) + mkdir -p build + cd build + cmake ${{ matrix.cmake_options }} .. + make -j$(nproc) \ No newline at end of file diff --git a/.github/workflows/format.yaml b/.github/workflows/format.yaml index 57c6de0..a67de06 100644 --- a/.github/workflows/format.yaml +++ b/.github/workflows/format.yaml @@ -31,7 +31,7 @@ jobs: # If any file is not properly formatted, print diff and exit with error if [ ${#files_to_fix[@]} -gt 0 ]; then # Print the list of files that are not properly formatted - echo "ERROR: Some files are not properly formatted. To resolve issues, run:" + echo "FAIL: Some files are not properly formatted. To resolve issues, run:" for file in "${files_to_fix[@]}"; do echo "clang-format -i $file" done @@ -42,6 +42,8 @@ jobs: bash -c "clang-format $file | diff $file -; exit 0" echo done - + exit 1 - fi \ No newline at end of file + fi + + echo "PASS: All files are properly formatted." \ No newline at end of file From 2ea4125e88ee556752db58d6af2ef62c958b0abc Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 18 Jul 2025 09:31:13 -0700 Subject: [PATCH 6/7] Update workflow to only run on pull requests. --- .github/workflows/build.yml | 3 +-- .github/workflows/format.yaml | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4e62d5d..fe5f0f2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,9 +1,8 @@ name: "Build" on: - push: pull_request: - workflow_dispatch: + branches: [ main ] jobs: build: diff --git a/.github/workflows/format.yaml b/.github/workflows/format.yaml index a67de06..8b95998 100644 --- a/.github/workflows/format.yaml +++ b/.github/workflows/format.yaml @@ -1,9 +1,8 @@ name: "Code Format" on: - push: pull_request: - workflow_dispatch: + branches: [ main ] jobs: clang-format: From 8663bf3538fbd402924dad1eba64abfd07f3798c Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 18 Jul 2025 10:01:17 -0700 Subject: [PATCH 7/7] Fix typo. --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fe5f0f2..2dd9bda 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,10 +15,10 @@ jobs: - name: "NVHPC SDK 25.5, CUDA 12.9, no NVSHMEM" image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04" cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1" - - name: "NVHPC SDK 23.11, CUDA 11.8" + - name: "NVHPC SDK 22.11, CUDA 11.8" image: "nvcr.io/nvidia/nvhpc:22.11-devel-cuda11.8-ubuntu20.04" cmake_options: "-DCUDECOMP_ENABLE_NVSHMEM=1 -DCUDECOMP_BUILD_EXTRAS=1" - - name: "NVHPC SDK 23.11, CUDA 11.8, no NVSHMEM" + - name: "NVHPC SDK 22.11, CUDA 11.8, no NVSHMEM" image: "nvcr.io/nvidia/nvhpc:22.11-devel-cuda11.8-ubuntu20.04" cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1"