diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..2dd9bda --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,39 @@ +name: "Build" + +on: + pull_request: + branches: [ main ] + +jobs: + build: + strategy: + matrix: + include: + - name: "NVHPC SDK 25.5, CUDA 12.9" + image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04" + cmake_options: "-DCUDECOMP_ENABLE_NVSHMEM=1 -DCUDECOMP_BUILD_EXTRAS=1" + - name: "NVHPC SDK 25.5, CUDA 12.9, no NVSHMEM" + image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04" + cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1" + - name: "NVHPC SDK 22.11, CUDA 11.8" + image: "nvcr.io/nvidia/nvhpc:22.11-devel-cuda11.8-ubuntu20.04" + cmake_options: "-DCUDECOMP_ENABLE_NVSHMEM=1 -DCUDECOMP_BUILD_EXTRAS=1" + - name: "NVHPC SDK 22.11, CUDA 11.8, no NVSHMEM" + image: "nvcr.io/nvidia/nvhpc:22.11-devel-cuda11.8-ubuntu20.04" + cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1" + + name: ${{ matrix.name }} + runs-on: ubuntu-latest + container: + image: ${{ matrix.image }} + + steps: + - name: "Checkout code" + uses: actions/checkout@v4 + + - name: "Compile" + run: | + mkdir -p build + cd build + cmake ${{ matrix.cmake_options }} .. + make -j$(nproc) \ No newline at end of file diff --git a/.github/workflows/format.yaml b/.github/workflows/format.yaml new file mode 100644 index 0000000..8b95998 --- /dev/null +++ b/.github/workflows/format.yaml @@ -0,0 +1,48 @@ +name: "Code Format" + +on: + pull_request: + branches: [ main ] + +jobs: + clang-format: + runs-on: ubuntu-latest + name: "clang-format check" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Run clang-format check + run: | + # Install clang-format + sudo apt-get update && sudo apt-get install -y clang-format + + # Collect names of files that are not properly formatted + filelist=`find src include examples -name "*.cc" -o -name "*.cu" -o -name "*.h" -o -name "*.cuh"` + files_to_fix=() + for file in $filelist; do + if ! clang-format --dry-run --Werror "$file" 2>/dev/null; then + files_to_fix+=("$file") + fi + done + + # If any file is not properly formatted, print diff and exit with error + if [ ${#files_to_fix[@]} -gt 0 ]; then + # Print the list of files that are not properly formatted + echo "FAIL: Some files are not properly formatted. To resolve issues, run:" + for file in "${files_to_fix[@]}"; do + echo "clang-format -i $file" + done + echo + + for file in "${files_to_fix[@]}"; do + echo "Diff for $file:" + bash -c "clang-format $file | diff $file -; exit 0" + echo + done + + exit 1 + fi + + echo "PASS: All files are properly formatted." \ No newline at end of file diff --git a/examples/cc/taylor_green/tg.cu b/examples/cc/taylor_green/tg.cu index f5292b7..68203e7 100644 --- a/examples/cc/taylor_green/tg.cu +++ b/examples/cc/taylor_green/tg.cu @@ -332,7 +332,7 @@ public: enum TimeScheme { RK1, RK4 }; TGSolver(int64_t N, real_t nu, real_t dt, real_t cfl, TimeScheme tscheme = RK1) - : N(N), nu(nu), dt_(dt), cfl(cfl), tscheme(tscheme) {}; + : N(N), nu(nu), dt_(dt), cfl(cfl), tscheme(tscheme){}; void finalize() { // Free memory for (int i = 0; i < 3; ++i) { diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h index 1521419..789ba87 100644 --- a/include/internal/comm_routines.h +++ b/include/internal/comm_routines.h @@ -57,36 +57,27 @@ static inline bool canUseMpiAlltoall(const std::vector& send_count const std::vector& send_offsets, const std::vector& recv_counts, const std::vector& recv_offsets) { - auto scount = send_counts[0]; - auto rcount = recv_counts[0]; - // Check that send and recv counts are constants - for (int i = 1; i < send_counts.size(); ++i) { - if (send_counts[i] != scount) { - return false; - } - } - for (int i = 1; i < recv_counts.size(); ++i) { - if (recv_counts[i] != rcount) { - return false; - } - } + auto scount = send_counts[0]; + auto rcount = recv_counts[0]; + // Check that send and recv counts are constants + for (int i = 1; i < send_counts.size(); ++i) { + if (send_counts[i] != scount) { return false; } + } + for (int i = 1; i < recv_counts.size(); ++i) { + if (recv_counts[i] != rcount) { return false; } + } - // Check that offsets are contiguous and equal to counts - for (int i = 0; i < send_offsets.size(); ++i) { - if (send_offsets[i] != i * scount) { - return false; - } - } - for (int i = 0; i < recv_offsets.size(); ++i) { - if (recv_offsets[i] != i * rcount) { - return false; - } - } + // Check that offsets are contiguous and equal to counts + for (int i = 0; i < send_offsets.size(); ++i) { + if (send_offsets[i] != i * scount) { return false; } + } + for (int i = 0; i < recv_offsets.size(); ++i) { + if (recv_offsets[i] != i * rcount) { return false; } + } - return true; + return true; } - #ifdef ENABLE_NVSHMEM #define CUDECOMP_NVSHMEM_CHUNK_SZ (static_cast(1024 * 1024 * 1024)) template diff --git a/include/internal/common.h b/include/internal/common.h index 829bfc8..a299271 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -55,7 +55,7 @@ typedef std::pair, unsigned typedef std::pair, unsigned int> mnnvl_info; #endif typedef std::shared_ptr ncclComm; -} +} // namespace cudecomp // cuDecomp handle containing general information struct cudecompHandle { @@ -75,7 +75,7 @@ struct cudecompHandle { // Entries for NCCL management cudecomp::ncclComm nccl_comm; // NCCL communicator (global) cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems) - bool nccl_enable_ubr = false; // Flag to control NCCL user buffer registration usage + bool nccl_enable_ubr = false; // Flag to control NCCL user buffer registration usage std::unordered_map>> nccl_ubr_handles; // map of allocated buffer address to NCCL registration handle(s) @@ -140,8 +140,9 @@ struct cudecompGridDesc { cudecomp::graphCache graph_cache; // CUDA graph cache - cudecomp::ncclComm nccl_comm; // NCCL communicator (global), shared from handle - cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle + cudecomp::ncclComm nccl_comm; // NCCL communicator (global), shared from handle + cudecomp::ncclComm + nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle bool initialized = false; }; diff --git a/include/internal/exceptions.h b/include/internal/exceptions.h index 6aadb48..f1415a9 100644 --- a/include/internal/exceptions.h +++ b/include/internal/exceptions.h @@ -97,63 +97,63 @@ class BaseException : public std::exception { class InvalidUsage : public BaseException { public: InvalidUsage(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "Invalid usage.", extra_info) {}; + : BaseException(file, line, "Invalid usage.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_INVALID_USAGE; } }; class NotSupported : public BaseException { public: NotSupported(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "Not supported.", extra_info) {}; + : BaseException(file, line, "Not supported.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NOT_SUPPORTED; } }; class InternalError : public BaseException { public: InternalError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "Internal error.", extra_info) {}; + : BaseException(file, line, "Internal error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_INTERNAL_ERROR; } }; class CudaError : public BaseException { public: CudaError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "CUDA error.", extra_info) {}; + : BaseException(file, line, "CUDA error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_CUDA_ERROR; } }; class CutensorError : public BaseException { public: CutensorError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "cuTENSOR error.", extra_info) {}; + : BaseException(file, line, "cuTENSOR error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_CUTENSOR_ERROR; } }; class MpiError : public BaseException { public: MpiError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "MPI error.", extra_info) {}; + : BaseException(file, line, "MPI error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_MPI_ERROR; } }; class NcclError : public BaseException { public: NcclError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "NCCL error.", extra_info) {}; + : BaseException(file, line, "NCCL error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NCCL_ERROR; } }; class NvshmemError : public BaseException { public: NvshmemError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "NVSHMEM error.", extra_info) {}; + : BaseException(file, line, "NVSHMEM error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NVSHMEM_ERROR; } }; class NvmlError : public BaseException { public: NvmlError(const char* file, int line, const char* extra_info = nullptr) - : BaseException(file, line, "NVML error.", extra_info) {}; + : BaseException(file, line, "NVML error.", extra_info){}; cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NVML_ERROR; } }; diff --git a/include/internal/transpose.h b/include/internal/transpose.h index ad3982a..fb47c76 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -35,8 +35,8 @@ #include #include -#include #include +#include #include #include diff --git a/src/autotune.cc b/src/autotune.cc index 6b8a74d..b49cf0b 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -452,7 +452,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID; #endif } - } // Free test data and workspace diff --git a/src/cudecomp.cc b/src/cudecomp.cc index 39aa0fa..bd4338e 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -533,7 +533,8 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes if (!handle->nccl_comm) { handle->nccl_comm = ncclCommFromMPIComm(handle->mpi_comm); } if (!handle->nccl_local_comm) { if (grid_desc->config.pdims[0] > 0 && grid_desc->config.pdims[1] > 0) { - // If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL communicator + // If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL + // communicator grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1]; grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; int color_row = grid_desc->pidx[0]; @@ -550,14 +551,14 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes if ((grid_desc->row_comm_info.ngroups == 1 && grid_desc->row_comm_info.nranks > 1) || (grid_desc->col_comm_info.ngroups == 1 && grid_desc->col_comm_info.nranks > 1)) { handle->nccl_local_comm = ncclCommFromMPIComm( - handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm); + handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm); } CHECK_MPI(MPI_Comm_free(&row_comm)); CHECK_MPI(MPI_Comm_free(&col_comm)); } else { // If pdims are not set, set up local NCCL communicator for use during autotuning handle->nccl_local_comm = ncclCommFromMPIComm( - handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm); + handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm); } } if (!handle->pl_stream) { @@ -654,16 +655,15 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes #endif if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) || haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) { - // If this grid descriptor initialized the group local NCCL communicator but does not need it, release reference to it + // If this grid descriptor initialized the group local NCCL communicator but does not need it, release reference + // to it if (grid_desc->nccl_local_comm) { if ((grid_desc->row_comm_info.ngroups > 1 || grid_desc->row_comm_info.nranks == 1) && (grid_desc->col_comm_info.ngroups > 1 || grid_desc->col_comm_info.nranks == 1)) { grid_desc->nccl_local_comm.reset(); // If handle has the only remaining reference to the local NCCL communicator, destroy it to reclaim resources - if (handle->nccl_local_comm.use_count() == 1) { - handle->nccl_local_comm.reset(); - } + if (handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); } } } } else { @@ -672,12 +672,8 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes grid_desc->nccl_local_comm.reset(); // Destroy NCCL communicators to reclaim resources if not used by other grid descriptors - if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { - handle->nccl_comm.reset(); - } - if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { - handle->nccl_local_comm.reset(); - } + if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { handle->nccl_comm.reset(); } + if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); } } *grid_desc_in = grid_desc; @@ -733,12 +729,8 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe grid_desc->nccl_local_comm.reset(); // Destroy NCCL communicators to reclaim resources if not used by other grid descriptors - if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { - handle->nccl_comm.reset(); - } - if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { - handle->nccl_local_comm.reset(); - } + if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { handle->nccl_comm.reset(); } + if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); } } #ifdef ENABLE_NVSHMEM