Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: "Build"

on:
pull_request:
branches: [ main ]

jobs:
build:
strategy:
matrix:
include:
- name: "NVHPC SDK 25.5, CUDA 12.9"
image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04"
cmake_options: "-DCUDECOMP_ENABLE_NVSHMEM=1 -DCUDECOMP_BUILD_EXTRAS=1"
- name: "NVHPC SDK 25.5, CUDA 12.9, no NVSHMEM"
image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04"
cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1"
- name: "NVHPC SDK 22.11, CUDA 11.8"
image: "nvcr.io/nvidia/nvhpc:22.11-devel-cuda11.8-ubuntu20.04"
cmake_options: "-DCUDECOMP_ENABLE_NVSHMEM=1 -DCUDECOMP_BUILD_EXTRAS=1"
- name: "NVHPC SDK 22.11, CUDA 11.8, no NVSHMEM"
image: "nvcr.io/nvidia/nvhpc:22.11-devel-cuda11.8-ubuntu20.04"
cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1"

name: ${{ matrix.name }}
runs-on: ubuntu-latest
container:
image: ${{ matrix.image }}

steps:
- name: "Checkout code"
uses: actions/checkout@v4

- name: "Compile"
run: |
mkdir -p build
cd build
cmake ${{ matrix.cmake_options }} ..
make -j$(nproc)
48 changes: 48 additions & 0 deletions .github/workflows/format.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: "Code Format"

on:
pull_request:
branches: [ main ]

jobs:
clang-format:
runs-on: ubuntu-latest
name: "clang-format check"

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Run clang-format check
run: |
# Install clang-format
sudo apt-get update && sudo apt-get install -y clang-format

# Collect names of files that are not properly formatted
filelist=`find src include examples -name "*.cc" -o -name "*.cu" -o -name "*.h" -o -name "*.cuh"`
files_to_fix=()
for file in $filelist; do
if ! clang-format --dry-run --Werror "$file" 2>/dev/null; then
files_to_fix+=("$file")
fi
done

# If any file is not properly formatted, print diff and exit with error
if [ ${#files_to_fix[@]} -gt 0 ]; then
# Print the list of files that are not properly formatted
echo "FAIL: Some files are not properly formatted. To resolve issues, run:"
for file in "${files_to_fix[@]}"; do
echo "clang-format -i $file"
done
echo

for file in "${files_to_fix[@]}"; do
echo "Diff for $file:"
bash -c "clang-format $file | diff $file -; exit 0"
echo
done

exit 1
fi

echo "PASS: All files are properly formatted."
2 changes: 1 addition & 1 deletion examples/cc/taylor_green/tg.cu
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ public:
enum TimeScheme { RK1, RK4 };

TGSolver(int64_t N, real_t nu, real_t dt, real_t cfl, TimeScheme tscheme = RK1)
: N(N), nu(nu), dt_(dt), cfl(cfl), tscheme(tscheme) {};
: N(N), nu(nu), dt_(dt), cfl(cfl), tscheme(tscheme){};
void finalize() {
// Free memory
for (int i = 0; i < 3; ++i) {
Expand Down
43 changes: 17 additions & 26 deletions include/internal/comm_routines.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,36 +57,27 @@ static inline bool canUseMpiAlltoall(const std::vector<comm_count_t>& send_count
const std::vector<comm_count_t>& send_offsets,
const std::vector<comm_count_t>& recv_counts,
const std::vector<comm_count_t>& recv_offsets) {
auto scount = send_counts[0];
auto rcount = recv_counts[0];
// Check that send and recv counts are constants
for (int i = 1; i < send_counts.size(); ++i) {
if (send_counts[i] != scount) {
return false;
}
}
for (int i = 1; i < recv_counts.size(); ++i) {
if (recv_counts[i] != rcount) {
return false;
}
}
auto scount = send_counts[0];
auto rcount = recv_counts[0];
// Check that send and recv counts are constants
for (int i = 1; i < send_counts.size(); ++i) {
if (send_counts[i] != scount) { return false; }
}
for (int i = 1; i < recv_counts.size(); ++i) {
if (recv_counts[i] != rcount) { return false; }
}

// Check that offsets are contiguous and equal to counts
for (int i = 0; i < send_offsets.size(); ++i) {
if (send_offsets[i] != i * scount) {
return false;
}
}
for (int i = 0; i < recv_offsets.size(); ++i) {
if (recv_offsets[i] != i * rcount) {
return false;
}
}
// Check that offsets are contiguous and equal to counts
for (int i = 0; i < send_offsets.size(); ++i) {
if (send_offsets[i] != i * scount) { return false; }
}
for (int i = 0; i < recv_offsets.size(); ++i) {
if (recv_offsets[i] != i * rcount) { return false; }
}

return true;
return true;
}


#ifdef ENABLE_NVSHMEM
#define CUDECOMP_NVSHMEM_CHUNK_SZ (static_cast<size_t>(1024 * 1024 * 1024))
template <typename T>
Expand Down
9 changes: 5 additions & 4 deletions include/internal/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ typedef std::pair<std::array<unsigned char, NVML_GPU_FABRIC_UUID_LEN>, unsigned
typedef std::pair<std::array<unsigned char, 1>, unsigned int> mnnvl_info;
#endif
typedef std::shared_ptr<ncclComm_t> ncclComm;
}
} // namespace cudecomp

// cuDecomp handle containing general information
struct cudecompHandle {
Expand All @@ -75,7 +75,7 @@ struct cudecompHandle {
// Entries for NCCL management
cudecomp::ncclComm nccl_comm; // NCCL communicator (global)
cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems)
bool nccl_enable_ubr = false; // Flag to control NCCL user buffer registration usage
bool nccl_enable_ubr = false; // Flag to control NCCL user buffer registration usage
std::unordered_map<void*, std::vector<std::pair<ncclComm_t, void*>>>
nccl_ubr_handles; // map of allocated buffer address to NCCL registration handle(s)

Expand Down Expand Up @@ -140,8 +140,9 @@ struct cudecompGridDesc {

cudecomp::graphCache graph_cache; // CUDA graph cache

cudecomp::ncclComm nccl_comm; // NCCL communicator (global), shared from handle
cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle
cudecomp::ncclComm nccl_comm; // NCCL communicator (global), shared from handle
cudecomp::ncclComm
nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle

bool initialized = false;
};
Expand Down
18 changes: 9 additions & 9 deletions include/internal/exceptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,63 +97,63 @@ class BaseException : public std::exception {
class InvalidUsage : public BaseException {
public:
InvalidUsage(const char* file, int line, const char* extra_info = nullptr)
: BaseException(file, line, "Invalid usage.", extra_info) {};
: BaseException(file, line, "Invalid usage.", extra_info){};
cudecompResult_t getResult() const override { return CUDECOMP_RESULT_INVALID_USAGE; }
};

class NotSupported : public BaseException {
public:
NotSupported(const char* file, int line, const char* extra_info = nullptr)
: BaseException(file, line, "Not supported.", extra_info) {};
: BaseException(file, line, "Not supported.", extra_info){};
cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NOT_SUPPORTED; }
};

class InternalError : public BaseException {
public:
InternalError(const char* file, int line, const char* extra_info = nullptr)
: BaseException(file, line, "Internal error.", extra_info) {};
: BaseException(file, line, "Internal error.", extra_info){};
cudecompResult_t getResult() const override { return CUDECOMP_RESULT_INTERNAL_ERROR; }
};

class CudaError : public BaseException {
public:
CudaError(const char* file, int line, const char* extra_info = nullptr)
: BaseException(file, line, "CUDA error.", extra_info) {};
: BaseException(file, line, "CUDA error.", extra_info){};
cudecompResult_t getResult() const override { return CUDECOMP_RESULT_CUDA_ERROR; }
};

class CutensorError : public BaseException {
public:
CutensorError(const char* file, int line, const char* extra_info = nullptr)
: BaseException(file, line, "cuTENSOR error.", extra_info) {};
: BaseException(file, line, "cuTENSOR error.", extra_info){};
cudecompResult_t getResult() const override { return CUDECOMP_RESULT_CUTENSOR_ERROR; }
};

class MpiError : public BaseException {
public:
MpiError(const char* file, int line, const char* extra_info = nullptr)
: BaseException(file, line, "MPI error.", extra_info) {};
: BaseException(file, line, "MPI error.", extra_info){};
cudecompResult_t getResult() const override { return CUDECOMP_RESULT_MPI_ERROR; }
};

class NcclError : public BaseException {
public:
NcclError(const char* file, int line, const char* extra_info = nullptr)
: BaseException(file, line, "NCCL error.", extra_info) {};
: BaseException(file, line, "NCCL error.", extra_info){};
cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NCCL_ERROR; }
};

class NvshmemError : public BaseException {
public:
NvshmemError(const char* file, int line, const char* extra_info = nullptr)
: BaseException(file, line, "NVSHMEM error.", extra_info) {};
: BaseException(file, line, "NVSHMEM error.", extra_info){};
cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NVSHMEM_ERROR; }
};

class NvmlError : public BaseException {
public:
NvmlError(const char* file, int line, const char* extra_info = nullptr)
: BaseException(file, line, "NVML error.", extra_info) {};
: BaseException(file, line, "NVML error.", extra_info){};
cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NVML_ERROR; }
};

Expand Down
2 changes: 1 addition & 1 deletion include/internal/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@
#include <cstdint>
#include <vector>

#include <cuda_runtime.h>
#include <cuda/std/complex>
#include <cuda_runtime.h>
#include <cutensor.h>
#include <mpi.h>

Expand Down
1 change: 0 additions & 1 deletion src/autotune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
#endif
}

}

// Free test data and workspace
Expand Down
30 changes: 11 additions & 19 deletions src/cudecomp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,8 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
if (!handle->nccl_comm) { handle->nccl_comm = ncclCommFromMPIComm(handle->mpi_comm); }
if (!handle->nccl_local_comm) {
if (grid_desc->config.pdims[0] > 0 && grid_desc->config.pdims[1] > 0) {
// If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL communicator
// If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL
// communicator
grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
int color_row = grid_desc->pidx[0];
Expand All @@ -550,14 +551,14 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
if ((grid_desc->row_comm_info.ngroups == 1 && grid_desc->row_comm_info.nranks > 1) ||
(grid_desc->col_comm_info.ngroups == 1 && grid_desc->col_comm_info.nranks > 1)) {
handle->nccl_local_comm = ncclCommFromMPIComm(
handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm);
handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm);
}
CHECK_MPI(MPI_Comm_free(&row_comm));
CHECK_MPI(MPI_Comm_free(&col_comm));
} else {
// If pdims are not set, set up local NCCL communicator for use during autotuning
handle->nccl_local_comm = ncclCommFromMPIComm(
handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm);
handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm);
}
}
if (!handle->pl_stream) {
Expand Down Expand Up @@ -654,16 +655,15 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
#endif
if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
// If this grid descriptor initialized the group local NCCL communicator but does not need it, release reference to it
// If this grid descriptor initialized the group local NCCL communicator but does not need it, release reference
// to it
if (grid_desc->nccl_local_comm) {
if ((grid_desc->row_comm_info.ngroups > 1 || grid_desc->row_comm_info.nranks == 1) &&
(grid_desc->col_comm_info.ngroups > 1 || grid_desc->col_comm_info.nranks == 1)) {
grid_desc->nccl_local_comm.reset();

// If handle has the only remaining reference to the local NCCL communicator, destroy it to reclaim resources
if (handle->nccl_local_comm.use_count() == 1) {
handle->nccl_local_comm.reset();
}
if (handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); }
}
}
} else {
Expand All @@ -672,12 +672,8 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
grid_desc->nccl_local_comm.reset();

// Destroy NCCL communicators to reclaim resources if not used by other grid descriptors
if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) {
handle->nccl_comm.reset();
}
if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) {
handle->nccl_local_comm.reset();
}
if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { handle->nccl_comm.reset(); }
if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); }
}

*grid_desc_in = grid_desc;
Expand Down Expand Up @@ -733,12 +729,8 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
grid_desc->nccl_local_comm.reset();

// Destroy NCCL communicators to reclaim resources if not used by other grid descriptors
if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) {
handle->nccl_comm.reset();
}
if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) {
handle->nccl_local_comm.reset();
}
if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { handle->nccl_comm.reset(); }
if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); }
}

#ifdef ENABLE_NVSHMEM
Expand Down