NVIDIA · romerojosh · Jul 18, 2025 · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,39 @@
+name: "Build"
+
+on:
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+    strategy:
+      matrix:
+        include:
+          - name: "NVHPC SDK 25.5, CUDA 12.9"
+            image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04"
+            cmake_options: "-DCUDECOMP_ENABLE_NVSHMEM=1 -DCUDECOMP_BUILD_EXTRAS=1"
+          - name: "NVHPC SDK 25.5, CUDA 12.9, no NVSHMEM"
+            image: "nvcr.io/nvidia/nvhpc:25.5-devel-cuda12.9-ubuntu22.04"
+            cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1"
+          - name: "NVHPC SDK 22.11, CUDA 11.8"
+            image: "nvcr.io/nvidia/nvhpc:22.11-devel-cuda11.8-ubuntu20.04"
+            cmake_options: "-DCUDECOMP_ENABLE_NVSHMEM=1 -DCUDECOMP_BUILD_EXTRAS=1"
+          - name: "NVHPC SDK 22.11, CUDA 11.8, no NVSHMEM"
+            image: "nvcr.io/nvidia/nvhpc:22.11-devel-cuda11.8-ubuntu20.04"
+            cmake_options: "-DCUDECOMP_BUILD_EXTRAS=1"
+
+    name: ${{ matrix.name }}
+    runs-on: ubuntu-latest
+    container:
+      image: ${{ matrix.image }}
+
+    steps:
+    - name: "Checkout code"
+      uses: actions/checkout@v4
+
+    - name: "Compile"
+      run: |
+        mkdir -p build
+        cd build
+        cmake ${{ matrix.cmake_options }} ..
+        make -j$(nproc)
diff --git a/.github/workflows/format.yaml b/.github/workflows/format.yaml
@@ -0,0 +1,48 @@
+name: "Code Format"
+
+on:
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  clang-format:
+    runs-on: ubuntu-latest
+    name: "clang-format check"
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Run clang-format check
+      run: |
+         # Install clang-format
+         sudo apt-get update && sudo apt-get install -y clang-format
+
+         # Collect names of files that are not properly formatted
+         filelist=`find src include examples -name "*.cc" -o -name "*.cu" -o -name "*.h" -o -name "*.cuh"`
+         files_to_fix=()
+         for file in $filelist; do
+           if ! clang-format --dry-run --Werror "$file" 2>/dev/null; then
+             files_to_fix+=("$file")
+           fi
+         done
+
+         # If any file is not properly formatted, print diff and exit with error
+         if [ ${#files_to_fix[@]} -gt 0 ]; then
+           # Print the list of files that are not properly formatted
+           echo "FAIL: Some files are not properly formatted. To resolve issues, run:"
+           for file in "${files_to_fix[@]}"; do
+             echo "clang-format -i $file"
+           done
+           echo
+
+           for file in "${files_to_fix[@]}"; do
+             echo "Diff for $file:"
+             bash -c "clang-format $file | diff $file -; exit 0"
+             echo
+           done
+
+           exit 1
+         fi
+
+         echo "PASS: All files are properly formatted."
diff --git a/examples/cc/taylor_green/tg.cu b/examples/cc/taylor_green/tg.cu
@@ -332,7 +332,7 @@ public:
   enum TimeScheme { RK1, RK4 };
 
   TGSolver(int64_t N, real_t nu, real_t dt, real_t cfl, TimeScheme tscheme = RK1)
-      : N(N), nu(nu), dt_(dt), cfl(cfl), tscheme(tscheme) {};
+      : N(N), nu(nu), dt_(dt), cfl(cfl), tscheme(tscheme){};
   void finalize() {
     // Free memory
     for (int i = 0; i < 3; ++i) {

diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
@@ -57,36 +57,27 @@ static inline bool canUseMpiAlltoall(const std::vector<comm_count_t>& send_count
                                      const std::vector<comm_count_t>& send_offsets,
                                      const std::vector<comm_count_t>& recv_counts,
                                      const std::vector<comm_count_t>& recv_offsets) {
-    auto scount = send_counts[0];
-    auto rcount = recv_counts[0];
-    // Check that send and recv counts are constants
-    for (int i = 1; i < send_counts.size(); ++i) {
-      if (send_counts[i] != scount) {
-        return false;
-      }
-    }
-    for (int i = 1; i < recv_counts.size(); ++i) {
-      if (recv_counts[i] != rcount) {
-        return false;
-      }
-    }
+  auto scount = send_counts[0];
+  auto rcount = recv_counts[0];
+  // Check that send and recv counts are constants
+  for (int i = 1; i < send_counts.size(); ++i) {
+    if (send_counts[i] != scount) { return false; }
+  }
+  for (int i = 1; i < recv_counts.size(); ++i) {
+    if (recv_counts[i] != rcount) { return false; }
+  }
 
-    // Check that offsets are contiguous and equal to counts
-    for (int i = 0; i < send_offsets.size(); ++i) {
-      if (send_offsets[i] != i * scount) {
-        return false;
-      }
-    }
-    for (int i = 0; i < recv_offsets.size(); ++i) {
-      if (recv_offsets[i] != i * rcount) {
-        return false;
-      }
-    }
+  // Check that offsets are contiguous and equal to counts
+  for (int i = 0; i < send_offsets.size(); ++i) {
+    if (send_offsets[i] != i * scount) { return false; }
+  }
+  for (int i = 0; i < recv_offsets.size(); ++i) {
+    if (recv_offsets[i] != i * rcount) { return false; }
+  }
 
-    return true;
+  return true;
 }
 
-
 #ifdef ENABLE_NVSHMEM
 #define CUDECOMP_NVSHMEM_CHUNK_SZ (static_cast<size_t>(1024 * 1024 * 1024))
 template <typename T>

diff --git a/include/internal/common.h b/include/internal/common.h
@@ -55,7 +55,7 @@ typedef std::pair<std::array<unsigned char, NVML_GPU_FABRIC_UUID_LEN>, unsigned
 typedef std::pair<std::array<unsigned char, 1>, unsigned int> mnnvl_info;
 #endif
 typedef std::shared_ptr<ncclComm_t> ncclComm;
-}
+} // namespace cudecomp
 
 // cuDecomp handle containing general information
 struct cudecompHandle {
@@ -75,7 +75,7 @@ struct cudecompHandle {
   // Entries for NCCL management
   cudecomp::ncclComm nccl_comm;       // NCCL communicator (global)
   cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems)
-  bool nccl_enable_ubr = false;         // Flag to control NCCL user buffer registration usage
+  bool nccl_enable_ubr = false;       // Flag to control NCCL user buffer registration usage
   std::unordered_map<void*, std::vector<std::pair<ncclComm_t, void*>>>
       nccl_ubr_handles; // map of allocated buffer address to NCCL registration handle(s)
 
@@ -140,8 +140,9 @@ struct cudecompGridDesc {
 
   cudecomp::graphCache graph_cache; // CUDA graph cache
 
-  cudecomp::ncclComm nccl_comm;       // NCCL communicator (global), shared from handle
-  cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle
+  cudecomp::ncclComm nccl_comm; // NCCL communicator (global), shared from handle
+  cudecomp::ncclComm
+      nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle
 
   bool initialized = false;
 };

diff --git a/include/internal/exceptions.h b/include/internal/exceptions.h
@@ -97,63 +97,63 @@ class BaseException : public std::exception {
 class InvalidUsage : public BaseException {
 public:
   InvalidUsage(const char* file, int line, const char* extra_info = nullptr)
-      : BaseException(file, line, "Invalid usage.", extra_info) {};
+      : BaseException(file, line, "Invalid usage.", extra_info){};
   cudecompResult_t getResult() const override { return CUDECOMP_RESULT_INVALID_USAGE; }
 };
 
 class NotSupported : public BaseException {
 public:
   NotSupported(const char* file, int line, const char* extra_info = nullptr)
-      : BaseException(file, line, "Not supported.", extra_info) {};
+      : BaseException(file, line, "Not supported.", extra_info){};
   cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NOT_SUPPORTED; }
 };
 
 class InternalError : public BaseException {
 public:
   InternalError(const char* file, int line, const char* extra_info = nullptr)
-      : BaseException(file, line, "Internal error.", extra_info) {};
+      : BaseException(file, line, "Internal error.", extra_info){};
   cudecompResult_t getResult() const override { return CUDECOMP_RESULT_INTERNAL_ERROR; }
 };
 
 class CudaError : public BaseException {
 public:
   CudaError(const char* file, int line, const char* extra_info = nullptr)
-      : BaseException(file, line, "CUDA error.", extra_info) {};
+      : BaseException(file, line, "CUDA error.", extra_info){};
   cudecompResult_t getResult() const override { return CUDECOMP_RESULT_CUDA_ERROR; }
 };
 
 class CutensorError : public BaseException {
 public:
   CutensorError(const char* file, int line, const char* extra_info = nullptr)
-      : BaseException(file, line, "cuTENSOR error.", extra_info) {};
+      : BaseException(file, line, "cuTENSOR error.", extra_info){};
   cudecompResult_t getResult() const override { return CUDECOMP_RESULT_CUTENSOR_ERROR; }
 };
 
 class MpiError : public BaseException {
 public:
   MpiError(const char* file, int line, const char* extra_info = nullptr)
-      : BaseException(file, line, "MPI error.", extra_info) {};
+      : BaseException(file, line, "MPI error.", extra_info){};
   cudecompResult_t getResult() const override { return CUDECOMP_RESULT_MPI_ERROR; }
 };
 
 class NcclError : public BaseException {
 public:
   NcclError(const char* file, int line, const char* extra_info = nullptr)
-      : BaseException(file, line, "NCCL error.", extra_info) {};
+      : BaseException(file, line, "NCCL error.", extra_info){};
   cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NCCL_ERROR; }
 };
 
 class NvshmemError : public BaseException {
 public:
   NvshmemError(const char* file, int line, const char* extra_info = nullptr)
-      : BaseException(file, line, "NVSHMEM error.", extra_info) {};
+      : BaseException(file, line, "NVSHMEM error.", extra_info){};
   cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NVSHMEM_ERROR; }
 };
 
 class NvmlError : public BaseException {
 public:
   NvmlError(const char* file, int line, const char* extra_info = nullptr)
-      : BaseException(file, line, "NVML error.", extra_info) {};
+      : BaseException(file, line, "NVML error.", extra_info){};
   cudecompResult_t getResult() const override { return CUDECOMP_RESULT_NVML_ERROR; }
 };
 

diff --git a/include/internal/transpose.h b/include/internal/transpose.h
@@ -35,8 +35,8 @@
 #include <cstdint>
 #include <vector>
 
-#include <cuda_runtime.h>
 #include <cuda/std/complex>
+#include <cuda_runtime.h>
 #include <cutensor.h>
 #include <mpi.h>
 

diff --git a/src/autotune.cc b/src/autotune.cc
@@ -452,7 +452,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
 #endif
     }
-
   }
 
   // Free test data and workspace

diff --git a/src/cudecomp.cc b/src/cudecomp.cc
@@ -533,7 +533,8 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
       if (!handle->nccl_comm) { handle->nccl_comm = ncclCommFromMPIComm(handle->mpi_comm); }
       if (!handle->nccl_local_comm) {
         if (grid_desc->config.pdims[0] > 0 && grid_desc->config.pdims[1] > 0) {
-          // If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL communicator
+          // If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL
+          // communicator
           grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
           grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
           int color_row = grid_desc->pidx[0];
@@ -550,14 +551,14 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
           if ((grid_desc->row_comm_info.ngroups == 1 && grid_desc->row_comm_info.nranks > 1) ||
               (grid_desc->col_comm_info.ngroups == 1 && grid_desc->col_comm_info.nranks > 1)) {
             handle->nccl_local_comm = ncclCommFromMPIComm(
-              handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm);
+                handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm);
           }
           CHECK_MPI(MPI_Comm_free(&row_comm));
           CHECK_MPI(MPI_Comm_free(&col_comm));
         } else {
           // If pdims are not set, set up local NCCL communicator for use during autotuning
           handle->nccl_local_comm = ncclCommFromMPIComm(
-            handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm);
+              handle->mpi_clique_comm != MPI_COMM_NULL ? handle->mpi_clique_comm : handle->mpi_local_comm);
         }
       }
       if (!handle->pl_stream) {
@@ -654,16 +655,15 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
 #endif
     if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
         haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
-      // If this grid descriptor initialized the group local NCCL communicator but does not need it, release reference to it
+      // If this grid descriptor initialized the group local NCCL communicator but does not need it, release reference
+      // to it
       if (grid_desc->nccl_local_comm) {
         if ((grid_desc->row_comm_info.ngroups > 1 || grid_desc->row_comm_info.nranks == 1) &&
             (grid_desc->col_comm_info.ngroups > 1 || grid_desc->col_comm_info.nranks == 1)) {
           grid_desc->nccl_local_comm.reset();
 
           // If handle has the only remaining reference to the local NCCL communicator, destroy it to reclaim resources
-          if (handle->nccl_local_comm.use_count() == 1) {
-            handle->nccl_local_comm.reset();
-          }
+          if (handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); }
         }
       }
     } else {
@@ -672,12 +672,8 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
       grid_desc->nccl_local_comm.reset();
 
       // Destroy NCCL communicators to reclaim resources if not used by other grid descriptors
-      if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) {
-        handle->nccl_comm.reset();
-      }
-      if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) {
-        handle->nccl_local_comm.reset();
-      }
+      if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { handle->nccl_comm.reset(); }
+      if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); }
     }
 
     *grid_desc_in = grid_desc;
@@ -733,12 +729,8 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
       grid_desc->nccl_local_comm.reset();
 
       // Destroy NCCL communicators to reclaim resources if not used by other grid descriptors
-      if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) {
-        handle->nccl_comm.reset();
-      }
-      if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) {
-        handle->nccl_local_comm.reset();
-      }
+      if (handle->nccl_comm && handle->nccl_comm.use_count() == 1) { handle->nccl_comm.reset(); }
+      if (handle->nccl_local_comm && handle->nccl_local_comm.use_count() == 1) { handle->nccl_local_comm.reset(); }
     }
 
 #ifdef ENABLE_NVSHMEM
-Original file line number
+Diff line change
@@ Expand Up @@
           grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
     #endif
         }
       }
       // Free test data and workspace
@@ Expand Down @@