From 2789bd90936cd78635ed0ae8ab490f86b7f6cfcf Mon Sep 17 00:00:00 2001
From: jamesETsmith <james.smith9113@gmail.com>
Date: Tue, 11 Nov 2025 17:31:39 -0500
Subject: [PATCH 1/4] Bumping hipcollections to the latest version

This commit (sloppily) adds support for the latest hipcollections
release (release/rocmds-25.10). There are all sorts of potholes here,
hipcollections + libhipcxx + rocm need special care to work correctly.

I've opened issued in libhipcxx to try and address some of these but for
now, we handle it with some source file manipulation in our script to
install graphbolt dependencies.

This PR will break backward compatibility with rocm < 7, but that should
be ok because we'll probably be focused on rocm@7.9+ in the next
release.
---
 .../cuda/cooperative_minibatching_utils.cu    |  4 --
 .../src/cuda/extension/gpu_graph_cache.cu     |  4 --
 .../cuda/extension/unique_and_compact_map.cu  | 20 --------
 script/install_graphbolt_deps.sh              | 49 ++++++++++---------
 4 files changed, 27 insertions(+), 50 deletions(-)
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
index 969296597475..6bd0f1005875 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
@@ -54,11 +54,7 @@ torch::Tensor RankAssignment(
         THRUST_CALL(
             transform, nodes_ptr, nodes_ptr + nodes.numel(), part_ids_ptr,
 
-#ifdef GRAPHBOLT_USE_HIP
-            ::proclaim_return_type
-#else
             ::cuda::proclaim_return_type
-#endif
             <part_t>(
                 [rank = static_cast<uint32_t>(rank),
                  world_size = static_cast<uint32_t>(
diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.cu b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
index 46732e0aa8f4..3519f01842c2 100644
--- a/graphbolt/src/cuda/extension/gpu_graph_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
@@ -23,11 +23,7 @@
 
 #include <cstddef>
 #ifdef GRAPHBOLT_USE_HIP
-#include <cuco/cuda_stream_ref.hpp>
 #include <hipcub/hipcub.hpp>
-namespace cuda {
-using stream_ref = cuco::cuda_stream_ref;
-}
 #define C10_CUDA_KERNEL_LAUNCH_CHECK C10_HIP_KERNEL_LAUNCH_CHECK
 #else
 #include <cub/cub.cuh>
diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index b305247824d1..57ce4d642dae 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -25,10 +25,6 @@
 
 #ifdef GRAPHBOLT_USE_HIP
 #include <hipcub/hipcub.hpp>
-#include <cuco/cuda_stream_ref.hpp>
-namespace cuda{
-    using stream_ref = cuco::cuda_stream_ref;
-}
 #define C10_CUDA_KERNEL_LAUNCH_CHECK C10_HIP_KERNEL_LAUNCH_CHECK
 #else
 #include <cub/cub.cuh>
@@ -209,11 +205,7 @@ UniqueAndCompactBatchedHashMapBased(
         cub::ArgIndexInputIterator index_it(indexes.data_ptr<int32_t>());
         auto input_it = thrust::make_transform_iterator(
             index_it,
-            #ifdef GRAPHBOLT_USE_HIP
-            ::proclaim_return_type
-            #else
             ::cuda::proclaim_return_type
-            #endif
             <::cuda::std::tuple<int64_t*, index_t, int32_t, bool>>(
                 [=, map = map.ref(cuco::find)] __device__(auto it)
                     -> ::cuda::std::tuple<int64_t*, index_t, int32_t, bool> {
@@ -247,11 +239,7 @@ UniqueAndCompactBatchedHashMapBased(
         auto unique_ids_offsets_dev_ptr =
             unique_ids_offsets_dev.data_ptr<int64_t>();
         auto output_it = thrust::make_tabulate_output_iterator(
-            #ifdef GRAPHBOLT_USE_HIP
-            ::proclaim_return_type
-            #else
             ::cuda::proclaim_return_type
-            #endif
             <void>(
                 [=, unique_ids_ptr = unique_ids.data_ptr<index_t>(),
                  part_ids_ptr =
@@ -276,11 +264,7 @@ UniqueAndCompactBatchedHashMapBased(
             DeviceSelect::If, input_it, output_it,
             unique_ids_offsets_dev_ptr + num_batches,
             offsets_ptr[2 * num_batches],
-            #ifdef GRAPHBOLT_USE_HIP
-            ::proclaim_return_type
-            #else
             ::cuda::proclaim_return_type
-            #endif
             <bool>([] __device__(const auto& t) {
               return ::cuda::std::get<3>(t);
             }));
@@ -300,11 +284,7 @@ UniqueAndCompactBatchedHashMapBased(
                       thrust::make_zip_iterator(
                           unique_ids_offsets_dev2.data_ptr<int64_t>(),
                           unique_ids_offsets.data_ptr<int64_t>()),
-                      #ifdef GRAPHBOLT_USE_HIP
-            ::proclaim_return_type
-            #else
             ::cuda::proclaim_return_type
-            #endif
             <
                           thrust::tuple<int64_t, int64_t>>(
                           [=] __device__(const auto x) {
diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh
index d119994d6d78..27c6f21d3cb9 100644
--- a/script/install_graphbolt_deps.sh
+++ b/script/install_graphbolt_deps.sh
@@ -1,42 +1,47 @@
 #!/usr/bin/env bash
 export CC=/opt/rocm/llvm/bin/clang
 export CXX=/opt/rocm/llvm/bin/clang++
-
+set -x
 # set the install prefix to the cwd/install
 # INSTALL_PREFIX=$(pwd)/install
 INSTALL_PREFIX=/opt/rocm
 FILE_SOURCE_DIR=$(dirname $(realpath $0))
 DEPS_DIR=$(pwd)
+export CMAKE_PREFIX_PATH="/opt/rocm/hip/lib/cmake;/opt/rocm/lib/cmake"
 
-# Not installed by default
-git clone https://github.com/ROCm/libhipcxx.git 
-cd libhipcxx 
-git checkout v2.2.0 
-cmake -B build \
-        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}
-cmake --build build --target install 
-cd ${DEPS_DIR}
+# # Not installed by default
+# git clone https://github.com/ROCm/libhipcxx.git 
+# cd libhipcxx 
+# git checkout v2.2.0 
+# cmake -B build \
+#         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}
+# cmake --build build --target install 
+# cd ${DEPS_DIR}
 
-# Need to patch for https://github.com/ROCm/rocm-libraries/issues/101.
-# Should be fixed in
-# https://github.com/ROCm/rocm-libraries/commit/e403601a2abe4a305cafd6526af2dc9bc69823e2#diff-7579081ee4dda43a07274a2397b8277bfa022af6d485ba086efc66a124ee8f5b
-git clone https://github.com/tpopp/rocThrust.git
-cd rocThrust
-git checkout 613db9a025709fb18f2a676543a17850bd231b04
-cmake -B build \
-        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}
-cmake --build build --target install
-cd ${DEPS_DIR}
+# # Need to patch for https://github.com/ROCm/rocm-libraries/issues/101.
+# # Should be fixed in
+# # https://github.com/ROCm/rocm-libraries/commit/e403601a2abe4a305cafd6526af2dc9bc69823e2#diff-7579081ee4dda43a07274a2397b8277bfa022af6d485ba086efc66a124ee8f5b
+# git clone https://github.com/tpopp/rocThrust.git
+# cd rocThrust
+# git checkout 613db9a025709fb18f2a676543a17850bd231b04
+# cmake -B build \
+#         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}
+# cmake --build build --target install
+# cd ${DEPS_DIR}
 
-# Need to patch for https://github.com/ROCm/hipCollections/issues/7, https://github.com/ROCm/hipCollections/issues/8, https://github.com/ROCm/hipCollections/issues/9
-git clone https://github.com/tpopp/hipCollections.git 
+git clone https://github.com/ROCm/hipCollections.git -b release/rocmds-25.10
+export RAPIDS_CMAKE_SCRIPT_BRANCH=release/rocmds-25.10
 cd hipCollections
-git checkout 6e31da8fd309f229d28adde8583a30bb4efaf1b7 
 cmake -B build \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DINSTALL_CUCO=ON -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_EXAMPLES=OFF
 cmake --build build --target install
 cd ${DEPS_DIR}
 
+# find and remove all lines in the libhipcxx that contain "#error semaphore is not supported on AMD hardware and should not be included"
+sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/semaphore
+sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/semaphore
+sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/std/semaphore
+sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/std/semaphore
 
 # if ROCM < 7.0 we also need to install rocThrust
 ROCM_VERSION=$(/opt/rocm/bin/hipconfig --version)

From 598430602a270baf0f2e9c0c0bf48b64e9ea0a67 Mon Sep 17 00:00:00 2001
From: jamesETsmith <james.smith9113@gmail.com>
Date: Wed, 12 Nov 2025 10:56:10 -0500
Subject: [PATCH 2/4] Cleaning up graphbolt install script

There are some challenges with the new hipco installation. For example,
libhipcxx cmake config get installed in /opt/rocm/lib/rapids/cmake
instead of the /opt/rocm/lib/cmake.
---
 CMakeLists.txt                   |  1 +
 CMakePresets.json                |  2 +-
 docker/Dockerfile.ci_gpu_rocm    |  2 +-
 script/install_graphbolt_deps.sh | 66 +++++++-------------------------
 4 files changed, 17 insertions(+), 54 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00491e4f6ad2..3dca01dac1ed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -634,6 +634,7 @@ if (BUILD_GRAPHBOLT)
         ALL
         ${CMAKE_COMMAND} -E env
         CMAKE_COMMAND=${CMAKE_CMD}
+        CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}"
         PYTORCH_ROCM_ARCH=${CMAKE_HIP_ARCHITECTURES}
         GPU_TARGETS=${CMAKE_HIP_ARCHITECTURES}
         CMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}
diff --git a/CMakePresets.json b/CMakePresets.json
index f02ab2f6629c..00926fbcc23b 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -23,7 +23,7 @@
         "CMAKE_CXX_FLAGS": "-fdiagnostics-color=always",
         "CMAKE_HIP_FLAGS": "-ftime-trace -fdiagnostics-color=always",
         "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-        "CMAKE_PREFIX_PATH": "/opt/rocm/lib/cmake",
+        "CMAKE_PREFIX_PATH": "/opt/rocm/lib/cmake:/opt/rocm/lib/rapids/cmake",
         "CMAKE_COLOR_DIAGNOSTICS": "ON"
       }
     },
diff --git a/docker/Dockerfile.ci_gpu_rocm b/docker/Dockerfile.ci_gpu_rocm
index 5d13c774c96d..ae74bf91e32c 100644
--- a/docker/Dockerfile.ci_gpu_rocm
+++ b/docker/Dockerfile.ci_gpu_rocm
@@ -2,7 +2,7 @@
 # Licensed under the Apache License Version 2.0"
 
 #############################################################################
-ARG BASE_IMAGE=rocm/pytorch:rocm7.0_ubuntu24.04_py3.12_pytorch_release_2.6.0
+ARG BASE_IMAGE=rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.6.0
 FROM ${BASE_IMAGE} AS dgl_build
 
 # NOTE: This dockerfile **assumes** that BASE_IMAGE comes with the appropriate
diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh
index 27c6f21d3cb9..15290bada8d1 100644
--- a/script/install_graphbolt_deps.sh
+++ b/script/install_graphbolt_deps.sh
@@ -1,33 +1,15 @@
 #!/usr/bin/env bash
-export CC=/opt/rocm/llvm/bin/clang
-export CXX=/opt/rocm/llvm/bin/clang++
+
+ROCM_ROOT=/opt/rocm
+
+export CC=${ROCM_ROOT}/llvm/bin/clang
+export CXX=${ROCM_ROOT}/llvm/bin/clang++
+
 set -x
-# set the install prefix to the cwd/install
-# INSTALL_PREFIX=$(pwd)/install
-INSTALL_PREFIX=/opt/rocm
+INSTALL_PREFIX=${ROCM_ROOT}
 FILE_SOURCE_DIR=$(dirname $(realpath $0))
 DEPS_DIR=$(pwd)
-export CMAKE_PREFIX_PATH="/opt/rocm/hip/lib/cmake;/opt/rocm/lib/cmake"
-
-# # Not installed by default
-# git clone https://github.com/ROCm/libhipcxx.git 
-# cd libhipcxx 
-# git checkout v2.2.0 
-# cmake -B build \
-#         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}
-# cmake --build build --target install 
-# cd ${DEPS_DIR}
-
-# # Need to patch for https://github.com/ROCm/rocm-libraries/issues/101.
-# # Should be fixed in
-# # https://github.com/ROCm/rocm-libraries/commit/e403601a2abe4a305cafd6526af2dc9bc69823e2#diff-7579081ee4dda43a07274a2397b8277bfa022af6d485ba086efc66a124ee8f5b
-# git clone https://github.com/tpopp/rocThrust.git
-# cd rocThrust
-# git checkout 613db9a025709fb18f2a676543a17850bd231b04
-# cmake -B build \
-#         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}
-# cmake --build build --target install
-# cd ${DEPS_DIR}
+export CMAKE_PREFIX_PATH="/opt/rocm/hip/lib/cmake;/opt/rocm/lib/cmake"  
 
 git clone https://github.com/ROCm/hipCollections.git -b release/rocmds-25.10
 export RAPIDS_CMAKE_SCRIPT_BRANCH=release/rocmds-25.10
@@ -37,35 +19,15 @@ cmake -B build \
 cmake --build build --target install
 cd ${DEPS_DIR}
 
+# TODO this is an unacceptable way to do this, hopefully we can resolve https://github.com/ROCm/libhipcxx/issues/10 quickly
+# previous version of libhipcxx allowed semaphores and we didn't have a problems so we're turning them on here
 # find and remove all lines in the libhipcxx that contain "#error semaphore is not supported on AMD hardware and should not be included"
 sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/semaphore
 sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/semaphore
 sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/std/semaphore
 sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/std/semaphore
 
-# if ROCM < 7.0 we also need to install rocThrust
-ROCM_VERSION=$(/opt/rocm/bin/hipconfig --version)
-#strip the major version from the ROCM_VERSION (before the dot)
-ROCM_VERSION=${ROCM_VERSION%%.*}
-echo "Working with ROCm Major Version: $ROCM_VERSION"
-if [ "$ROCM_VERSION" -lt "7" ]; then
-
-        # Need to patch for https://github.com/ROCm/rocm-libraries/issues/94. Fixed in https://github.com/ROCm/rocm-libraries/commit/2539bb2e1cd17d287f532a65125b662bf0b658dc
-        git clone https://github.com/tpopp/hipCUB.git
-        cd hipCUB
-        git checkout f342111197dd020f1c4210b16aa550b08992e97b
-        cmake -B build \
-                -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}
-        cmake --build build --target install
-        cd ${DEPS_DIR}
-else
-    echo "ROCm Major Version is 7.0 or higher, skipping hipCUB installation"
-    # TODO remove this once the patches are merged
-    # Right now we need to patch the rocPRIM headers to fix the build because these
-    # config headers are missing gfx942 (I've added them manually)
-    cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.
-
-fi
-
-
-
+# TODO remove this once the patches are merged
+# Right now we need to patch the rocPRIM headers to fix the build because these
+# config headers are missing gfx942 (I've added them manually)
+cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.
\ No newline at end of file

From c8c5c165fdde5f2c4301a32278931364aba866d3 Mon Sep 17 00:00:00 2001
From: jamesETsmith <james.smith9113@gmail.com>
Date: Wed, 12 Nov 2025 12:33:22 -0500
Subject: [PATCH 3/4] Formatting changes and adding dockerignore

---
 .dockerignore                                       |  8 ++++++++
 docker/Dockerfile.ci_gpu_rocm                       |  7 -------
 .../src/cuda/cooperative_minibatching_utils.cu      |  3 +--
 graphbolt/src/cuda/extension/gpu_graph_cache.cu     | 10 ++++------
 .../src/cuda/extension/unique_and_compact_map.cu    | 13 +++++--------
 script/install_graphbolt_deps.sh                    |  2 +-
 6 files changed, 19 insertions(+), 24 deletions(-)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 000000000000..c87e6c56f85f
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,8 @@
+build/
+graphbolt/build/
+dgl_sparse/build/
+tensoradapter/pytorch/build/
+python/build/
+python/dist/
+python/*.egg-info/
+python/libdgl.so
diff --git a/docker/Dockerfile.ci_gpu_rocm b/docker/Dockerfile.ci_gpu_rocm
index ae74bf91e32c..d8758f41a464 100644
--- a/docker/Dockerfile.ci_gpu_rocm
+++ b/docker/Dockerfile.ci_gpu_rocm
@@ -28,13 +28,6 @@ ENV DGL_SRC_DIR="/src/dgl"
 RUN mkdir -p ${DGL_SRC_DIR}
 COPY . ${DGL_SRC_DIR}
 
-# Clean up remnants of any previous builds
-RUN rm -rf ${DGL_SRC_DIR}/build
-RUN rm -rf ${DGL_SRC_DIR}/graphbolt/build
-RUN rm -rf ${DGL_SRC_DIR}/dgl_sparse/build
-RUN rm -rf ${DGL_SRC_DIR}/tensoradapter/pytorch/build
-RUN rm -rf ${DGL_SRC_DIR}/python/build ${DGL_SRC_DIR}/python/dist ${DGL_SRC_DIR}/python/*.egg-info ${DGL_SRC_DIR}/python/libdgl.so
-
 # Set GPU build targets
 ARG ARG_GPU_BUILD_TARGETS="gfx90a,gfx942"
 ENV GPU_BUILD_TARGETS=${ARG_GPU_BUILD_TARGETS}
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
index 6bd0f1005875..00ad8341900c 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
@@ -54,8 +54,7 @@ torch::Tensor RankAssignment(
         THRUST_CALL(
             transform, nodes_ptr, nodes_ptr + nodes.numel(), part_ids_ptr,
 
-            ::cuda::proclaim_return_type
-            <part_t>(
+            ::cuda::proclaim_return_type<part_t>(
                 [rank = static_cast<uint32_t>(rank),
                  world_size = static_cast<uint32_t>(
                      world_size)] __device__(index_t id) -> part_t {
diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.cu b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
index 3519f01842c2..3990ac498713 100644
--- a/graphbolt/src/cuda/extension/gpu_graph_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
@@ -506,12 +506,10 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
               }
               if (edge_id_offsets) {
                 // Append the edge ids as the last element of the output.
-                output_edge_tensors.push_back(
-                    ops::IndptrEdgeIdsImpl(
-                        output_indptr, output_indptr.scalar_type(),
-                        *edge_id_offsets,
-                        static_cast<int64_t>(
-                            static_cast<indptr_t>(output_size))));
+                output_edge_tensors.push_back(ops::IndptrEdgeIdsImpl(
+                    output_indptr, output_indptr.scalar_type(),
+                    *edge_id_offsets,
+                    static_cast<int64_t>(static_cast<indptr_t>(output_size))));
               }
 
               {
diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index 57ce4d642dae..6a5625634065 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -205,8 +205,8 @@ UniqueAndCompactBatchedHashMapBased(
         cub::ArgIndexInputIterator index_it(indexes.data_ptr<int32_t>());
         auto input_it = thrust::make_transform_iterator(
             index_it,
-            ::cuda::proclaim_return_type
-            <::cuda::std::tuple<int64_t*, index_t, int32_t, bool>>(
+            ::cuda::proclaim_return_type<
+                ::cuda::std::tuple<int64_t*, index_t, int32_t, bool>>(
                 [=, map = map.ref(cuco::find)] __device__(auto it)
                     -> ::cuda::std::tuple<int64_t*, index_t, int32_t, bool> {
                   const auto i = it.key;
@@ -239,8 +239,7 @@ UniqueAndCompactBatchedHashMapBased(
         auto unique_ids_offsets_dev_ptr =
             unique_ids_offsets_dev.data_ptr<int64_t>();
         auto output_it = thrust::make_tabulate_output_iterator(
-            ::cuda::proclaim_return_type
-            <void>(
+            ::cuda::proclaim_return_type<void>(
                 [=, unique_ids_ptr = unique_ids.data_ptr<index_t>(),
                  part_ids_ptr =
                      part_ids ? part_ids->data_ptr<cuda::part_t>() : nullptr,
@@ -264,8 +263,7 @@ UniqueAndCompactBatchedHashMapBased(
             DeviceSelect::If, input_it, output_it,
             unique_ids_offsets_dev_ptr + num_batches,
             offsets_ptr[2 * num_batches],
-            ::cuda::proclaim_return_type
-            <bool>([] __device__(const auto& t) {
+            ::cuda::proclaim_return_type<bool>([] __device__(const auto& t) {
               return ::cuda::std::get<3>(t);
             }));
         auto unique_ids_offsets = torch::empty(
@@ -284,8 +282,7 @@ UniqueAndCompactBatchedHashMapBased(
                       thrust::make_zip_iterator(
                           unique_ids_offsets_dev2.data_ptr<int64_t>(),
                           unique_ids_offsets.data_ptr<int64_t>()),
-            ::cuda::proclaim_return_type
-            <
+                      ::cuda::proclaim_return_type<
                           thrust::tuple<int64_t, int64_t>>(
                           [=] __device__(const auto x) {
                             return thrust::make_tuple(x, x);
diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh
index 15290bada8d1..8d3c2e9fe865 100644
--- a/script/install_graphbolt_deps.sh
+++ b/script/install_graphbolt_deps.sh
@@ -30,4 +30,4 @@ sed -i '/#error semaphore is not supported on AMD hardware and should not be inc
 # TODO remove this once the patches are merged
 # Right now we need to patch the rocPRIM headers to fix the build because these
 # config headers are missing gfx942 (I've added them manually)
-cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.
\ No newline at end of file
+cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.

From 7da392ad2338694b66ba80ccf975c9a75f1c3f08 Mon Sep 17 00:00:00 2001
From: jamesETsmith <james.smith9113@gmail.com>
Date: Fri, 14 Nov 2025 12:25:32 -0500
Subject: [PATCH 4/4] Adding links to rocprim PR and better comments about in
 the script to install graphbolt dependencies

---
 script/install_graphbolt_deps.sh | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/script/install_graphbolt_deps.sh b/script/install_graphbolt_deps.sh
index 8d3c2e9fe865..0a67bba493d0 100644
--- a/script/install_graphbolt_deps.sh
+++ b/script/install_graphbolt_deps.sh
@@ -19,15 +19,23 @@ cmake -B build \
 cmake --build build --target install
 cd ${DEPS_DIR}
 
-# TODO this is an unacceptable way to do this, hopefully we can resolve https://github.com/ROCm/libhipcxx/issues/10 quickly
-# previous version of libhipcxx allowed semaphores and we didn't have a problems so we're turning them on here
-# find and remove all lines in the libhipcxx that contain "#error semaphore is not supported on AMD hardware and should not be included"
+# TODO this is an unacceptable way to do this,
+# see https://github.com/ROCm/libhipcxx/issues/10 for more details
+# This was implicitly not allowed in previous releases we were using, 
+# but with v2.7.0 they are explicitly not allowed.
+
+# We only use semaphores for a counter of IO operations in graphbolt, 
+# that only runs on the host (not on the device) so we should be "safe"
+# to use this for now.
 sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/semaphore
 sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/semaphore
 sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/cuda/std/semaphore
 sed -i '/#error semaphore is not supported on AMD hardware and should not be included/d' ${INSTALL_PREFIX}/include/rapids/libhipcxx/hip/std/semaphore
 
 # TODO remove this once the patches are merged
+# the patches for this were merged in https://github.com/ROCm/rocm-libraries/pull/1883
+# but may take more time to be released.
+
 # Right now we need to patch the rocPRIM headers to fix the build because these
 # config headers are missing gfx942 (I've added them manually)
 cp ${FILE_SOURCE_DIR}/*.hpp ${INSTALL_PREFIX}/include/rocprim/device/detail/config/.