From f057d83efb99bb80bcd3ca1adf84efe21c206852 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 12 Mar 2026 13:20:10 +0100
Subject: [PATCH 1/8] Make warpspeed scan tunable

* Duplicate exclusive/sum benchmark for warpspeed since it is too different from the old implementation
* Hardcode benchmark OffsetT to uint64
* Review warpspeed scan tuning parameters and expose the relevant ones

Fixes: #7893
Fixes: #7894
---
 cub/benchmarks/bench/scan/exclusive/base.cuh  | 122 ++++++++++++------
 cub/benchmarks/bench/scan/exclusive/custom.cu |   4 +-
 cub/benchmarks/bench/scan/exclusive/sum.cu    |   6 +-
 .../bench/scan/exclusive/sum.warspeed.cu      |  32 +++++
 .../device/dispatch/tuning/tuning_scan.cuh    |  20 +--
 5 files changed, 135 insertions(+), 49 deletions(-)
 create mode 100644 cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu
diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh
index 709f5592678..b5a942e9092 100644
--- a/cub/benchmarks/bench/scan/exclusive/base.cuh
+++ b/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #pragma once
@@ -11,43 +11,89 @@
 #include <nvbench_helper.cuh>
 
 #if !TUNE_BASE
-#  if TUNE_TRANSPOSE == 0
-#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_DIRECT
-#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
-#  else // TUNE_TRANSPOSE == 1
-#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_WARP_TRANSPOSE
-#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
-#  endif // TUNE_TRANSPOSE
-
-#  if TUNE_LOAD == 0
-#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
-#  elif TUNE_LOAD == 1
-#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
-#  endif // TUNE_LOAD
-
 template <typename AccumT>
 struct policy_hub_t
 {
-  template <int NOMINAL_BLOCK_THREADS_4B,
-            int NOMINAL_ITEMS_PER_THREAD_4B,
-            typename ComputeT,
-            cub::BlockLoadAlgorithm LOAD_ALGORITHM,
-            cub::CacheLoadModifier LOAD_MODIFIER,
-            cub::BlockStoreAlgorithm STORE_ALGORITHM,
-            cub::BlockScanAlgorithm SCAN_ALGORITHM>
-  using agent_policy_t = cub::AgentScanPolicy<
-    NOMINAL_BLOCK_THREADS_4B,
-    NOMINAL_ITEMS_PER_THREAD_4B,
-    ComputeT,
-    LOAD_ALGORITHM,
-    LOAD_MODIFIER,
-    STORE_ALGORITHM,
-    SCAN_ALGORITHM,
-    cub::detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
-    delay_constructor_t>;
-
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  struct MaxPolicy : cub::ChainedPolicy<300, policy_t, policy_t>
   {
+#  if USES_WARPSPEED()
+    struct WarpspeedPolicy
+    {
+      static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS;
+      static constexpr int num_look_ahead_items      = TUNE_NUM_LOOKBACK_ITEMS;
+      static constexpr int items_per_thread          = TUNE_ITEMS_PLUS_ONE - 1;
+
+      // the rest are fixed or derived definitions
+
+      static constexpr int num_squads           = 5;
+      static constexpr int num_threads_per_warp = 32;
+      static constexpr int num_load_warps       = 1;
+      static constexpr int num_sched_warps      = 1;
+      static constexpr int num_look_ahead_warps = 1;
+
+      static constexpr int num_total_warps =
+        2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
+      static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;
+
+      static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp;
+
+      static constexpr int tile_size = items_per_thread * squad_reduce_thread_count;
+
+      // The squads cannot be static constexpr variables, as those are not device accessible
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept
+      {
+        return warpspeed::SquadDesc{0, num_reduce_and_scan_warps};
+      }
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept
+      {
+        return warpspeed::SquadDesc{1, num_reduce_and_scan_warps};
+      }
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept
+      {
+        return warpspeed::SquadDesc{2, num_load_warps};
+      }
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadSched() noexcept
+      {
+        return warpspeed::SquadDesc{3, num_sched_warps};
+      }
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLookback() noexcept
+      {
+        return warpspeed::SquadDesc{4, num_look_ahead_warps};
+      }
+    };
+#  else // USES_WARPSPEED()
+#    if TUNE_TRANSPOSE == 0
+#      define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_DIRECT
+#      define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
+#    else // TUNE_TRANSPOSE == 1
+#      define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_WARP_TRANSPOSE
+#      define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
+#    endif // TUNE_TRANSPOSE
+
+#    if TUNE_LOAD == 0
+#      define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#    elif TUNE_LOAD == 1
+#      define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#    endif // TUNE_LOAD
+
+    template <int NOMINAL_BLOCK_THREADS_4B,
+              int NOMINAL_ITEMS_PER_THREAD_4B,
+              typename ComputeT,
+              cub::BlockLoadAlgorithm LOAD_ALGORITHM,
+              cub::CacheLoadModifier LOAD_MODIFIER,
+              cub::BlockStoreAlgorithm STORE_ALGORITHM,
+              cub::BlockScanAlgorithm SCAN_ALGORITHM>
+    using agent_policy_t = cub::AgentScanPolicy<
+      NOMINAL_BLOCK_THREADS_4B,
+      NOMINAL_ITEMS_PER_THREAD_4B,
+      ComputeT,
+      LOAD_ALGORITHM,
+      LOAD_MODIFIER,
+      STORE_ALGORITHM,
+      SCAN_ALGORITHM,
+      cub::detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
+      delay_constructor_t>;
+
     using ScanPolicyT =
       agent_policy_t<TUNE_THREADS,
                      TUNE_ITEMS,
@@ -56,9 +102,8 @@ struct policy_hub_t
                      TUNE_LOAD_MODIFIER,
                      TUNE_STORE_ALGORITHM,
                      cub::BLOCK_SCAN_WARP_SCANS>;
+#  endif // USES_WARPSPEED()
   };
-
-  using MaxPolicy = policy_t;
 };
 #endif // TUNE_BASE
 
@@ -72,6 +117,9 @@ try
   using input_it_t     = const T*;
   using output_it_t    = T*;
   using offset_t       = cub::detail::choose_offset_t<OffsetT>;
+#if USES_WARPSPEED()
+  static_assert(sizeof(offset_t) == sizeof(size_t)); // warpspeed scan uses size_t internally
+#endif // USES_WARPSPEED()
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<accum_t>;
@@ -128,7 +176,7 @@ catch (const std::bad_alloc&)
   state.skip("Skipping: out of memory.");
 }
 
-NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, offset_types))
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, scan_offset_types))
   .set_name("base")
   .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4));
diff --git a/cub/benchmarks/bench/scan/exclusive/custom.cu b/cub/benchmarks/bench/scan/exclusive/custom.cu
index 057a198d34f..a54500be00c 100644
--- a/cub/benchmarks/bench/scan/exclusive/custom.cu
+++ b/cub/benchmarks/bench/scan/exclusive/custom.cu
@@ -9,5 +9,7 @@
 
 #include <nvbench_helper.cuh>
 
-using op_t = max_t;
+#define USES_WARPSPEED() 0
+using op_t              = max_t;
+using scan_offset_types = offset_types;
 #include "base.cuh"
diff --git a/cub/benchmarks/bench/scan/exclusive/sum.cu b/cub/benchmarks/bench/scan/exclusive/sum.cu
index 36b1e02b831..df3beebcd99 100644
--- a/cub/benchmarks/bench/scan/exclusive/sum.cu
+++ b/cub/benchmarks/bench/scan/exclusive/sum.cu
@@ -5,6 +5,8 @@
 
 #include <nvbench_helper.cuh>
 
+// This benchmark tunes the old, non-warpspeed scan implementation
+
 // %RANGE% TUNE_ITEMS ipt 7:24:1
 // %RANGE% TUNE_THREADS tpb 128:1024:32
 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4
@@ -13,5 +15,7 @@
 // %RANGE% TUNE_TRANSPOSE trp 0:1:1
 // %RANGE% TUNE_LOAD ld 0:1:1
 
-using op_t = ::cuda::std::plus<>;
+#define USES_WARPSPEED() 0
+using op_t              = ::cuda::std::plus<>;
+using scan_offset_types = offset_types;
 #include "base.cuh"
diff --git a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu
new file mode 100644
index 00000000000..373a07305ec
--- /dev/null
+++ b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu
@@ -0,0 +1,32 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This tunes the warpspeed implementation of scan, which is only available on SM100+. It has entirely different tuning
+// parameters and is agnostic of the offset type. It is thus in a separate file, so we can continue to tune the old scan
+// implementation on older hardware architectures.
+
+#if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
+#  error "This benchmark does not support being compiled for multiple architectures"
+#endif
+
+#if __CUDA_ARCH_LIST__ < 1000
+#  error Warpspeed scan requires at least sm_100
+#endif // __CUDA_ARCH_LIST__ < 1000
+
+#if __cccl_ptx_isa < 860
+#  error Warpspeed scan requires at least PTX ISA 8.6
+#endif // __cccl_ptx_isa < 860
+
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_NUM_REDUCE_SCAN_WARPS wrps 1:8:1
+// %RANGE% TUNE_NUM_LOOKBACK_ITEMS lbi 1:8:1
+
+// TODO(bgruber): find a good range and step width, items per thread should be coprime with 32 to avoid SMEM conflicts.
+// Should we specify nominal items per thread instead?
+// %RANGE% TUNE_ITEMS_PLUS_ONE ipt 8:256:8
+
+#define USES_WARPSPEED() 1
+using op_t              = ::cuda::std::plus<>;
+using scan_offset_types = nvbench::type_list<int64_t>;
+#include "base.cuh"
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index aa7f991f328..4c9b07e7c84 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -568,6 +568,8 @@ struct policy_hub
 #if __cccl_ptx_isa >= 860
     struct WarpspeedPolicy
     {
+      // TODO(bgruber): remove everything but num_reduce_warps, num_look_ahead_items and items_per_thread from policy
+
       // Squad definitions
       static constexpr int num_squads           = 5;
       static constexpr int num_threads_per_warp = 32;
@@ -579,26 +581,24 @@ struct policy_hub
       static constexpr int num_reduce_warps    = 2;
       static constexpr int num_scan_stor_warps = 2;
 #  else // _CCCL_COMPILER(NVHPC)
-      static constexpr int num_reduce_warps    = 4;
-      static constexpr int num_scan_stor_warps = 4;
+      static constexpr int num_reduce_and_scan_warps = 4; // this can be tuned
 #  endif // _CCCL_COMPILER(NVHPC)
-      static constexpr int num_load_warps       = 1;
-      static constexpr int num_sched_warps      = 1;
-      static constexpr int num_look_ahead_warps = 1;
-
+      static constexpr int num_load_warps       = 1; // no point in being more than 1
+      static constexpr int num_sched_warps      = 1; // no point in being more than 1
+      static constexpr int num_look_ahead_warps = 1; // must be 1
       // TODO(bgruber): 5 is a bit better for complex<float>
       static constexpr int num_look_ahead_items = sizeof(AccumT) == 2 ? 3 : 4;
 
       // Deduced definitions
       static constexpr int num_total_warps =
-        num_reduce_warps + num_scan_stor_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
+        2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
       static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;
 
 #  if _CCCL_COMPILER(NVHPC)
       static_assert(num_total_threads <= 256);
 #  endif // _CCCL_COMPILER(NVHPC)
 
-      static constexpr int squad_reduce_thread_count = num_reduce_warps * num_threads_per_warp;
+      static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp;
 
       // manual tuning based on cub.bench.scan.exclusive.sum.base
       // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM)
@@ -627,11 +627,11 @@ struct policy_hub
       // The squads cannot be static constexpr variables, as those are not device accessible
       [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept
       {
-        return warpspeed::SquadDesc{0, num_reduce_warps};
+        return warpspeed::SquadDesc{0, num_reduce_and_scan_warps};
       }
       [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept
       {
-        return warpspeed::SquadDesc{1, num_scan_stor_warps};
+        return warpspeed::SquadDesc{1, num_reduce_and_scan_warps};
       }
       [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept
       {

From 4dbdc0f09554df280c34f3c32c756cd8c39aea9a Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 12 Mar 2026 13:24:40 +0100
Subject: [PATCH 2/8] Comment

---
 cub/benchmarks/bench/scan/exclusive/sum.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cub/benchmarks/bench/scan/exclusive/sum.cu b/cub/benchmarks/bench/scan/exclusive/sum.cu
index df3beebcd99..a0104279f76 100644
--- a/cub/benchmarks/bench/scan/exclusive/sum.cu
+++ b/cub/benchmarks/bench/scan/exclusive/sum.cu
@@ -5,7 +5,8 @@
 
 #include <nvbench_helper.cuh>
 
-// This benchmark tunes the old, non-warpspeed scan implementation
+// This benchmark tunes the old, non-warpspeed scan implementation. Using it for benchmarking, will pick the warpspeed
+// implementation on SM100+, but it's better to use the sum.warpspeed.cu benchmark instead, which uses a single OffsetT.
 
 // %RANGE% TUNE_ITEMS ipt 7:24:1
 // %RANGE% TUNE_THREADS tpb 128:1024:32

From 88ed0eb6207bff0d5d8ad8fcd87e918c98cb33c8 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 12 Mar 2026 13:27:31 +0100
Subject: [PATCH 3/8] Comment

---
 cub/cub/device/dispatch/tuning/tuning_scan.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 4c9b07e7c84..9178d558fe5 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -568,7 +568,7 @@ struct policy_hub
 #if __cccl_ptx_isa >= 860
     struct WarpspeedPolicy
     {
-      // TODO(bgruber): remove everything but num_reduce_warps, num_look_ahead_items and items_per_thread from policy
+      // TODO(bgruber): remove everything but num_reduce_and_scan_warps, num_look_ahead_items and items_per_thread
 
       // Squad definitions
       static constexpr int num_squads           = 5;

From d25427036ee1564419f375f621f52f12e49c7c21 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 12 Mar 2026 13:27:35 +0100
Subject: [PATCH 4/8] Fix

---
 cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu
index 373a07305ec..996837ffd2e 100644
--- a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu
+++ b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu
@@ -5,6 +5,8 @@
 // parameters and is agnostic of the offset type. It is thus in a separate file, so we can continue to tune the old scan
 // implementation on older hardware architectures.
 
+#include <cuda/__cccl_config>
+
 #if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
 #  error "This benchmark does not support being compiled for multiple architectures"
 #endif

From f528d154d989d9f40e4e847380362248e1435ba1 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 12 Mar 2026 13:49:52 +0100
Subject: [PATCH 5/8] Tuning fixes

---
 cub/benchmarks/bench/scan/exclusive/base.cuh | 29 ++++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh
index b5a942e9092..e5242d5f2aa 100644
--- a/cub/benchmarks/bench/scan/exclusive/base.cuh
+++ b/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -7,14 +7,17 @@
 
 #include <cuda/std/__functional/invoke.h>
 
-#include <look_back_helper.cuh>
 #include <nvbench_helper.cuh>
 
 #if !TUNE_BASE
+#  if !USES_WARPSPEED()
+#    include <look_back_helper.cuh>
+#  endif // !USES_WARPSPEED()
+
 template <typename AccumT>
 struct policy_hub_t
 {
-  struct MaxPolicy : cub::ChainedPolicy<300, policy_t, policy_t>
+  struct MaxPolicy : cub::ChainedPolicy<300, MaxPolicy, MaxPolicy>
   {
 #  if USES_WARPSPEED()
     struct WarpspeedPolicy
@@ -39,26 +42,28 @@ struct policy_hub_t
 
       static constexpr int tile_size = items_per_thread * squad_reduce_thread_count;
 
+      using SquadDesc = cub::detail::warpspeed::SquadDesc;
+
       // The squads cannot be static constexpr variables, as those are not device accessible
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadReduce() noexcept
       {
-        return warpspeed::SquadDesc{0, num_reduce_and_scan_warps};
+        return SquadDesc{0, num_reduce_and_scan_warps};
       }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadScanStore() noexcept
       {
-        return warpspeed::SquadDesc{1, num_reduce_and_scan_warps};
+        return SquadDesc{1, num_reduce_and_scan_warps};
       }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLoad() noexcept
       {
-        return warpspeed::SquadDesc{2, num_load_warps};
+        return SquadDesc{2, num_load_warps};
       }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadSched() noexcept
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadSched() noexcept
       {
-        return warpspeed::SquadDesc{3, num_sched_warps};
+        return SquadDesc{3, num_sched_warps};
       }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLookback() noexcept
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLookback() noexcept
       {
-        return warpspeed::SquadDesc{4, num_look_ahead_warps};
+        return SquadDesc{4, num_look_ahead_warps};
       }
     };
 #  else // USES_WARPSPEED()

From 2da3dd1e44b177d78c7bbd5a2805b051c67c7d5a Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 12 Mar 2026 16:19:18 +0100
Subject: [PATCH 6/8] Let the CI compile the benchmark

---
 .../bench/scan/exclusive/sum.warspeed.cu      | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu
index 996837ffd2e..3cd70b5af62 100644
--- a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu
+++ b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu
@@ -8,18 +8,18 @@
 #include <cuda/__cccl_config>
 
 #if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
-#  error "This benchmark does not support being compiled for multiple architectures"
-#endif
+#  warning "This benchmark does not support being compiled for multiple architectures. Disabling it."
+#else // _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
 
-#if __CUDA_ARCH_LIST__ < 1000
-#  error Warpspeed scan requires at least sm_100
-#endif // __CUDA_ARCH_LIST__ < 1000
+#  if __CUDA_ARCH_LIST__ < 1000
+#    warning "Warpspeed scan requires at least sm_100. Disabling it."
+#  else // __CUDA_ARCH_LIST__ < 1000
 
-#if __cccl_ptx_isa < 860
-#  error Warpspeed scan requires at least PTX ISA 8.6
-#endif // __cccl_ptx_isa < 860
+#    if __cccl_ptx_isa < 860
+#      warning "Warpspeed scan requires at least PTX ISA 8.6. Disabling it."
+#    else // if __cccl_ptx_isa < 860
 
-#include <nvbench_helper.cuh>
+#      include <nvbench_helper.cuh>
 
 // %RANGE% TUNE_NUM_REDUCE_SCAN_WARPS wrps 1:8:1
 // %RANGE% TUNE_NUM_LOOKBACK_ITEMS lbi 1:8:1
@@ -28,7 +28,11 @@
 // Should we specify nominal items per thread instead?
 // %RANGE% TUNE_ITEMS_PLUS_ONE ipt 8:256:8
 
-#define USES_WARPSPEED() 1
+#      define USES_WARPSPEED() 1
 using op_t              = ::cuda::std::plus<>;
 using scan_offset_types = nvbench::type_list<int64_t>;
-#include "base.cuh"
+#      include "base.cuh"
+
+#    endif // __cccl_ptx_isa < 860
+#  endif // __CUDA_ARCH_LIST__ < 1000
+#endif // _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1

From 275cd87da310a24db32ce790fe58c24334441036 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 12 Mar 2026 16:19:32 +0100
Subject: [PATCH 7/8] Fix filename

---
 .../bench/scan/exclusive/{sum.warspeed.cu => sum.warpspeed.cu}    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cub/benchmarks/bench/scan/exclusive/{sum.warspeed.cu => sum.warpspeed.cu} (100%)

diff --git a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu b/cub/benchmarks/bench/scan/exclusive/sum.warpspeed.cu
similarity index 100%
rename from cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu
rename to cub/benchmarks/bench/scan/exclusive/sum.warpspeed.cu

From 2dee3c412aacdc8d2c1aebd704fd31c2eb772dce Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 12 Mar 2026 18:07:17 +0100
Subject: [PATCH 8/8] Fix NVHPC

---
 cub/cub/device/dispatch/tuning/tuning_scan.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 9178d558fe5..a83d2bdf6ee 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -578,8 +578,7 @@ struct policy_hub
 #  if _CCCL_COMPILER(NVHPC)
       // need to reduce the number of threads to <= 256, so each thread can use up to 255 registers. This avoids an
       // error in ptxas, see also: https://github.com/NVIDIA/cccl/issues/7700.
-      static constexpr int num_reduce_warps    = 2;
-      static constexpr int num_scan_stor_warps = 2;
+      static constexpr int num_reduce_and_scan_warps = 2;
 #  else // _CCCL_COMPILER(NVHPC)
       static constexpr int num_reduce_and_scan_warps = 4; // this can be tuned
 #  endif // _CCCL_COMPILER(NVHPC)