NVIDIA · davebayer · Mar 13, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #pragma once
@@ -7,47 +7,98 @@
 
 #include <cuda/std/__functional/invoke.h>
 
-#include <look_back_helper.cuh>
 #include <nvbench_helper.cuh>
 
 #if !TUNE_BASE
-#  if TUNE_TRANSPOSE == 0
-#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_DIRECT
-#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
-#  else // TUNE_TRANSPOSE == 1
-#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_WARP_TRANSPOSE
-#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
-#  endif // TUNE_TRANSPOSE
-
-#  if TUNE_LOAD == 0
-#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
-#  elif TUNE_LOAD == 1
-#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
-#  endif // TUNE_LOAD
+#  if !USES_WARPSPEED()
+#    include <look_back_helper.cuh>
+#  endif // !USES_WARPSPEED()
 
 template <typename AccumT>
 struct policy_hub_t
 {
-  template <int NOMINAL_BLOCK_THREADS_4B,
-            int NOMINAL_ITEMS_PER_THREAD_4B,
-            typename ComputeT,
-            cub::BlockLoadAlgorithm LOAD_ALGORITHM,
-            cub::CacheLoadModifier LOAD_MODIFIER,
-            cub::BlockStoreAlgorithm STORE_ALGORITHM,
-            cub::BlockScanAlgorithm SCAN_ALGORITHM>
-  using agent_policy_t = cub::AgentScanPolicy<
-    NOMINAL_BLOCK_THREADS_4B,
-    NOMINAL_ITEMS_PER_THREAD_4B,
-    ComputeT,
-    LOAD_ALGORITHM,
-    LOAD_MODIFIER,
-    STORE_ALGORITHM,
-    SCAN_ALGORITHM,
-    cub::detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
-    delay_constructor_t>;
-
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  struct MaxPolicy : cub::ChainedPolicy<300, MaxPolicy, MaxPolicy>
   {
+#  if USES_WARPSPEED()
+    struct WarpspeedPolicy
+    {
+      static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS;
+      static constexpr int num_look_ahead_items      = TUNE_NUM_LOOKBACK_ITEMS;
+      static constexpr int items_per_thread          = TUNE_ITEMS_PLUS_ONE - 1;
+
+      // the rest are fixed or derived definitions
+
+      static constexpr int num_squads           = 5;
+      static constexpr int num_threads_per_warp = 32;
+      static constexpr int num_load_warps       = 1;
+      static constexpr int num_sched_warps      = 1;
+      static constexpr int num_look_ahead_warps = 1;
+
+      static constexpr int num_total_warps =
+        2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
+      static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;
+
+      static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp;
+
+      static constexpr int tile_size = items_per_thread * squad_reduce_thread_count;
+
+      using SquadDesc = cub::detail::warpspeed::SquadDesc;
+
+      // The squads cannot be static constexpr variables, as those are not device accessible
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadReduce() noexcept
+      {
+        return SquadDesc{0, num_reduce_and_scan_warps};
+      }
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadScanStore() noexcept
+      {
+        return SquadDesc{1, num_reduce_and_scan_warps};
+      }
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLoad() noexcept
+      {
+        return SquadDesc{2, num_load_warps};
+      }
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadSched() noexcept
+      {
+        return SquadDesc{3, num_sched_warps};
+      }
+      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLookback() noexcept
+      {
+        return SquadDesc{4, num_look_ahead_warps};
+      }
+    };
+#  else // USES_WARPSPEED()
+#    if TUNE_TRANSPOSE == 0
+#      define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_DIRECT
+#      define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
+#    else // TUNE_TRANSPOSE == 1
+#      define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_WARP_TRANSPOSE
+#      define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
+#    endif // TUNE_TRANSPOSE
+
+#    if TUNE_LOAD == 0
+#      define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#    elif TUNE_LOAD == 1
+#      define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#    endif // TUNE_LOAD
+
+    template <int NOMINAL_BLOCK_THREADS_4B,
+              int NOMINAL_ITEMS_PER_THREAD_4B,
+              typename ComputeT,
+              cub::BlockLoadAlgorithm LOAD_ALGORITHM,
+              cub::CacheLoadModifier LOAD_MODIFIER,
+              cub::BlockStoreAlgorithm STORE_ALGORITHM,
+              cub::BlockScanAlgorithm SCAN_ALGORITHM>
+    using agent_policy_t = cub::AgentScanPolicy<
+      NOMINAL_BLOCK_THREADS_4B,
+      NOMINAL_ITEMS_PER_THREAD_4B,
+      ComputeT,
+      LOAD_ALGORITHM,
+      LOAD_MODIFIER,
+      STORE_ALGORITHM,
+      SCAN_ALGORITHM,
+      cub::detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
+      delay_constructor_t>;
+
     using ScanPolicyT =
       agent_policy_t<TUNE_THREADS,
                      TUNE_ITEMS,
@@ -56,9 +107,8 @@ struct policy_hub_t
                      TUNE_LOAD_MODIFIER,
                      TUNE_STORE_ALGORITHM,
                      cub::BLOCK_SCAN_WARP_SCANS>;
+#  endif // USES_WARPSPEED()
   };
-
-  using MaxPolicy = policy_t;
 };
 #endif // TUNE_BASE
 
@@ -72,6 +122,9 @@ try
   using input_it_t     = const T*;
   using output_it_t    = T*;
   using offset_t       = cub::detail::choose_offset_t<OffsetT>;
+#if USES_WARPSPEED()
+  static_assert(sizeof(offset_t) == sizeof(size_t)); // warpspeed scan uses size_t internally
+#endif // USES_WARPSPEED()
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<accum_t>;
@@ -128,7 +181,7 @@ catch (const std::bad_alloc&)
   state.skip("Skipping: out of memory.");
 }
 
-NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, offset_types))
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, scan_offset_types))
   .set_name("base")
   .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4));
@@ -9,5 +9,7 @@
 
 #include <nvbench_helper.cuh>
 
-using op_t = max_t;
+#define USES_WARPSPEED() 0
+using op_t              = max_t;
+using scan_offset_types = offset_types;
 #include "base.cuh"
@@ -5,6 +5,9 @@
 
 #include <nvbench_helper.cuh>
 
+// This benchmark tunes the old, non-warpspeed scan implementation. Using it for benchmarking, will pick the warpspeed
+// implementation on SM100+, but it's better to use the sum.warpspeed.cu benchmark instead, which uses a single OffsetT.
+
 // %RANGE% TUNE_ITEMS ipt 7:24:1
 // %RANGE% TUNE_THREADS tpb 128:1024:32
 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4
@@ -13,5 +16,7 @@
 // %RANGE% TUNE_TRANSPOSE trp 0:1:1
 // %RANGE% TUNE_LOAD ld 0:1:1
 
-using op_t = ::cuda::std::plus<>;
+#define USES_WARPSPEED() 0
+using op_t              = ::cuda::std::plus<>;
+using scan_offset_types = offset_types;
 #include "base.cuh"
@@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This tunes the warpspeed implementation of scan, which is only available on SM100+. It has entirely different tuning
+// parameters and is agnostic of the offset type. It is thus in a separate file, so we can continue to tune the old scan
+// implementation on older hardware architectures.
+
+#include <cuda/__cccl_config>
+
+#if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
+#  warning "This benchmark does not support being compiled for multiple architectures. Disabling it."
+#else // _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
+
+#  if __CUDA_ARCH_LIST__ < 1000
+#    warning "Warpspeed scan requires at least sm_100. Disabling it."
+#  else // __CUDA_ARCH_LIST__ < 1000
+
+#    if __cccl_ptx_isa < 860
+#      warning "Warpspeed scan requires at least PTX ISA 8.6. Disabling it."
+#    else // if __cccl_ptx_isa < 860
+
+#      include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_NUM_REDUCE_SCAN_WARPS wrps 1:8:1
+// %RANGE% TUNE_NUM_LOOKBACK_ITEMS lbi 1:8:1
+
+// TODO(bgruber): find a good range and step width, items per thread should be coprime with 32 to avoid SMEM conflicts.
+// Should we specify nominal items per thread instead?
+// %RANGE% TUNE_ITEMS_PLUS_ONE ipt 8:256:8
+
+#      define USES_WARPSPEED() 1
+using op_t              = ::cuda::std::plus<>;
+using scan_offset_types = nvbench::type_list<int64_t>;
+#      include "base.cuh"
+
+#    endif // __cccl_ptx_isa < 860
+#  endif // __CUDA_ARCH_LIST__ < 1000
+#endif // _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
@@ -568,6 +568,8 @@ struct policy_hub
 #if __cccl_ptx_isa >= 860
     struct WarpspeedPolicy
     {
+      // TODO(bgruber): remove everything but num_reduce_and_scan_warps, num_look_ahead_items and items_per_thread
+
       // Squad definitions
       static constexpr int num_squads           = 5;
       static constexpr int num_threads_per_warp = 32;
@@ -576,29 +578,26 @@ struct policy_hub
 #  if _CCCL_COMPILER(NVHPC)
       // need to reduce the number of threads to <= 256, so each thread can use up to 255 registers. This avoids an
       // error in ptxas, see also: https://github.com/NVIDIA/cccl/issues/7700.
-      static constexpr int num_reduce_warps    = 2;
-      static constexpr int num_scan_stor_warps = 2;
+      static constexpr int num_reduce_and_scan_warps = 2;
 #  else // _CCCL_COMPILER(NVHPC)
-      static constexpr int num_reduce_warps    = 4;
-      static constexpr int num_scan_stor_warps = 4;
+      static constexpr int num_reduce_and_scan_warps = 4; // this can be tuned
 #  endif // _CCCL_COMPILER(NVHPC)
-      static constexpr int num_load_warps       = 1;
-      static constexpr int num_sched_warps      = 1;
-      static constexpr int num_look_ahead_warps = 1;
-
+      static constexpr int num_load_warps       = 1; // no point in being more than 1
+      static constexpr int num_sched_warps      = 1; // no point in being more than 1
+      static constexpr int num_look_ahead_warps = 1; // must be 1
       // TODO(bgruber): 5 is a bit better for complex<float>
       static constexpr int num_look_ahead_items = sizeof(AccumT) == 2 ? 3 : 4;
 
       // Deduced definitions
       static constexpr int num_total_warps =
-        num_reduce_warps + num_scan_stor_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
+        2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
       static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;
 
 #  if _CCCL_COMPILER(NVHPC)
       static_assert(num_total_threads <= 256);
 #  endif // _CCCL_COMPILER(NVHPC)
 
-      static constexpr int squad_reduce_thread_count = num_reduce_warps * num_threads_per_warp;
+      static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp;
 
       // manual tuning based on cub.bench.scan.exclusive.sum.base
       // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM)
@@ -627,11 +626,11 @@ struct policy_hub
       // The squads cannot be static constexpr variables, as those are not device accessible
       [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept
       {
-        return warpspeed::SquadDesc{0, num_reduce_warps};
+        return warpspeed::SquadDesc{0, num_reduce_and_scan_warps};
       }
       [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept
       {
-        return warpspeed::SquadDesc{1, num_scan_stor_warps};
+        return warpspeed::SquadDesc{1, num_reduce_and_scan_warps};
       }
       [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept
       {