From f057d83efb99bb80bcd3ca1adf84efe21c206852 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Mar 2026 13:20:10 +0100 Subject: [PATCH 1/8] Make warpspeed scan tunable * Duplicate exclusive/sum benchmark for warpspeed since it is too different from the old implementation * Hardcode benchmark OffsetT to uint64 * Review warpspeed scan tuning parameters and expose the relevant ones Fixes: #7893 Fixes: #7894 --- cub/benchmarks/bench/scan/exclusive/base.cuh | 122 ++++++++++++------ cub/benchmarks/bench/scan/exclusive/custom.cu | 4 +- cub/benchmarks/bench/scan/exclusive/sum.cu | 6 +- .../bench/scan/exclusive/sum.warspeed.cu | 32 +++++ .../device/dispatch/tuning/tuning_scan.cuh | 20 +-- 5 files changed, 135 insertions(+), 49 deletions(-) create mode 100644 cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 709f5592678..b5a942e9092 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #pragma once @@ -11,43 +11,89 @@ #include #if !TUNE_BASE -# if TUNE_TRANSPOSE == 0 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT -# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT -# else // TUNE_TRANSPOSE == 1 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE -# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE -# endif // TUNE_TRANSPOSE - -# if TUNE_LOAD == 0 -# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT -# elif TUNE_LOAD == 1 -# define TUNE_LOAD_MODIFIER cub::LOAD_CA -# endif // TUNE_LOAD - template struct policy_hub_t { - template - using agent_policy_t = cub::AgentScanPolicy< - NOMINAL_BLOCK_THREADS_4B, - NOMINAL_ITEMS_PER_THREAD_4B, - ComputeT, - LOAD_ALGORITHM, - LOAD_MODIFIER, - STORE_ALGORITHM, - SCAN_ALGORITHM, - cub::detail::MemBoundScaling, - delay_constructor_t>; - - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> + struct MaxPolicy : cub::ChainedPolicy<300, policy_t, policy_t> { +# if USES_WARPSPEED() + struct WarpspeedPolicy + { + static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS; + static constexpr int num_look_ahead_items = TUNE_NUM_LOOKBACK_ITEMS; + static constexpr int items_per_thread = TUNE_ITEMS_PLUS_ONE - 1; + + // the rest are fixed or derived definitions + + static constexpr int num_squads = 5; + static constexpr int num_threads_per_warp = 32; + static constexpr int num_load_warps = 1; + static constexpr int num_sched_warps = 1; + static constexpr int num_look_ahead_warps = 1; + + static constexpr int num_total_warps = + 2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps; + static constexpr int num_total_threads = num_total_warps * num_threads_per_warp; + + static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp; + + static constexpr int tile_size = items_per_thread * squad_reduce_thread_count; + + // The squads cannot be static constexpr variables, as those are not device accessible + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept + { + return warpspeed::SquadDesc{0, num_reduce_and_scan_warps}; + } + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept + { + return warpspeed::SquadDesc{1, num_reduce_and_scan_warps}; + } + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept + { + return warpspeed::SquadDesc{2, num_load_warps}; + } + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadSched() noexcept + { + return warpspeed::SquadDesc{3, num_sched_warps}; + } + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLookback() noexcept + { + return warpspeed::SquadDesc{4, num_look_ahead_warps}; + } + }; +# else // USES_WARPSPEED() +# if TUNE_TRANSPOSE == 0 +# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT +# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT +# else // TUNE_TRANSPOSE == 1 +# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE +# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE +# endif // TUNE_TRANSPOSE + +# if TUNE_LOAD == 0 +# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT +# elif TUNE_LOAD == 1 +# define TUNE_LOAD_MODIFIER cub::LOAD_CA +# endif // TUNE_LOAD + + template + using agent_policy_t = cub::AgentScanPolicy< + NOMINAL_BLOCK_THREADS_4B, + NOMINAL_ITEMS_PER_THREAD_4B, + ComputeT, + LOAD_ALGORITHM, + LOAD_MODIFIER, + STORE_ALGORITHM, + SCAN_ALGORITHM, + cub::detail::MemBoundScaling, + delay_constructor_t>; + using ScanPolicyT = agent_policy_t; +# endif // USES_WARPSPEED() }; - - using MaxPolicy = policy_t; }; #endif // TUNE_BASE @@ -72,6 +117,9 @@ try using input_it_t = const T*; using output_it_t = T*; using offset_t = cub::detail::choose_offset_t; +#if USES_WARPSPEED() + static_assert(sizeof(offset_t) == sizeof(size_t)); // warpspeed scan uses size_t internally +#endif // USES_WARPSPEED() #if !TUNE_BASE using policy_t = policy_hub_t; @@ -128,7 +176,7 @@ catch (const std::bad_alloc&) state.skip("Skipping: out of memory."); } -NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, offset_types)) +NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, scan_offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4)); diff --git a/cub/benchmarks/bench/scan/exclusive/custom.cu b/cub/benchmarks/bench/scan/exclusive/custom.cu index 057a198d34f..a54500be00c 100644 --- a/cub/benchmarks/bench/scan/exclusive/custom.cu +++ b/cub/benchmarks/bench/scan/exclusive/custom.cu @@ -9,5 +9,7 @@ #include -using op_t = max_t; +#define USES_WARPSPEED() 0 +using op_t = max_t; +using scan_offset_types = offset_types; #include "base.cuh" diff --git a/cub/benchmarks/bench/scan/exclusive/sum.cu b/cub/benchmarks/bench/scan/exclusive/sum.cu index 36b1e02b831..df3beebcd99 100644 --- a/cub/benchmarks/bench/scan/exclusive/sum.cu +++ b/cub/benchmarks/bench/scan/exclusive/sum.cu @@ -5,6 +5,8 @@ #include +// This benchmark tunes the old, non-warpspeed scan implementation + // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 @@ -13,5 +15,7 @@ // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 -using op_t = ::cuda::std::plus<>; +#define USES_WARPSPEED() 0 +using op_t = ::cuda::std::plus<>; +using scan_offset_types = offset_types; #include "base.cuh" diff --git a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu new file mode 100644 index 00000000000..373a07305ec --- /dev/null +++ b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu @@ -0,0 +1,32 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// This tunes the warpspeed implementation of scan, which is only available on SM100+. It has entirely different tuning +// parameters and is agnostic of the offset type. It is thus in a separate file, so we can continue to tune the old scan +// implementation on older hardware architectures. + +#if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +#endif + +#if __CUDA_ARCH_LIST__ < 1000 +# error Warpspeed scan requires at least sm_100 +#endif // __CUDA_ARCH_LIST__ < 1000 + +#if __cccl_ptx_isa < 860 +# error Warpspeed scan requires at least PTX ISA 8.6 +#endif // __cccl_ptx_isa < 860 + +#include + +// %RANGE% TUNE_NUM_REDUCE_SCAN_WARPS wrps 1:8:1 +// %RANGE% TUNE_NUM_LOOKBACK_ITEMS lbi 1:8:1 + +// TODO(bgruber): find a good range and step width, items per thread should be coprime with 32 to avoid SMEM conflicts. +// Should we specify nominal items per thread instead? +// %RANGE% TUNE_ITEMS_PLUS_ONE ipt 8:256:8 + +#define USES_WARPSPEED() 1 +using op_t = ::cuda::std::plus<>; +using scan_offset_types = nvbench::type_list; +#include "base.cuh" diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index aa7f991f328..4c9b07e7c84 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -568,6 +568,8 @@ struct policy_hub #if __cccl_ptx_isa >= 860 struct WarpspeedPolicy { + // TODO(bgruber): remove everything but num_reduce_warps, num_look_ahead_items and items_per_thread from policy + // Squad definitions static constexpr int num_squads = 5; static constexpr int num_threads_per_warp = 32; @@ -579,26 +581,24 @@ struct policy_hub static constexpr int num_reduce_warps = 2; static constexpr int num_scan_stor_warps = 2; # else // _CCCL_COMPILER(NVHPC) - static constexpr int num_reduce_warps = 4; - static constexpr int num_scan_stor_warps = 4; + static constexpr int num_reduce_and_scan_warps = 4; // this can be tuned # endif // _CCCL_COMPILER(NVHPC) - static constexpr int num_load_warps = 1; - static constexpr int num_sched_warps = 1; - static constexpr int num_look_ahead_warps = 1; - + static constexpr int num_load_warps = 1; // no point in being more than 1 + static constexpr int num_sched_warps = 1; // no point in being more than 1 + static constexpr int num_look_ahead_warps = 1; // must be 1 // TODO(bgruber): 5 is a bit better for complex static constexpr int num_look_ahead_items = sizeof(AccumT) == 2 ? 3 : 4; // Deduced definitions static constexpr int num_total_warps = - num_reduce_warps + num_scan_stor_warps + num_load_warps + num_sched_warps + num_look_ahead_warps; + 2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps; static constexpr int num_total_threads = num_total_warps * num_threads_per_warp; # if _CCCL_COMPILER(NVHPC) static_assert(num_total_threads <= 256); # endif // _CCCL_COMPILER(NVHPC) - static constexpr int squad_reduce_thread_count = num_reduce_warps * num_threads_per_warp; + static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp; // manual tuning based on cub.bench.scan.exclusive.sum.base // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM) @@ -627,11 +627,11 @@ struct policy_hub // The squads cannot be static constexpr variables, as those are not device accessible [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept { - return warpspeed::SquadDesc{0, num_reduce_warps}; + return warpspeed::SquadDesc{0, num_reduce_and_scan_warps}; } [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept { - return warpspeed::SquadDesc{1, num_scan_stor_warps}; + return warpspeed::SquadDesc{1, num_reduce_and_scan_warps}; } [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept { From 4dbdc0f09554df280c34f3c32c756cd8c39aea9a Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Mar 2026 13:24:40 +0100 Subject: [PATCH 2/8] Comment --- cub/benchmarks/bench/scan/exclusive/sum.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cub/benchmarks/bench/scan/exclusive/sum.cu b/cub/benchmarks/bench/scan/exclusive/sum.cu index df3beebcd99..a0104279f76 100644 --- a/cub/benchmarks/bench/scan/exclusive/sum.cu +++ b/cub/benchmarks/bench/scan/exclusive/sum.cu @@ -5,7 +5,8 @@ #include -// This benchmark tunes the old, non-warpspeed scan implementation +// This benchmark tunes the old, non-warpspeed scan implementation. Using it for benchmarking, will pick the warpspeed +// implementation on SM100+, but it's better to use the sum.warpspeed.cu benchmark instead, which uses a single OffsetT. // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 From 88ed0eb6207bff0d5d8ad8fcd87e918c98cb33c8 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Mar 2026 13:27:31 +0100 Subject: [PATCH 3/8] Comment --- cub/cub/device/dispatch/tuning/tuning_scan.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 4c9b07e7c84..9178d558fe5 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -568,7 +568,7 @@ struct policy_hub #if __cccl_ptx_isa >= 860 struct WarpspeedPolicy { - // TODO(bgruber): remove everything but num_reduce_warps, num_look_ahead_items and items_per_thread from policy + // TODO(bgruber): remove everything but num_reduce_and_scan_warps, num_look_ahead_items and items_per_thread // Squad definitions static constexpr int num_squads = 5; From d25427036ee1564419f375f621f52f12e49c7c21 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Mar 2026 13:27:35 +0100 Subject: [PATCH 4/8] Fix --- cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu index 373a07305ec..996837ffd2e 100644 --- a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu +++ b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu @@ -5,6 +5,8 @@ // parameters and is agnostic of the offset type. It is thus in a separate file, so we can continue to tune the old scan // implementation on older hardware architectures. +#include + #if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1 # error "This benchmark does not support being compiled for multiple architectures" #endif From f528d154d989d9f40e4e847380362248e1435ba1 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Mar 2026 13:49:52 +0100 Subject: [PATCH 5/8] Tuning fixes --- cub/benchmarks/bench/scan/exclusive/base.cuh | 29 ++++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index b5a942e9092..e5242d5f2aa 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -7,14 +7,17 @@ #include -#include #include #if !TUNE_BASE +# if !USES_WARPSPEED() +# include +# endif // !USES_WARPSPEED() + template struct policy_hub_t { - struct MaxPolicy : cub::ChainedPolicy<300, policy_t, policy_t> + struct MaxPolicy : cub::ChainedPolicy<300, MaxPolicy, MaxPolicy> { # if USES_WARPSPEED() struct WarpspeedPolicy @@ -39,26 +42,28 @@ struct policy_hub_t static constexpr int tile_size = items_per_thread * squad_reduce_thread_count; + using SquadDesc = cub::detail::warpspeed::SquadDesc; + // The squads cannot be static constexpr variables, as those are not device accessible - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadReduce() noexcept { - return warpspeed::SquadDesc{0, num_reduce_and_scan_warps}; + return SquadDesc{0, num_reduce_and_scan_warps}; } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadScanStore() noexcept { - return warpspeed::SquadDesc{1, num_reduce_and_scan_warps}; + return SquadDesc{1, num_reduce_and_scan_warps}; } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLoad() noexcept { - return warpspeed::SquadDesc{2, num_load_warps}; + return SquadDesc{2, num_load_warps}; } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadSched() noexcept + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadSched() noexcept { - return warpspeed::SquadDesc{3, num_sched_warps}; + return SquadDesc{3, num_sched_warps}; } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLookback() noexcept + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLookback() noexcept { - return warpspeed::SquadDesc{4, num_look_ahead_warps}; + return SquadDesc{4, num_look_ahead_warps}; } }; # else // USES_WARPSPEED() From 2da3dd1e44b177d78c7bbd5a2805b051c67c7d5a Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Mar 2026 16:19:18 +0100 Subject: [PATCH 6/8] Let the CI compile the benchmark --- .../bench/scan/exclusive/sum.warspeed.cu | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu index 996837ffd2e..3cd70b5af62 100644 --- a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu +++ b/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu @@ -8,18 +8,18 @@ #include #if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1 -# error "This benchmark does not support being compiled for multiple architectures" -#endif +# warning "This benchmark does not support being compiled for multiple architectures. Disabling it." +#else // _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1 -#if __CUDA_ARCH_LIST__ < 1000 -# error Warpspeed scan requires at least sm_100 -#endif // __CUDA_ARCH_LIST__ < 1000 +# if __CUDA_ARCH_LIST__ < 1000 +# warning "Warpspeed scan requires at least sm_100. Disabling it." +# else // __CUDA_ARCH_LIST__ < 1000 -#if __cccl_ptx_isa < 860 -# error Warpspeed scan requires at least PTX ISA 8.6 -#endif // __cccl_ptx_isa < 860 +# if __cccl_ptx_isa < 860 +# warning "Warpspeed scan requires at least PTX ISA 8.6. Disabling it." +# else // if __cccl_ptx_isa < 860 -#include +# include // %RANGE% TUNE_NUM_REDUCE_SCAN_WARPS wrps 1:8:1 // %RANGE% TUNE_NUM_LOOKBACK_ITEMS lbi 1:8:1 @@ -28,7 +28,11 @@ // Should we specify nominal items per thread instead? // %RANGE% TUNE_ITEMS_PLUS_ONE ipt 8:256:8 -#define USES_WARPSPEED() 1 +# define USES_WARPSPEED() 1 using op_t = ::cuda::std::plus<>; using scan_offset_types = nvbench::type_list; -#include "base.cuh" +# include "base.cuh" + +# endif // __cccl_ptx_isa < 860 +# endif // __CUDA_ARCH_LIST__ < 1000 +#endif // _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1 From 275cd87da310a24db32ce790fe58c24334441036 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Mar 2026 16:19:32 +0100 Subject: [PATCH 7/8] Fix filename --- .../bench/scan/exclusive/{sum.warspeed.cu => sum.warpspeed.cu} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cub/benchmarks/bench/scan/exclusive/{sum.warspeed.cu => sum.warpspeed.cu} (100%) diff --git a/cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu b/cub/benchmarks/bench/scan/exclusive/sum.warpspeed.cu similarity index 100% rename from cub/benchmarks/bench/scan/exclusive/sum.warspeed.cu rename to cub/benchmarks/bench/scan/exclusive/sum.warpspeed.cu From 2dee3c412aacdc8d2c1aebd704fd31c2eb772dce Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Mar 2026 18:07:17 +0100 Subject: [PATCH 8/8] Fix NVHPC --- cub/cub/device/dispatch/tuning/tuning_scan.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 9178d558fe5..a83d2bdf6ee 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -578,8 +578,7 @@ struct policy_hub # if _CCCL_COMPILER(NVHPC) // need to reduce the number of threads to <= 256, so each thread can use up to 255 registers. This avoids an // error in ptxas, see also: https://github.com/NVIDIA/cccl/issues/7700. - static constexpr int num_reduce_warps = 2; - static constexpr int num_scan_stor_warps = 2; + static constexpr int num_reduce_and_scan_warps = 2; # else // _CCCL_COMPILER(NVHPC) static constexpr int num_reduce_and_scan_warps = 4; // this can be tuned # endif // _CCCL_COMPILER(NVHPC)