diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 709f5592678..e5242d5f2aa 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #pragma once @@ -7,47 +7,98 @@ #include -#include #include #if !TUNE_BASE -# if TUNE_TRANSPOSE == 0 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT -# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT -# else // TUNE_TRANSPOSE == 1 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE -# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE -# endif // TUNE_TRANSPOSE - -# if TUNE_LOAD == 0 -# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT -# elif TUNE_LOAD == 1 -# define TUNE_LOAD_MODIFIER cub::LOAD_CA -# endif // TUNE_LOAD +# if !USES_WARPSPEED() +# include +# endif // !USES_WARPSPEED() template struct policy_hub_t { - template - using agent_policy_t = cub::AgentScanPolicy< - NOMINAL_BLOCK_THREADS_4B, - NOMINAL_ITEMS_PER_THREAD_4B, - ComputeT, - LOAD_ALGORITHM, - LOAD_MODIFIER, - STORE_ALGORITHM, - SCAN_ALGORITHM, - cub::detail::MemBoundScaling, - delay_constructor_t>; - - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> + struct MaxPolicy : cub::ChainedPolicy<300, MaxPolicy, MaxPolicy> { +# if USES_WARPSPEED() + struct WarpspeedPolicy + { + static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS; + static constexpr int num_look_ahead_items = TUNE_NUM_LOOKBACK_ITEMS; + static constexpr int items_per_thread = TUNE_ITEMS_PLUS_ONE - 1; + + // the rest are fixed or derived definitions + + static constexpr int num_squads = 5; + static constexpr int num_threads_per_warp = 32; + static constexpr int num_load_warps = 1; + static constexpr int num_sched_warps = 1; + static constexpr int num_look_ahead_warps = 1; + + static constexpr int num_total_warps = + 2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps; + static constexpr int num_total_threads = num_total_warps * num_threads_per_warp; + + static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp; + + static constexpr int tile_size = items_per_thread * squad_reduce_thread_count; + + using SquadDesc = cub::detail::warpspeed::SquadDesc; + + // The squads cannot be static constexpr variables, as those are not device accessible + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadReduce() noexcept + { + return SquadDesc{0, num_reduce_and_scan_warps}; + } + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadScanStore() noexcept + { + return SquadDesc{1, num_reduce_and_scan_warps}; + } + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLoad() noexcept + { + return SquadDesc{2, num_load_warps}; + } + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadSched() noexcept + { + return SquadDesc{3, num_sched_warps}; + } + [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLookback() noexcept + { + return SquadDesc{4, num_look_ahead_warps}; + } + }; +# else // USES_WARPSPEED() +# if TUNE_TRANSPOSE == 0 +# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT +# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT +# else // TUNE_TRANSPOSE == 1 +# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE +# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE +# endif // TUNE_TRANSPOSE + +# if TUNE_LOAD == 0 +# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT +# elif TUNE_LOAD == 1 +# define TUNE_LOAD_MODIFIER cub::LOAD_CA +# endif // TUNE_LOAD + + template + using agent_policy_t = cub::AgentScanPolicy< + NOMINAL_BLOCK_THREADS_4B, + NOMINAL_ITEMS_PER_THREAD_4B, + ComputeT, + LOAD_ALGORITHM, + LOAD_MODIFIER, + STORE_ALGORITHM, + SCAN_ALGORITHM, + cub::detail::MemBoundScaling, + delay_constructor_t>; + using ScanPolicyT = agent_policy_t; +# endif // USES_WARPSPEED() }; - - using MaxPolicy = policy_t; }; #endif // TUNE_BASE @@ -72,6 +122,9 @@ try using input_it_t = const T*; using output_it_t = T*; using offset_t = cub::detail::choose_offset_t; +#if USES_WARPSPEED() + static_assert(sizeof(offset_t) == sizeof(size_t)); // warpspeed scan uses size_t internally +#endif // USES_WARPSPEED() #if !TUNE_BASE using policy_t = policy_hub_t; @@ -128,7 +181,7 @@ catch (const std::bad_alloc&) state.skip("Skipping: out of memory."); } -NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, offset_types)) +NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, scan_offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4)); diff --git a/cub/benchmarks/bench/scan/exclusive/custom.cu b/cub/benchmarks/bench/scan/exclusive/custom.cu index 057a198d34f..a54500be00c 100644 --- a/cub/benchmarks/bench/scan/exclusive/custom.cu +++ b/cub/benchmarks/bench/scan/exclusive/custom.cu @@ -9,5 +9,7 @@ #include -using op_t = max_t; +#define USES_WARPSPEED() 0 +using op_t = max_t; +using scan_offset_types = offset_types; #include "base.cuh" diff --git a/cub/benchmarks/bench/scan/exclusive/sum.cu b/cub/benchmarks/bench/scan/exclusive/sum.cu index 36b1e02b831..a0104279f76 100644 --- a/cub/benchmarks/bench/scan/exclusive/sum.cu +++ b/cub/benchmarks/bench/scan/exclusive/sum.cu @@ -5,6 +5,9 @@ #include +// This benchmark tunes the old, non-warpspeed scan implementation. Using it for benchmarking, will pick the warpspeed +// implementation on SM100+, but it's better to use the sum.warpspeed.cu benchmark instead, which uses a single OffsetT. + // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 @@ -13,5 +16,7 @@ // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 -using op_t = ::cuda::std::plus<>; +#define USES_WARPSPEED() 0 +using op_t = ::cuda::std::plus<>; +using scan_offset_types = offset_types; #include "base.cuh" diff --git a/cub/benchmarks/bench/scan/exclusive/sum.warpspeed.cu b/cub/benchmarks/bench/scan/exclusive/sum.warpspeed.cu new file mode 100644 index 00000000000..3cd70b5af62 --- /dev/null +++ b/cub/benchmarks/bench/scan/exclusive/sum.warpspeed.cu @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// This tunes the warpspeed implementation of scan, which is only available on SM100+. It has entirely different tuning +// parameters and is agnostic of the offset type. It is thus in a separate file, so we can continue to tune the old scan +// implementation on older hardware architectures. + +#include + +#if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1 +# warning "This benchmark does not support being compiled for multiple architectures. Disabling it." +#else // _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1 + +# if __CUDA_ARCH_LIST__ < 1000 +# warning "Warpspeed scan requires at least sm_100. Disabling it." +# else // __CUDA_ARCH_LIST__ < 1000 + +# if __cccl_ptx_isa < 860 +# warning "Warpspeed scan requires at least PTX ISA 8.6. Disabling it." +# else // if __cccl_ptx_isa < 860 + +# include + +// %RANGE% TUNE_NUM_REDUCE_SCAN_WARPS wrps 1:8:1 +// %RANGE% TUNE_NUM_LOOKBACK_ITEMS lbi 1:8:1 + +// TODO(bgruber): find a good range and step width, items per thread should be coprime with 32 to avoid SMEM conflicts. +// Should we specify nominal items per thread instead? +// %RANGE% TUNE_ITEMS_PLUS_ONE ipt 8:256:8 + +# define USES_WARPSPEED() 1 +using op_t = ::cuda::std::plus<>; +using scan_offset_types = nvbench::type_list; +# include "base.cuh" + +# endif // __cccl_ptx_isa < 860 +# endif // __CUDA_ARCH_LIST__ < 1000 +#endif // _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1 diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index aa7f991f328..a83d2bdf6ee 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -568,6 +568,8 @@ struct policy_hub #if __cccl_ptx_isa >= 860 struct WarpspeedPolicy { + // TODO(bgruber): remove everything but num_reduce_and_scan_warps, num_look_ahead_items and items_per_thread + // Squad definitions static constexpr int num_squads = 5; static constexpr int num_threads_per_warp = 32; @@ -576,29 +578,26 @@ struct policy_hub # if _CCCL_COMPILER(NVHPC) // need to reduce the number of threads to <= 256, so each thread can use up to 255 registers. This avoids an // error in ptxas, see also: https://github.com/NVIDIA/cccl/issues/7700. - static constexpr int num_reduce_warps = 2; - static constexpr int num_scan_stor_warps = 2; + static constexpr int num_reduce_and_scan_warps = 2; # else // _CCCL_COMPILER(NVHPC) - static constexpr int num_reduce_warps = 4; - static constexpr int num_scan_stor_warps = 4; + static constexpr int num_reduce_and_scan_warps = 4; // this can be tuned # endif // _CCCL_COMPILER(NVHPC) - static constexpr int num_load_warps = 1; - static constexpr int num_sched_warps = 1; - static constexpr int num_look_ahead_warps = 1; - + static constexpr int num_load_warps = 1; // no point in being more than 1 + static constexpr int num_sched_warps = 1; // no point in being more than 1 + static constexpr int num_look_ahead_warps = 1; // must be 1 // TODO(bgruber): 5 is a bit better for complex static constexpr int num_look_ahead_items = sizeof(AccumT) == 2 ? 3 : 4; // Deduced definitions static constexpr int num_total_warps = - num_reduce_warps + num_scan_stor_warps + num_load_warps + num_sched_warps + num_look_ahead_warps; + 2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps; static constexpr int num_total_threads = num_total_warps * num_threads_per_warp; # if _CCCL_COMPILER(NVHPC) static_assert(num_total_threads <= 256); # endif // _CCCL_COMPILER(NVHPC) - static constexpr int squad_reduce_thread_count = num_reduce_warps * num_threads_per_warp; + static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp; // manual tuning based on cub.bench.scan.exclusive.sum.base // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM) @@ -627,11 +626,11 @@ struct policy_hub // The squads cannot be static constexpr variables, as those are not device accessible [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept { - return warpspeed::SquadDesc{0, num_reduce_warps}; + return warpspeed::SquadDesc{0, num_reduce_and_scan_warps}; } [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept { - return warpspeed::SquadDesc{1, num_scan_stor_warps}; + return warpspeed::SquadDesc{1, num_reduce_and_scan_warps}; } [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept {