From ad1c1df7c77a15427392603d6c69d23e2774dd10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Sat, 7 Feb 2026 20:58:47 -0800 Subject: [PATCH 01/31] Base changes in scan and tests. --- c/parallel/src/scan.cu | 239 +++++---- cub/cub/detail/delay_constructor.cuh | 183 +++++++ cub/cub/device/dispatch/dispatch_scan.cuh | 239 ++++++++- .../device/dispatch/kernels/kernel_scan.cuh | 29 +- .../dispatch/tuning/tuning_radix_sort.cuh | 191 +------ .../device/dispatch/tuning/tuning_scan.cuh | 479 +++++++++++++++++- .../nvbench_helper/look_back_helper.cuh | 19 +- 7 files changed, 1086 insertions(+), 293 deletions(-) create mode 100644 cub/cub/detail/delay_constructor.cuh diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 230707e7d64..215b232a270 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -4,13 +4,12 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// #include #include -#include #include #include #include @@ -21,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -56,33 +56,6 @@ enum class InitKind NoInit, }; -struct scan_runtime_tuning_policy -{ - cub::detail::RuntimeScanAgentPolicy scan; - - auto Scan() const - { - return scan; - } - - void CheckLoadModifier() const - { - if (scan.LoadModifier() == cub::CacheLoadModifier::LOAD_LDG) - { - throw std::runtime_error("The memory consistency model does not apply to texture " - "accesses"); - } - } - - using MaxPolicy = scan_runtime_tuning_policy; - - template - cudaError_t Invoke(int, F& op) - { - return op.template Invoke(*this); - } -}; - static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_type_info init) { // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet @@ -120,8 +93,8 @@ std::string get_scan_kernel_name( bool force_inclusive, cccl_init_kind_t init_kind) { - std::string chained_policy_t; - check(cccl_type_name_from_nvrtc(&chained_policy_t)); + std::string policy_selector_t; + check(cccl_type_name_from_nvrtc(&policy_selector_t)); const cccl_type_info accum_t = scan::get_accumulator_type(op, input_it, init); const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type); @@ -162,7 +135,7 @@ std::string get_scan_kernel_name( auto tile_state_t = std::format("cub::ScanTileState<{0}>", accum_cpp_t); return std::format( "cub::detail::scan::DeviceScanKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}>", - chained_policy_t, // 0 + policy_selector_t, // 0 input_iterator_t, // 1 output_iterator_t, // 2 tile_state_t, // 3 @@ -174,20 +147,6 @@ std::string get_scan_kernel_name( init_t); // 9 } -template -struct dynamic_scan_policy_t -{ - using MaxPolicy = dynamic_scan_policy_t; - - template - cudaError_t Invoke(int device_ptx_version, F& op) - { - return op.template Invoke(GetPolicy(device_ptx_version, accumulator_type)); - } - - cccl_type_info accumulator_type; -}; - struct scan_kernel_source { cccl_device_scan_build_result_t& build; @@ -245,8 +204,115 @@ try const auto output_it_value_t = cccl_type_enum_to_name(output_it.value_type.type); - std::string policy_hub_expr = std::format( - "cub::detail::scan::policy_hub<{}, {}, {}, {}, {}>", + const auto policy_sel = [&] { + using cub::detail::op_kind_t; + using cub::detail::type_t; + using cub::detail::scan::policy_selector; + using cub::detail::scan::primitive_accum; + using cub::detail::scan::primitive_op; + + auto accum_type = type_t::other; + switch (accum_t.type) + { + case CCCL_INT8: + accum_type = type_t::int8; + break; + case CCCL_INT16: + accum_type = type_t::int16; + break; + case CCCL_INT32: + accum_type = type_t::int32; + break; + case CCCL_INT64: + accum_type = type_t::int64; + break; + case CCCL_UINT8: + accum_type = type_t::uint8; + break; + case CCCL_UINT16: + accum_type = type_t::uint16; + break; + case CCCL_UINT32: + accum_type = type_t::uint32; + break; + case CCCL_UINT64: + accum_type = type_t::uint64; + break; + case CCCL_FLOAT32: + accum_type = type_t::float32; + break; + case CCCL_FLOAT64: + accum_type = type_t::float64; + break; + default: + break; + } + + auto operation_t = op_kind_t::other; + switch (op.type) + { + case CCCL_PLUS: + operation_t = op_kind_t::plus; + break; + case CCCL_MINIMUM: + operation_t = op_kind_t::min; + break; + case CCCL_MAXIMUM: + operation_t = op_kind_t::max; + break; + default: + break; + } + + auto primitive_accum_t = primitive_accum::no; + switch (accum_t.type) + { + case CCCL_INT8: + case CCCL_INT16: + case CCCL_INT32: + case CCCL_INT64: + case CCCL_UINT8: + case CCCL_UINT16: + case CCCL_UINT32: + case CCCL_UINT64: + case CCCL_FLOAT16: + case CCCL_FLOAT32: + case CCCL_FLOAT64: + case CCCL_BOOLEAN: + primitive_accum_t = primitive_accum::yes; + break; + default: + break; + } + + const auto primitive_op_t = + (op.type == CCCL_PLUS || op.type == CCCL_MINIMUM || op.type == CCCL_MAXIMUM) + ? primitive_op::yes + : primitive_op::no; + + const auto input_type = input_it.value_type.type; + const auto output_type = output_it.value_type.type; + const bool types_match = input_type == output_type && input_type == accum_t.type; + const bool benchmark_match = primitive_op_t == primitive_op::yes && types_match && input_type != CCCL_STORAGE; + + return policy_selector{ + static_cast(input_it.value_type.size), + static_cast(output_it.value_type.size), + static_cast(accum_t.size), + int{sizeof(OffsetT)}, + accum_type, + operation_t, + primitive_accum_t, + primitive_op_t, + benchmark_match}; + }(); + + // TODO(bgruber): drop this if tuning policies become formattable + std::stringstream policy_sel_str; + policy_sel_str << policy_sel(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor})); + + std::string policy_selector_expr = std::format( + "cub::detail::scan::policy_selector_from_types<{}, {}, {}, {}, {}>", input_it_value_t, output_it_value_t, accum_cpp, @@ -265,20 +331,20 @@ struct __align__({1}) storage_t {{ {2} {3} {4} -using device_scan_policy = {5}::MaxPolicy; - -#include -__device__ consteval auto& policy_generator() {{ - return ptx_json::id() - = cub::detail::scan::ScanPolicyWrapper::EncodedPolicy(); -}} +using device_scan_policy = {5}; +using namespace cub; +using namespace cub::detail::scan; +using cub::detail::delay_constructor_policy; +using cub::detail::delay_constructor_kind; +static_assert(device_scan_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6}, "Host generated and JIT compiled policy mismatch"); )XXX", input_it.value_type.size, // 0 input_it.value_type.alignment, // 1 input_iterator_src, // 2 output_iterator_src, // 3 op_src, // 4 - policy_hub_expr); // 5 + policy_selector_expr, // 5 + policy_sel_str.view()); // 6 #if false // CCCL_DEBUGGING_SWITCH fflush(stderr); @@ -302,7 +368,6 @@ __device__ consteval auto& policy_generator() {{ "-rdc=true", "-dlto", "-DCUB_DISABLE_CDP", - "-DCUB_ENABLE_POLICY_PTX_JSON", "-std=c++20"}; cccl::detail::extend_args_with_build_config(args, config); @@ -337,11 +402,6 @@ __device__ consteval auto& policy_generator() {{ auto [description_bytes_per_tile, payload_bytes_per_tile] = get_tile_state_bytes_per_tile(accum_t, accum_cpp, args.data(), args.size(), arch); - nlohmann::json runtime_policy = cub::detail::ptx_json::parse("device_scan_policy", {result.data.get(), result.size}); - - using cub::detail::RuntimeScanAgentPolicy; - auto scan_policy = RuntimeScanAgentPolicy::from_json(runtime_policy, "ScanPolicyT"); - build_ptr->cc = cc; build_ptr->cubin = (void*) result.data.release(); build_ptr->cubin_size = result.size; @@ -350,7 +410,7 @@ __device__ consteval auto& policy_generator() {{ build_ptr->init_kind = init_kind; build_ptr->description_bytes_per_tile = description_bytes_per_tile; build_ptr->payload_bytes_per_tile = payload_bytes_per_tile; - build_ptr->runtime_policy = new scan::scan_runtime_tuning_policy{scan_policy}; + build_ptr->runtime_policy = new cub::detail::scan::policy_selector{policy_sel}; return CUDA_SUCCESS; } @@ -384,31 +444,39 @@ CUresult cccl_device_scan( CUdevice cu_device; check(cuCtxGetDevice(&cu_device)); - auto exec_status = cub::DispatchScan< - indirect_arg_t, - indirect_arg_t, - indirect_arg_t, - std::conditional_t, cub::NullType, indirect_arg_t>, - cuda::std::size_t, - void, - EnforceInclusive, - scan::scan_runtime_tuning_policy, - scan::scan_kernel_source, - cub::detail::CudaDriverLauncherFactory>:: - Dispatch( + if constexpr (std::is_same_v) + { + auto exec_status = cub::detail::scan::dispatch_with_accum( d_temp_storage, *temp_storage_bytes, - d_in, - d_out, - op, + indirect_arg_t{d_in}, + indirect_arg_t{d_out}, + indirect_arg_t{op}, init, - num_items, + static_cast(num_items), stream, - {build}, - cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}, - *reinterpret_cast(build.runtime_policy)); - - error = static_cast(exec_status); + *static_cast(build.runtime_policy), + scan::scan_kernel_source{build}, + cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}); + error = static_cast(exec_status); + } + else + { + indirect_arg_t init_arg{init}; + auto exec_status = cub::detail::scan::dispatch_with_accum( + d_temp_storage, + *temp_storage_bytes, + indirect_arg_t{d_in}, + indirect_arg_t{d_out}, + indirect_arg_t{op}, + init_arg, + static_cast(num_items), + stream, + *static_cast(build.runtime_policy), + scan::scan_kernel_source{build}, + cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}); + error = static_cast(exec_status); + } } catch (const std::exception& exc) { @@ -549,7 +617,8 @@ try return CUDA_ERROR_INVALID_VALUE; } std::unique_ptr cubin(reinterpret_cast(build_ptr->cubin)); - std::unique_ptr policy(reinterpret_cast(build_ptr->runtime_policy)); + std::unique_ptr policy( + static_cast(build_ptr->runtime_policy)); check(cuLibraryUnload(build_ptr->library)); return CUDA_SUCCESS; diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh new file mode 100644 index 00000000000..44a409f497a --- /dev/null +++ b/cub/cub/detail/delay_constructor.cuh @@ -0,0 +1,183 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3 + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + +CUB_NAMESPACE_BEGIN + +namespace detail +{ +enum class delay_constructor_kind +{ + no_delay, + fixed_delay, + exponential_backoff, + exponential_backoff_jitter, + exponential_backoff_jitter_window, + exponential_backon_jitter_window, + exponential_backon_jitter, + exponential_backon +}; + +#if !_CCCL_COMPILER(NVRTC) +inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kind) +{ + switch (kind) + { + case delay_constructor_kind::no_delay: + return os << "delay_constructor_kind::no_delay"; + case delay_constructor_kind::fixed_delay: + return os << "delay_constructor_kind::fixed_delay"; + case delay_constructor_kind::exponential_backoff: + return os << "delay_constructor_kind::exponential_backoff"; + case delay_constructor_kind::exponential_backoff_jitter: + return os << "delay_constructor_kind::exponential_backoff_jitter"; + case delay_constructor_kind::exponential_backoff_jitter_window: + return os << "delay_constructor_kind::exponential_backoff_jitter_window"; + case delay_constructor_kind::exponential_backon_jitter_window: + return os << "delay_constructor_kind::exponential_backon_jitter_window"; + case delay_constructor_kind::exponential_backon_jitter: + return os << "delay_constructor_kind::exponential_backon_jitter"; + case delay_constructor_kind::exponential_backon: + return os << "delay_constructor_kind::exponential_backon"; + default: + return os << "(kind) << ">"; + } +} +#endif // !_CCCL_COMPILER(NVRTC) + +struct delay_constructor_policy +{ + delay_constructor_kind kind; + unsigned int delay; + unsigned int l2_write_latency; + + _CCCL_API constexpr friend bool operator==(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs) + { + return lhs.kind == rhs.kind && lhs.delay == rhs.delay && lhs.l2_write_latency == rhs.l2_write_latency; + } + + _CCCL_API constexpr friend bool operator!=(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs) + { + return !(lhs == rhs); + } + +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const delay_constructor_policy& p) + { + return os << "delay_constructor_policy { .kind = " << p.kind << ", .delay = " << p.delay + << ", .l2_write_latency = " << p.l2_write_latency << " }"; + } +#endif // !_CCCL_COMPILER(NVRTC) +}; + +template +inline constexpr auto delay_constructor_policy_from_type = 0; + +template +inline constexpr auto delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::no_delay, 0, L2WriteLatency}; + +template +inline constexpr auto delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::fixed_delay, Delay, L2WriteLatency}; + +template +inline constexpr auto delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backoff, Delay, L2WriteLatency}; + +template +inline constexpr auto + delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter, Delay, L2WriteLatency}; + +template +inline constexpr auto + delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter_window, Delay, L2WriteLatency}; + +template +inline constexpr auto + delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, Delay, L2WriteLatency}; + +template +inline constexpr auto delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, Delay, L2WriteLatency}; + +template +inline constexpr auto delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency}; + +template +struct delay_constructor_for; + +template +struct delay_constructor_for +{ + using type = no_delay_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = fixed_delay_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backoff_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backoff_jitter_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backoff_jitter_window_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backon_jitter_window_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backon_jitter_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backon_constructor_t; +}; + +template +using delay_constructor_t = typename delay_constructor_for::type; +} // namespace detail + +CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 65a962d5901..fe3134e60e9 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved. -// SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 /** @@ -45,7 +45,7 @@ CUB_NAMESPACE_BEGIN namespace detail::scan { -template +_CCCL_API constexpr auto convert_policy() -> scan_policy +{ + using scan_policy_t = typename LegacyActivePolicy::ScanPolicyT; + return scan_policy{ + scan_policy_t::BLOCK_THREADS, + scan_policy_t::ITEMS_PER_THREAD, + scan_policy_t::LOAD_ALGORITHM, + scan_policy_t::LOAD_MODIFIER, + scan_policy_t::STORE_ALGORITHM, + scan_policy_t::SCAN_ALGORITHM, + detail::delay_constructor_policy_from_type}; +} + +// TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API +template +struct policy_selector_from_hub +{ + // this is only called in device code + _CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> scan_policy + { + return convert_policy(); + } +}; } // namespace detail::scan /****************************************************************************** @@ -111,6 +137,7 @@ struct DeviceScanKernelSource * Enum flag to specify whether to enforce inclusive scan. * */ +// TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0 template < typename InputIteratorT, typename OutputIteratorT, @@ -126,7 +153,7 @@ template < typename PolicyHub = detail::scan:: policy_hub, detail::it_value_t, AccumT, OffsetT, ScanOpT>, typename KernelSource = detail::scan::DeviceScanKernelSource< - typename PolicyHub::MaxPolicy, + detail::scan::policy_selector_from_hub, InputIteratorT, OutputIteratorT, ScanOpT, @@ -216,6 +243,7 @@ struct DispatchScan * @param[in] max_policy * Struct encoding chain of algorithm tuning policies */ + // TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0 CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -404,6 +432,7 @@ struct DispatchScan * @param[in] max_policy * Struct encoding chain of algorithm tuning policies */ + // TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0 template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -444,4 +473,206 @@ struct DispatchScan } }; +namespace detail::scan +{ +template < + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT, + typename InitValueT, + typename OffsetT, + typename AccumT = ::cuda::std::__accumulator_t, + ::cuda::std::_If<::cuda::std::is_same_v, + cub::detail::it_value_t, + typename InitValueT::value_type>>, + ForceInclusive EnforceInclusive = ForceInclusive::No, + typename PolicySelector = policy_selector_from_types, + detail::it_value_t, + AccumT, + OffsetT, + ScanOpT>, + typename KernelSource = + DeviceScanKernelSource, + typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> +#if _CCCL_HAS_CONCEPTS() + requires scan_policy_selector +#endif // _CCCL_HAS_CONCEPTS() +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + InitValueT init_value, + OffsetT num_items, + cudaStream_t stream, + PolicySelector policy_selector = {}, + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}) -> cudaError_t +{ + static_assert(::cuda::std::is_unsigned_v && sizeof(OffsetT) >= 4, + "DispatchScan only supports unsigned offset types of at least 4-bytes"); + + ::cuda::arch_id arch_id{}; + if (const auto error = CubDebug(launcher_factory.PtxArchId(arch_id))) + { + return error; + } + + const scan_policy active_policy = policy_selector(arch_id); + + // Number of input tiles + const int tile_size = active_policy.block_threads * active_policy.items_per_thread; + const int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); + + auto tile_state = kernel_source.TileState(); + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (const auto error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0]))) + { + return error; // bytes needed for tile status descriptors + } + + // Compute allocation pointers into the single storage blob (or compute + // the necessary size of the blob) + void* allocations[1] = {}; + if (const auto error = + CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + { + return error; + } + + // Return if the caller is simply requesting the size of the storage allocation, or the problem is empty + if (d_temp_storage == nullptr || num_items == 0) + { + return cudaSuccess; + } + + // Construct the tile status interface + if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) + { + return error; + } + + // Log init_kernel configuration + constexpr int init_kernel_threads = 128; + const int init_grid_size = ::cuda::ceil_div(num_tiles, init_kernel_threads); + +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); +#endif // CUB_DEBUG_LOG + + // Invoke init_kernel to initialize tile descriptors + launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true) + .doit(kernel_source.InitKernel(), tile_state, num_tiles); + + // Check for failure to launch + if (const auto error = CubDebug(cudaPeekAtLastError())) + { + return error; + } + + // Sync the stream if specified to flush runtime errors + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + + // Get SM occupancy for scan_kernel + int scan_sm_occupancy; + if (const auto error = CubDebug( + launcher_factory.MaxSmOccupancy(scan_sm_occupancy, kernel_source.ScanKernel(), active_policy.block_threads))) + { + return error; + } + + // Get max x-dimension of grid + int max_dim_x; + if (const auto error = CubDebug(launcher_factory.MaxGridDimX(max_dim_x))) + { + return error; + } + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { +// Log scan_kernel configuration +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " + "per thread, %d SM occupancy\n", + start_tile, + scan_grid_size, + active_policy.block_threads, + (long long) stream, + active_policy.items_per_thread, + scan_sm_occupancy); +#endif // CUB_DEBUG_LOG + + // Invoke scan_kernel + launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true) + .doit(kernel_source.ScanKernel(), d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items); + + // Check for failure to launch + if (const auto error = CubDebug(cudaPeekAtLastError())) + { + return error; + } + + // Sync the stream if specified to flush runtime errors + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + } + + return cudaSuccess; +} + +template < + typename AccumT, + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT, + typename InitValueT, + typename OffsetT, + ForceInclusive EnforceInclusive = ForceInclusive::No, + typename PolicySelector = policy_selector_from_types, + detail::it_value_t, + AccumT, + OffsetT, + ScanOpT>, + typename KernelSource = + DeviceScanKernelSource, + typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_with_accum( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + InitValueT init_value, + OffsetT num_items, + cudaStream_t stream, + PolicySelector policy_selector = {}, + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}) -> cudaError_t +{ + return dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init_value, + num_items, + stream, + policy_selector, + kernel_source, + launcher_factory); +} +} // namespace detail::scan + CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index eb4cba0ada8..f0ee2a0a236 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #pragma once @@ -14,6 +14,7 @@ #endif // no system header #include +#include #include CUB_NAMESPACE_BEGIN @@ -82,8 +83,8 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte * @brief Scan kernel entry point (multi-block) * * - * @tparam ChainedPolicyT - * Chained tuning policy + * @tparam PolicySelector + * Policy selector for tuning * * @tparam InputIteratorT * Random-access input iterator type for reading scan inputs @iterator @@ -126,7 +127,7 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte * @paramTotal num_items * number of scan items for the entire problem */ -template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) +__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel( InputIteratorT d_in, OutputIteratorT d_out, @@ -146,7 +147,23 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) InitValueT init_value, OffsetT num_items) { - using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT; + static constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); + static_assert(policy.load_modifier != CacheLoadModifier::LOAD_LDG, + "The memory consistency model does not apply to texture " + "accesses"); + + using ScanPolicyT = AgentScanPolicy< + policy.block_threads, + policy.items_per_thread, + AccumT, + policy.load_algorithm, + policy.load_modifier, + policy.store_algorithm, + policy.scan_algorithm, + NoScaling, + delay_constructor_t>; // Thread block type for scanning input tiles using AgentScanT = detail::scan::AgentScan< diff --git a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh index 7ae1e0bde7c..a68a2cb746b 100644 --- a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #pragma once @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -31,129 +32,8 @@ CUB_NAMESPACE_BEGIN namespace detail::radix_sort { -enum class delay_constructor_kind -{ - no_delay, - fixed_delay, - exponential_backoff, - exponential_backoff_jitter, - exponential_backoff_jitter_window, - exponential_backon_jitter_window, - exponential_backon_jitter, - exponential_backon -}; - -#if !_CCCL_COMPILER(NVRTC) -inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kind) -{ - switch (kind) - { - case delay_constructor_kind::no_delay: - return os << "delay_constructor_kind::no_delay"; - case delay_constructor_kind::fixed_delay: - return os << "delay_constructor_kind::fixed_delay"; - case delay_constructor_kind::exponential_backoff: - return os << "delay_constructor_kind::exponential_backoff"; - case delay_constructor_kind::exponential_backoff_jitter: - return os << "delay_constructor_kind::exponential_backoff_jitter"; - case delay_constructor_kind::exponential_backoff_jitter_window: - return os << "delay_constructor_kind::exponential_backoff_jitter_window"; - case delay_constructor_kind::exponential_backon_jitter_window: - return os << "delay_constructor_kind::exponential_backon_jitter_window"; - case delay_constructor_kind::exponential_backon_jitter: - return os << "delay_constructor_kind::exponential_backon_jitter"; - case delay_constructor_kind::exponential_backon: - return os << "delay_constructor_kind::exponential_backon"; - default: - return os << "(kind) << ">"; - } -} -#endif // !_CCCL_COMPILER(NVRTC) - -struct delay_constructor_policy -{ - delay_constructor_kind kind; - unsigned int delay; - unsigned int l2_write_latency; - - _CCCL_API constexpr friend bool operator==(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs) - { - return lhs.kind == rhs.kind && lhs.delay == rhs.delay && lhs.l2_write_latency == rhs.l2_write_latency; - } - - _CCCL_API constexpr friend bool operator!=(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs) - { - return !(lhs == rhs); - } - -#if !_CCCL_COMPILER(NVRTC) - friend ::std::ostream& operator<<(::std::ostream& os, const delay_constructor_policy& p) - { - return os << "delay_constructor_policy { .kind = " << p.kind << ", .delay = " << p.delay - << ", .l2_write_latency = " << p.l2_write_latency << " }"; - } -#endif // !_CCCL_COMPILER(NVRTC) -}; - -template -inline constexpr auto delay_constructor_policy_from_type = 0; - -template -inline constexpr auto delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::no_delay, 0, L2WriteLatency}; - -template -inline constexpr auto delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::fixed_delay, Delay, L2WriteLatency}; - -template -inline constexpr auto delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backoff, Delay, L2WriteLatency}; - -template -inline constexpr auto - delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter, Delay, L2WriteLatency}; - -template -inline constexpr auto - delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter_window, Delay, L2WriteLatency}; - -template -inline constexpr auto - delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, Delay, L2WriteLatency}; - -template -inline constexpr auto delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, Delay, L2WriteLatency}; - -template -inline constexpr auto delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency}; - -// TODO(bgruber): this is modeled after , unify this -template -struct __delay_constructor_t_helper -{ -private: - using delay_constructors = ::cuda::std::__type_list< - detail::no_delay_constructor_t, - detail::fixed_delay_constructor_t, - detail::exponential_backoff_constructor_t, - detail::exponential_backoff_jitter_constructor_t, - detail::exponential_backoff_jitter_window_constructor_t, - detail::exponential_backon_jitter_window_constructor_t, - detail::exponential_backon_jitter_constructor_t, - detail::exponential_backon_constructor_t>; - -public: - using type = ::cuda::std::__type_at_c(Kind), delay_constructors>; -}; - -template -using delay_constructor_t = typename __delay_constructor_t_helper::type; +using detail::scan::make_mem_scaled_scan_policy; +using detail::scan::scan_policy; struct radix_sort_histogram_policy { @@ -268,63 +148,6 @@ _CCCL_API constexpr auto make_reg_scaled_radix_sort_onesweep_policy( store_algorithm}; } -// TODO(bgruber): move this into the scan tuning header -struct scan_policy -{ - int block_threads; - int items_per_thread; - BlockLoadAlgorithm load_algorithm; - CacheLoadModifier load_modifier; - BlockStoreAlgorithm store_algorithm; - BlockScanAlgorithm scan_algorithm; - delay_constructor_policy delay_constructor; - - _CCCL_API constexpr friend bool operator==(const scan_policy& lhs, const scan_policy& rhs) - { - return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread - && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier - && lhs.store_algorithm == rhs.store_algorithm && lhs.scan_algorithm == rhs.scan_algorithm - && lhs.delay_constructor == rhs.delay_constructor; - } - - _CCCL_API constexpr friend bool operator!=(const scan_policy& lhs, const scan_policy& rhs) - { - return !(lhs == rhs); - } - -#if !_CCCL_COMPILER(NVRTC) - friend ::std::ostream& operator<<(::std::ostream& os, const scan_policy& p) - { - return os - << "scan_policy { .block_threads = " << p.block_threads << ", .items_per_thread = " << p.items_per_thread - << ", .load_algorithm = " << p.load_algorithm << ", .load_modifier = " << p.load_modifier - << ", .store_algorithm = " << p.store_algorithm << ", .scan_algorithm = " << p.scan_algorithm - << ", .delay_constructor = " << p.delay_constructor << " }"; - } -#endif // !_CCCL_COMPILER(NVRTC) -}; - -_CCCL_API constexpr auto make_mem_scaled_scan_policy( - int nominal_4b_block_threads, - int nominal_4b_items_per_thread, - int compute_t_size, - BlockLoadAlgorithm load_algorithm, - CacheLoadModifier load_modifier, - BlockStoreAlgorithm store_algorithm, - BlockScanAlgorithm scan_algorithm, - delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}) -> scan_policy -{ - const auto scaled = scale_mem_bound(nominal_4b_block_threads, nominal_4b_items_per_thread, compute_t_size); - return scan_policy{ - scaled.block_threads, - scaled.items_per_thread, - load_algorithm, - load_modifier, - store_algorithm, - scan_algorithm, - delay_constructor}; -} - struct radix_sort_downsweep_policy { int block_threads; @@ -1778,7 +1601,7 @@ struct policy_selector return ::cuda::std::max(value_size, key_size); } - [[nodiscard]] _CCCL_API constexpr auto make_onsweep_small_key_policy(const small_key_tuning_values& tuning) const + [[nodiscard]] _CCCL_API constexpr auto make_onesweep_small_key_policy(const small_key_tuning_values& tuning) const -> radix_sort_policy { const int primary_radix_bits = (key_size > 1) ? 7 : 5; @@ -1921,12 +1744,12 @@ struct policy_selector if (arch >= ::cuda::arch_id::sm_100) { - return make_onsweep_small_key_policy(get_sm100_tuning(key_size, value_size, offset_size, key_type)); + return make_onesweep_small_key_policy(get_sm100_tuning(key_size, value_size, offset_size, key_type)); } if (arch >= ::cuda::arch_id::sm_90) { - return make_onsweep_small_key_policy(get_sm90_tuning(key_size, value_size, offset_size)); + return make_onesweep_small_key_policy(get_sm90_tuning(key_size, value_size, offset_size)); } if (arch >= ::cuda::arch_id::sm_80) diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 986cc800872..7db16f81d65 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #pragma once @@ -18,16 +18,22 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include #include +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + CUB_NAMESPACE_BEGIN namespace detail::scan @@ -114,6 +120,62 @@ constexpr _CCCL_HOST_DEVICE offset_size classify_offset_size() return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown; } +struct scan_policy +{ + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_algorithm; + CacheLoadModifier load_modifier; + BlockStoreAlgorithm store_algorithm; + BlockScanAlgorithm scan_algorithm; + delay_constructor_policy delay_constructor; + + _CCCL_API constexpr friend bool operator==(const scan_policy& lhs, const scan_policy& rhs) + { + return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread + && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier + && lhs.store_algorithm == rhs.store_algorithm && lhs.scan_algorithm == rhs.scan_algorithm + && lhs.delay_constructor == rhs.delay_constructor; + } + + _CCCL_API constexpr friend bool operator!=(const scan_policy& lhs, const scan_policy& rhs) + { + return !(lhs == rhs); + } + +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const scan_policy& p) + { + return os + << "scan_policy { .block_threads = " << p.block_threads << ", .items_per_thread = " << p.items_per_thread + << ", .load_algorithm = " << p.load_algorithm << ", .load_modifier = " << p.load_modifier + << ", .store_algorithm = " << p.store_algorithm << ", .scan_algorithm = " << p.scan_algorithm + << ", .delay_constructor = " << p.delay_constructor << " }"; + } +#endif // !_CCCL_COMPILER(NVRTC) +}; + +_CCCL_API constexpr auto make_mem_scaled_scan_policy( + int nominal_4b_block_threads, + int nominal_4b_items_per_thread, + int compute_t_size, + BlockLoadAlgorithm load_algorithm, + CacheLoadModifier load_modifier, + BlockStoreAlgorithm store_algorithm, + BlockScanAlgorithm scan_algorithm, + delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}) -> scan_policy +{ + const auto scaled = scale_mem_bound(nominal_4b_block_threads, nominal_4b_items_per_thread, compute_t_size); + return scan_policy{ + scaled.block_threads, + scaled.items_per_thread, + load_algorithm, + load_modifier, + store_algorithm, + scan_algorithm, + delay_constructor}; +} + template (), @@ -255,6 +318,7 @@ struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_si #endif // clang-format on +// TODO(griwes): remove for CCCL 4.0 when we drop the public scan dispatcher template MakeScanPolicyWrapper(PolicyT polic return ScanPolicyWrapper{policy}; } +// TODO(griwes): remove this in CCCL 4.0 when we remove the public scan dispatcher template struct policy_hub { @@ -566,6 +631,418 @@ struct policy_hub using MaxPolicy = Policy1000; }; + +#if _CCCL_HAS_CONCEPTS() +template +concept scan_policy_selector = policy_selector; +#endif // _CCCL_HAS_CONCEPTS() + +constexpr _CCCL_HOST_DEVICE delay_constructor_policy default_delay_constructor_policy(primitive_accum primitive) +{ + return primitive == primitive_accum::yes + ? delay_constructor_policy{delay_constructor_kind::fixed_delay, 350, 450} + : delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450}; +} + +struct policy_selector +{ + int input_value_size; + int output_value_size; + int accum_size; + int offset_size; + type_t accum_type; + op_kind_t operation_t; + primitive_accum primitive_accum_t; + primitive_op primitive_op_t; + bool benchmark_match; + + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy + { + const bool large_values = accum_size > 128; + const BlockLoadAlgorithm scan_transposed_load = + large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE; + const BlockStoreAlgorithm scan_transposed_store = + large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; + const auto default_delay = default_delay_constructor_policy(primitive_accum_t); + + if (arch >= ::cuda::arch_id::sm_100) + { + if (benchmark_match && operation_t == op_kind_t::plus && primitive_accum_t == primitive_accum::yes) + { + if (offset_size == 4) + { + switch (input_value_size) + { + case 1: + // ipt_18.tpb_512.ns_768.dcid_7.l2w_820.trp_1.ld_0 1.188818 1.005682 1.173041 1.305288 + return make_mem_scaled_scan_policy( + 512, + 18, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon, 768, 820}); + case 2: + // ipt_13.tpb_512.ns_1384.dcid_7.l2w_720.trp_1.ld_0 1.128443 1.002841 1.119688 1.307692 + return make_mem_scaled_scan_policy( + 512, + 13, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon, 1384, 720}); + case 4: + // ipt_22.tpb_384.ns_1904.dcid_6.l2w_830.trp_1.ld_0 1.148442 0.997167 1.139902 1.462651 + return make_mem_scaled_scan_policy( + 384, + 22, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, 1904, 830}); + case 8: + // ipt_23.tpb_416.ns_772.dcid_5.l2w_710.trp_1.ld_0 1.089468 1.015581 1.085630 1.264583 + return make_mem_scaled_scan_policy( + 416, + 23, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, 772, 710}); + default: + break; + } + } + else if (offset_size == 8) + { + switch (input_value_size) + { + case 1: + // ipt_14.tpb_384.ns_228.dcid_7.l2w_775.trp_1.ld_1 1.107210 1.000000 1.100637 1.307692 + return make_mem_scaled_scan_policy( + 384, + 14, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_CA, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon, 228, 775}); + case 2: + // todo(gonidelis): Regresses for large inputs. Find better tuning. + // ipt_13.tpb_288.ns_1520.dcid_5.l2w_895.trp_1.ld_1 1.080934 0.983509 1.077724 1.305288 + break; + case 4: + // ipt_19.tpb_416.ns_956.dcid_7.l2w_550.trp_1.ld_1 1.146142 0.994350 1.137459 1.455636 + return make_mem_scaled_scan_policy( + 416, + 19, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_CA, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon, 956, 550}); + case 8: + if (accum_type == type_t::float64) + { + break; + } + // ipt_22.tpb_320.ns_328.dcid_2.l2w_965.trp_1.ld_0 1.080133 1.000000 1.075577 1.248963 + return make_mem_scaled_scan_policy( + 320, + 22, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backoff, 328, 965}); + default: + break; + } + } + } + } + + if (arch >= ::cuda::arch_id::sm_90) + { + if (primitive_op_t == primitive_op::yes) + { + if (primitive_accum_t == primitive_accum::yes) + { + switch (accum_size) + { + case 1: + return make_mem_scaled_scan_policy( + 192, + 22, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 168, 1140}); + case 2: + return make_mem_scaled_scan_policy( + 512, + 12, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 376, 1125}); + case 4: + if (accum_type == type_t::float32) + { + return make_mem_scaled_scan_policy( + 128, + 24, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 688, 1140}); + } + return make_mem_scaled_scan_policy( + 128, + 24, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 648, 1245}); + case 8: + if (accum_type == type_t::float64) + { + return make_mem_scaled_scan_policy( + 224, + 24, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 576, 1215}); + } + return make_mem_scaled_scan_policy( + 224, + 24, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 632, 1290}); + default: + break; + } + } + +#if _CCCL_HAS_INT128() + if (primitive_accum_t == primitive_accum::no && accum_size == 16 + && (accum_type == type_t::int128 || accum_type == type_t::uint128)) + { + return make_mem_scaled_scan_policy( + 576, + 21, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 860, 630}); + } +#endif + } + } + + if (arch >= ::cuda::arch_id::sm_80) + { + if (primitive_op_t == primitive_op::yes) + { + if (primitive_accum_t == primitive_accum::yes) + { + switch (accum_size) + { + case 1: + return make_mem_scaled_scan_policy( + 320, + 14, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 368, 725}); + case 2: + return make_mem_scaled_scan_policy( + 352, + 16, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 488, 1040}); + case 4: + if (accum_type == type_t::float32) + { + return make_mem_scaled_scan_policy( + 288, + 8, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 724, 1050}); + } + return make_mem_scaled_scan_policy( + 320, + 12, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 268, 1180}); + case 8: + if (accum_type == type_t::float64) + { + return make_mem_scaled_scan_policy( + 384, + 12, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 388, 1100}); + } + return make_mem_scaled_scan_policy( + 288, + 22, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 716, 785}); + default: + break; + } + } + +#if _CCCL_HAS_INT128() + if (primitive_accum_t == primitive_accum::no && accum_size == 16 + && (accum_type == type_t::int128 || accum_type == type_t::uint128)) + { + return make_mem_scaled_scan_policy( + 640, + 24, + accum_size, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_STORE_DIRECT, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::no_delay, 0, 1200}); + } +#endif + } + } + + if (arch >= ::cuda::arch_id::sm_75) + { + if (benchmark_match && operation_t == op_kind_t::plus && primitive_accum_t == primitive_accum::yes + && offset_size == 8 && input_value_size == 4) + { + // ipt_7.tpb_128.ns_628.dcid_1.l2w_520.trp_1.ld_0 + return make_mem_scaled_scan_policy( + 128, + 7, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 628, 520}); + } + + return make_mem_scaled_scan_policy( + 128, + 15, + accum_size, + scan_transposed_load, + LOAD_DEFAULT, + scan_transposed_store, + BLOCK_SCAN_WARP_SCANS, + default_delay); + } + + if (arch >= ::cuda::arch_id::sm_60) + { + return make_mem_scaled_scan_policy( + 128, + 15, + accum_size, + scan_transposed_load, + LOAD_DEFAULT, + scan_transposed_store, + BLOCK_SCAN_WARP_SCANS, + default_delay); + } + + return make_mem_scaled_scan_policy( + 128, + 12, + accum_size, + BLOCK_LOAD_DIRECT, + LOAD_CA, + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + BLOCK_SCAN_RAKING, + default_delay); + } +}; + +#if _CCCL_HAS_CONCEPTS() +static_assert(scan_policy_selector); +#endif // _CCCL_HAS_CONCEPTS() + +// stateless version which can be passed to kernels +template +struct policy_selector_from_types +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy + { + constexpr bool benchmark_match = + sizeof(AccumT) == sizeof(::cuda::std::__accumulator_t) + && sizeof(InputValueT) == sizeof(OutputValueT); + + constexpr auto policies = policy_selector{ + int{sizeof(InputValueT)}, + int{sizeof(OutputValueT)}, + int{sizeof(AccumT)}, + int{sizeof(OffsetT)}, + classify_type, + classify_op, + is_primitive_accum(), + is_primitive_op(), + benchmark_match}; + return policies(arch); + } +}; } // namespace detail::scan CUB_NAMESPACE_END diff --git a/nvbench_helper/nvbench_helper/look_back_helper.cuh b/nvbench_helper/nvbench_helper/look_back_helper.cuh index bee6708a781..f7a97937dcb 100644 --- a/nvbench_helper/nvbench_helper/look_back_helper.cuh +++ b/nvbench_helper/nvbench_helper/look_back_helper.cuh @@ -1,10 +1,10 @@ -// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #pragma once #if !TUNE_BASE -# include +# include # include @@ -12,15 +12,8 @@ # error "TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS, and TUNE_DELAY_CONSTRUCTOR_ID must be defined" # endif -using delay_constructors = nvbench::type_list< - cub::detail::no_delay_constructor_t, - cub::detail::fixed_delay_constructor_t, - cub::detail::exponential_backoff_constructor_t, - cub::detail::exponential_backoff_jitter_constructor_t, - cub::detail::exponential_backoff_jitter_window_constructor_t, - cub::detail::exponential_backon_jitter_window_constructor_t, - cub::detail::exponential_backon_jitter_constructor_t, - cub::detail::exponential_backon_constructor_t>; - -using delay_constructor_t = nvbench::tl::get; +using delay_constructor_t = + cub::detail::delay_constructor_t(TUNE_DELAY_CONSTRUCTOR_ID), + TUNE_MAGIC_NS, + TUNE_L2_WRITE_LATENCY_NS>; #endif // !TUNE_BASE From 637133997524a88797a9413c6518ab53ca5af8ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Sat, 7 Feb 2026 21:19:20 -0800 Subject: [PATCH 02/31] Update benchmarks. --- .../applications/P1/log-cdf-from-log-pdfs.cu | 60 ++++++++--------- .../P1/non-commutative-bicyclic-monoid.cu | 60 ++++++++--------- .../P1/rabin-karp-second-fingerprinting.cu | 53 +++++++-------- .../scan/applications/P1/running-min-max.cu | 59 ++++++++-------- .../P1/scan-over-unitriangular-group.cu | 60 ++++++++--------- cub/benchmarks/bench/scan/exclusive/base.cuh | 67 +++++-------------- cub/benchmarks/bench/scan/exclusive/by_key.cu | 2 +- cub/benchmarks/bench/scan/policy_selector.h | 31 +++++++++ 8 files changed, 192 insertions(+), 200 deletions(-) create mode 100644 cub/benchmarks/bench/scan/policy_selector.h diff --git a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu index f6acf8ae6c0..92f38e31892 100644 --- a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu +++ b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu @@ -34,26 +34,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< - TUNE_THREADS, - TUNE_ITEMS, - // TODO Tune - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - cub::BLOCK_SCAN_WARP_SCANS, - TUNE_STORE_ALGORITHM, - delay_constructor_t>; - }; - - using MaxPolicy = policy_t; -}; #endif // !TUNE_BASE +#include "../../policy_selector.h" + namespace impl { /* @@ -125,15 +109,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list; - using dispatch_t = - cub::DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); auto mu = static_cast(state.get_float64("Mu{io}")); @@ -161,14 +136,39 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list(elements); size_t tmp_size; - dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream); + cub::detail::scan::dispatch_with_accum( + nullptr, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + bench_stream +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream()); + cub::detail::scan::dispatch_with_accum( + d_tmp, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + launch.get_stream() +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); }); // for validation, use diff --git a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu index 07eb27be154..8ab3018ddc0 100644 --- a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu +++ b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu @@ -30,26 +30,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< - TUNE_THREADS, - TUNE_ITEMS, - // TODO Tune - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - cub::BLOCK_SCAN_WARP_SCANS, - TUNE_STORE_ALGORITHM, - delay_constructor_t>; - }; - - using MaxPolicy = policy_t; -}; #endif // !TUNE_BASE +#include "../../policy_selector.h" + namespace impl { /* Consider free monoid with two generators, ``q`` and ``p``, modulo defining relationship (``p * q == 1``). @@ -112,15 +96,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list using output_it_t = pair_t*; using offset_t = cub::detail::choose_offset_t; -#if !TUNE_BASE - using policy_t = policy_hub_t; - using dispatch_t = cub:: - DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector output(elements); @@ -148,14 +123,39 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list cudaStream_t bench_stream = state.get_cuda_stream(); size_t tmp_size; - dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream); + cub::detail::scan::dispatch_with_accum( + nullptr, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + bench_stream +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream()); + cub::detail::scan::dispatch_with_accum( + d_tmp, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + launch.get_stream() +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); }); } diff --git a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu index 579ef22d211..89421692c6b 100644 --- a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu +++ b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu @@ -35,26 +35,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< - TUNE_THREADS, - TUNE_ITEMS, - // TODO Tune - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - cub::BLOCK_SCAN_WARP_SCANS, - TUNE_STORE_ALGORITHM, - delay_constructor_t>; - }; - - using MaxPolicy = policy_t; -}; #endif // !TUNE_BASE +#include "../../policy_selector.h" + namespace impl { /* Denote epsilon, the identity element, be an empty sequence, and consider @@ -307,15 +291,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list; - using dispatch_t = cub:: - DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector input = generate(elements); @@ -337,13 +312,26 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list( + nullptr, + tmp_size, + inp_it, + out_it, + op_t{p}, + wrapped_init_t{}, + input.size(), + bench_stream +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( + cub::detail::scan::dispatch_with_accum( d_tmp, tmp_size, inp_it, @@ -351,7 +339,12 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list{} +#endif // !TUNE_BASE + ); }); // for validation uncomment these two lines diff --git a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu index cf37c7f6312..1ae88d78742 100644 --- a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu +++ b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu @@ -37,26 +37,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< - TUNE_THREADS, - TUNE_ITEMS, - // TODO Tune - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - cub::BLOCK_SCAN_WARP_SCANS, - TUNE_STORE_ALGORITHM, - delay_constructor_t>; - }; - - using MaxPolicy = policy_t; -}; #endif // !TUNE_BASE +#include "../../policy_selector.h" + namespace impl { /* Given input sequence of values, compute sequence of @@ -215,15 +199,6 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list) using output_it_t = pair_t*; using offset_t = cub::detail::choose_offset_t; -#if !TUNE_BASE - using policy_t = policy_hub_t; - using dispatch_t = cub:: - DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector output(elements); @@ -241,13 +216,39 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list) cudaStream_t bench_stream = state.get_cuda_stream().get_stream(); size_t tmp_size; - dispatch_t::Dispatch(nullptr, tmp_size, inp_it, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream); + cub::detail::scan::dispatch_with_accum( + nullptr, + tmp_size, + inp_it, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + bench_stream +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch(d_tmp, tmp_size, inp_it, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream()); + cub::detail::scan::dispatch_with_accum( + d_tmp, + tmp_size, + inp_it, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + launch.get_stream() +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); }); // for verification use diff --git a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu index e2afb9ea752..6ebc38a49fc 100644 --- a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu +++ b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu @@ -34,26 +34,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< - TUNE_THREADS, - TUNE_ITEMS, - // TODO Tune - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - cub::BLOCK_SCAN_WARP_SCANS, - TUNE_STORE_ALGORITHM, - delay_constructor_t>; - }; - - using MaxPolicy = policy_t; -}; #endif // !TUNE_BASE +#include "../../policy_selector.h" + namespace impl { template @@ -125,15 +109,6 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list) using output_it_t = tuple_t*; using offset_t = cub::detail::choose_offset_t; -#if !TUNE_BASE - using policy_t = policy_hub_t; - using dispatch_t = cub:: - DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); cudaStream_t bench_stream = state.get_cuda_stream().get_stream(); @@ -158,14 +133,39 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list) auto d_output = thrust::raw_pointer_cast(output.data()); size_t tmp_size; - dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream); + cub::detail::scan::dispatch_with_accum( + nullptr, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + bench_stream +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream()); + cub::detail::scan::dispatch_with_accum( + d_tmp, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + launch.get_stream() +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); }); // for validation use (recommended for integral types and smallish input sizes) diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 709f5592678..c2f572c2244 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -24,44 +24,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -template -struct policy_hub_t -{ - template - using agent_policy_t = cub::AgentScanPolicy< - NOMINAL_BLOCK_THREADS_4B, - NOMINAL_ITEMS_PER_THREAD_4B, - ComputeT, - LOAD_ALGORITHM, - LOAD_MODIFIER, - STORE_ALGORITHM, - SCAN_ALGORITHM, - cub::detail::MemBoundScaling, - delay_constructor_t>; - - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanPolicyT = - agent_policy_t; - }; - - using MaxPolicy = policy_t; -}; #endif // TUNE_BASE +#include "../policy_selector.h" + template static void basic(nvbench::state& state, nvbench::type_list) try @@ -73,15 +39,6 @@ try using output_it_t = T*; using offset_t = cub::detail::choose_offset_t; -#if !TUNE_BASE - using policy_t = policy_hub_t; - using dispatch_t = cub:: - DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); if (sizeof(offset_t) == 4 && elements > std::numeric_limits::max()) { @@ -100,7 +57,7 @@ try state.add_global_memory_writes(elements); size_t tmp_size; - dispatch_t::Dispatch( + cub::detail::scan::dispatch_with_accum( nullptr, tmp_size, d_input, @@ -108,11 +65,16 @@ try op_t{}, wrapped_init_t{T{}}, static_cast(input.size()), - 0 /* stream */); + 0 /* stream */ +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); - thrust::device_vector tmp(tmp_size); + thrust::device_vector tmp(tmp_size, thrust::no_init); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( + cub::detail::scan::dispatch_with_accum( thrust::raw_pointer_cast(tmp.data()), tmp_size, d_input, @@ -120,7 +82,12 @@ try op_t{}, wrapped_init_t{T{}}, static_cast(input.size()), - launch.get_stream()); + launch.get_stream() +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); }); } catch (const std::bad_alloc&) diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu index 09928667a52..e01363cc712 100644 --- a/cub/benchmarks/bench/scan/exclusive/by_key.cu +++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu @@ -106,7 +106,7 @@ static void scan(nvbench::state& state, nvbench::type_list(elements), 0 /* stream */); - thrust::device_vector tmp(tmp_size); + thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h new file mode 100644 index 00000000000..2d474315534 --- /dev/null +++ b/cub/benchmarks/bench/scan/policy_selector.h @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3 + +#include + +// %RANGE% TUNE_ITEMS ipt 7:24:1 +// %RANGE% TUNE_THREADS tpb 128:1024:32 +// %RANGE% TUNE_MAGIC_NS ns 0:2048:4 +// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 +// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 +// %RANGE% TUNE_TRANSPOSE trp 0:1:1 +// %RANGE% TUNE_LOAD ld 0:1:1 + +#if !TUNE_BASE +template +struct policy_selector +{ + _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::scan::scan_policy + { + return cub::detail::scan::make_mem_scaled_scan_policy( + TUNE_THREADS, + TUNE_ITEMS, + int{sizeof(AccumT)}, + TUNE_LOAD_ALGORITHM, + TUNE_LOAD_MODIFIER, + TUNE_STORE_ALGORITHM, + cub::BLOCK_SCAN_WARP_SCANS, + cub::detail::delay_constructor_policy_from_type); + } +}; +#endif // !TUNE_BASE From 9e346b5e44de1d77caa4a78cfc1efafeb6176d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Sat, 7 Feb 2026 21:22:38 -0800 Subject: [PATCH 03/31] Update copyright years. --- .../bench/scan/applications/P1/log-cdf-from-log-pdfs.cu | 2 +- .../scan/applications/P1/non-commutative-bicyclic-monoid.cu | 2 +- .../scan/applications/P1/rabin-karp-second-fingerprinting.cu | 2 +- cub/benchmarks/bench/scan/applications/P1/running-min-max.cu | 2 +- .../bench/scan/applications/P1/scan-over-unitriangular-group.cu | 2 +- cub/benchmarks/bench/scan/exclusive/base.cuh | 2 +- cub/benchmarks/bench/scan/exclusive/by_key.cu | 2 +- cub/benchmarks/bench/scan/policy_selector.h | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu index 92f38e31892..c3f5ebcb299 100644 --- a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu +++ b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include diff --git a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu index 8ab3018ddc0..41213cf1b0d 100644 --- a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu +++ b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include diff --git a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu index 89421692c6b..d52ca9a450f 100644 --- a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu +++ b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include diff --git a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu index 1ae88d78742..82a158c0940 100644 --- a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu +++ b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include diff --git a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu index 6ebc38a49fc..02ee4feba28 100644 --- a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu +++ b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index c2f572c2244..a425fe54d95 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #pragma once diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu index e01363cc712..d1a50605a23 100644 --- a/cub/benchmarks/bench/scan/exclusive/by_key.cu +++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #include diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h index 2d474315534..92ee907447e 100644 --- a/cub/benchmarks/bench/scan/policy_selector.h +++ b/cub/benchmarks/bench/scan/policy_selector.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #include From e9467aff52ae243622c9c0e59ade359c34213b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Sat, 7 Feb 2026 21:38:34 -0800 Subject: [PATCH 04/31] c.parallel: centralize the handling of common cub types. --- c/parallel/src/radix_sort.cu | 15 +------- c/parallel/src/reduce.cu | 29 ++------------ c/parallel/src/scan.cu | 61 ++---------------------------- c/parallel/src/segmented_reduce.cu | 30 ++------------- c/parallel/src/util/types.h | 51 ++++++++++++++++++++++++- 5 files changed, 61 insertions(+), 125 deletions(-) diff --git a/c/parallel/src/radix_sort.cu b/c/parallel/src/radix_sort.cu index 84db1db7d93..eef85b4fa0c 100644 --- a/c/parallel/src/radix_sort.cu +++ b/c/parallel/src/radix_sort.cu @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -213,18 +213,7 @@ try std::string offset_t; check(cccl_type_name_from_nvrtc(&offset_t)); - // TODO(bgruber): generalize this somewhere - const auto key_type = [&] { - switch (input_keys_it.value_type.type) - { - case CCCL_FLOAT32: - return cub::detail::type_t::float32; - case CCCL_FLOAT64: - return cub::detail::type_t::float64; - default: - return cub::detail::type_t::other; - } - }(); + const auto key_type = cccl_type_enum_to_cub_type(input_keys_it.value_type.type); const auto policy_sel = cub::detail::radix_sort::policy_selector{ static_cast(input_keys_it.value_type.size), diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index 8df0aeb5d8a..c673d3453c3 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -223,31 +223,8 @@ try const auto policy_sel = [&] { using namespace cub::detail; - auto accum_type = type_t::other; - if (accum_t.type == CCCL_FLOAT32) - { - accum_type = type_t::float32; - } - else if (accum_t.type == CCCL_FLOAT64) - { - accum_type = type_t::float64; - } - - auto operation_t = op_kind_t::other; - switch (op.type) - { - case CCCL_PLUS: - operation_t = op_kind_t::plus; - break; - case CCCL_MINIMUM: - operation_t = op_kind_t::min; - break; - case CCCL_MAXIMUM: - operation_t = op_kind_t::max; - break; - default: - break; - } + const auto accum_type = cccl_type_enum_to_cub_type(accum_t.type); + const auto operation_t = cccl_op_kind_to_cub_op(op.type); const int offset_size = int{sizeof(OffsetT)}; return cub::detail::reduce::policy_selector{accum_type, operation_t, offset_size, static_cast(accum_t.size)}; diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 215b232a270..6a4cb529f7c 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -205,64 +205,12 @@ try const auto output_it_value_t = cccl_type_enum_to_name(output_it.value_type.type); const auto policy_sel = [&] { - using cub::detail::op_kind_t; - using cub::detail::type_t; using cub::detail::scan::policy_selector; using cub::detail::scan::primitive_accum; using cub::detail::scan::primitive_op; - auto accum_type = type_t::other; - switch (accum_t.type) - { - case CCCL_INT8: - accum_type = type_t::int8; - break; - case CCCL_INT16: - accum_type = type_t::int16; - break; - case CCCL_INT32: - accum_type = type_t::int32; - break; - case CCCL_INT64: - accum_type = type_t::int64; - break; - case CCCL_UINT8: - accum_type = type_t::uint8; - break; - case CCCL_UINT16: - accum_type = type_t::uint16; - break; - case CCCL_UINT32: - accum_type = type_t::uint32; - break; - case CCCL_UINT64: - accum_type = type_t::uint64; - break; - case CCCL_FLOAT32: - accum_type = type_t::float32; - break; - case CCCL_FLOAT64: - accum_type = type_t::float64; - break; - default: - break; - } - - auto operation_t = op_kind_t::other; - switch (op.type) - { - case CCCL_PLUS: - operation_t = op_kind_t::plus; - break; - case CCCL_MINIMUM: - operation_t = op_kind_t::min; - break; - case CCCL_MAXIMUM: - operation_t = op_kind_t::max; - break; - default: - break; - } + const auto accum_type = cccl_type_enum_to_cub_type(accum_t.type); + const auto operation_t = cccl_op_kind_to_cub_op(op.type); auto primitive_accum_t = primitive_accum::no; switch (accum_t.type) @@ -285,10 +233,7 @@ try break; } - const auto primitive_op_t = - (op.type == CCCL_PLUS || op.type == CCCL_MINIMUM || op.type == CCCL_MAXIMUM) - ? primitive_op::yes - : primitive_op::no; + const auto primitive_op_t = (operation_t == cub::detail::op_kind_t::other) ? primitive_op::no : primitive_op::yes; const auto input_type = input_it.value_type.type; const auto output_type = output_it.value_type.type; diff --git a/c/parallel/src/segmented_reduce.cu b/c/parallel/src/segmented_reduce.cu index 93ca88066c6..a07c696ddfb 100644 --- a/c/parallel/src/segmented_reduce.cu +++ b/c/parallel/src/segmented_reduce.cu @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -154,35 +154,11 @@ try // OffsetT is checked to match have 64-bit size const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64); - // TODO(bgruber): share this with reduce.cu const auto policy_sel = [&] { using namespace cub::detail; - auto accum_type = type_t::other; - if (accum_t.type == CCCL_FLOAT32) - { - accum_type = type_t::float32; - } - else if (accum_t.type == CCCL_FLOAT64) - { - accum_type = type_t::float64; - } - - auto operation_t = op_kind_t::other; - switch (op.type) - { - case CCCL_PLUS: - operation_t = op_kind_t::plus; - break; - case CCCL_MINIMUM: - operation_t = op_kind_t::min; - break; - case CCCL_MAXIMUM: - operation_t = op_kind_t::max; - break; - default: - break; - } + const auto accum_type = cccl_type_enum_to_cub_type(accum_t.type); + const auto operation_t = cccl_op_kind_to_cub_op(op.type); const int offset_size = int{sizeof(OffsetT)}; return cub::detail::segmented_reduce::policy_selector{ diff --git a/c/parallel/src/util/types.h b/c/parallel/src/util/types.h index 10408939f80..7ff767b4a94 100644 --- a/c/parallel/src/util/types.h +++ b/c/parallel/src/util/types.h @@ -4,12 +4,14 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// #pragma once +#include + #include #include @@ -107,3 +109,50 @@ std::string cccl_type_enum_to_name(cccl_type_enum type, bool is_pointer = false) return result; } + +inline constexpr cub::detail::type_t cccl_type_enum_to_cub_type(cccl_type_enum type) +{ + switch (type) + { + case CCCL_INT8: + return cub::detail::type_t::int8; + case CCCL_INT16: + return cub::detail::type_t::int16; + case CCCL_INT32: + return cub::detail::type_t::int32; + case CCCL_INT64: + return cub::detail::type_t::int64; + case CCCL_UINT8: + return cub::detail::type_t::uint8; + case CCCL_UINT16: + return cub::detail::type_t::uint16; + case CCCL_UINT32: + return cub::detail::type_t::uint32; + case CCCL_UINT64: + return cub::detail::type_t::uint64; + case CCCL_FLOAT32: + return cub::detail::type_t::float32; + case CCCL_FLOAT64: + return cub::detail::type_t::float64; + case CCCL_FLOAT16: + case CCCL_STORAGE: + case CCCL_BOOLEAN: + default: + return cub::detail::type_t::other; + } +} + +inline constexpr cub::detail::op_kind_t cccl_op_kind_to_cub_op(cccl_op_kind_t type) +{ + switch (type) + { + case CCCL_PLUS: + return cub::detail::op_kind_t::plus; + case CCCL_MINIMUM: + return cub::detail::op_kind_t::min; + case CCCL_MAXIMUM: + return cub::detail::op_kind_t::max; + default: + return cub::detail::op_kind_t::other; + } +} From c4c0c0982f69df07e9cf6951f369110be498ebfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 12 Feb 2026 18:09:39 -0800 Subject: [PATCH 05/31] Resolve review comments. --- c/parallel/src/scan.cu | 46 ++++++------------- cub/benchmarks/bench/scan/policy_selector.h | 4 +- cub/cub/device/dispatch/dispatch_scan.cuh | 35 ++++++++++---- .../device/dispatch/tuning/tuning_scan.cuh | 8 ++-- 4 files changed, 46 insertions(+), 47 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 6a4cb529f7c..57edaf9ab6e 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -389,39 +389,19 @@ CUresult cccl_device_scan( CUdevice cu_device; check(cuCtxGetDevice(&cu_device)); - if constexpr (std::is_same_v) - { - auto exec_status = cub::detail::scan::dispatch_with_accum( - d_temp_storage, - *temp_storage_bytes, - indirect_arg_t{d_in}, - indirect_arg_t{d_out}, - indirect_arg_t{op}, - init, - static_cast(num_items), - stream, - *static_cast(build.runtime_policy), - scan::scan_kernel_source{build}, - cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}); - error = static_cast(exec_status); - } - else - { - indirect_arg_t init_arg{init}; - auto exec_status = cub::detail::scan::dispatch_with_accum( - d_temp_storage, - *temp_storage_bytes, - indirect_arg_t{d_in}, - indirect_arg_t{d_out}, - indirect_arg_t{op}, - init_arg, - static_cast(num_items), - stream, - *static_cast(build.runtime_policy), - scan::scan_kernel_source{build}, - cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}); - error = static_cast(exec_status); - } + auto exec_status = cub::detail::scan::dispatch_with_accum( + d_temp_storage, + *temp_storage_bytes, + indirect_arg_t{d_in}, + indirect_arg_t{d_out}, + indirect_arg_t{op}, + std::conditional_t, cub::NullType, indirect_arg_t>{init}, + static_cast(num_items), + stream, + *static_cast(build.runtime_policy), + scan::scan_kernel_source{build}, + cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}); + error = static_cast(exec_status); } catch (const std::exception& exc) { diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h index 92ee907447e..386e4b41278 100644 --- a/cub/benchmarks/bench/scan/policy_selector.h +++ b/cub/benchmarks/bench/scan/policy_selector.h @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. -// SPDX-License-Identifier: BSD-3 +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index fe3134e60e9..560ad485d28 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -319,8 +319,11 @@ struct DispatchScan #endif // CUB_DEBUG_LOG // Invoke init_kernel to initialize tile descriptors - launcher_factory(init_grid_size, INIT_KERNEL_THREADS, 0, stream, /* use_pdl */ true) - .doit(init_kernel, tile_state, num_tiles); + if (const auto error = CubDebug(launcher_factory(init_grid_size, INIT_KERNEL_THREADS, 0, stream, /* use_pdl */ true) + .doit(init_kernel, tile_state, num_tiles))) + { + return error; + } // Check for failure to launch if (const auto error = CubDebug(cudaPeekAtLastError())) @@ -366,8 +369,12 @@ struct DispatchScan #endif // CUB_DEBUG_LOG // Invoke scan_kernel - launcher_factory(scan_grid_size, policy.Scan().BlockThreads(), 0, stream, /* use_pdl */ true) - .doit(scan_kernel, d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items); + if (const auto error = CubDebug( + launcher_factory(scan_grid_size, policy.Scan().BlockThreads(), 0, stream, /* use_pdl */ true) + .doit(scan_kernel, d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items))) + { + return error; + } // Check for failure to launch if (const auto error = CubDebug(cudaPeekAtLastError())) @@ -521,6 +528,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( } const scan_policy active_policy = policy_selector(arch_id); +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + NV_IF_TARGET(NV_IS_HOST, + (std::stringstream ss; ss << active_policy; + _CubLog("Dispatching DeviceScan to arch %d with tuning: %s\n", (int) arch_id, ss.str().c_str());)) +#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) // Number of input tiles const int tile_size = active_policy.block_threads * active_policy.items_per_thread; @@ -565,8 +577,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( #endif // CUB_DEBUG_LOG // Invoke init_kernel to initialize tile descriptors - launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true) - .doit(kernel_source.InitKernel(), tile_state, num_tiles); + if (const auto error = CubDebug(launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true) + .doit(kernel_source.InitKernel(), tile_state, num_tiles))) + { + return error; + } // Check for failure to launch if (const auto error = CubDebug(cudaPeekAtLastError())) @@ -612,8 +627,12 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( #endif // CUB_DEBUG_LOG // Invoke scan_kernel - launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true) - .doit(kernel_source.ScanKernel(), d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items); + if (const auto error = CubDebug( + launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true) + .doit(kernel_source.ScanKernel(), d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items))) + { + return error; + } // Check for failure to launch if (const auto error = CubDebug(cudaPeekAtLastError())) diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 7db16f81d65..559111249dd 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -652,12 +652,14 @@ struct policy_selector int offset_size; type_t accum_type; op_kind_t operation_t; - primitive_accum primitive_accum_t; - primitive_op primitive_op_t; + // TODO(griwes): remove this field before policy_selector is publicly exposed bool benchmark_match; [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy { + primitive_accum primitive_accum_t = accum_type != type_t::other && accum_type != type_t::int128; + primitive_op primitive_op_t = operation_t != op_kind_t::other; + const bool large_values = accum_size > 128; const BlockLoadAlgorithm scan_transposed_load = large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE; @@ -1037,8 +1039,6 @@ struct policy_selector_from_types int{sizeof(OffsetT)}, classify_type, classify_op, - is_primitive_accum(), - is_primitive_op(), benchmark_match}; return policies(arch); } From 2c2db7cccc947fc1313dce798f32f8d2d52c5bd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 12 Feb 2026 18:11:11 -0800 Subject: [PATCH 06/31] Fix c.parallel radix_sort breakage. --- c/parallel/src/radix_sort.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/c/parallel/src/radix_sort.cu b/c/parallel/src/radix_sort.cu index eef85b4fa0c..c4b9717d7b5 100644 --- a/c/parallel/src/radix_sort.cu +++ b/c/parallel/src/radix_sort.cu @@ -245,6 +245,8 @@ struct __align__({3}) values_storage_t {{ using device_radix_sort_policy = {5}; using namespace cub; using namespace cub::detail::radix_sort; +using cub::detail::delay_constructor_policy; +using cub::detail::delay_constructor_kind; static_assert(device_radix_sort_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6}, "Host generated and JIT compiled policy mismatch"); )XXX", input_keys_it.value_type.size, // 0 From 2a0ddf4171762df497d1e788ca0fc4af8be66c2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 19 Feb 2026 01:15:37 +0100 Subject: [PATCH 07/31] Compilation fixes. --- cub/cub/device/dispatch/dispatch_scan.cuh | 2 +- cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 4652cecd3e7..7617eee9345 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -510,7 +510,7 @@ struct DispatchScan // TODO(bgruber): we probably need to ensure alignment of d_temp_storage _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), ""); - constexpr scan_warpspeed_policy warpspeed_policy = detail::scan::make_scan_warpspeed_policy(); + constexpr auto warpspeed_policy = detail::scan::make_scan_warpspeed_policy(); constexpr int smem_size_1_stage = detail::scan::smem_for_stages(warpspeed_policy, 1); static_assert(smem_size_1_stage <= detail::max_smem_per_block); // this is ensured by scan_use_warpspeed diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh index 5b40fcc4d81..15d68d6ac90 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh @@ -82,6 +82,12 @@ struct warpspeedKernelPolicy static constexpr int num_total_threads = NumTotalThreads; static constexpr int items_per_thread = ItemsPerThread; static constexpr int tile_size = TileSize; + + static constexpr int num_reduce_warps = NumReduceWarps; + static constexpr int num_scan_stor_warps = NumScanStorWarps; + static constexpr int num_load_warps = NumLoadWarps; + static constexpr int num_sched_warps = NumSchedWarps; + static constexpr int num_look_ahead_warps = NumLookAheadWarps; }; template @@ -854,7 +860,7 @@ _CCCL_API constexpr auto smem_for_stages( (void) output_size; const auto counts = make_scan_stage_counts(num_stages); - const int align_inout = ::cuda::std::max(16, input_align, output_align); + const int align_inout = ::cuda::std::max({16, input_align, output_align}); const int inout_bytes = policy.tile_size * input_size + 16; const auto reduce_squad = policy.squadReduce(); const int sum_thread_warp = (reduce_squad.threadCount() + reduce_squad.warpCount()) * accum_size; From 22ece56ea5f43da424d7f5d9bc966ffb90cb49c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 19 Feb 2026 02:20:14 +0100 Subject: [PATCH 08/31] Go through dispatch_arch, unify dispatch paths for scan. --- cub/cub/device/dispatch/dispatch_scan.cuh | 522 ++++++++---------- .../device/dispatch/tuning/tuning_scan.cuh | 14 +- cub/test/catch2_test_env_launch_helper.h | 47 ++ 3 files changed, 275 insertions(+), 308 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 7617eee9345..672263e477e 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -23,6 +23,7 @@ #endif // no system header #include +#include #include #include #include @@ -471,20 +472,22 @@ struct DispatchScan return cudaSuccess; } -#if __cccl_ptx_isa >= 860 - template - CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm(ActivePolicyT) +#if _CCCL_CUDACC_AT_LEAST(12, 8) + template + CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm( + const detail::scan::scan_warpspeed_policy& warpspeed_policy, const PolicySelectorT& policy_selector) { - using InputT = ::cuda::std::iter_value_t; - using OutputT = ::cuda::std::iter_value_t; - using WarpspeedPolicy = typename ActivePolicyT::WarpspeedPolicy; - const int grid_dim = - static_cast(::cuda::ceil_div(num_items, static_cast(WarpspeedPolicy::tile_size))); + static_cast(::cuda::ceil_div(num_items, static_cast(warpspeed_policy.tile_size))); if (d_temp_storage == nullptr) { - temp_storage_bytes = grid_dim * kernel_source.look_ahead_tile_state_size(); + temp_storage_bytes = static_cast(grid_dim) * kernel_source.look_ahead_tile_state_size(); + return cudaSuccess; + } + + if (num_items == 0) + { return cudaSuccess; } @@ -496,8 +499,8 @@ struct DispatchScan // number of stages to have an even workload across all SMs (improves small problem sizes), assuming 1 CTA per SM // +1 since it tends to improve performance // TODO(bgruber): make the +1 a tuning parameter - [[maybe_unused]] const int max_stages_for_even_workload = - static_cast(::cuda::ceil_div(num_items, static_cast(sm_count * WarpspeedPolicy::tile_size)) + 1); + const int max_stages_for_even_workload = + static_cast(::cuda::ceil_div(num_items, static_cast(sm_count * warpspeed_policy.tile_size)) + 1); // Maximum dynamic shared memory size that we can use for temporary storage. int max_dynamic_smem_size{}; @@ -510,20 +513,31 @@ struct DispatchScan // TODO(bgruber): we probably need to ensure alignment of d_temp_storage _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), ""); - constexpr auto warpspeed_policy = detail::scan::make_scan_warpspeed_policy(); - constexpr int smem_size_1_stage = detail::scan::smem_for_stages(warpspeed_policy, 1); - static_assert(smem_size_1_stage <= detail::max_smem_per_block); // this is ensured by scan_use_warpspeed - auto scan_kernel = kernel_source.ScanKernel(); int num_stages = 1; - int smem_size = smem_size_1_stage; + int smem_size = detail::scan::smem_for_stages( + warpspeed_policy, + num_stages, + policy_selector.input_value_size, + policy_selector.input_value_alignment, + policy_selector.output_value_size, + policy_selector.output_value_alignment, + policy_selector.accum_size, + policy_selector.accum_alignment); // When launched from the host, maximize the number of stages that we can fit inside the shared memory. NV_IF_TARGET(NV_IS_HOST, ({ while (num_stages <= max_stages_for_even_workload) { - const auto next_smem_size = - detail::scan::smem_for_stages(warpspeed_policy, num_stages + 1); + const auto next_smem_size = detail::scan::smem_for_stages( + warpspeed_policy, + num_stages + 1, + policy_selector.input_value_size, + policy_selector.input_value_alignment, + policy_selector.output_value_size, + policy_selector.output_value_alignment, + policy_selector.accum_size, + policy_selector.accum_alignment); if (next_smem_size > max_dynamic_smem_size) { // This number of stages failed, so stay at the current settings @@ -576,7 +590,7 @@ struct DispatchScan // Invoke scan kernel { - constexpr int block_dim = WarpspeedPolicy::num_total_threads; + const int block_dim = warpspeed_policy.num_total_threads; # ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", grid_dim, block_dim, smem_size, (long long) stream); @@ -612,26 +626,171 @@ struct DispatchScan return cudaSuccess; } -#endif // __cccl_ptx_isa >= 860 +#endif // _CCCL_CUDACC_AT_LEAST(12, 8) - template - CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {}) + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t + __invoke(PolicyGetter policy_getter, const PolicySelectorT& policy_selector) { -#if __cccl_ptx_isa >= 860 - if constexpr (detail::scan::scan_use_warpspeed< - ActivePolicyT, - THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, - THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, - AccumT>) + CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter(); + + CUB_DETAIL_STATIC_ISH_ASSERT(active_policy.load_modifier != CacheLoadModifier::LOAD_LDG, + "The memory consistency model does not apply to texture accesses"); + +#if !_CCCL_CUDACC_AT_LEAST(12, 8) + (void) policy_selector; +#endif // !_CCCL_CUDACC_AT_LEAST(12, 8) + +#if _CCCL_CUDACC_AT_LEAST(12, 8) + if (kernel_source.use_warpspeed(active_policy)) + { + return __invoke_lookahead_algorithm(*active_policy.warpspeed, policy_selector); + } +#endif // _CCCL_CUDACC_AT_LEAST(12, 8) + + // Number of input tiles + const int tile_size = active_policy.block_threads * active_policy.items_per_thread; + const int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); + + auto tile_state = kernel_source.TileState(); + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (const auto error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0]))) + { + return error; // bytes needed for tile status descriptors + } + + // Compute allocation pointers into the single storage blob (or compute + // the necessary size of the blob) + void* allocations[1] = {}; + if (const auto error = + CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { - return __invoke_lookahead_algorithm(active_policy); + return error; } - else -#endif // __cccl_ptx_isa >= 860 + + // Return if the caller is simply requesting the size of the storage allocation, or the problem is empty + if (d_temp_storage == nullptr || num_items == 0) { - return Invoke( - kernel_source.InitKernel(), kernel_source.ScanKernel(), detail::scan::MakeScanPolicyWrapper(active_policy)); + return cudaSuccess; } + + // Construct the tile status interface + if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) + { + return error; + } + + // Log init_kernel configuration + constexpr int init_kernel_threads = 128; + const int init_grid_size = ::cuda::ceil_div(num_tiles, init_kernel_threads); + +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); +#endif // CUB_DEBUG_LOG + + // Invoke init_kernel to initialize tile descriptors + if (const auto error = CubDebug( + launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true) + .doit(kernel_source.InitKernel(), kernel_source.make_tile_state_kernel_arg(tile_state), num_tiles))) + { + return error; + } + + // Check for failure to launch + if (const auto error = CubDebug(cudaPeekAtLastError())) + { + return error; + } + + // Sync the stream if specified to flush runtime errors + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + + // Get SM occupancy for scan_kernel + int scan_sm_occupancy; + if (const auto error = CubDebug( + launcher_factory.MaxSmOccupancy(scan_sm_occupancy, kernel_source.ScanKernel(), active_policy.block_threads))) + { + return error; + } + + // Get max x-dimension of grid + int max_dim_x; + if (const auto error = CubDebug(launcher_factory.MaxGridDimX(max_dim_x))) + { + return error; + } + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { +// Log scan_kernel configuration +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " + "per thread, %d SM occupancy\n", + start_tile, + scan_grid_size, + active_policy.block_threads, + (long long) stream, + active_policy.items_per_thread, + scan_sm_occupancy); +#endif // CUB_DEBUG_LOG + + // Invoke scan_kernel + if (const auto error = CubDebug( + launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true) + .doit(kernel_source.ScanKernel(), + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_in), + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_out), + kernel_source.make_tile_state_kernel_arg(tile_state), + start_tile, + scan_op, + init_value, + num_items, + /* num_stages, unused */ 1))) + { + return error; + } + + // Check for failure to launch + if (const auto error = CubDebug(cudaPeekAtLastError())) + { + return error; + } + + // Sync the stream if specified to flush runtime errors + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + } + + return cudaSuccess; + } + + template + CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT = {}) + { + struct policy_getter + { + _CCCL_API _CCCL_FORCEINLINE constexpr auto operator()() const + { + return detail::scan::convert_policy(); + } + }; + + using policy_selector_t = detail::scan::policy_selector_from_types< + detail::it_value_t, + detail::it_value_t, + AccumT, + OffsetT, + ScanOpT>; + return __invoke(policy_getter{}, policy_selector_t{}); } /** @@ -761,280 +920,41 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( return error; } - const scan_policy active_policy = policy_selector(arch_id); #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) NV_IF_TARGET(NV_IS_HOST, - (std::stringstream ss; ss << active_policy; + (std::stringstream ss; ss << policy_selector(arch_id); _CubLog("Dispatching DeviceScan to arch %d with tuning: %s\n", (int) arch_id, ss.str().c_str());)) #endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) -#if _CCCL_CUDACC_AT_LEAST(12, 8) - if (kernel_source.use_warpspeed(active_policy)) - { - const int grid_dim = - static_cast(::cuda::ceil_div(num_items, static_cast(active_policy.warpspeed->tile_size))); - - if (d_temp_storage == nullptr) - { - temp_storage_bytes = static_cast(grid_dim) * kernel_source.look_ahead_tile_state_size(); - return cudaSuccess; - } - - if (num_items == 0) - { - return cudaSuccess; - } - - int sm_count = 0; - if (const auto error = CubDebug(launcher_factory.MultiProcessorCount(sm_count))) - { - return error; - } - - const int max_stages_for_even_workload = static_cast( - ::cuda::ceil_div(num_items, static_cast(sm_count * active_policy.warpspeed->tile_size)) + 1); - - int max_dynamic_smem_size{}; - if (const auto error = - CubDebug(launcher_factory.max_dynamic_smem_size_for(max_dynamic_smem_size, kernel_source.ScanKernel()))) - { - return error; - } - - _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), ""); - - auto scan_kernel = kernel_source.ScanKernel(); - int num_stages = 1; - int smem_size = detail::scan::smem_for_stages( - *active_policy.warpspeed, - num_stages, - policy_selector.input_value_size, - policy_selector.input_value_alignment, - policy_selector.output_value_size, - policy_selector.output_value_alignment, - policy_selector.accum_size, - policy_selector.accum_alignment); - - NV_IF_TARGET(NV_IS_HOST, ({ - while (num_stages <= max_stages_for_even_workload) - { - const auto next_smem_size = detail::scan::smem_for_stages( - *active_policy.warpspeed, - num_stages + 1, - policy_selector.input_value_size, - policy_selector.input_value_alignment, - policy_selector.output_value_size, - policy_selector.output_value_alignment, - policy_selector.accum_size, - policy_selector.accum_alignment); - if (next_smem_size > max_dynamic_smem_size) - { - break; - } - - smem_size = next_smem_size; - ++num_stages; - } - - if (const auto error = launcher_factory.set_max_dynamic_smem_size_for(scan_kernel, smem_size)) - { - return error; - } - })) - - // Invoke init kernel - { - constexpr auto init_kernel_threads = 128; - const auto init_grid_size = ::cuda::ceil_div(grid_dim, init_kernel_threads); - -# ifdef CUB_DEBUG_LOG - _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, , %lld>>>()\n", - init_grid_size, - init_kernel_threads, - (long long) stream); -# endif // CUB_DEBUG_LOG - - if (const auto error = CubDebug( - launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true) - .doit(kernel_source.InitKernel(), - kernel_source.look_ahead_make_tile_state_kernel_arg(d_temp_storage), - grid_dim))) - { - return error; - } - - if (const auto error = CubDebug(cudaPeekAtLastError())) - { - return error; - } - - if (const auto error = CubDebug(detail::DebugSyncStream(stream))) - { - return error; - } - } - - // Invoke scan kernel - { - const int block_dim = active_policy.warpspeed->num_total_threads; - -# ifdef CUB_DEBUG_LOG - _CubLog("Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", grid_dim, block_dim, smem_size, (long long) stream); -# endif // CUB_DEBUG_LOG - - if (const auto error = CubDebug( - launcher_factory(grid_dim, block_dim, smem_size, stream, /* use_pdl */ true) - .doit(scan_kernel, - d_in, - d_out, - kernel_source.look_ahead_make_tile_state_kernel_arg(d_temp_storage), - /* start_tile, unused */ 0, - ::cuda::std::move(scan_op), - init_value, - num_items, - num_stages))) - { - return error; - } - - if (const auto error = CubDebug(cudaPeekAtLastError())) - { - return error; - } - - if (const auto error = CubDebug(detail::DebugSyncStream(stream))) - { - return error; - } - } - - return cudaSuccess; - } -#endif // _CCCL_CUDACC_AT_LEAST(12, 8) - - // Number of input tiles - const int tile_size = active_policy.block_threads * active_policy.items_per_thread; - const int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); - - auto tile_state = kernel_source.TileState(); - - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; - if (const auto error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0]))) + struct fake_policy { - return error; // bytes needed for tile status descriptors - } - - // Compute allocation pointers into the single storage blob (or compute - // the necessary size of the blob) - void* allocations[1] = {}; - if (const auto error = - CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) - { - return error; - } - - // Return if the caller is simply requesting the size of the storage allocation, or the problem is empty - if (d_temp_storage == nullptr || num_items == 0) - { - return cudaSuccess; - } - - // Construct the tile status interface - if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) - { - return error; - } - - // Log init_kernel configuration - constexpr int init_kernel_threads = 128; - const int init_grid_size = ::cuda::ceil_div(num_tiles, init_kernel_threads); - -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); -#endif // CUB_DEBUG_LOG - - // Invoke init_kernel to initialize tile descriptors - if (const auto error = CubDebug( - launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true) - .doit(kernel_source.InitKernel(), kernel_source.make_tile_state_kernel_arg(tile_state), num_tiles))) - { - return error; - } - - // Check for failure to launch - if (const auto error = CubDebug(cudaPeekAtLastError())) - { - return error; - } - - // Sync the stream if specified to flush runtime errors - if (const auto error = CubDebug(detail::DebugSyncStream(stream))) - { - return error; - } - - // Get SM occupancy for scan_kernel - int scan_sm_occupancy; - if (const auto error = CubDebug( - launcher_factory.MaxSmOccupancy(scan_sm_occupancy, kernel_source.ScanKernel(), active_policy.block_threads))) - { - return error; - } - - // Get max x-dimension of grid - int max_dim_x; - if (const auto error = CubDebug(launcher_factory.MaxGridDimX(max_dim_x))) - { - return error; - } - - // Run grids in epochs (in case number of tiles exceeds max x-dimension - const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x); - for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) - { -// Log scan_kernel configuration -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " - "per thread, %d SM occupancy\n", - start_tile, - scan_grid_size, - active_policy.block_threads, - (long long) stream, - active_policy.items_per_thread, - scan_sm_occupancy); -#endif // CUB_DEBUG_LOG - - // Invoke scan_kernel - if (const auto error = CubDebug( - launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true) - .doit(kernel_source.ScanKernel(), - d_in, - d_out, - kernel_source.make_tile_state_kernel_arg(tile_state), - start_tile, - scan_op, - init_value, - num_items, - /* num_stages, unused */ 1))) - { - return error; - } - - // Check for failure to launch - if (const auto error = CubDebug(cudaPeekAtLastError())) - { - return error; - } - - // Sync the stream if specified to flush runtime errors - if (const auto error = CubDebug(detail::DebugSyncStream(stream))) - { - return error; - } - } - - return cudaSuccess; + using MaxPolicy = void; + }; + + return dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) { + return DispatchScan{ + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + scan_op, + init_value, + stream, + -1 /* ptx_version, not used actually */, + kernel_source, + launcher_factory} + .__invoke(policy_getter, policy_selector); + }); } template < diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index bff5bf3bc85..baa72c09e63 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -202,13 +202,13 @@ struct scan_policy << ", .delay_constructor = " << p.delay_constructor; if (p.warpspeed) { - os << ", scan_warpspeed_policy { .num_squads = " << scan_warpspeed_policy::num_squads << ", .num_reduce_warps = " - << p.warpspeed->num_reduce_warps << ", .num_scan_stor_warps = " << p.warpspeed->num_scan_stor_warps - << ", .num_load_warps = " << p.warpspeed->num_load_warps << ", .num_sched_warps = " - << p.warpspeed->num_sched_warps << ", .num_look_ahead_warps = " << p.warpspeed->num_look_ahead_warps - << ", .num_look_ahead_items = " << p.warpspeed->num_look_ahead_items << ", .num_total_threads = " - << p.warpspeed->num_total_threads << ", .items_per_thread = " << p.warpspeed->items_per_thread - << ", .tile_size = " << p.warpspeed->tile_size << "}"; + os << ", .warpspeed = scan_warpspeed_policy { .num_reduce_warps = " << p.warpspeed->num_reduce_warps + << ", .num_scan_stor_warps = " << p.warpspeed->num_scan_stor_warps << ", .num_load_warps = " + << p.warpspeed->num_load_warps << ", .num_sched_warps = " << p.warpspeed->num_sched_warps + << ", .num_look_ahead_warps = " << p.warpspeed->num_look_ahead_warps << ", .num_look_ahead_items = " + << p.warpspeed->num_look_ahead_items << ", .num_total_threads = " << p.warpspeed->num_total_threads + << ", .items_per_thread = " << p.warpspeed->items_per_thread << ", .tile_size = " << p.warpspeed->tile_size + << "}"; } return os << " }"; } diff --git a/cub/test/catch2_test_env_launch_helper.h b/cub/test/catch2_test_env_launch_helper.h index 886e53f35ed..454a29b28e1 100644 --- a/cub/test/catch2_test_env_launch_helper.h +++ b/cub/test/catch2_test_env_launch_helper.h @@ -3,6 +3,8 @@ #pragma once +#include + #include #include @@ -149,6 +151,51 @@ struct stream_registry_factory_t // Get max grid dimension return cudaDeviceGetAttribute(&max_grid_dim_x, cudaDevAttrMaxGridDimX, device_ordinal); } + + CUB_RUNTIME_FUNCTION cudaError_t MemsetAsync(void* dst, unsigned char value, size_t num_bytes, cudaStream_t stream) + { + return cudaMemsetAsync(dst, value, num_bytes, stream); + } + + CUB_RUNTIME_FUNCTION cudaError_t + MemcpyAsync(void* dst, const void* src, size_t num_bytes, cudaMemcpyKind kind, cudaStream_t stream) + { + return cudaMemcpyAsync(dst, src, num_bytes, kind, stream); + } + + CUB_RUNTIME_FUNCTION cudaError_t MaxSharedMemory(int& max_shared_memory) const + { + int device = 0; + auto error = cudaGetDevice(&device); + if (error != cudaSuccess) + { + return error; + } + + return cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, device); + } + + template + CUB_RUNTIME_FUNCTION cudaError_t max_dynamic_smem_size_for(int& max_dynamic_smem_size, Kernel kernel_ptr) + { + NV_IF_ELSE_TARGET(NV_IS_HOST, // + ({ return cub::MaxPotentialDynamicSmemBytes(max_dynamic_smem_size, kernel_ptr); }), + ({ + cudaFuncAttributes func_attrs{}; + if (const auto error = cudaFuncGetAttributes(&func_attrs, kernel_ptr)) + { + return error; + } + max_dynamic_smem_size = func_attrs.maxDynamicSharedSizeBytes; + return cudaSuccess; + })) + } + + template + CUB_RUNTIME_FUNCTION cudaError_t set_max_dynamic_smem_size_for(Kernel kernel_ptr, int smem_size) + { + return cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); + } }; struct stream_scope From d7f53336a4329d959d382e673f4f2616b1ad6d31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 19 Feb 2026 02:34:06 +0100 Subject: [PATCH 09/31] Remove cuda::std::optional from policies. --- c/parallel/src/scan.cu | 2 +- cub/cub/device/dispatch/dispatch_scan.cuh | 8 ++-- .../device/dispatch/kernels/kernel_scan.cuh | 24 +++++------ .../kernels/kernel_scan_warpspeed.cuh | 1 + .../device/dispatch/tuning/tuning_scan.cuh | 42 +++++++++++-------- 5 files changed, 42 insertions(+), 35 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index c324ccaca99..fdb2f74e4dd 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -312,7 +312,7 @@ try const bool use_warpspeed = active_policy.warpspeed && cub::detail::scan::use_warpspeed( - *active_policy.warpspeed, + active_policy.warpspeed, static_cast(input_it.value_type.size), static_cast(input_it.value_type.alignment), static_cast(output_it.value_type.size), diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 672263e477e..309b69a1348 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -122,7 +122,7 @@ struct DeviceScanKernelSource #if _CCCL_CUDACC_AT_LEAST(12, 8) if (policy.warpspeed) { - return detail::scan::use_warpspeed(*policy.warpspeed); + return detail::scan::use_warpspeed(policy.warpspeed); } #else (void) policy; @@ -141,7 +141,7 @@ struct has_warpspeed_policy> {}; template -_CCCL_API constexpr auto convert_warpspeed_policy() -> ::cuda::std::optional +_CCCL_API constexpr auto convert_warpspeed_policy() -> scan_warpspeed_policy { #if _CCCL_CUDACC_AT_LEAST(12, 8) if constexpr (has_warpspeed_policy::value) @@ -149,7 +149,7 @@ _CCCL_API constexpr auto convert_warpspeed_policy() -> ::cuda::std::optional(); } #endif // _CCCL_CUDACC_AT_LEAST(12, 8) - return ::cuda::std::nullopt; + return {}; } // TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API @@ -644,7 +644,7 @@ struct DispatchScan #if _CCCL_CUDACC_AT_LEAST(12, 8) if (kernel_source.use_warpspeed(active_policy)) { - return __invoke_lookahead_algorithm(*active_policy.warpspeed, policy_selector); + return __invoke_lookahead_algorithm(active_policy.warpspeed, policy_selector); } #endif // _CCCL_CUDACC_AT_LEAST(12, 8) diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index d9fb0b2a0f1..35db22ee147 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -67,7 +67,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel( #if _CCCL_CUDACC_AT_LEAST(12, 8) constexpr scan_policy policy = PolicySelectorT{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); if constexpr (policy.warpspeed - && detail::scan::use_warpspeed(*policy.warpspeed)) + && detail::scan::use_warpspeed(policy.warpspeed)) { device_scan_init_lookahead_body(tile_state.lookahead, num_tiles); } @@ -204,19 +204,19 @@ __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block #if _CCCL_CUDACC_AT_LEAST(12, 8) if constexpr (policy.warpspeed - && detail::scan::use_warpspeed(*policy.warpspeed)) + && detail::scan::use_warpspeed(policy.warpspeed)) { using WarpspeedPolicyT = warpspeedKernelPolicy< - policy.warpspeed->num_squads, - policy.warpspeed->num_reduce_warps, - policy.warpspeed->num_scan_stor_warps, - policy.warpspeed->num_load_warps, - policy.warpspeed->num_sched_warps, - policy.warpspeed->num_look_ahead_warps, - policy.warpspeed->num_look_ahead_items, - policy.warpspeed->num_total_threads, - policy.warpspeed->items_per_thread, - policy.warpspeed->tile_size>; + scan_warpspeed_policy::num_squads, + policy.warpspeed.num_reduce_warps, + policy.warpspeed.num_scan_stor_warps, + policy.warpspeed.num_load_warps, + policy.warpspeed.num_sched_warps, + policy.warpspeed.num_look_ahead_warps, + policy.warpspeed.num_look_ahead_items, + policy.warpspeed.num_total_threads, + policy.warpspeed.items_per_thread, + policy.warpspeed.tile_size>; NV_IF_TARGET( NV_PROVIDES_SM_100, ({ auto scan_params = scanKernelParams, it_value_t, AccumT>{ diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh index 15d68d6ac90..e6fd0c5f21f 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh @@ -149,6 +149,7 @@ template _CCCL_API constexpr scan_warpspeed_policy make_scan_warpspeed_policy() { return scan_warpspeed_policy{ + true, WarpspeedPolicy::num_reduce_warps, WarpspeedPolicy::num_scan_stor_warps, WarpspeedPolicy::num_load_warps, diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index baa72c09e63..a379c7e667e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -31,7 +31,6 @@ #include #include #include -#include #if !_CCCL_COMPILER(NVRTC) # include @@ -127,6 +126,7 @@ struct scan_warpspeed_policy { static constexpr int num_squads = 5; + bool valid = false; int num_reduce_warps; int num_scan_stor_warps; int num_load_warps; @@ -138,6 +138,11 @@ struct scan_warpspeed_policy int items_per_thread; int tile_size; + _CCCL_API constexpr explicit operator bool() const noexcept + { + return valid; + } + _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const { return warpspeed::SquadDesc{0, num_reduce_warps}; @@ -161,11 +166,11 @@ struct scan_warpspeed_policy _CCCL_API constexpr friend bool operator==(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs) { - return lhs.num_reduce_warps == rhs.num_reduce_warps && lhs.num_scan_stor_warps == rhs.num_scan_stor_warps - && lhs.num_load_warps == rhs.num_load_warps && lhs.num_sched_warps == rhs.num_sched_warps - && lhs.num_look_ahead_warps == rhs.num_look_ahead_warps && lhs.num_look_ahead_items == rhs.num_look_ahead_items - && lhs.num_total_threads == rhs.num_total_threads && lhs.items_per_thread == rhs.items_per_thread - && lhs.tile_size == rhs.tile_size; + return lhs.valid == rhs.valid && lhs.num_reduce_warps == rhs.num_reduce_warps + && lhs.num_scan_stor_warps == rhs.num_scan_stor_warps && lhs.num_load_warps == rhs.num_load_warps + && lhs.num_sched_warps == rhs.num_sched_warps && lhs.num_look_ahead_warps == rhs.num_look_ahead_warps + && lhs.num_look_ahead_items == rhs.num_look_ahead_items && lhs.num_total_threads == rhs.num_total_threads + && lhs.items_per_thread == rhs.items_per_thread && lhs.tile_size == rhs.tile_size; } }; @@ -178,7 +183,7 @@ struct scan_policy BlockStoreAlgorithm store_algorithm; BlockScanAlgorithm scan_algorithm; delay_constructor_policy delay_constructor; - ::cuda::std::optional warpspeed = ::cuda::std::nullopt; + scan_warpspeed_policy warpspeed = {}; _CCCL_API constexpr friend bool operator==(const scan_policy& lhs, const scan_policy& rhs) { @@ -202,13 +207,13 @@ struct scan_policy << ", .delay_constructor = " << p.delay_constructor; if (p.warpspeed) { - os << ", .warpspeed = scan_warpspeed_policy { .num_reduce_warps = " << p.warpspeed->num_reduce_warps - << ", .num_scan_stor_warps = " << p.warpspeed->num_scan_stor_warps << ", .num_load_warps = " - << p.warpspeed->num_load_warps << ", .num_sched_warps = " << p.warpspeed->num_sched_warps - << ", .num_look_ahead_warps = " << p.warpspeed->num_look_ahead_warps << ", .num_look_ahead_items = " - << p.warpspeed->num_look_ahead_items << ", .num_total_threads = " << p.warpspeed->num_total_threads - << ", .items_per_thread = " << p.warpspeed->items_per_thread << ", .tile_size = " << p.warpspeed->tile_size - << "}"; + os << ", .warpspeed = scan_warpspeed_policy { .valid = " << p.warpspeed.valid << ", .num_reduce_warps = " + << p.warpspeed.num_reduce_warps << ", .num_scan_stor_warps = " << p.warpspeed.num_scan_stor_warps + << ", .num_load_warps = " << p.warpspeed.num_load_warps << ", .num_sched_warps = " + << p.warpspeed.num_sched_warps << ", .num_look_ahead_warps = " << p.warpspeed.num_look_ahead_warps + << ", .num_look_ahead_items = " << p.warpspeed.num_look_ahead_items << ", .num_total_threads = " + << p.warpspeed.num_total_threads << ", .items_per_thread = " << p.warpspeed.items_per_thread + << ", .tile_size = " << p.warpspeed.tile_size << "}"; } return os << " }"; } @@ -223,8 +228,8 @@ _CCCL_API constexpr auto make_mem_scaled_scan_policy( CacheLoadModifier load_modifier, BlockStoreAlgorithm store_algorithm, BlockScanAlgorithm scan_algorithm, - delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}, - ::cuda::std::optional warpspeed = ::cuda::std::nullopt) -> scan_policy + delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}, + scan_warpspeed_policy warpspeed = {}) -> scan_policy { const auto scaled = scale_mem_bound(nominal_4b_block_threads, nominal_4b_items_per_thread, compute_t_size); return scan_policy{ @@ -777,12 +782,13 @@ constexpr _CCCL_HOST_DEVICE delay_constructor_policy default_delay_constructor_p : delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450}; } -constexpr _CCCL_HOST_DEVICE ::cuda::std::optional +constexpr _CCCL_HOST_DEVICE scan_warpspeed_policy get_warpspeed_policy(::cuda::arch_id arch, int input_value_size, int accum_size) { if (arch >= ::cuda::arch_id::sm_100) { scan_warpspeed_policy warpspeed_policy{}; + warpspeed_policy.valid = true; warpspeed_policy.num_reduce_warps = 4; warpspeed_policy.num_scan_stor_warps = 4; @@ -806,7 +812,7 @@ get_warpspeed_policy(::cuda::arch_id arch, int input_value_size, int accum_size) return warpspeed_policy; } - return ::cuda::std::nullopt; + return {}; } struct policy_selector From 497638c0023db26546a9aa3f4d8f2ca9e991d58d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 19 Feb 2026 02:58:23 +0100 Subject: [PATCH 10/31] Pull scan_warpspeed_policy out into its own file. --- .../kernels/kernel_scan_warpspeed.cuh | 1 + .../kernels/scan_warpspeed_policy.cuh | 67 +++++++++++++++++++ .../device/dispatch/tuning/tuning_scan.cuh | 54 +-------------- 3 files changed, 69 insertions(+), 53 deletions(-) create mode 100644 cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh index e6fd0c5f21f..c8b1756653b 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh new file mode 100644 index 00000000000..3409fc53866 --- /dev/null +++ b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#include + +CUB_NAMESPACE_BEGIN + +namespace detail::scan +{ +struct scan_warpspeed_policy +{ + static constexpr int num_squads = 5; + + bool valid = false; + int num_reduce_warps; + int num_scan_stor_warps; + int num_load_warps; + int num_sched_warps; + int num_look_ahead_warps; + + int num_look_ahead_items; + int num_total_threads; + int items_per_thread; + int tile_size; + + _CCCL_API constexpr explicit operator bool() const noexcept + { + return valid; + } + + _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const + { + return warpspeed::SquadDesc{0, num_reduce_warps}; + } + _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const + { + return warpspeed::SquadDesc{1, num_scan_stor_warps}; + } + _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const + { + return warpspeed::SquadDesc{2, num_load_warps}; + } + _CCCL_API constexpr warpspeed::SquadDesc squadSched() const + { + return warpspeed::SquadDesc{3, num_sched_warps}; + } + _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const + { + return warpspeed::SquadDesc{4, num_look_ahead_warps}; + } + + _CCCL_API constexpr friend bool operator==(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs) + { + return lhs.valid == rhs.valid && lhs.num_reduce_warps == rhs.num_reduce_warps + && lhs.num_scan_stor_warps == rhs.num_scan_stor_warps && lhs.num_load_warps == rhs.num_load_warps + && lhs.num_sched_warps == rhs.num_sched_warps && lhs.num_look_ahead_warps == rhs.num_look_ahead_warps + && lhs.num_look_ahead_items == rhs.num_look_ahead_items && lhs.num_total_threads == rhs.num_total_threads + && lhs.items_per_thread == rhs.items_per_thread && lhs.tile_size == rhs.tile_size; + } +}; +} // namespace detail::scan + +CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index a379c7e667e..f7ea31fdc31 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -122,58 +122,6 @@ constexpr _CCCL_HOST_DEVICE offset_size classify_offset_size() return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown; } -struct scan_warpspeed_policy -{ - static constexpr int num_squads = 5; - - bool valid = false; - int num_reduce_warps; - int num_scan_stor_warps; - int num_load_warps; - int num_sched_warps; - int num_look_ahead_warps; - - int num_look_ahead_items; - int num_total_threads; - int items_per_thread; - int tile_size; - - _CCCL_API constexpr explicit operator bool() const noexcept - { - return valid; - } - - _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const - { - return warpspeed::SquadDesc{0, num_reduce_warps}; - } - _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const - { - return warpspeed::SquadDesc{1, num_scan_stor_warps}; - } - _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const - { - return warpspeed::SquadDesc{2, num_load_warps}; - } - _CCCL_API constexpr warpspeed::SquadDesc squadSched() const - { - return warpspeed::SquadDesc{3, num_sched_warps}; - } - _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const - { - return warpspeed::SquadDesc{4, num_look_ahead_warps}; - } - - _CCCL_API constexpr friend bool operator==(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs) - { - return lhs.valid == rhs.valid && lhs.num_reduce_warps == rhs.num_reduce_warps - && lhs.num_scan_stor_warps == rhs.num_scan_stor_warps && lhs.num_load_warps == rhs.num_load_warps - && lhs.num_sched_warps == rhs.num_sched_warps && lhs.num_look_ahead_warps == rhs.num_look_ahead_warps - && lhs.num_look_ahead_items == rhs.num_look_ahead_items && lhs.num_total_threads == rhs.num_total_threads - && lhs.items_per_thread == rhs.items_per_thread && lhs.tile_size == rhs.tile_size; - } -}; - struct scan_policy { int block_threads; From 61db5eb8becb81a55058b17f99e085b22faefdd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 19 Feb 2026 03:09:48 +0100 Subject: [PATCH 11/31] Check for is_constant_evaluated in new dispatch. --- cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh index c8b1756653b..c78dad4b27e 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh @@ -928,7 +928,7 @@ _CCCL_API constexpr bool use_warpspeed( bool output_trivially_copyable, bool output_default_constructible) { -#if defined(__CUDA_ARCH__) && __cccl_ptx_isa < 860 +#if defined(__CUDA_ARCH__) && __cccl_ptx_isa < 860 || !defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) (void) policy; (void) input_size; (void) input_align; From 5f3aedaf5bf5aa7241872b4b7de04f9992034b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Fri, 20 Feb 2026 15:35:09 +0100 Subject: [PATCH 12/31] Fix some thinkos. --- cub/cub/device/dispatch/dispatch_scan.cuh | 18 ++++++++------ .../device/dispatch/kernels/kernel_scan.cuh | 24 +++++++++++-------- .../kernels/kernel_scan_warpspeed.cuh | 10 ++++---- 3 files changed, 31 insertions(+), 21 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 309b69a1348..13e9d428708 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -51,6 +51,10 @@ #include #include +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) +# include +#endif + CUB_NAMESPACE_BEGIN namespace detail::scan @@ -496,12 +500,6 @@ struct DispatchScan { return error; } - // number of stages to have an even workload across all SMs (improves small problem sizes), assuming 1 CTA per SM - // +1 since it tends to improve performance - // TODO(bgruber): make the +1 a tuning parameter - const int max_stages_for_even_workload = - static_cast(::cuda::ceil_div(num_items, static_cast(sm_count * warpspeed_policy.tile_size)) + 1); - // Maximum dynamic shared memory size that we can use for temporary storage. int max_dynamic_smem_size{}; if (const auto error = @@ -527,6 +525,12 @@ struct DispatchScan // When launched from the host, maximize the number of stages that we can fit inside the shared memory. NV_IF_TARGET(NV_IS_HOST, ({ + // number of stages to have an even workload across all SMs (improves small problem sizes), assuming + // 1 CTA per SM +1 since it tends to improve performance + // TODO(bgruber): make the +1 a tuning parameter + const int max_stages_for_even_workload = static_cast( + ::cuda::ceil_div(num_items, static_cast(sm_count * warpspeed_policy.tile_size)) + 1); + while (num_stages <= max_stages_for_even_workload) { const auto next_smem_size = detail::scan::smem_for_stages( @@ -560,7 +564,7 @@ struct DispatchScan const auto init_grid_size = ::cuda::ceil_div(grid_dim, init_kernel_threads); # ifdef CUB_DEBUG_LOG - _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, , %lld>>>()\n", + _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index 35db22ee147..b2679711d2c 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -111,22 +111,26 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte *d_num_selected_out = 0; } } -template -[[nodiscard]] _CCCL_DEVICE_API _CCCL_CONSTEVAL int get_device_scan_launch_bounds() noexcept +template +_CCCL_API constexpr int get_device_scan_launch_bounds_helper() noexcept { + constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); #if _CCCL_CUDACC_AT_LEAST(12, 8) - if constexpr (detail::scan:: - scan_use_warpspeed) + if constexpr (policy.warpspeed + && detail::scan::use_warpspeed(policy.warpspeed)) { - return get_scan_block_threads; + return policy.warpspeed.num_total_threads; } - else #endif // _CCCL_CUDACC_AT_LEAST(12, 8) - { - return static_cast(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS); - } + return policy.block_threads; } +// need a variable template to force constant evaluation, otherwise nvcc may emit +// "bad attribute argument substitution" errors for __launch_bounds__ +template +inline constexpr int get_device_scan_launch_bounds = + get_device_scan_launch_bounds_helper(); + /** * @brief Scan kernel entry point (multi-block) * @@ -182,7 +186,7 @@ template -__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads)) +__launch_bounds__(get_device_scan_launch_bounds) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel( _CCCL_GRID_CONSTANT const InputIteratorT d_in, _CCCL_GRID_CONSTANT const OutputIteratorT d_out, diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh index c78dad4b27e..049a3440358 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh @@ -862,12 +862,14 @@ _CCCL_API constexpr auto smem_for_stages( (void) output_size; const auto counts = make_scan_stage_counts(num_stages); - const int align_inout = ::cuda::std::max({16, input_align, output_align}); - const int inout_bytes = policy.tile_size * input_size + 16; + const int align_inout = ::cuda::std::max({16, input_align, output_align}); + const int inout_bytes = policy.tile_size * input_size + 16; + // Match sizeof(InOutT): round up to the alignment so each stage matches SmemResource. + const int inout_stride = (inout_bytes + align_inout - 1) & ~(align_inout - 1); const auto reduce_squad = policy.squadReduce(); const int sum_thread_warp = (reduce_squad.threadCount() + reduce_squad.warpCount()) * accum_size; - void* inout_base = smemAllocator.alloc(static_cast<::cuda::std::uint32_t>(inout_bytes * num_stages), align_inout); + void* inout_base = smemAllocator.alloc(static_cast<::cuda::std::uint32_t>(inout_stride * num_stages), align_inout); void* next_block_idx_base = smemAllocator.alloc( static_cast<::cuda::std::uint32_t>(sizeof(uint4) * counts.num_block_idx_stages), alignof(uint4)); void* sum_exclusive_base = smemAllocator.alloc( @@ -876,7 +878,7 @@ _CCCL_API constexpr auto smem_for_stages( smemAllocator.alloc(static_cast<::cuda::std::uint32_t>(sum_thread_warp * num_stages), accum_align); ScanResourcesRaw res = { - warpspeed::SmemResourceRaw{syncHandler, inout_base, inout_bytes, inout_bytes, num_stages}, + warpspeed::SmemResourceRaw{syncHandler, inout_base, inout_stride, inout_stride, num_stages}, warpspeed::SmemResourceRaw{ syncHandler, next_block_idx_base, From 72239e23d8864447e880e1f3104cf5ee2e67e44a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Fri, 20 Feb 2026 16:06:47 +0100 Subject: [PATCH 13/31] Compilation fixes. --- cub/cub/device/dispatch/kernels/kernel_scan.cuh | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index b2679711d2c..cb45ef9543b 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -112,7 +112,7 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte } } template -_CCCL_API constexpr int get_device_scan_launch_bounds_helper() noexcept +_CCCL_DEVICE constexpr int get_device_scan_launch_bounds() noexcept { constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); #if _CCCL_CUDACC_AT_LEAST(12, 8) @@ -125,12 +125,6 @@ _CCCL_API constexpr int get_device_scan_launch_bounds_helper() noexcept return policy.block_threads; } -// need a variable template to force constant evaluation, otherwise nvcc may emit -// "bad attribute argument substitution" errors for __launch_bounds__ -template -inline constexpr int get_device_scan_launch_bounds = - get_device_scan_launch_bounds_helper(); - /** * @brief Scan kernel entry point (multi-block) * @@ -186,7 +180,7 @@ template -__launch_bounds__(get_device_scan_launch_bounds) +__launch_bounds__(get_device_scan_launch_bounds()) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel( _CCCL_GRID_CONSTANT const InputIteratorT d_in, _CCCL_GRID_CONSTANT const OutputIteratorT d_out, @@ -210,7 +204,7 @@ __launch_bounds__(get_device_scan_launch_bounds(policy.warpspeed)) { - using WarpspeedPolicyT = warpspeedKernelPolicy< + using WarpspeedPolicyT [[maybe_unused]] = warpspeedKernelPolicy< scan_warpspeed_policy::num_squads, policy.warpspeed.num_reduce_warps, policy.warpspeed.num_scan_stor_warps, From 861b25cba42bfd89cab35294f4a8e944a16cb9b1 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 13:37:08 +0100 Subject: [PATCH 14/31] Remove %RANGE% declarations from header --- cub/benchmarks/bench/scan/policy_selector.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h index 386e4b41278..b0a18d61c95 100644 --- a/cub/benchmarks/bench/scan/policy_selector.h +++ b/cub/benchmarks/bench/scan/policy_selector.h @@ -3,14 +3,6 @@ #include -// %RANGE% TUNE_ITEMS ipt 7:24:1 -// %RANGE% TUNE_THREADS tpb 128:1024:32 -// %RANGE% TUNE_MAGIC_NS ns 0:2048:4 -// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 -// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 -// %RANGE% TUNE_TRANSPOSE trp 0:1:1 -// %RANGE% TUNE_LOAD ld 0:1:1 - #if !TUNE_BASE template struct policy_selector From 0a7d3e7e9f0488ec2706f8c9c27109c336b4d457 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 13:42:10 +0100 Subject: [PATCH 15/31] Add delay_constructor_policy to look_back_helper.cuh --- cub/benchmarks/bench/scan/policy_selector.h | 4 +++- nvbench_helper/nvbench_helper/look_back_helper.cuh | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h index b0a18d61c95..e64813929f6 100644 --- a/cub/benchmarks/bench/scan/policy_selector.h +++ b/cub/benchmarks/bench/scan/policy_selector.h @@ -3,6 +3,8 @@ #include +#include "look_back_helper.cuh" + #if !TUNE_BASE template struct policy_selector @@ -17,7 +19,7 @@ struct policy_selector TUNE_LOAD_MODIFIER, TUNE_STORE_ALGORITHM, cub::BLOCK_SCAN_WARP_SCANS, - cub::detail::delay_constructor_policy_from_type); + delay_constructor_policy); } }; #endif // !TUNE_BASE diff --git a/nvbench_helper/nvbench_helper/look_back_helper.cuh b/nvbench_helper/nvbench_helper/look_back_helper.cuh index f7a97937dcb..81dd1a7a9db 100644 --- a/nvbench_helper/nvbench_helper/look_back_helper.cuh +++ b/nvbench_helper/nvbench_helper/look_back_helper.cuh @@ -16,4 +16,7 @@ using delay_constructor_t = cub::detail::delay_constructor_t(TUNE_DELAY_CONSTRUCTOR_ID), TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>; + +inline constexpr auto delay_constructor_policy = cub::detail::delay_constructor_policy{ + static_cast(TUNE_DELAY_CONSTRUCTOR_ID), TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS}; #endif // !TUNE_BASE From 2a7e044035a7666cc1a99be3882e985f44f55466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 26 Feb 2026 13:43:56 +0100 Subject: [PATCH 16/31] CI fixes. --- c/parallel/src/scan.cu | 20 ++----------------- ci/matrix.yaml | 4 ++++ .../kernels/kernel_scan_warpspeed.cuh | 5 ++++- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index fdb2f74e4dd..5df6035c55a 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -283,24 +283,8 @@ try #if _CCCL_CUDACC_AT_LEAST(12, 8) const auto is_trivial_type = [](cccl_type_enum type) { - switch (type) - { - case CCCL_INT8: - case CCCL_INT16: - case CCCL_INT32: - case CCCL_INT64: - case CCCL_UINT8: - case CCCL_UINT16: - case CCCL_UINT32: - case CCCL_UINT64: - case CCCL_FLOAT16: - case CCCL_FLOAT32: - case CCCL_FLOAT64: - case CCCL_BOOLEAN: - return true; - default: - return false; - } + // TODO: implement actual logic here when nontrivial custom types become supported + return true; }; const bool input_contiguous = input_it.type == cccl_iterator_kind_t::CCCL_POINTER; diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 6cc5c970233..237f6a2f681 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -184,6 +184,8 @@ workflows: # c.parallel -- pinned to gcc13 to match python - {jobs: ['test'], project: ['cccl_c_parallel'], ctk: '12.X', cxx: ['gcc13', 'msvc'], gpu: ['rtx2080']} - {jobs: ['test'], project: ['cccl_c_parallel'], ctk: '13.X', cxx: ['gcc13', 'msvc'], gpu: ['rtx2080', 'l4', 'h100']} + # RTX PRO 6000 coverage (limited due to small number of runners): + - {jobs: ['test'], project: 'cccl_c_parallel', ctk: '13.X', cxx: ['gcc13'], gpu: ['rtxpro6000']} # c.experimental.stf -- pinned to gcc13 to match python - {jobs: ['test'], project: ['cccl_c_stf'], ctk: '12.X', cxx: 'gcc13', gpu: ['rtx2080']} - {jobs: ['test'], project: ['cccl_c_stf'], ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']} @@ -269,6 +271,8 @@ workflows: # c.parallel -- pinned to gcc13 to match python - {jobs: ['test'], project: ['cccl_c_parallel'], ctk: '12.X', cxx: ['gcc13', 'msvc'], gpu: ['rtx2080']} - {jobs: ['test'], project: ['cccl_c_parallel'], ctk: '13.X', cxx: ['gcc13', 'msvc'], gpu: ['rtx2080', 'l4', 'h100']} + # RTX PRO 6000 coverage (limited due to small number of runners): + - {jobs: ['test'], project: 'cccl_c_parallel', ctk: '13.X', cxx: ['gcc13'], gpu: ['rtxpro6000']} # c.experimental.stf -- pinned to gcc13 to match python - {jobs: ['test'], project: ['cccl_c_stf'], ctk: '12.X', cxx: 'gcc13', gpu: ['rtx2080']} - {jobs: ['test'], project: ['cccl_c_stf'], ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']} diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh index 049a3440358..4c1918dc613 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh @@ -930,7 +930,10 @@ _CCCL_API constexpr bool use_warpspeed( bool output_trivially_copyable, bool output_default_constructible) { -#if defined(__CUDA_ARCH__) && __cccl_ptx_isa < 860 || !defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) +// We need `cuda::std::is_constant_evaluated` for the compile-time SMEM computation. And we need PTX ISA 8.6. +// MSVC + nvcc < 13.1 just fails to compile `cub.test.device.scan.lid_1.types_0` with `Internal error` and nothing else. +#if (defined(__CUDA_ARCH__) && __cccl_ptx_isa < 860) || !defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) \ + || ((_CCCL_COMPILER(MSVC) && _CCCL_CUDA_COMPILER(NVCC, <, 13, 1))) (void) policy; (void) input_size; (void) input_align; From 1353f063bde76485c5dc7db4ad772ae67c125808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 26 Feb 2026 15:51:15 +0100 Subject: [PATCH 17/31] Refactor DeviceScan, remove warpspeed from policy_hub, address review comments. --- c/parallel/src/scan.cu | 4 +- cub/cub/device/device_scan.cuh | 91 +++++---- cub/cub/device/dispatch/dispatch_scan.cuh | 101 +++++---- .../device/dispatch/kernels/kernel_scan.cuh | 15 +- .../kernels/kernel_scan_warpspeed.cuh | 191 ++++-------------- .../device/dispatch/tuning/tuning_scan.cuh | 117 ++++------- cub/test/catch2_test_device_scan_env.cu | 37 ++-- 7 files changed, 206 insertions(+), 350 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 5df6035c55a..ab7aeb44f04 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -282,7 +282,7 @@ try const auto active_policy = policy_sel(arch_id); #if _CCCL_CUDACC_AT_LEAST(12, 8) - const auto is_trivial_type = [](cccl_type_enum type) { + const auto is_trivial_type = [](cccl_type_enum /* type */) { // TODO: implement actual logic here when nontrivial custom types become supported return true; }; @@ -450,7 +450,7 @@ CUresult cccl_device_scan( CUdevice cu_device; check(cuCtxGetDevice(&cu_device)); - auto exec_status = cub::detail::scan::dispatch_with_accum( + auto exec_status = cub::detail::scan::dispatch_with_accum( d_temp_storage, *temp_storage_bytes, indirect_arg_t{d_in}, diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 2da3ba19adf..0ef257540e6 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -53,7 +53,7 @@ struct tuning struct default_tuning : tuning { template - using fn = policy_hub; + using selector = policy_selector_from_types; }; } // namespace detail::scan @@ -137,14 +137,23 @@ struct DeviceScan cub::detail::it_value_t, typename InitValueT::value_type>>; - using policy_t = typename scan_tuning_t:: - template fn, detail::it_value_t, accum_t, offset_t, ScanOpT>; + using policy_selector_t = typename scan_tuning_t::template selector< + detail::it_value_t, + detail::it_value_t, + accum_t, + offset_t, + ScanOpT>; - using dispatch_t = - DispatchScan; - - return dispatch_t::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, static_cast(num_items), stream); + return detail::scan::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init, + static_cast(num_items), + stream, + policy_selector_t{}); } template , detail::InputValue, OffsetT>:: - Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - ::cuda::std::plus<>{}, - detail::InputValue(init_value), - num_items, - stream); + return detail::scan::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + ::cuda::std::plus<>{}, + detail::InputValue(init_value), + static_cast(num_items), + stream); } //! @rst @@ -587,14 +596,14 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - return DispatchScan, OffsetT>::Dispatch( + return detail::scan::dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, detail::InputValue(init_value), - num_items, + static_cast(num_items), stream); } @@ -929,14 +938,14 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - return DispatchScan, OffsetT>::Dispatch( + return detail::scan::dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, detail::InputValue(init_value), - num_items, + static_cast(num_items), stream); } @@ -1158,8 +1167,15 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - return DispatchScan, NullType, OffsetT>::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream); + return detail::scan::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + ::cuda::std::plus<>{}, + NullType{}, + static_cast(num_items), + stream); } //! @rst @@ -1355,8 +1371,8 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - return DispatchScan::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream); + return detail::scan::dispatch( + d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), static_cast(num_items), stream); } //! @rst @@ -1446,23 +1462,16 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - using AccumT = ::cuda::std::__accumulator_t, InitValueT>; - return DispatchScan< - InputIteratorT, - OutputIteratorT, - ScanOpT, - detail::InputValue, - OffsetT, - AccumT, - ForceInclusive::Yes>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - detail::InputValue(init_value), - num_items, - stream); + return detail::scan::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + detail::InputValue(init_value), + static_cast(num_items), + stream); } //! @rst diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 13e9d428708..a50a9604ae5 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -135,27 +135,6 @@ struct DeviceScanKernelSource } }; -// TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API -template -struct has_warpspeed_policy : ::cuda::std::false_type -{}; - -template -struct has_warpspeed_policy> : ::cuda::std::true_type -{}; - -template -_CCCL_API constexpr auto convert_warpspeed_policy() -> scan_warpspeed_policy -{ -#if _CCCL_CUDACC_AT_LEAST(12, 8) - if constexpr (has_warpspeed_policy::value) - { - return make_scan_warpspeed_policy(); - } -#endif // _CCCL_CUDACC_AT_LEAST(12, 8) - return {}; -} - // TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API template _CCCL_API constexpr auto convert_policy() -> scan_policy @@ -168,12 +147,11 @@ _CCCL_API constexpr auto convert_policy() -> scan_policy scan_policy_t::LOAD_MODIFIER, scan_policy_t::STORE_ALGORITHM, scan_policy_t::SCAN_ALGORITHM, - detail::delay_constructor_policy_from_type, - convert_warpspeed_policy()}; + detail::delay_constructor_policy_from_type}; } // TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API -template +template struct policy_selector_from_hub { // this is only called in device code @@ -181,6 +159,13 @@ struct policy_selector_from_hub { return convert_policy(); } + + static constexpr int input_value_size = int{sizeof(InputValueT)}; + static constexpr int input_value_alignment = int{alignof(InputValueT)}; + static constexpr int output_value_size = int{sizeof(OutputValueT)}; + static constexpr int output_value_alignment = int{alignof(OutputValueT)}; + static constexpr int accum_size = int{sizeof(AccumT)}; + static constexpr int accum_alignment = int{alignof(AccumT)}; }; } // namespace detail::scan @@ -228,7 +213,8 @@ template < typename PolicyHub = detail::scan:: policy_hub, detail::it_value_t, AccumT, OffsetT, ScanOpT>, typename KernelSource = detail::scan::DeviceScanKernelSource< - detail::scan::policy_selector_from_hub, + detail::scan:: + policy_selector_from_hub, detail::it_value_t, AccumT>, THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, ScanOpT, @@ -880,24 +866,31 @@ struct DispatchScan namespace detail::scan { template < + ForceInclusive EnforceInclusive = ForceInclusive::No, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename OffsetT, - typename AccumT = ::cuda::std::__accumulator_t, - ::cuda::std::_If<::cuda::std::is_same_v, - cub::detail::it_value_t, - typename InitValueT::value_type>>, - ForceInclusive EnforceInclusive = ForceInclusive::No, - typename PolicySelector = policy_selector_from_types, - detail::it_value_t, - AccumT, - OffsetT, - ScanOpT>, - typename KernelSource = - DeviceScanKernelSource, + typename AccumT = ::cuda::std::__accumulator_t, + ::cuda::std::_If<::cuda::std::is_same_v, + cub::detail::it_value_t, + typename InitValueT::value_type>>, + typename PolicySelector = policy_selector_from_types, + detail::it_value_t, + AccumT, + OffsetT, + ScanOpT>, + typename KernelSource = DeviceScanKernelSource< + PolicySelector, + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, + ScanOpT, + InitValueT, + OffsetT, + AccumT, + EnforceInclusive>, typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> #if _CCCL_HAS_CONCEPTS() requires scan_policy_selector @@ -961,22 +954,28 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( }); } -template < - typename AccumT, - typename InputIteratorT, - typename OutputIteratorT, - typename ScanOpT, - typename InitValueT, - typename OffsetT, - ForceInclusive EnforceInclusive = ForceInclusive::No, - typename PolicySelector = policy_selector_from_types, +template , detail::it_value_t, AccumT, OffsetT, ScanOpT>, - typename KernelSource = - DeviceScanKernelSource, - typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> + typename KernelSource = DeviceScanKernelSource< + PolicySelector, + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, + ScanOpT, + InitValueT, + OffsetT, + AccumT, + EnforceInclusive>, + typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_with_accum( void* d_temp_storage, size_t& temp_storage_bytes, @@ -990,7 +989,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_with_accum( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) -> cudaError_t { - return dispatch( + return dispatch( d_temp_storage, temp_storage_bytes, d_in, diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index cb45ef9543b..d703d035d2b 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -112,7 +112,7 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte } } template -_CCCL_DEVICE constexpr int get_device_scan_launch_bounds() noexcept +[[nodiscard]] _CCCL_DEVICE_API _CCCL_CONSTEVAL int get_device_scan_launch_bounds() noexcept { constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); #if _CCCL_CUDACC_AT_LEAST(12, 8) @@ -204,22 +204,11 @@ __launch_bounds__(get_device_scan_launch_bounds(policy.warpspeed)) { - using WarpspeedPolicyT [[maybe_unused]] = warpspeedKernelPolicy< - scan_warpspeed_policy::num_squads, - policy.warpspeed.num_reduce_warps, - policy.warpspeed.num_scan_stor_warps, - policy.warpspeed.num_load_warps, - policy.warpspeed.num_sched_warps, - policy.warpspeed.num_look_ahead_warps, - policy.warpspeed.num_look_ahead_items, - policy.warpspeed.num_total_threads, - policy.warpspeed.items_per_thread, - policy.warpspeed.tile_size>; NV_IF_TARGET( NV_PROVIDES_SM_100, ({ auto scan_params = scanKernelParams, it_value_t, AccumT>{ d_in, d_out, tile_state.lookahead, num_items, num_stages}; - device_scan_lookahead_body(scan_params, scan_op, init_value); + device_scan_lookahead_body(scan_params, scan_op, init_value); })); } else diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh index 4c1918dc613..921036b836a 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -43,53 +44,11 @@ CUB_NAMESPACE_BEGIN namespace detail::scan { -template -struct warpspeedKernelPolicy +template +_CCCL_API constexpr scan_warpspeed_policy get_warpspeed_policy() noexcept { - static constexpr int num_squads = NumSquads; - - // The squads cannot be static constexpr variables, as those are not device accessible - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept - { - return warpspeed::SquadDesc{0, NumReduceWarps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept - { - return warpspeed::SquadDesc{1, NumScanStorWarps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept - { - return warpspeed::SquadDesc{2, NumLoadWarps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadSched() noexcept - { - return warpspeed::SquadDesc{3, NumSchedWarps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLookback() noexcept - { - return warpspeed::SquadDesc{4, NumLookAheadWarps}; - } - - static constexpr int num_look_ahead_items = NumLookAheadItems; - static constexpr int num_total_threads = NumTotalThreads; - static constexpr int items_per_thread = ItemsPerThread; - static constexpr int tile_size = TileSize; - - static constexpr int num_reduce_warps = NumReduceWarps; - static constexpr int num_scan_stor_warps = NumScanStorWarps; - static constexpr int num_load_warps = NumLoadWarps; - static constexpr int num_sched_warps = NumSchedWarps; - static constexpr int num_look_ahead_warps = NumLookAheadWarps; -}; + return PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).warpspeed; +} template struct scanKernelParams @@ -102,22 +61,27 @@ struct scanKernelParams }; // Struct holding all scan kernel resources -template +template struct ScanResources { + _CCCL_API static constexpr scan_warpspeed_policy warpspeed_policy() noexcept + { + return get_warpspeed_policy(); + } + // align to at least 16 bytes (InputT/OutputT may be aligned higher) so each stage starts correctly aligned struct alignas(::cuda::std::max({::cuda::std::size_t{16}, alignof(InputT), alignof(OutputT)})) InOutT { // the tile_size size is a multiple of the warp size, and thus for sure a multiple of 16 - static_assert(WarpspeedPolicy::tile_size % 16 == 0, "tile_size must be multiple of 16"); + static_assert(ScanResources::warpspeed_policy().tile_size % 16 == 0, "tile_size must be multiple of 16"); // therefore, unaligned inputs need exactly 16 bytes extra for overcopying (tail padding = 16 - head padding) - ::cuda::std::byte inout[WarpspeedPolicy::tile_size * sizeof(InputT) + 16]; + ::cuda::std::byte inout[ScanResources::warpspeed_policy().tile_size * sizeof(InputT) + 16]; }; static_assert(alignof(InOutT) >= alignof(InputT)); static_assert(alignof(InOutT) >= alignof(OutputT)); - using SumThreadAndWarpT = - AccumT[WarpspeedPolicy::squadReduce().threadCount() + WarpspeedPolicy::squadReduce().warpCount()]; + using SumThreadAndWarpT = AccumT[ScanResources::warpspeed_policy().squadReduce().threadCount() + + ScanResources::warpspeed_policy().squadReduce().warpCount()]; warpspeed::SmemResource smemInOut; // will also be used to stage the output (as OutputT) for the bulk copy warpspeed::SmemResource smemNextBlockIdx; @@ -146,22 +110,6 @@ _CCCL_API constexpr scan_stage_counts make_scan_stage_counts(int num_stages) return {num_block_idx_stages, 2}; } -template -_CCCL_API constexpr scan_warpspeed_policy make_scan_warpspeed_policy() -{ - return scan_warpspeed_policy{ - true, - WarpspeedPolicy::num_reduce_warps, - WarpspeedPolicy::num_scan_stor_warps, - WarpspeedPolicy::num_load_warps, - WarpspeedPolicy::num_sched_warps, - WarpspeedPolicy::num_look_ahead_warps, - WarpspeedPolicy::num_look_ahead_items, - WarpspeedPolicy::num_total_threads, - WarpspeedPolicy::items_per_thread, - WarpspeedPolicy::tile_size}; -} - template -[[nodiscard]] _CCCL_API constexpr ScanResources +template +[[nodiscard]] _CCCL_API constexpr ScanResources allocResources(warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& smemAllocator, int numStages) { - using ScanResourcesT = ScanResources; + using ScanResourcesT = ScanResources; using InOutT = typename ScanResourcesT::InOutT; using SumThreadAndWarpT = typename ScanResourcesT::SumThreadAndWarpT; @@ -217,7 +165,7 @@ allocResources(warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& sm }; setup_scan_resources( - make_scan_warpspeed_policy(), + ScanResourcesT::warpspeed_policy(), syncHandler, smemAllocator, res.smemInOut, @@ -293,7 +241,7 @@ _CCCL_DEVICE_API Tp warpScanExclusive(const Tp regInput, ScanOpT& scan_op) // warp-specialization dispatch is performed once at the start of the kernel and // not in any of the hot loops (even if that may seem the case from a first // glance at the code). -template (); + static constexpr warpspeed::SquadDesc squadReduce = policy.squadReduce(); + static constexpr warpspeed::SquadDesc squadScanStore = policy.squadScanStore(); + static constexpr warpspeed::SquadDesc squadLoad = policy.squadLoad(); + static constexpr warpspeed::SquadDesc squadSched = policy.squadSched(); + static constexpr warpspeed::SquadDesc squadLookback = policy.squadLookback(); - constexpr int tile_size = WarpspeedPolicy::tile_size; - constexpr int num_look_ahead_items = WarpspeedPolicy::num_look_ahead_items; + constexpr int tile_size = policy.tile_size; + constexpr int num_look_ahead_items = policy.num_look_ahead_items; // We might try to instantiate the kernel with hughe types which would lead to a small tile size. Ensure its never 0 - constexpr int elemPerThread = WarpspeedPolicy::items_per_thread; + constexpr int elemPerThread = policy.items_per_thread; static_assert(elemPerThread * squadReduce.threadCount() == tile_size, "Invalid tuning policy"); //////////////////////////////////////////////////////////////////////////////// @@ -329,8 +278,8 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody( warpspeed::SyncHandler syncHandler{}; warpspeed::SmemAllocator smemAllocator{}; - ScanResources res = - allocResources(syncHandler, smemAllocator, params.numStages); + ScanResources res = + allocResources(syncHandler, smemAllocator, params.numStages); syncHandler.clusterInitSync(specialRegisters); @@ -718,8 +667,7 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody( { // otherwise, issue multiple bulk copies in chunks of the input tile size // TODO(bgruber): I am sure this could be implemented a lot more efficiently - static constexpr int elem_per_chunk = - static_cast(WarpspeedPolicy::tile_size * sizeof(InputT) / sizeof(OutputT)); + static constexpr int elem_per_chunk = static_cast(policy.tile_size * sizeof(InputT) / sizeof(OutputT)); for (int chunk_offset = 0; chunk_offset < valid_items; chunk_offset += elem_per_chunk) { const int chunk_size = ::cuda::std::min(valid_items - chunk_offset, elem_per_chunk); @@ -774,14 +722,7 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody( #endif // __cccl_ptx_isa >= 860 -template -inline constexpr int get_scan_block_threads = 1; - -template -inline constexpr int get_scan_block_threads> = - ActivePolicy::WarpspeedPolicy::num_total_threads; - -template (); + // Dispatch for warp-specialization - static constexpr warpspeed::SquadDesc scanSquads[WarpspeedPolicy::num_squads] = { - WarpspeedPolicy::squadReduce(), - WarpspeedPolicy::squadScanStore(), - WarpspeedPolicy::squadLoad(), - WarpspeedPolicy::squadSched(), - WarpspeedPolicy::squadLookback(), + static constexpr warpspeed::SquadDesc scanSquads[scan_warpspeed_policy::num_squads] = { + policy.squadReduce(), + policy.squadScanStore(), + policy.squadLoad(), + policy.squadSched(), + policy.squadLookback(), }; // we need to force inline the lambda, but clang in CUDA mode only likes the GNU syntax warpspeed::squadDispatch(specialRegisters, scanSquads, [&](warpspeed::Squad squad) _CCCL_FORCEINLINE_LAMBDA { - kernelBody( + kernelBody( squad, specialRegisters, params, ::cuda::std::move(scan_op), static_cast(init_value)); }); #endif // __cccl_ptx_isa >= 860 @@ -978,54 +921,6 @@ _CCCL_API constexpr bool use_warpspeed(const scan_warpspeed_policy& policy) ::cuda::std::is_trivially_copyable_v, ::cuda::std::is_default_constructible_v); } - -#if 0 -// we check the required shared memory inside a template, so the error message shows the amount in case of failure -template -_CCCL_API constexpr void verify_smem() -{ - static_assert(RequiredSharedMemory <= max_smem_per_block, - "Single stage configuration exceeds architecture independent SMEM (48KiB)"); -} -#endif - -template -_CCCL_API constexpr auto one_stage_fits_48KiB_SMEM() -> bool -{ - using InputT = it_value_t; - using OutputT = it_value_t; - constexpr scan_warpspeed_policy policy = make_scan_warpspeed_policy(); - constexpr int smem_size_1_stage = smem_for_stages(policy, 1); -// We can turn this on to report if a single stage of the warpspeed scan would exceed 48KiB if SMEM. -#if 0 - verify_smem(); -#endif - return smem_size_1_stage <= max_smem_per_block; -} - -template -inline constexpr bool scan_use_warpspeed = false; - -// We need `cuda::std::is_constant_evaluated` for the compile-time SMEM computation. And we need PTX ISA 8.6. -// MSVC + nvcc < 13.1 just fails to compile `cub.test.device.scan.lid_1.types_0` with `Internal error` and nothing else. -#if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && __cccl_ptx_isa >= 860 \ - && !(_CCCL_COMPILER(MSVC) && _CCCL_CUDA_COMPILER(NVCC, <, 13, 1)) -template -inline constexpr bool scan_use_warpspeed> = - // for bulk copy: input and output iterators must be contiguous and their value types must be trivially copyable - THRUST_NS_QUALIFIER::is_contiguous_iterator_v - && ::cuda::std::is_trivially_copyable_v> - && THRUST_NS_QUALIFIER::is_contiguous_iterator_v - && ::cuda::std::is_trivially_copyable_v> - // for bulk copy store: we need to prepare a buffer of output types in SMEM - && ::cuda::std::is_default_constructible_v> - // need to fit one stage into 48KiB so binaries stay forward compatible with future GPUs - && one_stage_fits_48KiB_SMEM(); -#endif // defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && __cccl_ptx_isa >= 860 } // namespace detail::scan CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index f7ea31fdc31..89138cd1132 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -642,77 +642,6 @@ struct policy_hub using ScanPolicyT = decltype(select_agent_policy100>, InputValueT>(0)); - -#if __cccl_ptx_isa >= 860 - struct WarpspeedPolicy - { - // Squad definitions - static constexpr int num_squads = 5; - static constexpr int num_threads_per_warp = 32; - - // TODO(bgruber): tune this - static constexpr int num_reduce_warps = 4; - static constexpr int num_scan_stor_warps = 4; - static constexpr int num_load_warps = 1; - static constexpr int num_sched_warps = 1; - static constexpr int num_look_ahead_warps = 1; - - // TODO(bgruber): 5 is a bit better for complex - static constexpr int num_look_ahead_items = sizeof(AccumT) == 2 ? 3 : 4; - - // Deduced definitions - static constexpr int num_total_warps = - num_reduce_warps + num_scan_stor_warps + num_load_warps + num_sched_warps + num_look_ahead_warps; - static constexpr int num_total_threads = num_total_warps * num_threads_per_warp; - - static constexpr int squad_reduce_thread_count = num_reduce_warps * num_threads_per_warp; - - // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM) - // 2-byte types and double needed special handling - static constexpr int items_per_thread = - ::cuda::std::max(256 / (sizeof(InputValueT) == 2 ? 2 : int{sizeof(AccumT)}) - 1, 1); - // TODO(bgruber): the special handling of double below is a LOT faster, but exceeds 48KiB SMEM - // clang-format off - // | F64 | I32 | 72576 | 11.295 us | 2.44% | 11.917 us | 8.02% | 0.622 us | 5.50% | SLOW | - // | F64 | I32 | 1056384 | 16.162 us | 6.24% | 15.847 us | 5.57% | -0.315 us | -1.95% | SAME | - // | F64 | I32 | 16781184 | 65.696 us | 1.64% | 60.650 us | 3.37% | -5.046 us | -7.68% | FAST | - // | F64 | I32 | 268442496 | 863.896 us | 0.22% | 679.100 us | 0.93% | -184.796 us | -21.39% | FAST | - // | F64 | I32 | 1073745792 | 3.418 ms | 0.12% | 2.662 ms | 0.46% | -755.740 us | -22.11% | FAST | - // | F64 | I64 | 72576 | 12.301 us | 8.18% | 12.987 us | 5.75% | 0.686 us | 5.58% | SAME | - // | F64 | I64 | 1056384 | 16.775 us | 5.70% | 16.091 us | 6.14% | -0.684 us | -4.08% | SAME | - // | F64 | I64 | 16781184 | 66.970 us | 1.41% | 58.024 us | 3.17% | -8.946 us | -13.36% | FAST | - // | F64 | I64 | 268442496 | 863.826 us | 0.23% | 676.465 us | 0.98% | -187.360 us | -21.69% | FAST | - // | F64 | I64 | 1073745792 | 3.419 ms | 0.11% | 2.664 ms | 0.48% | -755.409 us | -22.09% | FAST | - // | F64 | I64 | 4294975104 | 13.641 ms | 0.05% | 10.575 ms | 0.24% | -3065.815 us | -22.48% | FAST | - // clang-format on - // (256 / (sizeof(InputValueT) == 2 ? 2 : (::cuda::std::is_same_v ? 4 : sizeof(AccumT))) - - // 1); - - static constexpr int tile_size = items_per_thread * squad_reduce_thread_count; - - // The squads cannot be static constexpr variables, as those are not device accessible - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept - { - return warpspeed::SquadDesc{0, num_reduce_warps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept - { - return warpspeed::SquadDesc{1, num_scan_stor_warps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept - { - return warpspeed::SquadDesc{2, num_load_warps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadSched() noexcept - { - return warpspeed::SquadDesc{3, num_sched_warps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLookback() noexcept - { - return warpspeed::SquadDesc{4, num_look_ahead_warps}; - } - }; -#endif // __cccl_ptx_isa >= 860 }; using MaxPolicy = Policy1000; @@ -738,14 +667,17 @@ get_warpspeed_policy(::cuda::arch_id arch, int input_value_size, int accum_size) scan_warpspeed_policy warpspeed_policy{}; warpspeed_policy.valid = true; + // TODO(bgruber): tune this warpspeed_policy.num_reduce_warps = 4; warpspeed_policy.num_scan_stor_warps = 4; warpspeed_policy.num_load_warps = 1; warpspeed_policy.num_sched_warps = 1; warpspeed_policy.num_look_ahead_warps = 1; + // TODO(bgruber): 5 is a bit better for complex warpspeed_policy.num_look_ahead_items = accum_size == 2 ? 3 : 4; + // Deduced definitions const auto num_threads_per_warp = 32; const auto num_total_warps = warpspeed_policy.num_reduce_warps + warpspeed_policy.num_scan_stor_warps + warpspeed_policy.num_load_warps @@ -754,8 +686,27 @@ get_warpspeed_policy(::cuda::arch_id arch, int input_value_size, int accum_size) const auto squad_reduce_thread_count = warpspeed_policy.num_reduce_warps * num_threads_per_warp; + // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM) + // 2-byte types and double needed special handling warpspeed_policy.items_per_thread = ::cuda::std::max(256 / (input_value_size == 2 ? 2 : accum_size) - 1, 1); - warpspeed_policy.tile_size = warpspeed_policy.items_per_thread * squad_reduce_thread_count; + // TODO(bgruber): the special handling of double below is a LOT faster, but exceeds 48KiB SMEM + // clang-format off + // | F64 | I32 | 72576 | 11.295 us | 2.44% | 11.917 us | 8.02% | 0.622 us | 5.50% | SLOW | + // | F64 | I32 | 1056384 | 16.162 us | 6.24% | 15.847 us | 5.57% | -0.315 us | -1.95% | SAME | + // | F64 | I32 | 16781184 | 65.696 us | 1.64% | 60.650 us | 3.37% | -5.046 us | -7.68% | FAST | + // | F64 | I32 | 268442496 | 863.896 us | 0.22% | 679.100 us | 0.93% | -184.796 us | -21.39% | FAST | + // | F64 | I32 | 1073745792 | 3.418 ms | 0.12% | 2.662 ms | 0.46% | -755.740 us | -22.11% | FAST | + // | F64 | I64 | 72576 | 12.301 us | 8.18% | 12.987 us | 5.75% | 0.686 us | 5.58% | SAME | + // | F64 | I64 | 1056384 | 16.775 us | 5.70% | 16.091 us | 6.14% | -0.684 us | -4.08% | SAME | + // | F64 | I64 | 16781184 | 66.970 us | 1.41% | 58.024 us | 3.17% | -8.946 us | -13.36% | FAST | + // | F64 | I64 | 268442496 | 863.826 us | 0.23% | 676.465 us | 0.98% | -187.360 us | -21.69% | FAST | + // | F64 | I64 | 1073745792 | 3.419 ms | 0.11% | 2.664 ms | 0.48% | -755.409 us | -22.09% | FAST | + // | F64 | I64 | 4294975104 | 13.641 ms | 0.05% | 10.575 ms | 0.24% | -3065.815 us | -22.48% | FAST | + // clang-format on + // (256 / (sizeof(InputValueT) == 2 ? 2 : (::cuda::std::is_same_v ? 4 : sizeof(AccumT))) - + // 1); + + warpspeed_policy.tile_size = warpspeed_policy.items_per_thread * squad_reduce_thread_count; return warpspeed_policy; } @@ -1172,6 +1123,25 @@ struct policy_selector static_assert(scan_policy_selector); #endif // _CCCL_HAS_CONCEPTS() +template +struct benchmark_match_for_policy_selector +{ + static constexpr bool value = false; +}; + +template +struct benchmark_match_for_policy_selector< + ScanOpT, + InputValueT, + OutputValueT, + AccumT, + ::cuda::std::void_t<::cuda::std::__accumulator_t>> +{ + static constexpr bool value = + sizeof(AccumT) == sizeof(::cuda::std::__accumulator_t) + && sizeof(InputValueT) == sizeof(OutputValueT); +}; + // stateless version which can be passed to kernels template struct policy_selector_from_types @@ -1185,8 +1155,7 @@ struct policy_selector_from_types [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy { constexpr bool benchmark_match = - sizeof(AccumT) == sizeof(::cuda::std::__accumulator_t) - && sizeof(InputValueT) == sizeof(OutputValueT); + benchmark_match_for_policy_selector::value; constexpr auto policies = policy_selector{ input_value_size, diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu index ce295020b54..5fc417274ed 100644 --- a/cub/test/catch2_test_device_scan_env.cu +++ b/cub/test/catch2_test_device_scan_env.cu @@ -14,6 +14,7 @@ struct stream_registry_factory_t; #include +#include #include #include "catch2_test_env_launch_helper.h" @@ -45,36 +46,23 @@ struct block_size_check_t } }; -struct block_size_retreiver_t -{ - int* ptr; - - template - cudaError_t Invoke() - { - *ptr = ActivePolicyT::ScanPolicyT::BLOCK_THREADS; - return cudaSuccess; - } -}; - TEST_CASE("Device scan exclusive scan works with default environment", "[scan][device]") { using num_items_t = int; using value_t = int; using offset_t = cub::detail::choose_offset_t; - using policy_t = - cub::detail::scan::default_tuning::fn::MaxPolicy; + using selector_t = + cub::detail::scan::default_tuning::selector; int current_device{}; REQUIRE(cudaSuccess == cudaGetDevice(¤t_device)); - int ptx_version{}; - REQUIRE(cudaSuccess == cub::PtxVersion(ptx_version, current_device)); + cudaDeviceProp device_props{}; + REQUIRE(cudaSuccess == cudaGetDeviceProperties(&device_props, current_device)); - int target_block_size{}; - block_size_retreiver_t block_size_retreiver{&target_block_size}; - REQUIRE(cudaSuccess == policy_t::Invoke(ptx_version, block_size_retreiver)); + const auto target_block_size = + selector_t{}(cuda::to_arch_id(cuda::compute_capability{device_props.major, device_props.minor})).block_threads; num_items_t num_items = 1; c2h::device_vector d_block_size(1); @@ -114,8 +102,8 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev template struct scan_tuning : cub::detail::scan::tuning> { - template - struct fn + template + struct policy_hub { struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { @@ -139,6 +127,13 @@ struct scan_tuning : cub::detail::scan::tuning> using MaxPolicy = Policy500; }; + + template + using selector = + cub::detail::scan::policy_selector_from_hub, + InputValueT, + OutputValueT, + AccumT>; }; struct get_reduce_tuning_query_t From 863c8743ebaed4addc47757e836e4fe1381da3ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 26 Feb 2026 16:36:01 +0100 Subject: [PATCH 18/31] Operators for scan_warpspeed_policy. --- .../kernels/scan_warpspeed_policy.cuh | 21 +++++++++++++++++++ .../device/dispatch/tuning/tuning_scan.cuh | 8 +------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh index 3409fc53866..e4f254dcd6b 100644 --- a/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh +++ b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh @@ -7,6 +7,10 @@ #include +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + CUB_NAMESPACE_BEGIN namespace detail::scan @@ -61,6 +65,23 @@ struct scan_warpspeed_policy && lhs.num_look_ahead_items == rhs.num_look_ahead_items && lhs.num_total_threads == rhs.num_total_threads && lhs.items_per_thread == rhs.items_per_thread && lhs.tile_size == rhs.tile_size; } + + _CCCL_API constexpr friend bool operator!=(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs) + { + return !(lhs == rhs); + } + +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const scan_warpspeed_policy& p) + { + return os + << "scan_warpspeed_policy { .valid = " << p.valid << ", .num_reduce_warps = " << p.num_reduce_warps + << ", .num_scan_stor_warps = " << p.num_scan_stor_warps << ", .num_load_warps = " << p.num_load_warps + << ", .num_sched_warps = " << p.num_sched_warps << ", .num_look_ahead_warps = " << p.num_look_ahead_warps + << ", .num_look_ahead_items = " << p.num_look_ahead_items << ", .num_total_threads = " << p.num_total_threads + << ", .items_per_thread = " << p.items_per_thread << ", .tile_size = " << p.tile_size << " }"; + } +#endif // !_CCCL_COMPILER(NVRTC) }; } // namespace detail::scan diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 89138cd1132..9683694e085 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -155,13 +155,7 @@ struct scan_policy << ", .delay_constructor = " << p.delay_constructor; if (p.warpspeed) { - os << ", .warpspeed = scan_warpspeed_policy { .valid = " << p.warpspeed.valid << ", .num_reduce_warps = " - << p.warpspeed.num_reduce_warps << ", .num_scan_stor_warps = " << p.warpspeed.num_scan_stor_warps - << ", .num_load_warps = " << p.warpspeed.num_load_warps << ", .num_sched_warps = " - << p.warpspeed.num_sched_warps << ", .num_look_ahead_warps = " << p.warpspeed.num_look_ahead_warps - << ", .num_look_ahead_items = " << p.warpspeed.num_look_ahead_items << ", .num_total_threads = " - << p.warpspeed.num_total_threads << ", .items_per_thread = " << p.warpspeed.items_per_thread - << ", .tile_size = " << p.warpspeed.tile_size << "}"; + os << ", .warpspeed = " << p.warpspeed; } return os << " }"; } From 699d4bfd209127e4941b1bcad2b0aaef78ed42c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 26 Feb 2026 16:49:56 +0100 Subject: [PATCH 19/31] Fix clang build. --- cub/cub/device/dispatch/dispatch_scan.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index a50a9604ae5..f19e4f0fb73 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -768,7 +768,8 @@ struct DispatchScan { struct policy_getter { - _CCCL_API _CCCL_FORCEINLINE constexpr auto operator()() const + // host-device not api, because clang warns about exclude_from_explicit_instantiation in local types + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto operator()() const { return detail::scan::convert_policy(); } From ab85bfee3b3266553fd35b9dbfb892f8a99df4ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 26 Feb 2026 17:25:14 +0100 Subject: [PATCH 20/31] Fix a missed test. --- cub/test/catch2_test_device_scan_env.cu | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu index 29e47f0fd48..d489e320942 100644 --- a/cub/test/catch2_test_device_scan_env.cu +++ b/cub/test/catch2_test_device_scan_env.cu @@ -207,18 +207,17 @@ TEST_CASE("Device scan inclusive-scan works with default environment", "[scan][d using value_t = int; using offset_t = cub::detail::choose_offset_t; - using policy_t = - cub::detail::scan::default_tuning::fn::MaxPolicy; + using selector_t = + cub::detail::scan::default_tuning::selector; int current_device{}; REQUIRE(cudaSuccess == cudaGetDevice(¤t_device)); - int ptx_version{}; - REQUIRE(cudaSuccess == cub::PtxVersion(ptx_version, current_device)); + cudaDeviceProp device_props{}; + REQUIRE(cudaSuccess == cudaGetDeviceProperties(&device_props, current_device)); - int target_block_size{}; - block_size_retreiver_t block_size_retreiver{&target_block_size}; - REQUIRE(cudaSuccess == policy_t::Invoke(ptx_version, block_size_retreiver)); + const auto target_block_size = + selector_t{}(cuda::to_arch_id(cuda::compute_capability{device_props.major, device_props.minor})).block_threads; num_items_t num_items = 1; c2h::device_vector d_block_size(1); From 372bcb90239db59440aeb4de01ffc2826585ca09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 26 Feb 2026 17:57:58 +0100 Subject: [PATCH 21/31] Fix clang-cuda concept checks. --- cub/cub/device/dispatch/dispatch_scan.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index f19e4f0fb73..54490c9fde8 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -154,8 +154,9 @@ _CCCL_API constexpr auto convert_policy() -> scan_policy template struct policy_selector_from_hub { - // this is only called in device code - _CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> scan_policy + // Called from device code during dispatch, and from host code when clang-cuda evaluates + // scan_policy_selector concept checks. + _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> scan_policy { return convert_policy(); } From 27ab1c4a27d8fec43becbf457e6545977a86a7fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 26 Feb 2026 18:39:04 +0100 Subject: [PATCH 22/31] Fix classifications of bool and min/max. --- c/parallel/src/util/types.h | 3 ++- cub/cub/device/dispatch/tuning/common.cuh | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/c/parallel/src/util/types.h b/c/parallel/src/util/types.h index 7ff767b4a94..837025a69c2 100644 --- a/c/parallel/src/util/types.h +++ b/c/parallel/src/util/types.h @@ -114,6 +114,8 @@ inline constexpr cub::detail::type_t cccl_type_enum_to_cub_type(cccl_type_enum t { switch (type) { + case CCCL_BOOLEAN: + return cub::detail::type_t::boolean; case CCCL_INT8: return cub::detail::type_t::int8; case CCCL_INT16: @@ -136,7 +138,6 @@ inline constexpr cub::detail::type_t cccl_type_enum_to_cub_type(cccl_type_enum t return cub::detail::type_t::float64; case CCCL_FLOAT16: case CCCL_STORAGE: - case CCCL_BOOLEAN: default: return cub::detail::type_t::other; } diff --git a/cub/cub/device/dispatch/tuning/common.cuh b/cub/cub/device/dispatch/tuning/common.cuh index 273eedfbd82..c41686b7971 100644 --- a/cub/cub/device/dispatch/tuning/common.cuh +++ b/cub/cub/device/dispatch/tuning/common.cuh @@ -29,6 +29,7 @@ namespace detail // libcu++ enum class type_t { + boolean, int8, int16, int32, @@ -50,6 +51,8 @@ inline constexpr auto classify_type = type_t::other; template <> inline constexpr auto classify_type = ::cuda::std::is_signed_v ? type_t::int8 : type_t::uint8; template <> +inline constexpr auto classify_type = type_t::boolean; +template <> inline constexpr auto classify_type = type_t::int8; template <> inline constexpr auto classify_type = type_t::uint8; @@ -101,6 +104,12 @@ inline constexpr auto classify_op = op_kind_t::other; template inline constexpr auto classify_op<::cuda::std::plus> = op_kind_t::plus; +template +inline constexpr auto classify_op<::cuda::minimum> = op_kind_t::min; + +template +inline constexpr auto classify_op<::cuda::maximum> = op_kind_t::max; + struct iterator_info { int value_type_size; From 16bc4522733f69ad504a3933cd946706d4d15d3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Fri, 27 Feb 2026 05:33:11 +0100 Subject: [PATCH 23/31] Add missing includes. --- cub/cub/device/dispatch/tuning/common.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cub/cub/device/dispatch/tuning/common.cuh b/cub/cub/device/dispatch/tuning/common.cuh index c41686b7971..8811afa012c 100644 --- a/cub/cub/device/dispatch/tuning/common.cuh +++ b/cub/cub/device/dispatch/tuning/common.cuh @@ -18,6 +18,8 @@ #include #include +#include +#include #include #include From 565a017f9565620ece4594cdbd99cea50d04d52a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 12 Mar 2026 20:56:12 -0500 Subject: [PATCH 24/31] Codegen fixes. --- c/parallel/src/scan.cu | 2 + cub/benchmarks/bench/scan/exclusive/base.cuh | 4 +- .../kernels/kernel_scan_warpspeed.cuh | 64 +++++++++++++++++-- .../device/dispatch/tuning/tuning_scan.cuh | 23 ++++++- 4 files changed, 86 insertions(+), 7 deletions(-) diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index ab7aeb44f04..b4fac8e294b 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -264,6 +264,7 @@ try const bool types_match = input_type == output_type && input_type == accum_t.type; const bool benchmark_match = operation_t != cub::detail::op_kind_t::other && types_match && input_type != CCCL_STORAGE; + const bool accum_is_primitive_or_trivially_copy_constructible = true; return policy_selector{ static_cast(input_it.value_type.size), @@ -275,6 +276,7 @@ try int{sizeof(OffsetT)}, accum_type, operation_t, + accum_is_primitive_or_trivially_copy_constructible, benchmark_match}; }(); diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index a425fe54d95..65f4b35a6fd 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -49,8 +49,8 @@ try thrust::device_vector input = generate(elements); thrust::device_vector output(elements); - T* d_input = thrust::raw_pointer_cast(input.data()); - T* d_output = thrust::raw_pointer_cast(output.data()); + const T* d_input = thrust::raw_pointer_cast(input.data()); + T* d_output = thrust::raw_pointer_cast(output.data()); state.add_element_count(elements); state.add_global_memory_reads(elements, "Size"); diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh index 921036b836a..1869b2a4e1d 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh @@ -50,6 +50,62 @@ _CCCL_API constexpr scan_warpspeed_policy get_warpspeed_policy() noexcept return PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).warpspeed; } +template +struct static_warpspeed_policy_adapter +{ + _CCCL_API static constexpr scan_warpspeed_policy policy() noexcept + { + return get_warpspeed_policy(); + } + + _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const + { + return policy().squadReduce(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const + { + return policy().squadScanStore(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const + { + return policy().squadLoad(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadSched() const + { + return policy().squadSched(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const + { + return policy().squadLookback(); + } +}; + +struct runtime_warpspeed_policy_adapter +{ + const scan_warpspeed_policy& policy; + + _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const + { + return policy.squadReduce(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const + { + return policy.squadScanStore(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const + { + return policy.squadLoad(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadSched() const + { + return policy.squadSched(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const + { + return policy.squadLookback(); + } +}; + template struct scanKernelParams { @@ -110,13 +166,13 @@ _CCCL_API constexpr scan_stage_counts make_scan_stage_counts(int num_stages) return {num_block_idx_stages, 2}; } -template _CCCL_API constexpr void setup_scan_resources( - const Policy& policy, + const PolicyAdapter& policy, warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& smemAllocator, SmemInOutT& smemInOut, @@ -165,7 +221,7 @@ allocResources(warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& sm }; setup_scan_resources( - ScanResourcesT::warpspeed_policy(), + static_warpspeed_policy_adapter{}, syncHandler, smemAllocator, res.smemInOut, @@ -834,7 +890,7 @@ _CCCL_API constexpr auto smem_for_stages( }; setup_scan_resources( - policy, + runtime_warpspeed_policy_adapter{policy}, syncHandler, smemAllocator, res.smemInOut, diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 9accc75e657..f5d45576fbb 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -30,6 +30,7 @@ #include #include #include +#include #include #if !_CCCL_COMPILER(NVRTC) @@ -712,6 +713,7 @@ struct policy_selector int offset_size; type_t accum_type; op_kind_t operation_t; + bool accum_is_primitive_or_trivially_copy_constructible; // TODO(griwes): remove this field before policy_selector is publicly exposed bool benchmark_match; @@ -726,7 +728,7 @@ struct policy_selector large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE; const BlockStoreAlgorithm scan_transposed_store = large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; - const auto default_delay = default_delay_constructor_policy(primitive_accum_t == primitive_accum::yes); + const auto default_delay = default_delay_constructor_policy(accum_is_primitive_or_trivially_copy_constructible); const auto warpspeed_policy = get_warpspeed_policy(arch, input_value_size, accum_size); @@ -947,6 +949,21 @@ struct policy_selector } } + // Keep sm_86 aligned with legacy policy_hub behavior: policy_hub resets to default policy for 86. + if (arch >= ::cuda::arch_id::sm_86) + { + return make_mem_scaled_scan_policy( + 128, + 15, + accum_size, + scan_transposed_load, + LOAD_DEFAULT, + scan_transposed_store, + BLOCK_SCAN_WARP_SCANS, + default_delay, + warpspeed_policy); + } + if (arch >= ::cuda::arch_id::sm_80) { if (primitive_op_t == primitive_op::yes) @@ -1144,6 +1161,9 @@ struct policy_selector_from_types constexpr bool benchmark_match = benchmark_match_for_policy_selector::value; + constexpr bool accum_is_primitive_or_trivially_copy_constructible = + is_primitive::value || ::cuda::std::is_trivially_copy_constructible_v; + constexpr auto policies = policy_selector{ input_value_size, input_value_alignment, @@ -1154,6 +1174,7 @@ struct policy_selector_from_types int{sizeof(OffsetT)}, classify_type, classify_op, + accum_is_primitive_or_trivially_copy_constructible, benchmark_match}; return policies(arch); } From 9cd3ca020ecab2de835132a7fd0e292e9067846f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Thu, 12 Mar 2026 21:49:16 -0500 Subject: [PATCH 25/31] Review comments. --- cub/cub/device/device_scan.cuh | 14 ++---- cub/cub/device/dispatch/dispatch_scan.cuh | 44 +++++++++---------- .../kernels/kernel_scan_warpspeed.cuh | 6 +-- cub/test/catch2_test_device_scan_env.cu | 44 +++++-------------- 4 files changed, 38 insertions(+), 70 deletions(-) diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index f6a22e99828..f584d58d158 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -49,12 +49,6 @@ struct tuning return static_cast(*this); } }; - -struct default_tuning : tuning -{ - template - using selector = policy_selector_from_types; -}; } // namespace detail::scan //! @rst @@ -124,9 +118,6 @@ struct DeviceScan ::cuda::execution::determinism::__determinism_holder_t, cudaStream_t stream) { - using scan_tuning_t = ::cuda::std::execution:: - __query_result_or_t; - // Unsigned integer type for global offsets using offset_t = detail::choose_offset_t; @@ -137,13 +128,16 @@ struct DeviceScan cub::detail::it_value_t, typename InitValueT::value_type>>; - using policy_selector_t = typename scan_tuning_t::template selector< + using default_policy_selector_t = detail::scan::policy_selector_from_types< detail::it_value_t, detail::it_value_t, accum_t, offset_t, ScanOpT>; + using policy_selector_t = ::cuda::std::execution:: + __query_result_or_t; + return detail::scan::dispatch( d_temp_storage, temp_storage_bytes, diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 54490c9fde8..6cc8c2bafbc 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -160,13 +160,6 @@ struct policy_selector_from_hub { return convert_policy(); } - - static constexpr int input_value_size = int{sizeof(InputValueT)}; - static constexpr int input_value_alignment = int{alignof(InputValueT)}; - static constexpr int output_value_size = int{sizeof(OutputValueT)}; - static constexpr int output_value_alignment = int{alignof(OutputValueT)}; - static constexpr int accum_size = int{sizeof(AccumT)}; - static constexpr int accum_alignment = int{alignof(AccumT)}; }; } // namespace detail::scan @@ -463,11 +456,14 @@ struct DispatchScan return cudaSuccess; } -#if _CCCL_CUDACC_AT_LEAST(12, 8) - template - CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm( - const detail::scan::scan_warpspeed_policy& warpspeed_policy, const PolicySelectorT& policy_selector) +#if __cccl_ptx_isa >= 860 + template + CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t + __invoke_lookahead_algorithm(PolicyGetter policy_getter, const PolicySelectorT& policy_selector) { + CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter(); + CUB_DETAIL_CONSTEXPR_ISH const auto warpspeed_policy = active_policy.warpspeed; + const int grid_dim = static_cast(::cuda::ceil_div(num_items, static_cast(warpspeed_policy.tile_size))); @@ -498,17 +494,21 @@ struct DispatchScan // TODO(bgruber): we probably need to ensure alignment of d_temp_storage _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), ""); - auto scan_kernel = kernel_source.ScanKernel(); - int num_stages = 1; - int smem_size = detail::scan::smem_for_stages( + auto scan_kernel = kernel_source.ScanKernel(); + CUB_DETAIL_CONSTEXPR_ISH int smem_size_1_stage = detail::scan::smem_for_stages( warpspeed_policy, - num_stages, + 1, policy_selector.input_value_size, policy_selector.input_value_alignment, policy_selector.output_value_size, policy_selector.output_value_alignment, policy_selector.accum_size, policy_selector.accum_alignment); + CUB_DETAIL_STATIC_ISH_ASSERT(smem_size_1_stage <= detail::max_smem_per_block, + "Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)"); + + int num_stages = 1; + int smem_size = smem_size_1_stage; // When launched from the host, maximize the number of stages that we can fit inside the shared memory. NV_IF_TARGET(NV_IS_HOST, ({ @@ -617,7 +617,7 @@ struct DispatchScan return cudaSuccess; } -#endif // _CCCL_CUDACC_AT_LEAST(12, 8) +#endif // __cccl_ptx_isa >= 860 template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t @@ -628,16 +628,12 @@ struct DispatchScan CUB_DETAIL_STATIC_ISH_ASSERT(active_policy.load_modifier != CacheLoadModifier::LOAD_LDG, "The memory consistency model does not apply to texture accesses"); -#if !_CCCL_CUDACC_AT_LEAST(12, 8) - (void) policy_selector; -#endif // !_CCCL_CUDACC_AT_LEAST(12, 8) - -#if _CCCL_CUDACC_AT_LEAST(12, 8) - if (kernel_source.use_warpspeed(active_policy)) +#if __cccl_ptx_isa >= 860 + if CUB_DETAIL_CONSTEXPR_ISH (kernel_source.use_warpspeed(active_policy)) { - return __invoke_lookahead_algorithm(active_policy.warpspeed, policy_selector); + return __invoke_lookahead_algorithm(policy_getter, policy_selector); } -#endif // _CCCL_CUDACC_AT_LEAST(12, 8) +#endif // __cccl_ptx_isa >= 860 // Number of input tiles const int tile_size = active_policy.block_threads * active_policy.items_per_thread; diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh index 1869b2a4e1d..92b84623932 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh @@ -211,12 +211,12 @@ allocResources(warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& sm using InOutT = typename ScanResourcesT::InOutT; using SumThreadAndWarpT = typename ScanResourcesT::SumThreadAndWarpT; - const auto counts = make_scan_stage_counts(numStages); + const auto [num_block_idx_stages, num_sum_exclusive_cta_stages] = make_scan_stage_counts(numStages); ScanResourcesT res = { warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{numStages}), - warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{counts.num_block_idx_stages}), - warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{counts.num_sum_exclusive_cta_stages}), + warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{num_block_idx_stages}), + warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{num_sum_exclusive_cta_stages}), warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{numStages}), }; diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu index d489e320942..2ccdb97587b 100644 --- a/cub/test/catch2_test_device_scan_env.cu +++ b/cub/test/catch2_test_device_scan_env.cu @@ -55,7 +55,7 @@ TEST_CASE("Device scan exclusive scan works with default environment", "[scan][d using offset_t = cub::detail::choose_offset_t; using selector_t = - cub::detail::scan::default_tuning::selector; + cub::detail::scan::policy_selector_from_types; int current_device{}; REQUIRE(cudaSuccess == cudaGetDevice(¤t_device)); @@ -104,38 +104,16 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev template struct scan_tuning : cub::detail::scan::tuning> { - template - struct policy_hub + _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::scan::scan_policy { - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> - { - struct ScanPolicyT - { - static constexpr int BLOCK_THREADS = BlockThreads; - static constexpr int ITEMS_PER_THREAD = 1; - static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE; - - static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::CacheLoadModifier::LOAD_DEFAULT; - static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = - cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE; - static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING; - - struct detail - { - using delay_constructor_t = cub::detail::default_delay_constructor_t; - }; - }; - }; - - using MaxPolicy = Policy500; - }; - - template - using selector = - cub::detail::scan::policy_selector_from_hub, - InputValueT, - OutputValueT, - AccumT>; + return {BlockThreads, + 1, + cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE, + cub::CacheLoadModifier::LOAD_DEFAULT, + cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE, + cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING, + cub::detail::delay_constructor_policy{cub::detail::delay_constructor_kind::fixed_delay, 350, 450}}; + } }; struct get_reduce_tuning_query_t @@ -208,7 +186,7 @@ TEST_CASE("Device scan inclusive-scan works with default environment", "[scan][d using offset_t = cub::detail::choose_offset_t; using selector_t = - cub::detail::scan::default_tuning::selector; + cub::detail::scan::policy_selector_from_types; int current_device{}; REQUIRE(cudaSuccess == cudaGetDevice(¤t_device)); From 81b7a7f7efc2dacb4ef1d97d42c98475c9dd2084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Fri, 13 Mar 2026 01:16:25 -0500 Subject: [PATCH 26/31] More abstraction layers to restore constexprness. --- cub/cub/device/dispatch/dispatch_scan.cuh | 78 ++++++++++++++----- .../device/dispatch/tuning/tuning_scan.cuh | 24 ++++++ 2 files changed, 81 insertions(+), 21 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 3a8ab97b60e..f5057b3c62a 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -500,18 +500,37 @@ struct DispatchScan // TODO(bgruber): we probably need to ensure alignment of d_temp_storage _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), ""); - auto scan_kernel = kernel_source.ScanKernel(); - CUB_DETAIL_CONSTEXPR_ISH int smem_size_1_stage = detail::scan::smem_for_stages( - warpspeed_policy, - 1, - policy_selector.input_value_size, - policy_selector.input_value_alignment, - policy_selector.output_value_size, - policy_selector.output_value_alignment, - policy_selector.accum_size, - policy_selector.accum_alignment); - CUB_DETAIL_STATIC_ISH_ASSERT(smem_size_1_stage <= detail::max_smem_per_block, - "Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)"); + using selector_smem_info_t = detail::scan::selector_smem_info; + + auto scan_kernel = kernel_source.ScanKernel(); + int smem_size_1_stage = 0; + if constexpr (selector_smem_info_t::has_static_layout) + { + CUB_DETAIL_CONSTEXPR_ISH int static_smem_size_1_stage = detail::scan::smem_for_stages( + warpspeed_policy, + 1, + selector_smem_info_t::input_value_size, + selector_smem_info_t::input_value_alignment, + selector_smem_info_t::output_value_size, + selector_smem_info_t::output_value_alignment, + selector_smem_info_t::accum_size, + selector_smem_info_t::accum_alignment); + CUB_DETAIL_STATIC_ISH_ASSERT(static_smem_size_1_stage <= detail::max_smem_per_block, + "Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)"); + smem_size_1_stage = static_smem_size_1_stage; + } + else + { + smem_size_1_stage = detail::scan::smem_for_stages( + warpspeed_policy, + 1, + policy_selector.input_value_size, + policy_selector.input_value_alignment, + policy_selector.output_value_size, + policy_selector.output_value_alignment, + policy_selector.accum_size, + policy_selector.accum_alignment); + } int num_stages = 1; int smem_size = smem_size_1_stage; @@ -526,15 +545,32 @@ struct DispatchScan while (num_stages <= max_stages_for_even_workload) { - const auto next_smem_size = detail::scan::smem_for_stages( - warpspeed_policy, - num_stages + 1, - policy_selector.input_value_size, - policy_selector.input_value_alignment, - policy_selector.output_value_size, - policy_selector.output_value_alignment, - policy_selector.accum_size, - policy_selector.accum_alignment); + const auto next_smem_size = [&] { + if constexpr (selector_smem_info_t::has_static_layout) + { + return detail::scan::smem_for_stages( + warpspeed_policy, + num_stages + 1, + selector_smem_info_t::input_value_size, + selector_smem_info_t::input_value_alignment, + selector_smem_info_t::output_value_size, + selector_smem_info_t::output_value_alignment, + selector_smem_info_t::accum_size, + selector_smem_info_t::accum_alignment); + } + else + { + return detail::scan::smem_for_stages( + warpspeed_policy, + num_stages + 1, + policy_selector.input_value_size, + policy_selector.input_value_alignment, + policy_selector.output_value_size, + policy_selector.output_value_alignment, + policy_selector.accum_size, + policy_selector.accum_alignment); + } + }(); if (next_smem_size > max_dynamic_smem_size) { // This number of stages failed, so stay at the current settings diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 4e75aee1fa7..c8042855284 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -1339,6 +1339,30 @@ struct policy_selector_from_types return policies(arch); } }; + +template +struct selector_smem_info +{ + static constexpr bool has_static_layout = false; +}; + +template +struct selector_smem_info> +{ + static constexpr bool has_static_layout = true; + static constexpr int input_value_size = PolicySelectorT::input_value_size; + static constexpr int input_value_alignment = PolicySelectorT::input_value_alignment; + static constexpr int output_value_size = PolicySelectorT::output_value_size; + static constexpr int output_value_alignment = PolicySelectorT::output_value_alignment; + static constexpr int accum_size = PolicySelectorT::accum_size; + static constexpr int accum_alignment = PolicySelectorT::accum_alignment; +}; } // namespace detail::scan CUB_NAMESPACE_END From 029b195cbf3b1501e86f8ae2136dac9acda03f82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Fri, 13 Mar 2026 03:22:49 -0500 Subject: [PATCH 27/31] Correctly check for the constants. --- cub/cub/device/dispatch/tuning/tuning_scan.cuh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index c8042855284..25568afc489 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1347,13 +1348,14 @@ struct selector_smem_info }; template -struct selector_smem_info> +struct selector_smem_info< + PolicySelectorT, + ::cuda::std::void_t{}), + decltype(::cuda::std::integral_constant{}), + decltype(::cuda::std::integral_constant{}), + decltype(::cuda::std::integral_constant{}), + decltype(::cuda::std::integral_constant{}), + decltype(::cuda::std::integral_constant{})>> { static constexpr bool has_static_layout = true; static constexpr int input_value_size = PolicySelectorT::input_value_size; From ac03691eda336c53ee23da4a3400af7c95ea3475 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Fri, 13 Mar 2026 11:41:36 -0500 Subject: [PATCH 28/31] Another abstraction layer, to remove a constexpr reference to `this`. --- cub/cub/device/dispatch/dispatch_scan.cuh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index f5057b3c62a..3f6fe35a082 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -135,6 +135,18 @@ struct DeviceScanKernelSource } }; +template +CUB_RUNTIME_FUNCTION constexpr bool +use_warpspeed_constexpr_ish(const KernelSourceT& kernel_source, const scan_policy& policy) +{ +#if defined(CUB_DEFINE_RUNTIME_POLICIES) + return kernel_source.use_warpspeed(policy); +#else + (void) kernel_source; + return KernelSourceT::use_warpspeed(policy); +#endif +} + // TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API template _CCCL_API constexpr auto convert_policy() -> scan_policy @@ -671,7 +683,7 @@ struct DispatchScan "The memory consistency model does not apply to texture accesses"); #if __cccl_ptx_isa >= 860 - if CUB_DETAIL_CONSTEXPR_ISH (kernel_source.use_warpspeed(active_policy)) + if CUB_DETAIL_CONSTEXPR_ISH (detail::scan::use_warpspeed_constexpr_ish(kernel_source, active_policy)) { return __invoke_lookahead_algorithm(policy_getter, policy_selector); } From a6ed3cd5de353ae4c93e053792bf7f7259f05aba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Fri, 13 Mar 2026 12:01:39 -0500 Subject: [PATCH 29/31] I kinda hate this but I think it has to be like this. --- cub/cub/device/dispatch/dispatch_scan.cuh | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 3f6fe35a082..8ef0ee3c71b 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -135,18 +135,6 @@ struct DeviceScanKernelSource } }; -template -CUB_RUNTIME_FUNCTION constexpr bool -use_warpspeed_constexpr_ish(const KernelSourceT& kernel_source, const scan_policy& policy) -{ -#if defined(CUB_DEFINE_RUNTIME_POLICIES) - return kernel_source.use_warpspeed(policy); -#else - (void) kernel_source; - return KernelSourceT::use_warpspeed(policy); -#endif -} - // TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API template _CCCL_API constexpr auto convert_policy() -> scan_policy @@ -683,10 +671,17 @@ struct DispatchScan "The memory consistency model does not apply to texture accesses"); #if __cccl_ptx_isa >= 860 - if CUB_DETAIL_CONSTEXPR_ISH (detail::scan::use_warpspeed_constexpr_ish(kernel_source, active_policy)) +# if defined(CUB_DEFINE_RUNTIME_POLICIES) + if (kernel_source.use_warpspeed(active_policy)) + { + return __invoke_lookahead_algorithm(policy_getter, policy_selector); + } +# else + if CUB_DETAIL_CONSTEXPR_ISH (KernelSource::use_warpspeed(active_policy)) { return __invoke_lookahead_algorithm(policy_getter, policy_selector); } +# endif #endif // __cccl_ptx_isa >= 860 // Number of input tiles From 3bb81697cc2fd75b736da38ab50e07c64bdb821f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Fri, 13 Mar 2026 12:32:28 -0500 Subject: [PATCH 30/31] Silence a warning. --- cub/cub/device/dispatch/dispatch_scan.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 8ef0ee3c71b..bf46fd0f9ba 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -663,7 +663,7 @@ struct DispatchScan template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t - __invoke(PolicyGetter policy_getter, const PolicySelectorT& policy_selector) + __invoke(PolicyGetter policy_getter, [[maybe_unused]] const PolicySelectorT& policy_selector) { CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter(); From 5dbcfd6182a8a8d23865f5d3daef1e23af4754eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= Date: Fri, 13 Mar 2026 13:51:27 -0500 Subject: [PATCH 31/31] Silence MSVC unreachable code warning. --- cub/cub/device/dispatch/dispatch_scan.cuh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index bf46fd0f9ba..0bcb0439d03 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -661,6 +661,10 @@ struct DispatchScan } #endif // __cccl_ptx_isa >= 860 + // On Windows, the `if CUB_DETAIL_CONSTEXPR_ISH` results in `warning C4702: unreachable code`. + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4702) + template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t __invoke(PolicyGetter policy_getter, [[maybe_unused]] const PolicySelectorT& policy_selector) @@ -809,6 +813,8 @@ struct DispatchScan return cudaSuccess; } + _CCCL_DIAG_POP + template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT = {}) {