From 03cb980c32842d9d1c1f6d281d30576fed4493d1 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 13 Feb 2026 13:45:01 +0100 Subject: [PATCH 01/23] Implement the new tuning API for Dispatch[Streaming]ReduceByKey Fixes: #7531, #7533 --- cub/benchmarks/bench/reduce/by_key.cu | 122 +-- .../bench/run_length_encode/encode.cu | 123 +-- cub/cub/agent/agent_reduce_by_key.cuh | 11 +- cub/cub/device/device_reduce.cuh | 40 +- cub/cub/device/device_run_length_encode.cuh | 27 +- .../dispatch/dispatch_reduce_by_key.cuh | 244 +++++- .../dispatch_streaming_reduce_by_key.cuh | 570 +++++--------- .../dispatch/tuning/tuning_reduce_by_key.cuh | 719 +++++++++++++++++- .../dispatch/tuning/tuning_rle_encode.cuh | 206 ++++- .../nvbench_helper/look_back_helper.cuh | 4 + 10 files changed, 1447 insertions(+), 619 deletions(-) diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu index 54d72ae4bb9..41b486aa752 100644 --- a/cub/benchmarks/bench/reduce/by_key.cu +++ b/cub/benchmarks/bench/reduce/by_key.cu @@ -15,32 +15,21 @@ // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE -# if TUNE_TRANSPOSE == 0 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT -# else // TUNE_TRANSPOSE == 1 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE -# endif // TUNE_TRANSPOSE - -# if TUNE_LOAD == 0 -# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT -# else // TUNE_LOAD == 1 -# define TUNE_LOAD_MODIFIER cub::LOAD_CA -# endif // TUNE_LOAD - -struct reduce_by_key_policy_hub +struct bench_reduce_by_key_policy_selector { - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> + [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const + -> cub::detail::reduce_by_key::reduce_by_key_policy { - using ReduceByKeyPolicyT = - cub::AgentReduceByKeyPolicy; - }; - - using MaxPolicy = Policy500; + return { + TUNE_THREADS, + TUNE_ITEMS, + TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE, + , + TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA, + cub::BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy, + }; + } }; #endif // !TUNE_BASE @@ -57,31 +46,6 @@ static void reduce(nvbench::state& state, nvbench::type_list; -#else - using dispatch_t = cub::DispatchReduceByKey< - keys_input_it_t, - unique_output_it_t, - vals_input_it_t, - aggregate_output_it_t, - num_runs_output_iterator_t, - equality_op_t, - reduction_op_t, - offset_t, - accum_t>; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; const std::size_t max_segment_size = static_cast(state.get_int64("MaxSegSize")); @@ -100,35 +64,34 @@ static void reduce(nvbench::state& state, nvbench::type_list(elements); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - 0); + auto dispatch_on_stream = [&](cudaStream_t stream) { + return cub::detail::reduce_by_key::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in_keys, + d_out_keys, + d_in_vals, + d_out_vals, + d_num_runs_out, + equality_op_t{}, + reduction_op_t{}, + num_items, + stream +#if !TUNE_BASE + , + bench_reduce_by_key_policy_selector{} +#endif + ); + }; + + dispatch_on_stream(cudaStream_t{0}); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - 0); + dispatch_on_stream(cudaStream_t{0}); cudaDeviceSynchronize(); const OffsetT num_runs = num_runs_out[0]; @@ -140,18 +103,7 @@ static void reduce(nvbench::state& state, nvbench::type_list(1); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - launch.get_stream()); + dispatch_on_stream(launch.get_stream()); }); } diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index 51f23a424f3..da115685082 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -17,32 +17,20 @@ // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE -# if TUNE_TRANSPOSE == 0 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT -# else // TUNE_TRANSPOSE == 1 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE -# endif // TUNE_TRANSPOSE - -# if TUNE_LOAD == 0 -# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT -# else // TUNE_LOAD == 1 -# define TUNE_LOAD_MODIFIER cub::LOAD_CA -# endif // TUNE_LOAD - -struct reduce_by_key_policy_hub +struct bench_encode_policy_selector { - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> + [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const + -> cub::detail::reduce_by_key::reduce_by_key_policy { - using ReduceByKeyPolicyT = - cub::AgentReduceByKeyPolicy; - }; - - using MaxPolicy = Policy500; + return { + TUNE_THREADS, + TUNE_ITEMS, + TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE, + TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA, + cub::BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy, + }; + } }; #endif // !TUNE_BASE @@ -65,33 +53,6 @@ static void rle(nvbench::state& state, nvbench::type_list; using accum_t = run_length_t; -#if !TUNE_BASE - using dispatch_t = cub::detail::reduce::DispatchStreamingReduceByKey< - keys_input_it_t, - unique_output_it_t, - run_length_input_it_t, - run_length_output_it_t, - num_runs_output_iterator_t, - equality_op_t, - reduction_op_t, - offset_t, - accum_t, - reduce_by_key_policy_hub>; -#else - using policy_t = cub::detail::rle::encode::policy_hub; - using dispatch_t = cub::detail::reduce::DispatchStreamingReduceByKey< - keys_input_it_t, - unique_output_it_t, - run_length_input_it_t, - run_length_output_it_t, - num_runs_output_iterator_t, - equality_op_t, - reduction_op_t, - offset_t, - accum_t, - policy_t>; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; const std::size_t max_segment_size = static_cast(state.get_int64("MaxSegSize")); @@ -109,35 +70,34 @@ static void rle(nvbench::state& state, nvbench::type_list(elements); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - 0); + auto dispatch_on_stream = [&](cudaStream_t stream) { + return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key( + d_temp_storage, + temp_storage_bytes, + d_in_keys, + d_out_keys, + d_in_vals, + d_out_vals, + d_num_runs_out, + equality_op_t{}, + reduction_op_t{}, + num_items, + stream +#if !TUNE_BASE + , + bench_encode_policy_selector{} +#endif + ); + }; + + dispatch_on_stream(cudaStream_t{0}); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - 0); + dispatch_on_stream(cudaStream_t{0}); cudaDeviceSynchronize(); const num_runs_t num_runs = num_runs_out[0]; @@ -148,18 +108,7 @@ static void rle(nvbench::state& state, nvbench::type_list(1); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - launch.get_stream()); + dispatch_on_stream(launch.get_stream()); }); } diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh index 43f74df5f35..9566cb705e4 100644 --- a/cub/cub/agent/agent_reduce_by_key.cuh +++ b/cub/cub/agent/agent_reduce_by_key.cuh @@ -2,10 +2,9 @@ // SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 -/** - * @file cub::AgentReduceByKey implements a stateful abstraction of CUDA thread - * blocks for participating in device-wide reduce-value-by-key. - */ +//! @file +//! cub::detail::reduce_by_key::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for +//! participating in device-wide reduce-value-by-key. #pragma once @@ -92,7 +91,7 @@ struct AgentReduceByKeyPolicy * Thread block abstractions ******************************************************************************/ -namespace detail::reduce +namespace detail::reduce_by_key { /** * @brief AgentReduceByKey implements a stateful abstraction of CUDA thread @@ -771,6 +770,6 @@ struct AgentReduceByKey } } }; -} // namespace detail::reduce +} // namespace detail::reduce_by_key CUB_NAMESPACE_END diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 50101c16e0f..afbbe219bf9 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -2308,35 +2308,21 @@ public: { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey"); - // Signed integer type for global offsets - using OffsetT = detail::choose_offset_t; - - // FlagT iterator type (not used) - - // Selection op (not used) - - // Default == operator + using OffsetT = detail::choose_offset_t; using EqualityOp = ::cuda::std::equal_to<>; - return DispatchReduceByKey< - KeysInputIteratorT, - UniqueOutputIteratorT, - ValuesInputIteratorT, - AggregatesOutputIteratorT, - NumRunsOutputIteratorT, - EqualityOp, - ReductionOpT, - OffsetT>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - EqualityOp(), - reduction_op, - static_cast(num_items), - stream); + return detail::reduce_by_key::dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + EqualityOp{}, + reduction_op, + static_cast(num_items), + stream); } }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 44fac1b1cd9..d0c5d05e0fa 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -186,9 +186,9 @@ struct DeviceRunLengthEncode using key_t = cub::detail::non_void_value_t>; - using policy_t = detail::rle::encode::policy_hub; + using policy_selector_t = detail::rle::encode::policy_selector_from_types; - return detail::reduce::DispatchStreamingReduceByKey< + return detail::reduce_by_key::dispatch_streaming_reduce_by_key< InputIteratorT, UniqueOutputIteratorT, lengths_input_iterator_t, @@ -198,17 +198,18 @@ struct DeviceRunLengthEncode reduction_op, offset_t, accum_t, - policy_t>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_unique_out, - lengths_input_iterator_t((length_t) 1), - d_counts_out, - d_num_runs_out, - equality_op(), - reduction_op(), - num_items, - stream); + policy_selector_t>( + d_temp_storage, + temp_storage_bytes, + d_in, + d_unique_out, + lengths_input_iterator_t((length_t) 1), + d_counts_out, + d_num_runs_out, + equality_op(), + reduction_op(), + num_items, + stream); } //! @rst diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index cda8857f460..72564332c0e 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -20,6 +20,7 @@ #endif // no system header #include +#include #include #include #include @@ -29,13 +30,17 @@ #include +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) +# include +#endif + CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ -namespace detail::reduce +namespace detail::reduce_by_key { template struct streaming_context @@ -106,8 +111,8 @@ struct streaming_context /** * @brief Multi-block reduce-by-key sweep kernel entry point * - * @tparam AgentReduceByKeyPolicyT - * Parameterized AgentReduceByKeyPolicyT tuning policy type + * @tparam PolicySelector + * Selects the tuning policy * * @tparam KeysInputIteratorT * Random-access input iterator type for keys @@ -167,7 +172,7 @@ struct streaming_context * @param num_items * Total number of items to select from */ -template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_THREADS)) +#if _CCCL_HAS_CONCEPTS() + requires reduce_by_key_policy_selector +#endif +__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel( KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, @@ -194,8 +202,19 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH _CCCL_GRID_CONSTANT const StreamingContextT streaming_context, vsmem_t vsmem) { + static constexpr reduce_by_key_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); + using AgentReduceByKeyPolicyT = AgentReduceByKeyPolicy< + policy.block_threads, + policy.items_per_thread, + policy.load_algorithm, + policy.load_modifier, + policy.scan_algorithm, + delay_constructor_t>; + using vsmem_helper_t = vsmem_helper_default_fallback_policy_t< - typename ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT, + AgentReduceByKeyPolicyT, AgentReduceByKey, KeysInputIteratorT, UniqueOutputIteratorT, @@ -234,7 +253,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH // If applicable, hints to discard modified cache lines for vsmem vsmem_helper_t::discard_temp_storage(temp_storage); } -} // namespace detail::reduce +} // namespace detail::reduce_by_key /****************************************************************************** * Dispatch @@ -272,6 +291,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ +// TODO(bgruber): deprecate when we make the tuning API public and remove in CCCL 4.0 template ( detail::scan::DeviceCompactInitKernel, - detail::reduce::DeviceReduceByKeyKernel< - typename PolicyHub::MaxPolicy, + detail::reduce_by_key::DeviceReduceByKeyKernel< + detail::reduce_by_key::policy_selector_from_hub, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, @@ -622,4 +642,208 @@ struct DispatchReduceByKey } }; +namespace detail::reduce_by_key +{ +template < + typename KeysInputIteratorT, + typename UniqueOutputIteratorT, + typename ValuesInputIteratorT, + typename AggregatesOutputIteratorT, + typename NumRunsOutputIteratorT, + typename EqualityOpT, + typename ReductionOpT, + typename OffsetT, + typename AccumT = + ::cuda::std::__accumulator_t, it_value_t>, + typename KeyT = non_void_value_t>, + typename PolicySelector = policy_selector_from_types> +#if _CCCL_HAS_CONCEPTS() + requires reduce_by_key::reduce_by_key_policy_selector +#endif // _CCCL_HAS_CONCEPTS() +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( + void* d_temp_storage, + size_t& temp_storage_bytes, + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, + EqualityOpT equality_op, + ReductionOpT reduction_op, + OffsetT num_items, + cudaStream_t stream, + PolicySelector policy_selector = {}) +{ + using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet + using ScanTileStateT = ReduceByKeyScanTileState; + [[maybe_unused]] static constexpr int INIT_KERNEL_THREADS = 128; + + ::cuda::arch_id arch_id{}; + if (const auto error = CubDebug(ptx_arch_id(arch_id))) + { + return error; + } + + return detail::dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) { + constexpr reduce_by_key_policy policy = policy_getter(); // need the constexpr of vsmem_helper + +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + NV_IF_TARGET( + NV_IS_HOST, + (::std::stringstream ss; ss << policy; _CubLog( + "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast(arch_id), ss.str().c_str());)) +#endif + + // convert the policy into a legacy agent policy for vsmem_helper. TODO(bgruber): refactor this in the future + using AgentReduceByKeyPolicy = AgentReduceByKeyPolicy< + policy.block_threads, + policy.items_per_thread, + policy.load_algorithm, + policy.load_modifier, + policy.scan_algorithm, + delay_constructor_t>; + + using vsmem_helper_t = vsmem_helper_default_fallback_policy_t< + AgentReduceByKeyPolicy, + AgentReduceByKey, + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + EqualityOpT, + ReductionOpT, + OffsetT, + AccumT, + streaming_context_t>; + + constexpr int block_threads = vsmem_helper_t::agent_policy_t::BLOCK_THREADS; + constexpr int items_per_thread = vsmem_helper_t::agent_policy_t::ITEMS_PER_THREAD; + + // Number of input tiles + const int tile_size = block_threads * items_per_thread; + const int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); + + // The amount of virtual shared memory to allocate + const auto vsmem_size = num_tiles * vsmem_helper_t::vsmem_per_block; + + size_t tile_descriptor_memory{}; + if (const auto error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, tile_descriptor_memory))) + { + return error; + } + size_t allocation_sizes[2] = {tile_descriptor_memory, vsmem_size}; + void* allocations[2] = {}; + + if (const auto error = + CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + { + return error; + } + + if (d_temp_storage == nullptr) + { + return cudaSuccess; + } + + ScanTileStateT tile_state; + if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) + { + return error; + } + + const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS)); +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); +#endif + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + .doit(detail::scan::DeviceCompactInitKernel, + tile_state, + num_tiles, + d_num_runs_out))) + { + return error; + } + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + if (num_items == 0) + { + return cudaSuccess; + } + + auto reduce_by_key_kernel = &DeviceReduceByKeyKernel< + PolicySelector, + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + ScanTileStateT, + EqualityOpT, + ReductionOpT, + OffsetT, + AccumT, + streaming_context_t>; + int reduce_by_key_sm_occupancy{}; + if (const auto error = CubDebug(MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, block_threads))) + { + return error; + } + + int device_ordinal{}; + if (const auto error = CubDebug(cudaGetDevice(&device_ordinal))) + { + return error; + } + int max_dim_x{}; + if (const auto error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) + { + return error; + } + + const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, + scan_grid_size, + block_threads, + (long long) stream, + items_per_thread, + reduce_by_key_sm_occupancy); +#endif + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream) + .doit(reduce_by_key_kernel, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + tile_state, + start_tile, + equality_op, + reduction_op, + num_items, + streaming_context_t{}, + cub::detail::vsmem_t{allocations[1]}))) + { + return error; + } + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + } + return cudaSuccess; + }); +} +} // namespace detail::reduce_by_key + CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh index 57d34ff4e7e..4fec39263a6 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh @@ -14,6 +14,7 @@ #endif // no system header #include +#include #include #include #include @@ -25,431 +26,230 @@ #include #include -#if !_CCCL_COMPILER(NVRTC) -# include -#endif // !_CCCL_COMPILER(NVRTC) +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) +# include +#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) CUB_NAMESPACE_BEGIN -namespace detail::reduce +namespace detail::reduce_by_key { -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * @brief Utility class for dispatching the appropriately-tuned kernels for - * DeviceReduceByKey - * - * @tparam KeysInputIteratorT - * Random-access input iterator type for keys - * - * @tparam UniqueOutputIteratorT - * Random-access output iterator type for keys - * - * @tparam ValuesInputIteratorT - * Random-access input iterator type for values - * - * @tparam AggregatesOutputIteratorT - * Random-access output iterator type for values - * - * @tparam NumRunsOutputIteratorT - * Output iterator type for recording number of segments encountered - * - * @tparam EqualityOpT - * KeyT equality operator type - * - * @tparam ReductionOpT - * ValueT reduction operator type - * - * @tparam PolicyHub - * Implementation detail, do not specify directly, requirements on the - * content of this type are subject to breaking change. - */ -template , - cub::detail::it_value_t>, - typename PolicyHub = detail::reduce_by_key::policy_hub< - ReductionOpT, - AccumT, - cub::detail::non_void_value_t>>> -struct DispatchStreamingReduceByKey +template < + typename KeysInputIteratorT, + typename UniqueOutputIteratorT, + typename ValuesInputIteratorT, + typename AggregatesOutputIteratorT, + typename NumRunsOutputIteratorT, + typename EqualityOpT, + typename ReductionOpT, + typename OffsetT, + typename AccumT = + ::cuda::std::__accumulator_t, it_value_t>, + typename PolicySelector = + policy_selector_from_types>>> +#if _CCCL_HAS_CONCEPTS() + requires reduce_by_key_policy_selector +#endif +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_reduce_by_key( + void* d_temp_storage, + size_t& temp_storage_bytes, + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, + EqualityOpT equality_op, + ReductionOpT reduction_op, + OffsetT num_items, + cudaStream_t stream, + PolicySelector policy_selector = {}) { - //------------------------------------------------------------------------- - // Types and constants - //------------------------------------------------------------------------- - // Offsets to index items within one partition (i.e., a single kernel invocation) - using local_offset_t = ::cuda::std::int32_t; - - // If the number of items provided by the user may exceed the maximum number of items processed by a single kernel - // invocation, we may require multiple kernel invocations - static constexpr bool use_streaming_invocation = ::cuda::std::numeric_limits::max() - > ::cuda::std::numeric_limits::max(); - - // Offsets to index any item within the entire input (large enough to cover num_items) - using global_offset_t = OffsetT; - - // Type used to provide context about the current partition during a streaming invocation - using streaming_context_t = - ::cuda::std::conditional_t, - NullType>; - - // The input values type - using ValueInputT = cub::detail::it_value_t; + ::cuda::arch_id arch_id{}; + if (const auto error = CubDebug(ptx_arch_id(arch_id))) + { + return error; + } - static constexpr int init_kernel_threads = 128; + const reduce_by_key_policy policy = policy_selector(arch_id); - // Tile status descriptor interface type - using ScanTileStateT = ReduceByKeyScanTileState; +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + NV_IF_TARGET(NV_IS_HOST, + (::std::stringstream ss; ss << policy; + _CubLog("Dispatching streaming reduce by key to arch %d with tuning: %s\n", + static_cast(arch_id), + ss.str().c_str());)) +#endif - void* d_temp_storage; - size_t& temp_storage_bytes; - KeysInputIteratorT d_keys_in; - UniqueOutputIteratorT d_unique_out; - ValuesInputIteratorT d_values_in; - AggregatesOutputIteratorT d_aggregates_out; - NumRunsOutputIteratorT d_num_runs_out; - EqualityOpT equality_op; - ReductionOpT reduction_op; - global_offset_t num_items; - cudaStream_t stream; + using local_offset_t = ::cuda::std::int32_t; + using global_offset_t = OffsetT; + static constexpr bool use_streaming_invocation = + ::cuda::std::numeric_limits::max() > ::cuda::std::numeric_limits::max(); + using streaming_context_t = ::cuda::std:: + conditional_t, NullType>; + using ScanTileStateT = ReduceByKeyScanTileState; + [[maybe_unused]] static constexpr int init_kernel_threads = 128; + + const int block_threads = policy.block_threads; + const int items_per_thread = policy.items_per_thread; + const auto tile_size = static_cast(block_threads * items_per_thread); + + auto capped_num_items_per_invocation = num_items; + if constexpr (use_streaming_invocation) + { + capped_num_items_per_invocation = static_cast(::cuda::std::numeric_limits::max()); + capped_num_items_per_invocation -= (capped_num_items_per_invocation % tile_size); + } - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchStreamingReduceByKey( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - UniqueOutputIteratorT d_unique_out, - ValuesInputIteratorT d_values_in, - AggregatesOutputIteratorT d_aggregates_out, - NumRunsOutputIteratorT d_num_runs_out, - EqualityOpT equality_op, - ReductionOpT reduction_op, - global_offset_t num_items, - cudaStream_t stream) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_keys_in(d_keys_in) - , d_unique_out(d_unique_out) - , d_values_in(d_values_in) - , d_aggregates_out(d_aggregates_out) - , d_num_runs_out(d_num_runs_out) - , equality_op(equality_op) - , reduction_op(reduction_op) - , num_items(num_items) - , stream(stream) - {} + const auto max_num_items_per_invocation = + use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items; + auto const num_partitions = + (num_items == 0) ? global_offset_t{1} : ::cuda::ceil_div(num_items, capped_num_items_per_invocation); - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- + const auto max_num_tiles = static_cast(::cuda::ceil_div(max_num_items_per_invocation, tile_size)); - template - CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t - Invoke(ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel) + size_t allocation_sizes[3]; + if (const auto error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0]))) { - using AgentReduceByKeyPolicyT = typename ActivePolicyT::ReduceByKeyPolicyT; - constexpr int block_threads = AgentReduceByKeyPolicyT::BLOCK_THREADS; - constexpr int items_per_thread = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD; - - // The upper bound of for the number of items that a single kernel invocation will ever process - auto capped_num_items_per_invocation = num_items; - if constexpr (use_streaming_invocation) - { - capped_num_items_per_invocation = - static_cast(::cuda::std::numeric_limits::max()); - // Make sure that the number of items is a multiple of tile size - capped_num_items_per_invocation -= (capped_num_items_per_invocation % (block_threads * items_per_thread)); - } - - // Across invocations, the maximum number of items that a single kernel invocation will ever process - const auto max_num_items_per_invocation = - use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items; + return error; + } + allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0}; + allocation_sizes[2] = num_partitions > 1 ? sizeof(AccumT) * 2 : size_t{0}; - // Number of invocations required to "iterate" over the total input (at least one iteration to process zero items) - auto const num_partitions = - (num_items == 0) ? global_offset_t{1} : ::cuda::ceil_div(num_items, capped_num_items_per_invocation); + void* allocations[3] = {}; + if (const auto error = + CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + { + return error; + } - cudaError error = cudaSuccess; + if (d_temp_storage == nullptr) + { + return cudaSuccess; + } - // Number of input tiles - const auto tile_size = static_cast(block_threads * items_per_thread); - int max_num_tiles = static_cast(::cuda::ceil_div(max_num_items_per_invocation, tile_size)); + for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) + { + global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation; + global_offset_t current_num_items = + (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : capped_num_items_per_invocation; - // Specify temporary storage allocation requirements - size_t allocation_sizes[3]; - error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0])); - if (cudaSuccess != error) + const auto num_current_tiles = static_cast(::cuda::ceil_div(current_num_items, tile_size)); + ScanTileStateT tile_state; + if (const auto error = CubDebug(tile_state.Init(num_current_tiles, allocations[0], allocation_sizes[0]))) { return error; } - allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0}; - allocation_sizes[2] = num_partitions > 1 ? sizeof(AccumT) * 2 : size_t{0}; - - // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[3] = {}; - error = CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - if (cudaSuccess != error) + const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads)); +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); +#endif + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream) + .doit(&detail::scan::DeviceCompactInitKernel, + tile_state, + num_current_tiles, + d_num_runs_out))) { return error; } - if (d_temp_storage == nullptr) + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) { - // Return if the caller is simply requesting the size of the storage allocation return error; } - // Iterate over the partitions until all input is processed - for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) + if (num_items == 0) { - global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation; - global_offset_t current_num_items = - (partition_idx + 1 == num_partitions) - ? (num_items - current_partition_offset) - : capped_num_items_per_invocation; - - // Construct the tile status interface - const auto num_current_tiles = static_cast(::cuda::ceil_div(current_num_items, tile_size)); - - // Construct the tile status interface - ScanTileStateT tile_state; - error = CubDebug(tile_state.Init(num_current_tiles, allocations[0], allocation_sizes[0])); - if (cudaSuccess != error) - { - return error; - } - - // Log init_kernel configuration - int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads)); + return cudaSuccess; + } #ifdef CUB_DEBUG_LOG - _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); -#endif // CUB_DEBUG_LOG - - // Invoke init_kernel to initialize tile descriptors - error = CubDebug( - THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream) - .doit(init_kernel, tile_state, num_current_tiles, d_num_runs_out)); - if (cudaSuccess != error) - { - return error; - } + _CubLog("Invoking reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + num_current_tiles, + block_threads, + (long long) stream, + items_per_thread); +#endif + auto reduce_by_key_kernel = DeviceReduceByKeyKernel< + PolicySelector, + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + ScanTileStateT, + EqualityOpT, + ReductionOpT, + local_offset_t, + AccumT, + streaming_context_t>; - // Sync the stream if specified to flush runtime errors - error = CubDebug(detail::DebugSyncStream(stream)); - if (cudaSuccess != error) - { - return error; - } - - // For empty problems we can skip the reduce_by_key_kernel - if (num_items == 0) + if constexpr (use_streaming_invocation) + { + auto tmp_num_uniques = static_cast(allocations[1]); + auto tmp_prefix = static_cast(allocations[2]); + const bool is_first_partition = (partition_idx == 0); + const bool is_last_partition = (partition_idx + 1 == num_partitions); + const int buffer_selector = partition_idx % 2; + streaming_context_t streaming_context{ + is_first_partition, + is_last_partition, + is_first_partition ? d_keys_in : d_keys_in + current_partition_offset - 1, + &tmp_prefix[buffer_selector], + &tmp_prefix[buffer_selector ^ 0x01], + &tmp_num_uniques[buffer_selector], + &tmp_num_uniques[buffer_selector ^ 0x01]}; + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) + .doit(reduce_by_key_kernel, + d_keys_in + current_partition_offset, + d_unique_out, + d_values_in + current_partition_offset, + d_aggregates_out, + d_num_runs_out, + tile_state, + 0, + equality_op, + reduction_op, + static_cast(current_num_items), + streaming_context, + detail::vsmem_t{nullptr}))) { return error; } - -// Log reduce_by_key_kernel configuration -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d " - "items per thread\n", - num_current_tiles, - block_threads, - (long long) stream, - items_per_thread); -#endif // CUB_DEBUG_LOG - - // Invoke reduce_by_key_kernel - - if constexpr (use_streaming_invocation) - { - auto tmp_num_uniques = static_cast(allocations[1]); - auto tmp_prefix = static_cast(allocations[2]); - - const bool is_first_partition = (partition_idx == 0); - const bool is_last_partition = (partition_idx + 1 == num_partitions); - const int buffer_selector = partition_idx % 2; - - streaming_context_t streaming_context{ - is_first_partition, - is_last_partition, - is_first_partition ? d_keys_in : d_keys_in + current_partition_offset - 1, - &tmp_prefix[buffer_selector], - &tmp_prefix[buffer_selector ^ 0x01], - &tmp_num_uniques[buffer_selector], - &tmp_num_uniques[buffer_selector ^ 0x01]}; - - error = CubDebug( - THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) - .doit(reduce_by_key_kernel, - d_keys_in + current_partition_offset, - d_unique_out, - d_values_in + current_partition_offset, - d_aggregates_out, - d_num_runs_out, - tile_state, - 0, - equality_op, - reduction_op, - static_cast(current_num_items), - streaming_context, - cub::detail::vsmem_t{nullptr})); - if (cudaSuccess != error) - { - return error; - } - } - else - { - error = CubDebug( - THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) - .doit(reduce_by_key_kernel, - d_keys_in + current_partition_offset, - d_unique_out, - d_values_in + current_partition_offset, - d_aggregates_out, - d_num_runs_out, - tile_state, - 0, - equality_op, - reduction_op, - static_cast(current_num_items), - NullType{}, - cub::detail::vsmem_t{nullptr})); - if (cudaSuccess != error) - { - return error; - } - } - - // Sync the stream if specified to flush runtime errors - error = CubDebug(detail::DebugSyncStream(stream)); - if (cudaSuccess != error) + } + else + { + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) + .doit(reduce_by_key_kernel, + d_keys_in + current_partition_offset, + d_unique_out, + d_values_in + current_partition_offset, + d_aggregates_out, + d_num_runs_out, + tile_state, + 0, + equality_op, + reduction_op, + static_cast(current_num_items), + NullType{}, + detail::vsmem_t{nullptr}))) { return error; } } - - return error; - } - - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() - { - return Invoke( - detail::scan::DeviceCompactInitKernel, - detail::reduce::DeviceReduceByKeyKernel< - typename PolicyHub::MaxPolicy, - KeysInputIteratorT, - UniqueOutputIteratorT, - ValuesInputIteratorT, - AggregatesOutputIteratorT, - NumRunsOutputIteratorT, - ScanTileStateT, - EqualityOpT, - ReductionOpT, - local_offset_t, - AccumT, - streaming_context_t>); - } - - /** - * Internal dispatch routine - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Pointer to the input sequence of keys - * - * @param[out] d_unique_out - * Pointer to the output sequence of unique keys (one key per run) - * - * @param[in] d_values_in - * Pointer to the input sequence of corresponding values - * - * @param[out] d_aggregates_out - * Pointer to the output sequence of value aggregates - * (one aggregate per run) - * - * @param[out] d_num_runs_out - * Pointer to total number of runs encountered - * (i.e., the length of d_unique_out) - * - * @param[in] equality_op - * KeyT equality operator - * - * @param[in] reduction_op - * ValueT reduction operator - * - * @param[in] num_items - * Total number of items to select from - * - * @param[in] stream - * CUDA stream to launch kernels within. Default is stream0. - */ - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - UniqueOutputIteratorT d_unique_out, - ValuesInputIteratorT d_values_in, - AggregatesOutputIteratorT d_aggregates_out, - NumRunsOutputIteratorT d_num_runs_out, - EqualityOpT equality_op, - ReductionOpT reduction_op, - global_offset_t num_items, - cudaStream_t stream) - { - cudaError error = cudaSuccess; - - do + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) { - // Get PTX version - int ptx_version = 0; - error = CubDebug(PtxVersion(ptx_version)); - if (cudaSuccess != error) - { - break; - } - - DispatchStreamingReduceByKey dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - equality_op, - reduction_op, - num_items, - stream); - - // Dispatch - error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); - if (cudaSuccess != error) - { - break; - } - } while (0); - - return error; + return error; + } } -}; -} // namespace detail::reduce + + return cudaSuccess; +} +} // namespace detail::reduce_by_key CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index cfaad7185da..1dd6e433206 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -18,12 +18,22 @@ #include #include #include +#include #include #include +#include #include #include +#if _CCCL_HAS_CONCEPTS() +# include +#endif // _CCCL_HAS_CONCEPTS() + +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + CUB_NAMESPACE_BEGIN namespace detail::reduce_by_key @@ -63,25 +73,25 @@ enum class accum_size }; template -constexpr primitive_key is_primitive_key() +_CCCL_API constexpr primitive_key is_primitive_key() { return detail::is_primitive::value ? primitive_key::yes : primitive_key::no; } template -constexpr primitive_accum is_primitive_accum() +_CCCL_API constexpr primitive_accum is_primitive_accum() { return detail::is_primitive::value ? primitive_accum::yes : primitive_accum::no; } template -constexpr primitive_op is_primitive_op() +_CCCL_API constexpr primitive_op is_primitive_op() { return basic_binary_op_t::value ? primitive_op::yes : primitive_op::no; } template -constexpr key_size classify_key_size() +_CCCL_API constexpr key_size classify_key_size() { return sizeof(KeyT) == 1 ? key_size::_1 : sizeof(KeyT) == 2 ? key_size::_2 @@ -93,7 +103,7 @@ constexpr key_size classify_key_size() } template -constexpr accum_size classify_accum_size() +_CCCL_API constexpr accum_size classify_accum_size() { return sizeof(AccumT) == 1 ? accum_size::_1 : sizeof(AccumT) == 2 ? accum_size::_2 @@ -104,6 +114,28 @@ constexpr accum_size classify_accum_size() : accum_size::unknown; } +_CCCL_API constexpr int size_of(key_size sz) +{ + return sz == key_size::_1 ? 1 + : sz == key_size::_2 ? 2 + : sz == key_size::_4 ? 4 + : sz == key_size::_8 ? 8 + : sz == key_size::_16 + ? 16 + : 4; +} + +_CCCL_API constexpr int size_of(accum_size sz) +{ + return sz == accum_size::_1 ? 1 + : sz == accum_size::_2 ? 2 + : sz == accum_size::_4 ? 4 + : sz == accum_size::_8 ? 8 + : sz == accum_size::_16 + ? 16 + : 4; +} + template (::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes)), + 1, + nominal_4B_items_per_thread); + return reduce_by_key_policy{ + 128, + items_per_thread, + BLOCK_LOAD_DIRECT, + load_mod, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; +} + +struct policy_selector +{ + int key_size; + int accum_size; + bool is_primitive_key_t; + bool is_primitive_accum_t; + bool is_primitive_op; + + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy + { + const int combined_input_bytes = key_size + accum_size; + const int max_input_bytes = (::cuda::std::max) (key_size, accum_size); + const auto default_ldg = [&] { + return make_default_reduce_by_key_policy(combined_input_bytes, max_input_bytes, LOAD_LDG); + }; + const auto default_load_default = [&] { + return make_default_reduce_by_key_policy(combined_input_bytes, max_input_bytes, LOAD_DEFAULT); + }; + + const bool tuned_prim = (is_primitive_key_t && is_primitive_accum_t); + + if (!is_primitive_op) + { + return default_ldg(); + } + + if (arch >= ::cuda::arch_id::sm_100 && tuned_prim) + { + if (key_size == 1 && accum_size == 1) + { + return {576, + 13, + BLOCK_LOAD_DIRECT, + LOAD_CA, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon_jitter_window, 2044, 240}}; + } + if (key_size == 1 && accum_size == 2) + { + return {224, + 10, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff_jitter_window, 224, 390}}; + } + if (key_size == 1 && accum_size == 4) + { + return {128, + 14, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 248, 285}}; + } + if (key_size == 1 && accum_size == 8) + { + return {128, + 19, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 132, 540}}; + } + if (key_size == 2 && accum_size == 1) + { + return {128, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 164, 290}}; + } + if (key_size == 2 && accum_size == 2) + { + return {256, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 180, 975}}; + } + if (key_size == 2 && accum_size == 4) + { + return {256, + 11, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 224, 550}}; + } + if (key_size == 2 && accum_size == 8) + { + return {160, + 10, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 156, 725}}; + } + if (key_size == 4 && accum_size == 1) + { + return {224, + 10, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 324, 285}}; + } + if (key_size == 4 && accum_size == 2) + { + return {256, + 11, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon_jitter_window, 1984, 115}}; + } + if (key_size == 4 && accum_size == 4) + { + return {224, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon_jitter_window, 476, 1005}}; + } + if (key_size == 4 && accum_size == 8) + { + return {256, + 10, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon, 1868, 145}}; + } + if (key_size == 8 && accum_size == 1) + { + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon_jitter_window, 1940, 460}}; + } + if (key_size == 8 && accum_size == 2) + { + return {224, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_CA, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 392, 550}}; + } + if (key_size == 8 && accum_size == 4) + { + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 244, 475}}; + } + if (key_size == 8 && accum_size == 8) + { + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 196, 340}}; + } + } + + if (arch >= ::cuda::arch_id::sm_90 && tuned_prim) + { + if (key_size == 1 && accum_size == 1) + { + return { + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 720}}; + } + if (key_size == 1 && accum_size == 2) + { + return { + 320, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 865}}; + } + if (key_size == 1 && accum_size == 4) + { + return {192, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 735}}; + } + if (key_size == 1 && accum_size == 8) + { + return {128, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 580}}; + } + if (key_size == 1 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1100}}; + } + if (key_size == 2 && accum_size == 1) + { + return { + 128, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 985}}; + } + if (key_size == 2 && accum_size == 2) + { + return {256, + 11, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 276, 650}}; + } + if (key_size == 2 && accum_size == 4) + { + return {256, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 240, 765}}; + } + if (key_size == 2 && accum_size == 8) + { + return {128, + 19, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1190}}; + } + if (key_size == 2 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1175}}; + } + if (key_size == 4 && accum_size == 1) + { + return {256, + 13, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 404, 645}}; + } + if (key_size == 4 && accum_size == 2) + { + return {256, + 18, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1160}}; + } + if (key_size == 4 && accum_size == 4) + { + return {256, + 18, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1170}}; + } + if (key_size == 4 && accum_size == 8) + { + return {128, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1055}}; + } + if (key_size == 4 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1195}}; + } + if (key_size == 8 && accum_size == 1) + { + return { + 256, 10, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1170}}; + } + if (key_size == 8 && accum_size == 2) + { + return {256, + 9, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 236, 1030}}; + } + if (key_size == 8 && accum_size == 4) + { + return {128, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 152, 560}}; + } + if (key_size == 8 && accum_size == 8) + { + return {128, + 23, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1030}}; + } + if (key_size == 8 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1125}}; + } + if (key_size == 16 && !is_primitive_key_t && accum_size == 1) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1080}}; + } + if (key_size == 16 && !is_primitive_key_t && accum_size == 2) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 320, 1005}}; + } + if (key_size == 16 && !is_primitive_key_t && accum_size == 4) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 232, 1100}}; + } + if (key_size == 16 && !is_primitive_key_t && accum_size == 8) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1195}}; + } + if (key_size == 16 && !is_primitive_key_t && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1150}}; + } + return default_load_default(); + } + + if (arch >= ::cuda::arch_id::sm_86) + { + return default_ldg(); + } + + if (arch >= ::cuda::arch_id::sm_80 && tuned_prim) + { + if (key_size == 1 && accum_size == 1) + { + return { + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 975}}; + } + if (key_size == 1 && accum_size == 2) + { + return { + 224, 12, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 840}}; + } + if (key_size == 1 && accum_size == 4) + { + return {256, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 760}}; + } + if (key_size == 1 && accum_size == 8) + { + return { + 224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1070}}; + } + if (key_size == 1 && accum_size == 16) + { + return {128, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1175}}; + } + if (key_size == 2 && accum_size == 1) + { + return { + 256, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}}; + } + if (key_size == 2 && accum_size == 2) + { + return {224, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 640}}; + } + if (key_size == 2 && accum_size == 4) + { + return {256, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 905}}; + } + if (key_size == 2 && accum_size == 8) + { + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 810}}; + } + if (key_size == 2 && accum_size == 16) + { + return {160, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1115}}; + } + if (key_size == 4 && accum_size == 1) + { + return { + 288, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1110}}; + } + if (key_size == 4 && accum_size == 2) + { + return {192, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1200}}; + } + if (key_size == 4 && accum_size == 4) + { + return { + 256, 15, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1110}}; + } + if (key_size == 4 && accum_size == 8) + { + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1165}}; + } + if (key_size == 4 && accum_size == 16) + { + return {160, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1100}}; + } + if (key_size == 8 && accum_size == 1) + { + return {192, + 10, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1175}}; + } + if (key_size == 8 && accum_size == 2) + { + return { + 224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1075}}; + } + if (key_size == 8 && accum_size == 4) + { + return { + 384, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1040}}; + } + if (key_size == 8 && accum_size == 8) + { + return {128, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1080}}; + } + if (key_size == 8 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 430}}; + } + if (key_size == 16 && !is_primitive_key_t && accum_size == 1) + { + return { + 192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1105}}; + } + if (key_size == 16 && !is_primitive_key_t && accum_size == 2) + { + return {192, + 7, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 755}}; + } + if (key_size == 16 && !is_primitive_key_t && accum_size == 4) + { + return {192, + 7, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 535}}; + } + if (key_size == 16 && !is_primitive_key_t && accum_size == 8) + { + return { + 192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1035}}; + } + if (key_size == 16 && !is_primitive_key_t && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1090}}; + } + return default_load_default(); + } + + return default_ldg(); + } +}; + +#if _CCCL_HAS_CONCEPTS() +template +concept reduce_by_key_policy_selector = detail::policy_selector; +#endif // _CCCL_HAS_CONCEPTS() + +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers +template +struct policy_selector_from_hub +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_by_key_policy + { + using ReduceByKeyPolicyT = typename PolicyHub::MaxPolicy::ReduceByKeyPolicyT; + return reduce_by_key_policy{ + ReduceByKeyPolicyT::BLOCK_THREADS, + ReduceByKeyPolicyT::ITEMS_PER_THREAD, + ReduceByKeyPolicyT::LOAD_ALGORITHM, + ReduceByKeyPolicyT::LOAD_MODIFIER, + ReduceByKeyPolicyT::SCAN_ALGORITHM, + delay_constructor_policy_from_type, + }; + } +}; + +template +struct policy_selector_from_types +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy + { + return policy_selector{ + int{sizeof(KeyT)}, + int{sizeof(AccumT)}, + (is_primitive_key() == primitive_key::yes), + (is_primitive_accum() == primitive_accum::yes), + (is_primitive_op() == primitive_op::yes)}(arch); + } +}; } // namespace detail::reduce_by_key CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh index 77c393d7643..309a295bf42 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh @@ -18,17 +18,29 @@ #include #include #include +#include #include +#include #include #include +#include #include #include +#if _CCCL_HAS_CONCEPTS() +# include +#endif // _CCCL_HAS_CONCEPTS() + +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + CUB_NAMESPACE_BEGIN namespace detail::rle::encode { +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce dispatchers template (), @@ -89,6 +101,7 @@ struct sm80_tuning(), @@ -218,7 +231,8 @@ struct sm100_tuning struct policy_hub { @@ -294,6 +308,196 @@ struct policy_hub using MaxPolicy = Policy1000; }; + +// DeviceRunLengthEncode::Encode delegates to reduce by key +using rle_encode_policy = reduce_by_key::reduce_by_key_policy; + +#if _CCCL_HAS_CONCEPTS() +template +concept rle_encode_policy_selector = reduce_by_key::reduce_by_key_policy_selector; +#endif // _CCCL_HAS_CONCEPTS() + +// TODO(bgruber): remove in CCCL 4.0 when we drop the RLE dispatchers +using reduce_by_key::policy_selector_from_hub; + +struct policy_selector +{ + length_size length_sz; + key_size key_sz; + primitive_length prim_len; + primitive_key prim_key; + int max_input_bytes; + int combined_input_bytes; + + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy + { + const bool tuned_prim = (prim_len == primitive_length::yes && prim_key == primitive_key::yes); + const bool length_4 = (length_sz == length_size::_4); + + if (arch >= ::cuda::arch_id::sm_100 && tuned_prim && length_4) + { + if (key_sz == key_size::_1) + { + return rle_encode_policy{ + 256, + 14, + BLOCK_LOAD_DIRECT, + LOAD_CA, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon, 468, 300}}; + } + if (key_sz == key_size::_2) + { + return rle_encode_policy{ + 224, + 14, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon, 376, 420}}; + } + if (key_sz == key_size::_4) + { + return rle_encode_policy{ + 256, + 14, + BLOCK_LOAD_DIRECT, + LOAD_CA, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon, 956, 70}}; + } + if (key_sz == key_size::_8) + { + return rle_encode_policy{ + 224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 188, 765}}; + } + } + if (arch >= ::cuda::arch_id::sm_90 && tuned_prim && length_4) + { + if (key_sz == key_size::_1) + { + return rle_encode_policy{ + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}}; + } + if (key_sz == key_size::_2) + { + return rle_encode_policy{ + 128, 22, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 775}}; + } + if (key_sz == key_size::_4) + { + return rle_encode_policy{ + 192, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 284, 480}}; + } + if (key_sz == key_size::_8) + { + return rle_encode_policy{ + 128, + 19, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 515}}; + } +#if _CCCL_HAS_INT128() + if (key_sz == key_size::_16) + { + return rle_encode_policy{ + 128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 428, 930}}; + } +#endif + } + if (arch >= ::cuda::arch_id::sm_80 && tuned_prim && length_4) + { + if (key_sz == key_size::_1) + { + return rle_encode_policy{ + 256, 14, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 640}}; + } + if (key_sz == key_size::_2) + { + return rle_encode_policy{ + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 900}}; + } + if (key_sz == key_size::_4) + { + return rle_encode_policy{ + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1080}}; + } + if (key_sz == key_size::_8) + { + return rle_encode_policy{ + 224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1075}}; + } +#if _CCCL_HAS_INT128() + if (key_sz == key_size::_16) + { + return rle_encode_policy{ + 128, + 7, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 630}}; + } +#endif + } + constexpr int nominal_4B_items_per_thread = 6; + const int items = + (max_input_bytes <= 8) + ? 6 + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); + return rle_encode_policy{ + 128, + items, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}, + }; + } +}; + +template +struct policy_selector_from_types +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy + { + constexpr policy_selector selector{ + classify_length_size(), + classify_key_size(), + is_primitive_length(), + is_primitive_key(), + static_cast((::cuda::std::max) (sizeof(LengthT), sizeof(KeyT))), + static_cast(sizeof(LengthT) + sizeof(KeyT))}; + return selector(arch); + } +}; + +#if _CCCL_HAS_CONCEPTS() +static_assert(rle_encode_policy_selector); +#endif // _CCCL_HAS_CONCEPTS() } // namespace detail::rle::encode CUB_NAMESPACE_END diff --git a/nvbench_helper/nvbench_helper/look_back_helper.cuh b/nvbench_helper/nvbench_helper/look_back_helper.cuh index f7a97937dcb..5d31dde94dd 100644 --- a/nvbench_helper/nvbench_helper/look_back_helper.cuh +++ b/nvbench_helper/nvbench_helper/look_back_helper.cuh @@ -16,4 +16,8 @@ using delay_constructor_t = cub::detail::delay_constructor_t(TUNE_DELAY_CONSTRUCTOR_ID), TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>; + +inline constexpr auto delay_constructor_policy = cub::detail::delay_constructor_policy{ + static_cast(TUNE_DELAY_CONSTRUCTOR_ID), TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS}; + #endif // !TUNE_BASE From f045884883ebe6047f48b82803c84b95d5e272c3 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 24 Feb 2026 23:45:05 +0100 Subject: [PATCH 02/23] annot --- cub/benchmarks/bench/reduce/by_key.cu | 2 +- cub/benchmarks/bench/run_length_encode/encode.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu index 41b486aa752..f3603525bf7 100644 --- a/cub/benchmarks/bench/reduce/by_key.cu +++ b/cub/benchmarks/bench/reduce/by_key.cu @@ -17,7 +17,7 @@ #if !TUNE_BASE struct bench_reduce_by_key_policy_selector { - [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> cub::detail::reduce_by_key::reduce_by_key_policy { return { diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index da115685082..d00e16acf6e 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -19,7 +19,7 @@ #if !TUNE_BASE struct bench_encode_policy_selector { - [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> cub::detail::reduce_by_key::reduce_by_key_policy { return { From 2add7e60dbbf155911c5b7cfd25db955b34e8825 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 13:51:54 +0100 Subject: [PATCH 03/23] Comments --- .../dispatch/tuning/tuning_reduce_by_key.cuh | 48 +++++++++---------- .../dispatch/tuning/tuning_rle_encode.cuh | 13 ++--- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 1dd6e433206..357cba58181 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -38,21 +38,28 @@ CUB_NAMESPACE_BEGIN namespace detail::reduce_by_key { +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers enum class primitive_key { no, yes }; + +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers enum class primitive_accum { no, yes }; + +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers enum class primitive_op { no, yes }; + +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers enum class key_size { _1, @@ -62,6 +69,8 @@ enum class key_size _16, unknown }; + +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers enum class accum_size { _1, @@ -72,24 +81,28 @@ enum class accum_size unknown }; +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template _CCCL_API constexpr primitive_key is_primitive_key() { return detail::is_primitive::value ? primitive_key::yes : primitive_key::no; } +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template _CCCL_API constexpr primitive_accum is_primitive_accum() { return detail::is_primitive::value ? primitive_accum::yes : primitive_accum::no; } +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template _CCCL_API constexpr primitive_op is_primitive_op() { return basic_binary_op_t::value ? primitive_op::yes : primitive_op::no; } +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template _CCCL_API constexpr key_size classify_key_size() { @@ -102,6 +115,7 @@ _CCCL_API constexpr key_size classify_key_size() : key_size::unknown; } +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template _CCCL_API constexpr accum_size classify_accum_size() { @@ -114,28 +128,7 @@ _CCCL_API constexpr accum_size classify_accum_size() : accum_size::unknown; } -_CCCL_API constexpr int size_of(key_size sz) -{ - return sz == key_size::_1 ? 1 - : sz == key_size::_2 ? 2 - : sz == key_size::_4 ? 4 - : sz == key_size::_8 ? 8 - : sz == key_size::_16 - ? 16 - : 4; -} - -_CCCL_API constexpr int size_of(accum_size sz) -{ - return sz == accum_size::_1 ? 1 - : sz == accum_size::_2 ? 2 - : sz == accum_size::_4 ? 4 - : sz == accum_size::_8 ? 8 - : sz == accum_size::_16 - ? 16 - : 4; -} - +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template ; }; +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template ; }; +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template ; // }; +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template struct policy_hub { @@ -980,6 +976,11 @@ struct reduce_by_key_policy #endif // !_CCCL_COMPILER(NVRTC) }; +#if _CCCL_HAS_CONCEPTS() +template +concept reduce_by_key_policy_selector = detail::policy_selector; +#endif // _CCCL_HAS_CONCEPTS() + _CCCL_HOST_DEVICE constexpr reduce_by_key_policy make_default_reduce_by_key_policy(int combined_input_bytes, int max_input_bytes, CacheLoadModifier load_mod) { @@ -1587,8 +1588,7 @@ struct policy_selector }; #if _CCCL_HAS_CONCEPTS() -template -concept reduce_by_key_policy_selector = detail::policy_selector; +static_assert(reduce_by_key_policy_selector); #endif // _CCCL_HAS_CONCEPTS() // TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh index 309a295bf42..2fdd3bbbe90 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh @@ -40,7 +40,7 @@ CUB_NAMESPACE_BEGIN namespace detail::rle::encode { -// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce dispatchers +// TODO(bgruber): remove in CCCL 4.0 when we drop the CUB dispatchers template (), @@ -101,7 +101,7 @@ struct sm80_tuning(), @@ -162,6 +162,7 @@ struct sm90_tuning(), @@ -479,6 +480,10 @@ struct policy_selector } }; +#if _CCCL_HAS_CONCEPTS() +static_assert(rle_encode_policy_selector); +#endif // _CCCL_HAS_CONCEPTS() + template struct policy_selector_from_types { @@ -494,10 +499,6 @@ struct policy_selector_from_types return selector(arch); } }; - -#if _CCCL_HAS_CONCEPTS() -static_assert(rle_encode_policy_selector); -#endif // _CCCL_HAS_CONCEPTS() } // namespace detail::rle::encode CUB_NAMESPACE_END From 2f3240b7ebe3e3bf785b9d3de79bde773798f035 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 13:55:45 +0100 Subject: [PATCH 04/23] Fix --- cub/benchmarks/bench/reduce/by_key.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu index f3603525bf7..dfe1d9549c4 100644 --- a/cub/benchmarks/bench/reduce/by_key.cu +++ b/cub/benchmarks/bench/reduce/by_key.cu @@ -24,7 +24,6 @@ struct bench_reduce_by_key_policy_selector TUNE_THREADS, TUNE_ITEMS, TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE, - , TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA, cub::BLOCK_SCAN_WARP_SCANS, delay_constructor_policy, From d7b0edc85f4f822d27d1ad072587dcd54506605b Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 14:52:39 +0100 Subject: [PATCH 05/23] MSVC? --- cub/cub/device/dispatch/dispatch_reduce_by_key.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 72564332c0e..820d03a6d7b 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -685,7 +685,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( } return detail::dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) { - constexpr reduce_by_key_policy policy = policy_getter(); // need the constexpr of vsmem_helper + static constexpr reduce_by_key_policy policy = policy_getter(); // need the constexpr of vsmem_helper #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) NV_IF_TARGET( From 132e464d8a02ffbc8feb224c4c24c3d9594b882d Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 15:14:11 +0100 Subject: [PATCH 06/23] SASS fixes --- .../dispatch/tuning/tuning_reduce_by_key.cuh | 61 +++++++++---------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 357cba58181..2b3a95b49ff 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -981,49 +981,44 @@ template concept reduce_by_key_policy_selector = detail::policy_selector; #endif // _CCCL_HAS_CONCEPTS() -_CCCL_HOST_DEVICE constexpr reduce_by_key_policy -make_default_reduce_by_key_policy(int combined_input_bytes, int max_input_bytes, CacheLoadModifier load_mod) -{ - constexpr int nominal_4B_items_per_thread = 6; - const int items_per_thread = - (max_input_bytes <= 8) - ? 6 - : ::cuda::std::clamp(static_cast(::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes)), - 1, - nominal_4B_items_per_thread); - return reduce_by_key_policy{ - 128, - items_per_thread, - BLOCK_LOAD_DIRECT, - load_mod, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; -} - struct policy_selector { int key_size; int accum_size; + + // TODO(bgruber): we want to get rid of the following three and just assume by default that types behave "primitive". + // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because + // then users can opt-out of tunings again bool is_primitive_key_t; bool is_primitive_accum_t; bool is_primitive_op; - [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy + _CCCL_API constexpr auto __make_default_reduce_by_key_policy(CacheLoadModifier load_mod) const -> reduce_by_key_policy { - const int combined_input_bytes = key_size + accum_size; - const int max_input_bytes = (::cuda::std::max) (key_size, accum_size); - const auto default_ldg = [&] { - return make_default_reduce_by_key_policy(combined_input_bytes, max_input_bytes, LOAD_LDG); - }; - const auto default_load_default = [&] { - return make_default_reduce_by_key_policy(combined_input_bytes, max_input_bytes, LOAD_DEFAULT); - }; + constexpr int nominal_4B_items_per_thread = 6; + const int combined_input_bytes = key_size + accum_size; + const int max_input_bytes = (::cuda::std::max) (key_size, accum_size); + const int items_per_thread = + (max_input_bytes <= 8) + ? 6 + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); + return reduce_by_key_policy{ + 128, + items_per_thread, + BLOCK_LOAD_DIRECT, + load_mod, + BLOCK_SCAN_WARP_SCANS, + default_reduce_by_key_delay_constructor_policy(accum_size, sizeof(int), true)}; + } + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy + { const bool tuned_prim = (is_primitive_key_t && is_primitive_accum_t); if (!is_primitive_op) { - return default_ldg(); + return __make_default_reduce_by_key_policy(LOAD_LDG); } if (arch >= ::cuda::arch_id::sm_100 && tuned_prim) @@ -1385,12 +1380,12 @@ struct policy_selector BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1150}}; } - return default_load_default(); + return __make_default_reduce_by_key_policy(LOAD_DEFAULT); } if (arch >= ::cuda::arch_id::sm_86) { - return default_ldg(); + return __make_default_reduce_by_key_policy(LOAD_LDG); } if (arch >= ::cuda::arch_id::sm_80 && tuned_prim) @@ -1580,10 +1575,10 @@ struct policy_selector BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1090}}; } - return default_load_default(); + return __make_default_reduce_by_key_policy(LOAD_DEFAULT); } - return default_ldg(); + return __make_default_reduce_by_key_policy(LOAD_LDG); } }; From 641f3563a0f65f3963855a5f87da6e1bd7513215 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 15:20:11 +0100 Subject: [PATCH 07/23] const fix --- cub/benchmarks/bench/reduce/by_key.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu index dfe1d9549c4..0c92a820788 100644 --- a/cub/benchmarks/bench/reduce/by_key.cu +++ b/cub/benchmarks/bench/reduce/by_key.cu @@ -55,9 +55,9 @@ static void reduce(nvbench::state& state, nvbench::type_list out_keys(elements); thrust::device_vector in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); - KeyT* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); + const KeyT* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); KeyT* d_out_keys = thrust::raw_pointer_cast(out_keys.data()); - ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data()); + const ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data()); ValueT* d_out_vals = thrust::raw_pointer_cast(out_vals.data()); OffsetT* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); From e213ef2a48f29936f255c442df80999ea9abd17d Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 15:47:12 +0100 Subject: [PATCH 08/23] Fix is primitive conditions --- cub/benchmarks/bench/reduce/by_key.cu | 14 ++++---------- cub/cub/device/dispatch/dispatch_reduce_by_key.cuh | 8 ++++++-- .../dispatch/tuning/tuning_reduce_by_key.cuh | 11 ++++++----- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu index 0c92a820788..e02945740cc 100644 --- a/cub/benchmarks/bench/reduce/by_key.cu +++ b/cub/benchmarks/bench/reduce/by_key.cu @@ -35,15 +35,9 @@ struct bench_reduce_by_key_policy_selector template static void reduce(nvbench::state& state, nvbench::type_list) { - using keys_input_it_t = const KeyT*; - using unique_output_it_t = KeyT*; - using vals_input_it_t = const ValueT*; - using aggregate_output_it_t = ValueT*; - using num_runs_output_iterator_t = OffsetT*; - using equality_op_t = ::cuda::std::equal_to<>; - using reduction_op_t = ::cuda::std::plus<>; - using accum_t = ValueT; - using offset_t = OffsetT; + using equality_op_t = ::cuda::std::equal_to<>; + using reduction_op_t = ::cuda::std::plus<>; + using offset_t = OffsetT; const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; @@ -66,7 +60,7 @@ static void reduce(nvbench::state& state, nvbench::type_list(elements); auto dispatch_on_stream = [&](cudaStream_t stream) { - return cub::detail::reduce_by_key::dispatch( + return cub::detail::reduce_by_key::dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 820d03a6d7b..e10d7d24eec 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -645,6 +646,7 @@ struct DispatchReduceByKey namespace detail::reduce_by_key { template < + typename OverrideAccumT = use_default, typename KeysInputIteratorT, typename UniqueOutputIteratorT, typename ValuesInputIteratorT, @@ -653,8 +655,10 @@ template < typename EqualityOpT, typename ReductionOpT, typename OffsetT, - typename AccumT = - ::cuda::std::__accumulator_t, it_value_t>, + typename AccumT = ::cuda::std::conditional_t< + !::cuda::std::is_same_v, + OverrideAccumT, + ::cuda::std::__accumulator_t, it_value_t>>, typename KeyT = non_void_value_t>, typename PolicySelector = policy_selector_from_types> #if _CCCL_HAS_CONCEPTS() diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 2b3a95b49ff..d31810c9158 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -1014,14 +1014,15 @@ struct policy_selector [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy { - const bool tuned_prim = (is_primitive_key_t && is_primitive_accum_t); - + // bail out if we don't know the operation. TODO(bgruber): drop this check when we make the tuning API public if (!is_primitive_op) { return __make_default_reduce_by_key_policy(LOAD_LDG); } - if (arch >= ::cuda::arch_id::sm_100 && tuned_prim) + const bool use_tuning = (is_primitive_key_t || key_size == 16) && (is_primitive_accum_t || accum_size == 16); + + if (arch >= ::cuda::arch_id::sm_100 && use_tuning) { if (key_size == 1 && accum_size == 1) { @@ -1169,7 +1170,7 @@ struct policy_selector } } - if (arch >= ::cuda::arch_id::sm_90 && tuned_prim) + if (arch >= ::cuda::arch_id::sm_90 && use_tuning) { if (key_size == 1 && accum_size == 1) { @@ -1388,7 +1389,7 @@ struct policy_selector return __make_default_reduce_by_key_policy(LOAD_LDG); } - if (arch >= ::cuda::arch_id::sm_80 && tuned_prim) + if (arch >= ::cuda::arch_id::sm_80 && use_tuning) { if (key_size == 1 && accum_size == 1) { From 8bd1b347efee4326f1576d6c3f557fbb194e34c3 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 16:24:44 +0100 Subject: [PATCH 09/23] Refactor --- cub/cub/device/dispatch/dispatch_reduce_by_key.cuh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index e10d7d24eec..321cd2536f5 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -680,7 +680,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( { using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet using ScanTileStateT = ReduceByKeyScanTileState; - [[maybe_unused]] static constexpr int INIT_KERNEL_THREADS = 128; + [[maybe_unused]] static constexpr int init_kernel_threads = 128; ::cuda::arch_id arch_id{}; if (const auto error = CubDebug(ptx_arch_id(arch_id))) @@ -758,12 +758,12 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( return error; } - const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS)); + const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_tiles, init_kernel_threads)); #ifdef CUB_DEBUG_LOG - _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); #endif if (const auto error = CubDebug( - THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream) .doit(detail::scan::DeviceCompactInitKernel, tile_state, num_tiles, @@ -793,6 +793,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( OffsetT, AccumT, streaming_context_t>; + int reduce_by_key_sm_occupancy{}; if (const auto error = CubDebug(MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, block_threads))) { From ef27cd4d32f97bffcf3a17fc862131d2e9d7add1 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 18:33:51 +0100 Subject: [PATCH 10/23] More SASS fixes --- cub/cub/detail/delay_constructor.cuh | 20 +- .../dispatch/tuning/tuning_reduce_by_key.cuh | 839 +++++++++--------- .../tuning/tuning_rle_non_trivial_runs.cuh | 11 +- 3 files changed, 457 insertions(+), 413 deletions(-) diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh index 23d0c6d571e..9d939d4563f 100644 --- a/cub/cub/detail/delay_constructor.cuh +++ b/cub/cub/detail/delay_constructor.cuh @@ -193,15 +193,27 @@ struct delay_constructor_for using delay_constructor_t = typename delay_constructor_for::type; -_CCCL_API constexpr auto -default_reduce_by_key_delay_constructor_policy(int key_size, int value_size, bool value_is_primitive) +_CCCL_API constexpr auto default_delay_constructor_policy(bool is_primitive_or_trivially_copyable) { - if (value_is_primitive && (value_size + key_size < 16)) + if (is_primitive_or_trivially_copyable) { - return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450}; + return delay_constructor_policy{delay_constructor_kind::fixed_delay, 350, 450}; } return delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450}; } + +_CCCL_API constexpr auto default_reduce_by_key_delay_constructor_policy( + int key_size, + int value_size, + bool key_is_primitive_or_trivially_copyable, + bool value_is_primitive_or_trivially_copyable) +{ + if (value_is_primitive_or_trivially_copyable && (value_size + key_size < 16)) + { + return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450}; + } + return default_delay_constructor_policy(key_is_primitive_or_trivially_copyable); +} } // namespace detail CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index d31810c9158..9ec45f56906 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -986,12 +986,13 @@ struct policy_selector int key_size; int accum_size; - // TODO(bgruber): we want to get rid of the following three and just assume by default that types behave "primitive". + // TODO(bgruber): we want to get rid of the following four and just assume by default that types behave "primitive". // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because // then users can opt-out of tunings again - bool is_primitive_key_t; - bool is_primitive_accum_t; - bool is_primitive_op; + bool key_is_primitive; + bool key_is_trivially_copyable; + bool accum_is_primitive; + bool op_is_primitive; _CCCL_API constexpr auto __make_default_reduce_by_key_policy(CacheLoadModifier load_mod) const -> reduce_by_key_policy { @@ -1009,20 +1010,21 @@ struct policy_selector BLOCK_LOAD_DIRECT, load_mod, BLOCK_SCAN_WARP_SCANS, - default_reduce_by_key_delay_constructor_policy(accum_size, sizeof(int), true)}; + default_reduce_by_key_delay_constructor_policy( + accum_size, sizeof(int), key_is_primitive || key_is_trivially_copyable, true)}; } [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy { // bail out if we don't know the operation. TODO(bgruber): drop this check when we make the tuning API public - if (!is_primitive_op) + if (!op_is_primitive) { return __make_default_reduce_by_key_policy(LOAD_LDG); } - const bool use_tuning = (is_primitive_key_t || key_size == 16) && (is_primitive_accum_t || accum_size == 16); + const bool use_tuning = (key_is_primitive || key_size == 16) && (accum_is_primitive || accum_size == 16); - if (arch >= ::cuda::arch_id::sm_100 && use_tuning) + if (arch >= ::cuda::arch_id::sm_100 && use_tuning) // if we don't have a tuning, fall back to SM90 { if (key_size == 1 && accum_size == 1) { @@ -1170,217 +1172,226 @@ struct policy_selector } } - if (arch >= ::cuda::arch_id::sm_90 && use_tuning) + if (arch >= ::cuda::arch_id::sm_90) { - if (key_size == 1 && accum_size == 1) - { - return { - 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 720}}; - } - if (key_size == 1 && accum_size == 2) - { - return { - 320, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 865}}; - } - if (key_size == 1 && accum_size == 4) - { - return {192, - 14, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 735}}; - } - if (key_size == 1 && accum_size == 8) - { - return {128, - 13, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 580}}; - } - if (key_size == 1 && accum_size == 16) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1100}}; - } - if (key_size == 2 && accum_size == 1) - { - return { - 128, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 985}}; - } - if (key_size == 2 && accum_size == 2) - { - return {256, - 11, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 276, 650}}; - } - if (key_size == 2 && accum_size == 4) - { - return {256, - 14, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 240, 765}}; - } - if (key_size == 2 && accum_size == 8) - { - return {128, - 19, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1190}}; - } - if (key_size == 2 && accum_size == 16) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1175}}; - } - if (key_size == 4 && accum_size == 1) - { - return {256, - 13, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 404, 645}}; - } - if (key_size == 4 && accum_size == 2) - { - return {256, - 18, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1160}}; - } - if (key_size == 4 && accum_size == 4) - { - return {256, - 18, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1170}}; - } - if (key_size == 4 && accum_size == 8) - { - return {128, - 13, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1055}}; - } - if (key_size == 4 && accum_size == 16) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1195}}; - } - if (key_size == 8 && accum_size == 1) - { - return { - 256, 10, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1170}}; - } - if (key_size == 8 && accum_size == 2) - { - return {256, - 9, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 236, 1030}}; - } - if (key_size == 8 && accum_size == 4) - { - return {128, - 13, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 152, 560}}; - } - if (key_size == 8 && accum_size == 8) - { - return {128, - 23, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1030}}; - } - if (key_size == 8 && accum_size == 16) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1125}}; - } - if (key_size == 16 && !is_primitive_key_t && accum_size == 1) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1080}}; - } - if (key_size == 16 && !is_primitive_key_t && accum_size == 2) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 320, 1005}}; - } - if (key_size == 16 && !is_primitive_key_t && accum_size == 4) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 232, 1100}}; - } - if (key_size == 16 && !is_primitive_key_t && accum_size == 8) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1195}}; - } - if (key_size == 16 && !is_primitive_key_t && accum_size == 16) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1150}}; - } + if (use_tuning) + { + if (key_size == 1 && accum_size == 1) + { + return { + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 720}}; + } + if (key_size == 1 && accum_size == 2) + { + return { + 320, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 865}}; + } + if (key_size == 1 && accum_size == 4) + { + return {192, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 735}}; + } + if (key_size == 1 && accum_size == 8) + { + return {128, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 580}}; + } + if (key_size == 1 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1100}}; + } + if (key_size == 2 && accum_size == 1) + { + return { + 128, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 985}}; + } + if (key_size == 2 && accum_size == 2) + { + return {256, + 11, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 276, 650}}; + } + if (key_size == 2 && accum_size == 4) + { + return {256, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 240, 765}}; + } + if (key_size == 2 && accum_size == 8) + { + return {128, + 19, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1190}}; + } + if (key_size == 2 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1175}}; + } + if (key_size == 4 && accum_size == 1) + { + return {256, + 13, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 404, 645}}; + } + if (key_size == 4 && accum_size == 2) + { + return {256, + 18, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1160}}; + } + if (key_size == 4 && accum_size == 4) + { + return {256, + 18, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1170}}; + } + if (key_size == 4 && accum_size == 8) + { + return {128, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1055}}; + } + if (key_size == 4 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1195}}; + } + if (key_size == 8 && accum_size == 1) + { + return {256, + 10, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1170}}; + } + if (key_size == 8 && accum_size == 2) + { + return {256, + 9, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 236, 1030}}; + } + if (key_size == 8 && accum_size == 4) + { + return {128, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 152, 560}}; + } + if (key_size == 8 && accum_size == 8) + { + return {128, + 23, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1030}}; + } + if (key_size == 8 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1125}}; + } + if (key_size == 16 && accum_size == 1) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1080}}; + } + if (key_size == 16 && accum_size == 2) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 320, 1005}}; + } + if (key_size == 16 && accum_size == 4) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 232, 1100}}; + } + if (key_size == 16 && accum_size == 8) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1195}}; + } + if (key_size == 16 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1150}}; + } + } + + // no tuning, use a default one return __make_default_reduce_by_key_policy(LOAD_DEFAULT); } @@ -1389,193 +1400,206 @@ struct policy_selector return __make_default_reduce_by_key_policy(LOAD_LDG); } - if (arch >= ::cuda::arch_id::sm_80 && use_tuning) + if (arch >= ::cuda::arch_id::sm_80) { - if (key_size == 1 && accum_size == 1) - { - return { - 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 975}}; - } - if (key_size == 1 && accum_size == 2) - { - return { - 224, 12, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 840}}; - } - if (key_size == 1 && accum_size == 4) - { - return {256, - 15, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 760}}; - } - if (key_size == 1 && accum_size == 8) - { - return { - 224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1070}}; - } - if (key_size == 1 && accum_size == 16) - { - return {128, - 9, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1175}}; - } - if (key_size == 2 && accum_size == 1) - { - return { - 256, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}}; - } - if (key_size == 2 && accum_size == 2) - { - return {224, - 14, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 640}}; - } - if (key_size == 2 && accum_size == 4) - { - return {256, - 14, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 905}}; - } - if (key_size == 2 && accum_size == 8) - { - return {224, - 9, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 810}}; - } - if (key_size == 2 && accum_size == 16) - { - return {160, - 9, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1115}}; - } - if (key_size == 4 && accum_size == 1) - { - return { - 288, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1110}}; - } - if (key_size == 4 && accum_size == 2) - { - return {192, - 15, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1200}}; - } - if (key_size == 4 && accum_size == 4) - { - return { - 256, 15, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1110}}; - } - if (key_size == 4 && accum_size == 8) - { - return {224, - 9, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1165}}; - } - if (key_size == 4 && accum_size == 16) - { - return {160, - 9, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1100}}; - } - if (key_size == 8 && accum_size == 1) - { - return {192, - 10, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1175}}; - } - if (key_size == 8 && accum_size == 2) - { - return { - 224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1075}}; - } - if (key_size == 8 && accum_size == 4) - { - return { - 384, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1040}}; - } - if (key_size == 8 && accum_size == 8) - { - return {128, - 14, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1080}}; - } - if (key_size == 8 && accum_size == 16) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 430}}; - } - if (key_size == 16 && !is_primitive_key_t && accum_size == 1) - { - return { - 192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1105}}; - } - if (key_size == 16 && !is_primitive_key_t && accum_size == 2) - { - return {192, - 7, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 755}}; - } - if (key_size == 16 && !is_primitive_key_t && accum_size == 4) - { - return {192, - 7, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 535}}; - } - if (key_size == 16 && !is_primitive_key_t && accum_size == 8) - { - return { - 192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1035}}; - } - if (key_size == 16 && !is_primitive_key_t && accum_size == 16) - { - return {128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1090}}; - } + if (use_tuning) + { + if (key_size == 1 && accum_size == 1) + { + return { + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 975}}; + } + if (key_size == 1 && accum_size == 2) + { + return { + 224, 12, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 840}}; + } + if (key_size == 1 && accum_size == 4) + { + return {256, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 760}}; + } + if (key_size == 1 && accum_size == 8) + { + return { + 224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1070}}; + } + if (key_size == 1 && accum_size == 16) + { + return {128, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1175}}; + } + if (key_size == 2 && accum_size == 1) + { + return { + 256, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}}; + } + if (key_size == 2 && accum_size == 2) + { + return {224, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 640}}; + } + if (key_size == 2 && accum_size == 4) + { + return {256, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 905}}; + } + if (key_size == 2 && accum_size == 8) + { + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 810}}; + } + if (key_size == 2 && accum_size == 16) + { + return {160, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1115}}; + } + if (key_size == 4 && accum_size == 1) + { + return {288, + 11, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1110}}; + } + if (key_size == 4 && accum_size == 2) + { + return {192, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1200}}; + } + if (key_size == 4 && accum_size == 4) + { + return {256, + 15, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1110}}; + } + if (key_size == 4 && accum_size == 8) + { + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1165}}; + } + if (key_size == 4 && accum_size == 16) + { + return {160, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1100}}; + } + if (key_size == 8 && accum_size == 1) + { + return {192, + 10, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1175}}; + } + if (key_size == 8 && accum_size == 2) + { + return { + 224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1075}}; + } + if (key_size == 8 && accum_size == 4) + { + return { + 384, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1040}}; + } + if (key_size == 8 && accum_size == 8) + { + return {128, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1080}}; + } + if (key_size == 8 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 430}}; + } + if (key_size == 16 && accum_size == 1) + { + return { + 192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1105}}; + } + if (key_size == 16 && accum_size == 2) + { + return {192, + 7, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 755}}; + } + if (key_size == 16 && accum_size == 4) + { + return {192, + 7, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 535}}; + } + if (key_size == 16 && accum_size == 8) + { + return { + 192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1035}}; + } + if (key_size == 16 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1090}}; + } + } + + // no tuning, use a default one return __make_default_reduce_by_key_policy(LOAD_DEFAULT); } @@ -1613,9 +1637,10 @@ struct policy_selector_from_types return policy_selector{ int{sizeof(KeyT)}, int{sizeof(AccumT)}, - (is_primitive_key() == primitive_key::yes), - (is_primitive_accum() == primitive_accum::yes), - (is_primitive_op() == primitive_op::yes)}(arch); + is_primitive_v, + ::cuda::std::is_trivially_copyable_v, + is_primitive_v, + basic_binary_op_t::value}(arch); } }; } // namespace detail::reduce_by_key diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh index 3ff351a2bd4..97f1d9a03e6 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh @@ -374,6 +374,7 @@ struct policy_selector type_t key_type; bool length_is_primitive; bool key_is_primitive; // TODO(bgruber): can probably be derived from key_type + bool key_is_trivially_copyable; _CCCL_API constexpr auto make_default_policy(BlockLoadAlgorithm block_load_alg, int delay_ctor_key_size, CacheLoadModifier load_mod) const @@ -388,7 +389,8 @@ struct policy_selector load_mod, true, BLOCK_SCAN_WARP_SCANS, - default_reduce_by_key_delay_constructor_policy(delay_ctor_key_size, sizeof(int), true)}; + default_reduce_by_key_delay_constructor_policy( + delay_ctor_key_size, sizeof(int), key_is_primitive || key_is_trivially_copyable, true)}; } [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy @@ -600,7 +602,12 @@ struct policy_selector_from_types [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy { constexpr policy_selector selector{ - sizeof(LengthT), int{sizeof(KeyT)}, classify_type, is_primitive_v, is_primitive_v}; + sizeof(LengthT), + int{sizeof(KeyT)}, + classify_type, + is_primitive_v, + is_primitive_v, + ::cuda::std::is_trivially_copyable_v}; return selector(arch); } }; From 74f0d052887956f3e36278d5708074fea6e008c1 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 21:57:16 +0100 Subject: [PATCH 11/23] comments and fix --- .../dispatch/tuning/tuning_reduce_by_key.cuh | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 9ec45f56906..c1baddd6f7f 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -17,8 +17,8 @@ #include #include #include -#include #include +#include #include #include @@ -762,7 +762,6 @@ template struct sm100_tuning { // ipt_14.tpb_224.trp_1.ld_0.ns_476.dcid_5.l2w_1005 1.187378 1.119705 1.185397 1.258420 - static constexpr int items = 14; static constexpr int threads = 224; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; @@ -923,7 +922,7 @@ struct policy_hub struct Policy1000 : ChainedPolicy<1000, Policy1000, Policy900> { - // Use values from tuning if a specialization exists, otherwise pick the default + // Use values from tuning if a specialization exists, otherwise fall back to SM90 template static auto select_agent_policy(int) -> AgentReduceByKeyPolicy, is_primitive_v, ::cuda::std::is_trivially_copyable_v, is_primitive_v, From bfbdc85e9eadf9917afde789259ac606d2d9d2f9 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 23:32:48 +0100 Subject: [PATCH 12/23] RLE fixdes --- .../dispatch/tuning/tuning_reduce_by_key.cuh | 13 +- .../dispatch/tuning/tuning_rle_encode.cuh | 241 ++++++++++-------- 2 files changed, 141 insertions(+), 113 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index c1baddd6f7f..c2230e9ef9b 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -994,7 +994,7 @@ struct policy_selector bool accum_is_primitive; bool op_is_primitive; - _CCCL_API constexpr auto __make_default_reduce_by_key_policy(CacheLoadModifier load_mod) const -> reduce_by_key_policy + _CCCL_API constexpr auto __make_default_policy(CacheLoadModifier load_mod) const -> reduce_by_key_policy { constexpr int nominal_4B_items_per_thread = 6; const int combined_input_bytes = key_size + accum_size; @@ -1019,7 +1019,7 @@ struct policy_selector // bail out if we don't know the operation. TODO(bgruber): drop this check when we make the tuning API public if (!op_is_primitive) { - return __make_default_reduce_by_key_policy(LOAD_LDG); + return __make_default_policy(LOAD_LDG); } const bool use_tuning = (key_is_primitive || key_size == 16) && (accum_is_primitive || accum_size == 16); @@ -1408,12 +1408,12 @@ struct policy_selector } // no tuning, use a default one - return __make_default_reduce_by_key_policy(LOAD_DEFAULT); + return __make_default_policy(LOAD_DEFAULT); } if (arch >= ::cuda::arch_id::sm_86) { - return __make_default_reduce_by_key_policy(LOAD_LDG); + return __make_default_policy(LOAD_LDG); } if (arch >= ::cuda::arch_id::sm_80) @@ -1616,10 +1616,11 @@ struct policy_selector } // no tuning, use a default one - return __make_default_reduce_by_key_policy(LOAD_DEFAULT); + return __make_default_policy(LOAD_DEFAULT); } - return __make_default_reduce_by_key_policy(LOAD_LDG); + // for SM50 + return __make_default_policy(LOAD_LDG); } }; diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh index 2fdd3bbbe90..14f9b58dea9 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh @@ -323,21 +323,38 @@ using reduce_by_key::policy_selector_from_hub; struct policy_selector { - length_size length_sz; - key_size key_sz; - primitive_length prim_len; - primitive_key prim_key; + int length_size; + int key_size; + type_t key_t; + bool length_is_primitive; + bool key_is_primitive; int max_input_bytes; int combined_input_bytes; - [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy + _CCCL_API constexpr auto __make_default_policy(CacheLoadModifier load_mod) const -> rle_encode_policy { - const bool tuned_prim = (prim_len == primitive_length::yes && prim_key == primitive_key::yes); - const bool length_4 = (length_sz == length_size::_4); + constexpr int nominal_4B_items_per_thread = 6; + const int items_per_thread = + (max_input_bytes <= 8) + ? 6 + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); + return rle_encode_policy{ + 128, + items_per_thread, + BLOCK_LOAD_DIRECT, + load_mod, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}, + }; + } - if (arch >= ::cuda::arch_id::sm_100 && tuned_prim && length_4) + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy + { + // if we don't have a tuning for SM100, fall back to SM90 + if (arch >= ::cuda::arch_id::sm_100 && length_is_primitive && length_size == 4 && key_is_primitive) { - if (key_sz == key_size::_1) + if (key_size == 1) { return rle_encode_policy{ 256, @@ -347,7 +364,7 @@ struct policy_selector BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::exponential_backon, 468, 300}}; } - if (key_sz == key_size::_2) + if (key_size == 2) { return rle_encode_policy{ 224, @@ -357,7 +374,7 @@ struct policy_selector BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::exponential_backon, 376, 420}}; } - if (key_sz == key_size::_4) + if (key_size == 4) { return rle_encode_policy{ 256, @@ -367,7 +384,7 @@ struct policy_selector BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::exponential_backon, 956, 70}}; } - if (key_sz == key_size::_8) + if (key_size == 8) { return rle_encode_policy{ 224, @@ -378,105 +395,114 @@ struct policy_selector {delay_constructor_kind::exponential_backoff, 188, 765}}; } } - if (arch >= ::cuda::arch_id::sm_90 && tuned_prim && length_4) + + if (arch >= ::cuda::arch_id::sm_90) { - if (key_sz == key_size::_1) + if (length_is_primitive && length_size == 4) { - return rle_encode_policy{ - 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}}; + if (key_is_primitive && key_size == 1) + { + return rle_encode_policy{ + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}}; + } + if (key_is_primitive && key_size == 2) + { + return rle_encode_policy{ + 128, 22, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 775}}; + } + if (key_is_primitive && key_size == 4) + { + return rle_encode_policy{ + 192, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 284, 480}}; + } + if (key_is_primitive && key_size == 8) + { + return rle_encode_policy{ + 128, + 19, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 515}}; + } + if (key_t == type_t::int128 || key_t == type_t::uint128) + { + return rle_encode_policy{ + 128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 428, 930}}; + } } - if (key_sz == key_size::_2) - { - return rle_encode_policy{ - 128, 22, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 775}}; - } - if (key_sz == key_size::_4) - { - return rle_encode_policy{ - 192, - 14, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 284, 480}}; - } - if (key_sz == key_size::_8) - { - return rle_encode_policy{ - 128, - 19, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 515}}; - } -#if _CCCL_HAS_INT128() - if (key_sz == key_size::_16) - { - return rle_encode_policy{ - 128, - 11, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 428, 930}}; - } -#endif + + // no tuning, use a default one + return __make_default_policy(LOAD_DEFAULT); } - if (arch >= ::cuda::arch_id::sm_80 && tuned_prim && length_4) + + if (arch >= ::cuda::arch_id::sm_86) { - if (key_sz == key_size::_1) - { - return rle_encode_policy{ - 256, 14, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 640}}; - } - if (key_sz == key_size::_2) - { - return rle_encode_policy{ - 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 900}}; - } - if (key_sz == key_size::_4) - { - return rle_encode_policy{ - 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1080}}; - } - if (key_sz == key_size::_8) - { - return rle_encode_policy{ - 224, - 9, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 1075}}; - } -#if _CCCL_HAS_INT128() - if (key_sz == key_size::_16) + return __make_default_policy(LOAD_LDG); + } + + if (arch >= ::cuda::arch_id::sm_80) + { + if (length_is_primitive && length_size == 4) { - return rle_encode_policy{ - 128, - 7, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::no_delay, 0, 630}}; + if (key_is_primitive && key_size == 1) + { + return rle_encode_policy{ + 256, 14, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 640}}; + } + if (key_is_primitive && key_size == 2) + { + return rle_encode_policy{ + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 900}}; + } + if (key_is_primitive && key_size == 4) + { + return rle_encode_policy{ + 256, + 13, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1080}}; + } + if (key_is_primitive && key_size == 8) + { + return rle_encode_policy{ + 224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1075}}; + } + if (key_t == type_t::int128 || key_t == type_t::uint128) + { + return rle_encode_policy{ + 128, + 7, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 630}}; + } } -#endif + + // no tuning, use a default one + return __make_default_policy(LOAD_DEFAULT); } - constexpr int nominal_4B_items_per_thread = 6; - const int items = - (max_input_bytes <= 8) - ? 6 - : ::cuda::std::clamp( - ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); - return rle_encode_policy{ - 128, - items, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}, - }; + + // for SM50 + return __make_default_policy(LOAD_LDG); } }; @@ -490,10 +516,11 @@ struct policy_selector_from_types [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy { constexpr policy_selector selector{ - classify_length_size(), - classify_key_size(), - is_primitive_length(), - is_primitive_key(), + int{sizeof(LengthT)}, + int{sizeof(KeyT)}, + classify_type, + is_primitive_v, + is_primitive_v, static_cast((::cuda::std::max) (sizeof(LengthT), sizeof(KeyT))), static_cast(sizeof(LengthT) + sizeof(KeyT))}; return selector(arch); From eff865fab9cd4db8f9abe697ff5a22fd89b0eee0 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 25 Feb 2026 23:48:15 +0100 Subject: [PATCH 13/23] const --- cub/benchmarks/bench/run_length_encode/encode.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index d00e16acf6e..650da0437a2 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -62,7 +62,7 @@ static void rle(nvbench::state& state, nvbench::type_list out_keys(elements); thrust::device_vector in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); - T* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); + const T* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); T* d_out_keys = thrust::raw_pointer_cast(out_keys.data()); auto d_out_vals = thrust::raw_pointer_cast(out_vals.data()); auto d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); From 475ba6fa540038d3f7486603b024b9b8a50d279c Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 26 Feb 2026 00:06:21 +0100 Subject: [PATCH 14/23] override accum_t --- cub/benchmarks/bench/run_length_encode/encode.cu | 13 ++++--------- .../dispatch/dispatch_streaming_reduce_by_key.cuh | 9 ++++++--- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index 650da0437a2..4addc163fbb 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -44,14 +44,9 @@ static void rle(nvbench::state& state, nvbench::type_list; - using run_length_output_it_t = run_length_t*; - using num_runs_output_iterator_t = num_runs_t*; - using equality_op_t = ::cuda::std::equal_to<>; - using reduction_op_t = ::cuda::std::plus<>; - using accum_t = run_length_t; + using run_length_input_it_t = thrust::constant_iterator; + using equality_op_t = ::cuda::std::equal_to<>; + using reduction_op_t = ::cuda::std::plus<>; const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; @@ -73,7 +68,7 @@ static void rle(nvbench::state& state, nvbench::type_list(elements); auto dispatch_on_stream = [&](cudaStream_t stream) { - return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key( + return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key( d_temp_storage, temp_storage_bytes, d_in_keys, diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh index 4fec39263a6..2adc6651354 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh @@ -14,7 +14,7 @@ #endif // no system header #include -#include +#include #include #include #include @@ -35,6 +35,7 @@ CUB_NAMESPACE_BEGIN namespace detail::reduce_by_key { template < + typename OverrideAccumT = use_default, typename KeysInputIteratorT, typename UniqueOutputIteratorT, typename ValuesInputIteratorT, @@ -43,8 +44,10 @@ template < typename EqualityOpT, typename ReductionOpT, typename OffsetT, - typename AccumT = - ::cuda::std::__accumulator_t, it_value_t>, + typename AccumT = ::cuda::std::conditional_t< + !::cuda::std::is_same_v, + OverrideAccumT, + ::cuda::std::__accumulator_t, it_value_t>>, typename PolicySelector = policy_selector_from_types Date: Thu, 26 Feb 2026 00:25:36 +0100 Subject: [PATCH 15/23] rle fixes --- cub/benchmarks/bench/run_length_encode/encode.cu | 12 +++++++----- .../device/dispatch/tuning/tuning_rle_encode.cuh | 14 +++++++------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index 4addc163fbb..37227aa7649 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -47,6 +47,7 @@ static void rle(nvbench::state& state, nvbench::type_list; using equality_op_t = ::cuda::std::equal_to<>; using reduction_op_t = ::cuda::std::plus<>; + using accum_t = run_length_t; const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; @@ -68,7 +69,7 @@ static void rle(nvbench::state& state, nvbench::type_list(elements); auto dispatch_on_stream = [&](cudaStream_t stream) { - return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key( + return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key( d_temp_storage, temp_storage_bytes, d_in_keys, @@ -79,11 +80,12 @@ static void rle(nvbench::state& state, nvbench::type_list {} +#else // TUNE_BASE bench_encode_policy_selector{} -#endif +#endif // TUNE_BASE ); }; diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh index 14f9b58dea9..fe01147de4c 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh @@ -327,13 +327,14 @@ struct policy_selector int key_size; type_t key_t; bool length_is_primitive; + bool length_is_trivially_copyable; bool key_is_primitive; - int max_input_bytes; - int combined_input_bytes; _CCCL_API constexpr auto __make_default_policy(CacheLoadModifier load_mod) const -> rle_encode_policy { constexpr int nominal_4B_items_per_thread = 6; + const int combined_input_bytes = length_size + key_size; + const int max_input_bytes = (::cuda::std::max) (length_size, key_size); const int items_per_thread = (max_input_bytes <= 8) ? 6 @@ -345,8 +346,8 @@ struct policy_selector BLOCK_LOAD_DIRECT, load_mod, BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}, - }; + default_reduce_by_key_delay_constructor_policy( + length_size, int{sizeof(int)}, length_is_primitive || length_is_trivially_copyable, true)}; } [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy @@ -520,9 +521,8 @@ struct policy_selector_from_types int{sizeof(KeyT)}, classify_type, is_primitive_v, - is_primitive_v, - static_cast((::cuda::std::max) (sizeof(LengthT), sizeof(KeyT))), - static_cast(sizeof(LengthT) + sizeof(KeyT))}; + ::cuda::std::is_trivially_copyable_v, + is_primitive_v}; return selector(arch); } }; From dd5f4bc30fd176fec383bd8c3c03197d62c04ccc Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 26 Feb 2026 00:36:06 +0100 Subject: [PATCH 16/23] Comment --- cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh index fe01147de4c..20c52d20f84 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh @@ -326,6 +326,10 @@ struct policy_selector int length_size; int key_size; type_t key_t; + + // TODO(bgruber): we want to get rid of the following three and just assume by default that types behave "primitive". + // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because + // then users can opt-out of tunings again bool length_is_primitive; bool length_is_trivially_copyable; bool key_is_primitive; From c9585d49456c29ab0dd8a9efd109abeee1b72144 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 26 Feb 2026 09:27:47 +0100 Subject: [PATCH 17/23] Fix --- cub/cub/device/device_run_length_encode.cuh | 23 +++++++-------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index d0c5d05e0fa..8b1373afdc5 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -188,28 +188,19 @@ struct DeviceRunLengthEncode using policy_selector_t = detail::rle::encode::policy_selector_from_types; - return detail::reduce_by_key::dispatch_streaming_reduce_by_key< - InputIteratorT, - UniqueOutputIteratorT, - lengths_input_iterator_t, - LengthsOutputIteratorT, - NumRunsOutputIteratorT, - equality_op, - reduction_op, - offset_t, - accum_t, - policy_selector_t>( + return detail::reduce_by_key::dispatch_streaming_reduce_by_key( d_temp_storage, temp_storage_bytes, d_in, d_unique_out, - lengths_input_iterator_t((length_t) 1), + lengths_input_iterator_t(length_t{1}), d_counts_out, d_num_runs_out, - equality_op(), - reduction_op(), - num_items, - stream); + equality_op{}, + reduction_op{}, + static_cast(num_items), + stream, + policy_selector_t{}); } //! @rst From e2ea871d27124b08e97ff66ad2dd30412ac6d118 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 26 Feb 2026 09:32:54 +0100 Subject: [PATCH 18/23] Review --- cub/cub/device/dispatch/dispatch_reduce_by_key.cuh | 2 +- .../device/dispatch/dispatch_streaming_reduce_by_key.cuh | 7 ++++--- cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh | 5 +---- cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh | 5 +---- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 321cd2536f5..e1bb6357833 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -33,7 +33,7 @@ #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) # include -#endif +#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) CUB_NAMESPACE_BEGIN diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh index 2adc6651354..5d959df2c4c 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh @@ -17,15 +17,16 @@ #include #include #include -#include #include -#include #include #include -#include #include +#include +#include +#include + #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) # include #endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index c2230e9ef9b..64c96bc77d6 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -25,10 +25,7 @@ #include #include #include - -#if _CCCL_HAS_CONCEPTS() -# include -#endif // _CCCL_HAS_CONCEPTS() +#include #if !_CCCL_COMPILER(NVRTC) # include diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh index 20c52d20f84..5083ed52b85 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh @@ -27,10 +27,7 @@ #include #include #include - -#if _CCCL_HAS_CONCEPTS() -# include -#endif // _CCCL_HAS_CONCEPTS() +#include #if !_CCCL_COMPILER(NVRTC) # include From 668966eba5e023aaa259e50be5059dd44eccfb0c Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 26 Feb 2026 18:07:41 +0100 Subject: [PATCH 19/23] MSVC --- .../dispatch/dispatch_reduce_by_key.cuh | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index e1bb6357833..fada316a417 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -698,19 +698,16 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast(arch_id), ss.str().c_str());)) #endif - // convert the policy into a legacy agent policy for vsmem_helper. TODO(bgruber): refactor this in the future - using AgentReduceByKeyPolicy = AgentReduceByKeyPolicy< - policy.block_threads, - policy.items_per_thread, - policy.load_algorithm, - policy.load_modifier, - policy.scan_algorithm, - delay_constructor_t>; - using vsmem_helper_t = vsmem_helper_default_fallback_policy_t< - AgentReduceByKeyPolicy, + // convert the policy into a legacy agent policy for vsmem_helper. TODO(bgruber): refactor this in the future + AgentReduceByKeyPolicy>, AgentReduceByKey, KeysInputIteratorT, UniqueOutputIteratorT, From 224597a65ec23e6f95073c276b7e2df69a2ae3dd Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 26 Feb 2026 19:26:03 +0100 Subject: [PATCH 20/23] M S V C --- .../dispatch/dispatch_reduce_by_key.cuh | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index fada316a417..638e3056dc2 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -698,16 +698,20 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast(arch_id), ss.str().c_str());)) #endif + // Convert the policy into a legacy agent policy for vsmem_helper. Use a value and decltype it again, to workaround + // MSVC bug that just turns the type to `int`. TODO(bgruber): refactor this in the future + [[maybe_unused]] AgentReduceByKeyPolicy< + policy.block_threads, + policy.items_per_thread, + policy.load_algorithm, + policy.load_modifier, + policy.scan_algorithm, + delay_constructor_t> agent_policy; + using vsmem_helper_t = vsmem_helper_default_fallback_policy_t< - // convert the policy into a legacy agent policy for vsmem_helper. TODO(bgruber): refactor this in the future - AgentReduceByKeyPolicy>, + decltype(agent_policy), AgentReduceByKey, KeysInputIteratorT, UniqueOutputIteratorT, From ab404c69f957ebe2eddce36e4fa68725129c1887 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 27 Feb 2026 13:02:11 +0100 Subject: [PATCH 21/23] M S V C --- .../dispatch/dispatch_reduce_by_key.cuh | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 638e3056dc2..f88c853628b 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -645,6 +645,22 @@ struct DispatchReduceByKey namespace detail::reduce_by_key { +// we move the conversion out of the lambda below, so MSVC can compile the code +template +_CCCL_API constexpr auto convert_to_agent_policy(PolicyGetter policy_getter) +{ + constexpr reduce_by_key_policy policy = policy_getter(); + return AgentReduceByKeyPolicy< + policy.block_threads, + policy.items_per_thread, + policy.load_algorithm, + policy.load_modifier, + policy.scan_algorithm, + delay_constructor_t>{}; +} + template < typename OverrideAccumT = use_default, typename KeysInputIteratorT, @@ -698,20 +714,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast(arch_id), ss.str().c_str());)) #endif - // Convert the policy into a legacy agent policy for vsmem_helper. Use a value and decltype it again, to workaround - // MSVC bug that just turns the type to `int`. TODO(bgruber): refactor this in the future - [[maybe_unused]] AgentReduceByKeyPolicy< - policy.block_threads, - policy.items_per_thread, - policy.load_algorithm, - policy.load_modifier, - policy.scan_algorithm, - delay_constructor_t> agent_policy; - + // TODO(bgruber): refactor this in the future using vsmem_helper_t = vsmem_helper_default_fallback_policy_t< - decltype(agent_policy), + decltype(convert_to_agent_policy(policy_getter)), AgentReduceByKey, KeysInputIteratorT, UniqueOutputIteratorT, From 186ac061c38cda8c20e5ddef3362508d2debc856 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 27 Feb 2026 14:00:47 +0100 Subject: [PATCH 22/23] Silence warning --- cub/cub/device/dispatch/dispatch_reduce_by_key.cuh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index f88c853628b..84bd03e465c 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -705,12 +705,10 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( } return detail::dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) { - static constexpr reduce_by_key_policy policy = policy_getter(); // need the constexpr of vsmem_helper - #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) NV_IF_TARGET( NV_IS_HOST, - (::std::stringstream ss; ss << policy; _CubLog( + (::std::stringstream ss; ss << policy_getter(); _CubLog( "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast(arch_id), ss.str().c_str());)) #endif From 87f00f775b6fa3dc594a12f5ee0484642d6ecaa1 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 27 Feb 2026 16:02:55 +0100 Subject: [PATCH 23/23] M S V C --- .../dispatch/dispatch_reduce_by_key.cuh | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 84bd03e465c..14fbf3ae15c 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -645,20 +645,25 @@ struct DispatchReduceByKey namespace detail::reduce_by_key { -// we move the conversion out of the lambda below, so MSVC can compile the code -template -_CCCL_API constexpr auto convert_to_agent_policy(PolicyGetter policy_getter) +// we move the conversion of the policy to the agent policy and its use out of the lambda below, so MSVC does not ICE +template +_CCCL_API auto determine_threads_items_vsmem(PolicyGetter policy_getter) { + // TODO(bgruber): refactor this in the future constexpr reduce_by_key_policy policy = policy_getter(); - return AgentReduceByKeyPolicy< - policy.block_threads, - policy.items_per_thread, - policy.load_algorithm, - policy.load_modifier, - policy.scan_algorithm, - delay_constructor_t>{}; + using Policy = AgentReduceByKeyPolicy< + policy.block_threads, + policy.items_per_thread, + policy.load_algorithm, + policy.load_modifier, + policy.scan_algorithm, + delay_constructor_t>; + using vsmem_helper_t = vsmem_helper_default_fallback_policy_t; + return ::cuda::std::tuple{vsmem_helper_t::agent_policy_t::BLOCK_THREADS, + vsmem_helper_t::agent_policy_t::ITEMS_PER_THREAD, + vsmem_helper_t::vsmem_per_block}; } template < @@ -712,10 +717,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast(arch_id), ss.str().c_str());)) #endif - // TODO(bgruber): refactor this in the future - using vsmem_helper_t = vsmem_helper_default_fallback_policy_t< - decltype(convert_to_agent_policy(policy_getter)), - AgentReduceByKey, + const auto [block_threads, items_per_thread, vsmem_per_block] = determine_threads_items_vsmem< + decltype(policy_getter), KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, @@ -725,17 +728,14 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( ReductionOpT, OffsetT, AccumT, - streaming_context_t>; - - constexpr int block_threads = vsmem_helper_t::agent_policy_t::BLOCK_THREADS; - constexpr int items_per_thread = vsmem_helper_t::agent_policy_t::ITEMS_PER_THREAD; + streaming_context_t>(policy_getter); // Number of input tiles const int tile_size = block_threads * items_per_thread; const int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); // The amount of virtual shared memory to allocate - const auto vsmem_size = num_tiles * vsmem_helper_t::vsmem_per_block; + const auto vsmem_size = num_tiles * vsmem_per_block; size_t tile_descriptor_memory{}; if (const auto error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, tile_descriptor_memory)))