From 45a669bd75914b431557ca38a76ae2ee620c3802 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 13 Feb 2026 15:32:29 +0100 Subject: [PATCH 1/7] Implement the new tuning API for DeviceRleDispatch Fixes: #7532 --- .../run_length_encode/non_trivial_runs.cu | 109 +++---- cub/cub/device/device_run_length_encode.cuh | 27 +- cub/cub/device/dispatch/dispatch_rle.cuh | 245 +++++++++++++++- .../tuning/tuning_rle_non_trivial_runs.cuh | 272 ++++++++++++++++++ ...vice_run_length_encode_non_trivial_runs.cu | 1 + 5 files changed, 556 insertions(+), 98 deletions(-) diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu index 4abf201cbb8..7b4073256b9 100644 --- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu +++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu @@ -16,33 +16,21 @@ // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE -# if TUNE_TRANSPOSE == 0 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT -# else // TUNE_TRANSPOSE == 1 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE -# endif // TUNE_TRANSPOSE - -# if TUNE_LOAD == 0 -# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT -# else // TUNE_LOAD == 1 -# define TUNE_LOAD_MODIFIER cub::LOAD_CA -# endif // TUNE_LOAD - -struct device_rle_policy_hub +struct bench_rle_policy_selector { - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> + [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const + -> cub::detail::rle::non_trivial_runs::rle_non_trivial_runs_policy { - using RleSweepPolicyT = - cub::AgentRlePolicy; - }; - - using MaxPolicy = Policy500; + return { + TUNE_THREADS, + TUNE_ITEMS, + TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE, + TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA, + static_cast(TUNE_TIME_SLICING), + cub::BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy, + }; + } }; #endif // !TUNE_BASE @@ -60,25 +48,6 @@ static void rle(nvbench::state& state, nvbench::type_list; -#if !TUNE_BASE - using dispatch_t = - cub::DeviceRleDispatch; -#else - using dispatch_t = - cub::DeviceRleDispatch; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; const std::size_t max_segment_size = static_cast(state.get_int64("MaxSegSize")); @@ -95,31 +64,32 @@ static void rle(nvbench::state& state, nvbench::type_list(elements); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_offsets, - d_out_lengths, - d_num_runs_out, - equality_op_t{}, - elements, - 0); + auto dispatch_on_stream = [&](cudaStream_t stream) { + cub::detail::rle::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in_keys, + d_out_offsets, + d_out_lengths, + d_num_runs_out, + equality_op_t{}, + num_items, + stream +#if !TUNE_BASE + , + bench_rle_policy_selector{} +#endif + ); + }; + + dispatch_on_stream(cudaStream_t{0}); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_offsets, - d_out_lengths, - d_num_runs_out, - equality_op_t{}, - elements, - 0); + dispatch_on_stream(cudaStream_t{0}); cudaDeviceSynchronize(); const OffsetT num_runs = num_runs_out[0]; @@ -130,16 +100,7 @@ static void rle(nvbench::state& state, nvbench::type_list(1); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_offsets, - d_out_lengths, - d_num_runs_out, - equality_op_t{}, - elements, - launch.get_stream()); + dispatch_on_stream(launch.get_stream().get_stream()); }); } diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 691e0a28c30..f5aa4016c1e 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -327,25 +327,18 @@ struct DeviceRunLengthEncode { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::NonTrivialRuns"); - // Offset type used for global offsets using offset_t = detail::choose_signed_offset_t; using equality_op = ::cuda::std::equal_to<>; - - return DeviceRleDispatch< - InputIteratorT, - OffsetsOutputIteratorT, - LengthsOutputIteratorT, - NumRunsOutputIteratorT, - equality_op, - offset_t>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - equality_op{}, - num_items, - stream); + return detail::rle::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + equality_op{}, + static_cast(num_items), + stream); } }; diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index a5b490287ab..4184ad4646a 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -21,6 +21,7 @@ #endif // no system header #include +#include #include #include #include @@ -29,6 +30,10 @@ #include +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) +# include +#endif + #include #include #include @@ -110,8 +115,8 @@ struct streaming_context * Otherwise performs flag-based selection if FlagIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) * - * @tparam AgentRlePolicyT - * Parameterized AgentRlePolicyT tuning policy type + * @tparam PolicySelector + * Selects the tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading input items @iterator @@ -158,7 +163,7 @@ struct streaming_context * @param num_tiles * Total number of tiles for the entire problem */ -template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREADS)) +#if _CCCL_HAS_CONCEPTS() + requires non_trivial_runs::rle_non_trivial_runs_policy_selector +#endif // _CCCL_HAS_CONCEPTS() +__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel( InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, @@ -180,9 +188,19 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context) { - using AgentRlePolicyT = typename ChainedPolicyT::ActivePolicy::RleSweepPolicyT; + static constexpr non_trivial_runs::rle_non_trivial_runs_policy policy = + PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); + using AgentRlePolicyT = + AgentRlePolicy>; - // Thread block type for selecting data from input tiles using AgentRleT = AgentRle +struct policy_selector_from_hub +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const + -> non_trivial_runs::rle_non_trivial_runs_policy + { + using RleSweepPolicyT = typename PolicyHub::MaxPolicy::RleSweepPolicyT; + return non_trivial_runs::rle_non_trivial_runs_policy{ + RleSweepPolicyT::BLOCK_THREADS, + RleSweepPolicyT::ITEMS_PER_THREAD, + RleSweepPolicyT::LOAD_ALGORITHM, + RleSweepPolicyT::LOAD_MODIFIER, + RleSweepPolicyT::STORE_WARP_TIME_SLICING, + RleSweepPolicyT::SCAN_ALGORITHM, + delay_constructor_policy_from_type, + }; + } +}; } // namespace detail::rle /****************************************************************************** @@ -231,6 +269,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ +// TODO(bgruber): deprecate when we make the tuning API public and remove in CCCL 4.0 template ( detail::scan::DeviceCompactInitKernel, detail::rle::DeviceRleSweepKernel< - typename PolicyHub::MaxPolicy, + detail::rle::policy_selector_from_hub, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, @@ -601,4 +640,196 @@ struct DeviceRleDispatch } }; +namespace detail::rle +{ +template , + typename key_t = it_value_t, + typename PolicySelector = non_trivial_runs::policy_selector_from_types> +#if _CCCL_HAS_CONCEPTS() + requires non_trivial_runs::rle_non_trivial_runs_policy_selector +#endif // _CCCL_HAS_CONCEPTS() +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OffsetsOutputIteratorT d_offsets_out, + LengthsOutputIteratorT d_lengths_out, + NumRunsOutputIteratorT d_num_runs_out, + EqualityOpT equality_op, + OffsetT num_items, + cudaStream_t stream, + PolicySelector policy_selector = {}) +{ + using local_offset_t = ::cuda::std::int32_t; + using global_offset_t = OffsetT; + static constexpr bool use_streaming_invocation = + ::cuda::std::numeric_limits::max() > ::cuda::std::numeric_limits::max(); + using streaming_context_t = ::cuda::std:: + conditional_t, NullType>; + using ScanTileStateT = ReduceByKeyScanTileState; + static constexpr int init_kernel_threads = 128; + + ::cuda::arch_id arch_id{}; + if (const auto error = CubDebug(ptx_arch_id(arch_id))) + { + return error; + } + +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + NV_IF_TARGET( + NV_IS_HOST, + (::std::stringstream ss; ss << policy_selector(arch_id); + _CubLog("Dispatching DeviceRle to arch %d with tuning: %s\n", static_cast(arch_id), ss.str().c_str());)) +#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + + const non_trivial_runs::rle_non_trivial_runs_policy policy = policy_selector(arch_id); + const int block_threads = policy.block_threads; + const int items_per_thread = policy.items_per_thread; + const auto tile_size = static_cast(block_threads * items_per_thread); + + auto capped_num_items_per_invocation = num_items; + if constexpr (use_streaming_invocation) + { + capped_num_items_per_invocation = static_cast(::cuda::std::numeric_limits::max()); + capped_num_items_per_invocation -= (capped_num_items_per_invocation % tile_size); + } + + const auto max_num_items_per_invocation = + use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items; + const auto num_partitions = + (capped_num_items_per_invocation == 0) + ? global_offset_t{1} + : ::cuda::ceil_div(num_items, capped_num_items_per_invocation); + + const int max_num_tiles = static_cast(::cuda::ceil_div(max_num_items_per_invocation, tile_size)); + + size_t allocation_sizes[3]; + if (const auto error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0]))) + { + return error; + } + allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0}; + allocation_sizes[2] = num_partitions > 1 ? sizeof(length_t) * 2 : size_t{0}; + + void* allocations[3] = {}; + if (const auto error = + CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + { + return error; + } + + if (d_temp_storage == nullptr) + { + return cudaSuccess; + } + + for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) + { + global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation; + global_offset_t current_num_items = + (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : capped_num_items_per_invocation; + + const auto num_current_tiles = static_cast(::cuda::ceil_div(current_num_items, tile_size)); + ScanTileStateT tile_status; + if (const auto error = CubDebug(tile_status.Init(num_current_tiles, allocations[0], allocation_sizes[0]))) + { + return error; + } + + const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads)); +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", + init_grid_size, + init_kernel_threads, + (long long) stream); +#endif + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream) + .doit(&detail::scan::template DeviceCompactInitKernel, + tile_status, + num_current_tiles, + d_num_runs_out))) + { + return error; + } + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + if (num_items <= 1) + { + return cudaSuccess; + } +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking device_rle_sweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + num_current_tiles, + block_threads, + (long long) stream, + items_per_thread); +#endif + auto device_rle_sweep_kernel = &detail::rle::DeviceRleSweepKernel< + PolicySelector, + InputIteratorT, + OffsetsOutputIteratorT, + LengthsOutputIteratorT, + NumRunsOutputIteratorT, + ScanTileStateT, + EqualityOpT, + local_offset_t, + global_offset_t, + streaming_context_t>; + + auto streaming_context = [&] { + if constexpr (use_streaming_invocation) + { + auto tmp_num_uniques = static_cast(allocations[1]); + auto tmp_prefix = static_cast(allocations[2]); + const bool is_first_partition = (partition_idx == 0); + const bool is_last_partition = (partition_idx + 1 == num_partitions); + const int buffer_selector = partition_idx % 2; + return streaming_context_t{ + is_first_partition, + is_last_partition, + current_partition_offset, + &tmp_prefix[buffer_selector], + &tmp_prefix[buffer_selector ^ 0x01], + &tmp_num_uniques[buffer_selector], + &tmp_num_uniques[buffer_selector ^ 0x01]}; + } + else + { + return NullType{}; + } + }(); + + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) + .doit(device_rle_sweep_kernel, + d_in + current_partition_offset, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + tile_status, + equality_op, + static_cast(current_num_items), + num_current_tiles, + streaming_context))) + { + return error; + } + + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + } + return cudaSuccess; +} +} // namespace detail::rle CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh index 6d75fe750ce..5e435e017e0 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh @@ -18,15 +18,28 @@ #include #include #include +#include #include +#include #include +#include +#include #include +#if _CCCL_HAS_CONCEPTS() +# include +#endif // _CCCL_HAS_CONCEPTS() + +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + CUB_NAMESPACE_BEGIN namespace detail::rle::non_trivial_runs { +// TODO(bgruber): remove in CCCL 4.0 when we drop all CUB dispatchers template (), @@ -92,6 +105,7 @@ struct sm80_tuning(), @@ -157,6 +171,7 @@ struct sm90_tuning(), @@ -238,6 +253,7 @@ struct sm100_tuning struct policy_hub { @@ -311,6 +327,262 @@ struct policy_hub using MaxPolicy = Policy1000; }; + +struct rle_non_trivial_runs_policy +{ + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_algorithm; + CacheLoadModifier load_modifier; + bool store_with_time_slicing; + BlockScanAlgorithm scan_algorithm; + delay_constructor_policy delay_constructor; + + [[nodiscard]] _CCCL_API constexpr friend bool + operator==(const rle_non_trivial_runs_policy& lhs, const rle_non_trivial_runs_policy& rhs) + { + return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread + && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier + && lhs.store_with_time_slicing == rhs.store_with_time_slicing && lhs.scan_algorithm == rhs.scan_algorithm + && lhs.delay_constructor == rhs.delay_constructor; + } + + [[nodiscard]] _CCCL_API constexpr friend bool + operator!=(const rle_non_trivial_runs_policy& lhs, const rle_non_trivial_runs_policy& rhs) + { + return !(lhs == rhs); + } + +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const rle_non_trivial_runs_policy& p) + { + return os + << "rle_non_trivial_runs_policy { .block_threads = " << p.block_threads + << ", .items_per_thread = " << p.items_per_thread << ", .load_algorithm = " << p.load_algorithm + << ", .load_modifier = " << p.load_modifier << ", .store_with_time_slicing = " << p.store_with_time_slicing + << ", .scan_algorithm = " << p.scan_algorithm << ", .delay_constructor = " << p.delay_constructor << " }"; + } +#endif // !_CCCL_COMPILER(NVRTC) +}; + +#if _CCCL_HAS_CONCEPTS() +template +concept rle_non_trivial_runs_policy_selector = detail::policy_selector; +#endif // _CCCL_HAS_CONCEPTS() + +struct policy_selector +{ + length_size length_sz; + key_size key_sz; + primitive_length prim_len; + primitive_key prim_key; + int key_type_size; + + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy + { + const bool tuned_prim = (prim_len == primitive_length::yes && prim_key == primitive_key::yes); + const bool length_4 = (length_sz == length_size::_4); + + if (arch >= ::cuda::arch_id::sm_100 && tuned_prim && length_4) + { + if (key_sz == key_size::_1) + { + return rle_non_trivial_runs_policy{ + 224, + 20, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_CA, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } + if (key_sz == key_size::_2) + { + return rle_non_trivial_runs_policy{ + 224, + 20, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } + if (key_sz == key_size::_4) + { + return rle_non_trivial_runs_policy{ + 224, + 13, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } + if (key_sz == key_size::_8) + { + return rle_non_trivial_runs_policy{ + 256, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } + } + if (arch >= ::cuda::arch_id::sm_90 && tuned_prim && length_4) + { + if (key_sz == key_size::_1) + { + return rle_non_trivial_runs_policy{ + 256, + 18, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } + if (key_sz == key_size::_2) + { + return rle_non_trivial_runs_policy{ + 224, + 20, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } + if (key_sz == key_size::_4) + { + return rle_non_trivial_runs_policy{ + 256, + 18, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } + if (key_sz == key_size::_8) + { + return rle_non_trivial_runs_policy{ + 224, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } +#if _CCCL_HAS_INT128() + if (key_sz == key_size::_16) + { + return rle_non_trivial_runs_policy{ + 192, + 12, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } +#endif + } + if (arch >= ::cuda::arch_id::sm_80 && tuned_prim && length_4) + { + if (key_sz == key_size::_1) + { + return rle_non_trivial_runs_policy{ + 192, + 20, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } + if (key_sz == key_size::_2) + { + return rle_non_trivial_runs_policy{ + 192, + 20, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } + if (key_sz == key_size::_4) + { + return rle_non_trivial_runs_policy{ + 224, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } + if (key_sz == key_size::_8) + { + return rle_non_trivial_runs_policy{ + 256, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } +#if _CCCL_HAS_INT128() + if (key_sz == key_size::_16) + { + return rle_non_trivial_runs_policy{ + 192, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}}; + } +#endif + } + constexpr int nominal_4B_items_per_thread = 15; + const int items_per_thread = + ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / key_type_size, 1, nominal_4B_items_per_thread); + return rle_non_trivial_runs_policy{ + 96, + items_per_thread, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 350, 450}, + }; + } +}; + +template +struct policy_selector_from_types +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy + { + constexpr policy_selector selector{ + classify_length_size(), + classify_key_size(), + is_primitive_length(), + is_primitive_key(), + static_cast(sizeof(KeyT))}; + return selector(arch); + } +}; + +#if _CCCL_HAS_CONCEPTS() +static_assert(rle_non_trivial_runs_policy_selector); +#endif // _CCCL_HAS_CONCEPTS() } // namespace detail::rle::non_trivial_runs CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu index a84fc5315a1..82586867413 100644 --- a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu +++ b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu @@ -261,6 +261,7 @@ struct device_rle_policy_hub using MaxPolicy = Policy500; }; +// TODO(bgruber): rewrite this test to use the new tuning API once it's available struct CustomDeviceRunLengthEncode { template Date: Fri, 13 Feb 2026 18:08:12 +0100 Subject: [PATCH 2/7] Fix input type --- cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu index 7b4073256b9..fc902393c9d 100644 --- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu +++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu @@ -57,7 +57,7 @@ static void rle(nvbench::state& state, nvbench::type_list out_lengths(elements); thrust::device_vector in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); - T* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); + const T* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); offset_t* d_out_offsets = thrust::raw_pointer_cast(out_offsets.data()); run_length_t* d_out_lengths = thrust::raw_pointer_cast(out_lengths.data()); offset_t* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); From 0ce0058cabeb3f734c59565e53925cc8e914bc1d Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sun, 22 Feb 2026 14:40:48 +0100 Subject: [PATCH 3/7] Add reduce_by_key delay constructor --- cub/cub/detail/delay_constructor.cuh | 26 ++++++++++++++++++- .../tuning/tuning_rle_non_trivial_runs.cuh | 3 +-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh index 44a409f497a..9325f1f258c 100644 --- a/cub/cub/detail/delay_constructor.cuh +++ b/cub/cub/detail/delay_constructor.cuh @@ -32,7 +32,8 @@ enum class delay_constructor_kind exponential_backoff_jitter_window, exponential_backon_jitter_window, exponential_backon_jitter, - exponential_backon + exponential_backon, + reduce_by_key }; #if !_CCCL_COMPILER(NVRTC) @@ -44,6 +45,8 @@ inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kin return os << "delay_constructor_kind::no_delay"; case delay_constructor_kind::fixed_delay: return os << "delay_constructor_kind::fixed_delay"; + case delay_constructor_kind::reduce_by_key: + return os << "delay_constructor_kind::reduce_by_key"; case delay_constructor_kind::exponential_backoff: return os << "delay_constructor_kind::exponential_backoff"; case delay_constructor_kind::exponential_backoff_jitter: @@ -125,6 +128,11 @@ template inline constexpr auto delay_constructor_policy_from_type> = delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency}; +template +inline constexpr auto + delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::reduce_by_key, Delay, L2WriteLatency}; + template struct delay_constructor_for; @@ -140,6 +148,12 @@ struct delay_constructor_for; }; +template +struct delay_constructor_for +{ + using type = reduce_by_key_delay_constructor_t; +}; + template struct delay_constructor_for { @@ -178,6 +192,16 @@ struct delay_constructor_for using delay_constructor_t = typename delay_constructor_for::type; + +_CCCL_API constexpr auto +default_reduce_by_key_delay_constructor_policy(int key_size, int value_size, bool value_is_primitive) +{ + if (value_is_primitive && (value_size + key_size < 16)) + { + return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450}; + } + return delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450}; +} } // namespace detail CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh index 5e435e017e0..58db6928051 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh @@ -560,8 +560,7 @@ struct policy_selector LOAD_LDG, true, BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}, - }; + default_reduce_by_key_delay_constructor_policy(key_type_size, sizeof(int), true)}; } }; From 2ae7a6858b91a4709bc7348507845e7fdd7aac0a Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sun, 22 Feb 2026 18:29:44 +0100 Subject: [PATCH 4/7] no SASS diff --- .../tuning/tuning_rle_non_trivial_runs.cuh | 381 ++++++++++-------- 1 file changed, 203 insertions(+), 178 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh index 58db6928051..0bf98be5cdc 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh @@ -372,195 +372,224 @@ concept rle_non_trivial_runs_policy_selector = detail::policy_selector rle_non_trivial_runs_policy { - const bool tuned_prim = (prim_len == primitive_length::yes && prim_key == primitive_key::yes); - const bool length_4 = (length_sz == length_size::_4); - - if (arch >= ::cuda::arch_id::sm_100 && tuned_prim && length_4) - { - if (key_sz == key_size::_1) - { - return rle_non_trivial_runs_policy{ - 224, - 20, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_CA, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } - if (key_sz == key_size::_2) - { - return rle_non_trivial_runs_policy{ - 224, - 20, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } - if (key_sz == key_size::_4) - { + auto make_default_policy = + [&](BlockLoadAlgorithm block_load_alg, int delay_ctor_key_size, CacheLoadModifier load_mod) { + constexpr int nominal_4B_items_per_thread = 15; + const int items_per_thread = + ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / key_size, 1, nominal_4B_items_per_thread); return rle_non_trivial_runs_policy{ - 224, - 13, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - false, + 96, + items_per_thread, + block_load_alg, + load_mod, + true, BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } - if (key_sz == key_size::_8) + default_reduce_by_key_delay_constructor_policy(delay_ctor_key_size, sizeof(int), true)}; + }; + + if (arch >= ::cuda::arch_id::sm_100) + { + if (length_is_primitive && key_is_primitive && length_size == 4) { - return rle_non_trivial_runs_policy{ - 256, - 15, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; + if (key_size == 1) + { + // ipt_20.tpb_224.trp_1.ts_0.ld_1.ns_64.dcid_2.l2w_315 1.119878 1.003690 1.130067 1.338983 + return rle_non_trivial_runs_policy{ + 224, + 20, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_CA, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 64, 315}}; + } + if (key_size == 2) + { + // ipt_20.tpb_224.trp_1.ts_0.ld_0.ns_116.dcid_7.l2w_340 1.146528 1.072769 1.152390 1.333333 + return rle_non_trivial_runs_policy{ + 224, + 20, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon, 116, 340}}; + } + if (key_size == 4) + { + // ipt_13.tpb_224.trp_0.ts_0.ld_0.ns_252.dcid_2.l2w_470 1.113202 1.003690 1.133114 1.349296 + return rle_non_trivial_runs_policy{ + 224, + 13, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 252, 470}}; + } + if (key_size == 8 && key_type != type_t::float64) // fall back to SM90 for double + { + // ipt_15.tpb_256.trp_1.ts_0.ld_0.ns_28.dcid_2.l2w_520 1.114944 1.033189 1.122360 1.252083 + return rle_non_trivial_runs_policy{ + 256, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 28, 520}}; + } } + + // no tuning for SM100, fall-through to SM90 } - if (arch >= ::cuda::arch_id::sm_90 && tuned_prim && length_4) + + if (arch >= ::cuda::arch_id::sm_90) { - if (key_sz == key_size::_1) - { - return rle_non_trivial_runs_policy{ - 256, - 18, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } - if (key_sz == key_size::_2) - { - return rle_non_trivial_runs_policy{ - 224, - 20, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } - if (key_sz == key_size::_4) - { - return rle_non_trivial_runs_policy{ - 256, - 18, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } - if (key_sz == key_size::_8) - { - return rle_non_trivial_runs_policy{ - 224, - 14, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } -#if _CCCL_HAS_INT128() - if (key_sz == key_size::_16) + if (length_is_primitive && length_size == 4) { - return rle_non_trivial_runs_policy{ - 192, - 12, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; + if (key_is_primitive && key_size == 1) + { + return rle_non_trivial_runs_policy{ + 256, + 18, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 385}}; + } + if (key_is_primitive && key_size == 2) + { + return rle_non_trivial_runs_policy{ + 224, + 20, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 675}}; + } + if (key_is_primitive && key_size == 4) + { + return rle_non_trivial_runs_policy{ + 256, + 18, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 695}}; + } + if (key_is_primitive && key_size == 8) + { + return rle_non_trivial_runs_policy{ + 224, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 840}}; + } + if (key_type == type_t::int128 || key_type == type_t::uint128) + { + return rle_non_trivial_runs_policy{ + 288, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 484, 1150}}; + } } -#endif + + // no tuning for SM90, use a default policy + return make_default_policy(BLOCK_LOAD_WARP_TRANSPOSE, length_size, LOAD_DEFAULT); } - if (arch >= ::cuda::arch_id::sm_80 && tuned_prim && length_4) + + if (arch >= ::cuda::arch_id::sm_86) { - if (key_sz == key_size::_1) - { - return rle_non_trivial_runs_policy{ - 192, - 20, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } - if (key_sz == key_size::_2) - { - return rle_non_trivial_runs_policy{ - 192, - 20, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } - if (key_sz == key_size::_4) - { - return rle_non_trivial_runs_policy{ - 224, - 15, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } - if (key_sz == key_size::_8) - { - return rle_non_trivial_runs_policy{ - 256, - 13, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; - } -#if _CCCL_HAS_INT128() - if (key_sz == key_size::_16) + // TODO(bgruber): I think we want `LengthT` instead of `int` + return make_default_policy(BLOCK_LOAD_DIRECT, sizeof(int), LOAD_LDG); + } + + if (arch >= ::cuda::arch_id::sm_80) + { + if (length_is_primitive && length_size == 4) { - return rle_non_trivial_runs_policy{ - 192, - 13, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS, - {delay_constructor_kind::fixed_delay, 350, 450}}; + if (key_is_primitive && key_size == 1) + { + return rle_non_trivial_runs_policy{ + 192, + 20, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 630}}; + } + if (key_is_primitive && key_size == 2) + { + return rle_non_trivial_runs_policy{ + 192, + 20, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1015}}; + } + if (key_is_primitive && key_size == 4) + { + return rle_non_trivial_runs_policy{ + 224, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 915}}; + } + if (key_is_primitive && key_size == 8) + { + return rle_non_trivial_runs_policy{ + 256, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1065}}; + } + if (key_type == type_t::int128 || key_type == type_t::uint128) + { + return rle_non_trivial_runs_policy{ + 192, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1050}}; + } } -#endif + // no tuning for SM80, use a default policy + return make_default_policy(BLOCK_LOAD_WARP_TRANSPOSE, length_size, LOAD_DEFAULT); } - constexpr int nominal_4B_items_per_thread = 15; - const int items_per_thread = - ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / key_type_size, 1, nominal_4B_items_per_thread); - return rle_non_trivial_runs_policy{ - 96, - items_per_thread, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - true, - BLOCK_SCAN_WARP_SCANS, - default_reduce_by_key_delay_constructor_policy(key_type_size, sizeof(int), true)}; + + // default is from SM50 + return make_default_policy(BLOCK_LOAD_DIRECT, int{sizeof(int)}, LOAD_LDG); } }; @@ -570,11 +599,7 @@ struct policy_selector_from_types [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy { constexpr policy_selector selector{ - classify_length_size(), - classify_key_size(), - is_primitive_length(), - is_primitive_key(), - static_cast(sizeof(KeyT))}; + sizeof(LengthT), int{sizeof(KeyT)}, classify_type, is_primitive_v, is_primitive_v}; return selector(arch); } }; From 5f250416c720842764afbe45e542e932356b251a Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sun, 22 Feb 2026 18:48:15 +0100 Subject: [PATCH 5/7] Refactoring and fixes --- cub/cub/detail/delay_constructor.cuh | 16 +++--- cub/cub/device/device_run_length_encode.cuh | 6 +-- cub/cub/device/dispatch/dispatch_rle.cuh | 35 +++++++------ .../tuning/tuning_rle_non_trivial_runs.cuh | 15 +++--- ...vice_run_length_encode_non_trivial_runs.cu | 49 +++++++------------ 5 files changed, 54 insertions(+), 67 deletions(-) diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh index 9325f1f258c..23d0c6d571e 100644 --- a/cub/cub/detail/delay_constructor.cuh +++ b/cub/cub/detail/delay_constructor.cuh @@ -45,8 +45,6 @@ inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kin return os << "delay_constructor_kind::no_delay"; case delay_constructor_kind::fixed_delay: return os << "delay_constructor_kind::fixed_delay"; - case delay_constructor_kind::reduce_by_key: - return os << "delay_constructor_kind::reduce_by_key"; case delay_constructor_kind::exponential_backoff: return os << "delay_constructor_kind::exponential_backoff"; case delay_constructor_kind::exponential_backoff_jitter: @@ -59,6 +57,8 @@ inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kin return os << "delay_constructor_kind::exponential_backon_jitter"; case delay_constructor_kind::exponential_backon: return os << "delay_constructor_kind::exponential_backon"; + case delay_constructor_kind::reduce_by_key: + return os << "delay_constructor_kind::reduce_by_key"; default: return os << "(kind) << ">"; } @@ -148,12 +148,6 @@ struct delay_constructor_for; }; -template -struct delay_constructor_for -{ - using type = reduce_by_key_delay_constructor_t; -}; - template struct delay_constructor_for { @@ -190,6 +184,12 @@ struct delay_constructor_for; }; +template +struct delay_constructor_for +{ + using type = reduce_by_key_delay_constructor_t; +}; + template using delay_constructor_t = typename delay_constructor_for::type; diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index f5aa4016c1e..44fac1b1cd9 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -327,8 +327,8 @@ struct DeviceRunLengthEncode { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::NonTrivialRuns"); - using offset_t = detail::choose_signed_offset_t; - using equality_op = ::cuda::std::equal_to<>; + using global_offset_t = detail::choose_signed_offset_t; + using equality_op = ::cuda::std::equal_to<>; return detail::rle::dispatch( d_temp_storage, temp_storage_bytes, @@ -337,7 +337,7 @@ struct DeviceRunLengthEncode d_lengths_out, d_num_runs_out, equality_op{}, - static_cast(num_items), + static_cast(num_items), stream); } }; diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 4184ad4646a..2854b88833c 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -681,17 +681,17 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( return error; } + const non_trivial_runs::rle_non_trivial_runs_policy active_policy = policy_selector(arch_id); #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) NV_IF_TARGET( NV_IS_HOST, - (::std::stringstream ss; ss << policy_selector(arch_id); + (::std::stringstream ss; ss << active_policy; _CubLog("Dispatching DeviceRle to arch %d with tuning: %s\n", static_cast(arch_id), ss.str().c_str());)) #endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) - const non_trivial_runs::rle_non_trivial_runs_policy policy = policy_selector(arch_id); - const int block_threads = policy.block_threads; - const int items_per_thread = policy.items_per_thread; - const auto tile_size = static_cast(block_threads * items_per_thread); + const int block_threads = active_policy.block_threads; + const int items_per_thread = active_policy.items_per_thread; + const auto tile_size = static_cast(block_threads * items_per_thread); auto capped_num_items_per_invocation = num_items; if constexpr (use_streaming_invocation) @@ -751,7 +751,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( #endif if (const auto error = CubDebug( THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream) - .doit(&detail::scan::template DeviceCompactInitKernel, + .doit(&detail::scan::DeviceCompactInitKernel, tile_status, num_current_tiles, d_num_runs_out))) @@ -773,17 +773,6 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( (long long) stream, items_per_thread); #endif - auto device_rle_sweep_kernel = &detail::rle::DeviceRleSweepKernel< - PolicySelector, - InputIteratorT, - OffsetsOutputIteratorT, - LengthsOutputIteratorT, - NumRunsOutputIteratorT, - ScanTileStateT, - EqualityOpT, - local_offset_t, - global_offset_t, - streaming_context_t>; auto streaming_context = [&] { if constexpr (use_streaming_invocation) @@ -810,7 +799,17 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( if (const auto error = CubDebug( THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) - .doit(device_rle_sweep_kernel, + .doit(&detail::rle::DeviceRleSweepKernel< + PolicySelector, + InputIteratorT, + OffsetsOutputIteratorT, + LengthsOutputIteratorT, + NumRunsOutputIteratorT, + ScanTileStateT, + EqualityOpT, + local_offset_t, + global_offset_t, + streaming_context_t>, d_in + current_partition_offset, d_offsets_out, d_lengths_out, diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh index 0bf98be5cdc..215f44587de 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh @@ -23,7 +23,6 @@ #include #include -#include #include #include @@ -336,7 +335,7 @@ struct rle_non_trivial_runs_policy CacheLoadModifier load_modifier; bool store_with_time_slicing; BlockScanAlgorithm scan_algorithm; - delay_constructor_policy delay_constructor; + delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}; [[nodiscard]] _CCCL_API constexpr friend bool operator==(const rle_non_trivial_runs_policy& lhs, const rle_non_trivial_runs_policy& rhs) @@ -372,12 +371,12 @@ concept rle_non_trivial_runs_policy_selector = detail::policy_selector rle_non_trivial_runs_policy { @@ -593,6 +592,10 @@ struct policy_selector } }; +#if _CCCL_HAS_CONCEPTS() +static_assert(rle_non_trivial_runs_policy_selector); +#endif // _CCCL_HAS_CONCEPTS() + template struct policy_selector_from_types { @@ -603,10 +606,6 @@ struct policy_selector_from_types return selector(arch); } }; - -#if _CCCL_HAS_CONCEPTS() -static_assert(rle_non_trivial_runs_policy_selector); -#endif // _CCCL_HAS_CONCEPTS() } // namespace detail::rle::non_trivial_runs CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu index 82586867413..ffb9c6fc783 100644 --- a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu +++ b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu @@ -247,21 +247,18 @@ C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle pointers", "[device][ // Guard against #293 template -struct device_rle_policy_hub +struct device_rle_policy_selector { static constexpr int threads = 96; static constexpr int items = 15; - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> + _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const + -> cub::detail::rle::non_trivial_runs::rle_non_trivial_runs_policy { - using RleSweepPolicyT = cub:: - AgentRlePolicy; - }; - - using MaxPolicy = Policy500; + return {threads, items, cub::BLOCK_LOAD_DIRECT, cub::LOAD_DEFAULT, TimeSlicing, cub::BLOCK_SCAN_WARP_SCANS}; + } }; -// TODO(bgruber): rewrite this test to use the new tuning API once it's available struct CustomDeviceRunLengthEncode { template ; // Default == operator - - return cub::DeviceRleDispatch>:: - Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - EqualityOp(), - num_items, - stream); + return cub::detail::rle::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + cuda::std::equal_to<>{}, // Default == operator + num_items, + stream, + device_rle_policy_selector{}); } }; @@ -309,9 +298,9 @@ using time_slicing = c2h::type_list; C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns does not run out of memory", "[device][run_length_encode]", time_slicing) { using type = typename c2h::get<0, TestType>; - using policy_hub_t = device_rle_policy_hub; + using policy_sel_t = device_rle_policy_selector; - constexpr int tile_size = policy_hub_t::threads * policy_hub_t::items; + constexpr int tile_size = policy_sel_t::threads * policy_sel_t::items; constexpr int num_items = 2 * tile_size; constexpr int magic_number = num_items + 1; From 8a5bb4d7116d36ab86ef0fe13f772c6c1e77d4db Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sun, 22 Feb 2026 18:51:11 +0100 Subject: [PATCH 6/7] Try to work around MSVC --- cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh index 215f44587de..2f9d8730970 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh @@ -382,7 +382,7 @@ struct policy_selector { auto make_default_policy = [&](BlockLoadAlgorithm block_load_alg, int delay_ctor_key_size, CacheLoadModifier load_mod) { - constexpr int nominal_4B_items_per_thread = 15; + const int nominal_4B_items_per_thread = 15; const int items_per_thread = ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / key_size, 1, nominal_4B_items_per_thread); return rle_non_trivial_runs_policy{ From 490d98ba6ea9d69a02248901ed5dddc0c23f2417 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 23 Feb 2026 00:33:46 +0100 Subject: [PATCH 7/7] MSVC --- .../tuning/tuning_rle_non_trivial_runs.cuh | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh index 2f9d8730970..87d8aadf084 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh @@ -378,23 +378,24 @@ struct policy_selector bool length_is_primitive; bool key_is_primitive; // TODO(bgruber): can probably be derived from key_type - [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy + _CCCL_API constexpr auto + make_default_policy(BlockLoadAlgorithm block_load_alg, int delay_ctor_key_size, CacheLoadModifier load_mod) const { - auto make_default_policy = - [&](BlockLoadAlgorithm block_load_alg, int delay_ctor_key_size, CacheLoadModifier load_mod) { - const int nominal_4B_items_per_thread = 15; - const int items_per_thread = - ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / key_size, 1, nominal_4B_items_per_thread); - return rle_non_trivial_runs_policy{ - 96, - items_per_thread, - block_load_alg, - load_mod, - true, - BLOCK_SCAN_WARP_SCANS, - default_reduce_by_key_delay_constructor_policy(delay_ctor_key_size, sizeof(int), true)}; - }; + const int nominal_4B_items_per_thread = 15; + const int items_per_thread = + ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / key_size, 1, nominal_4B_items_per_thread); + return rle_non_trivial_runs_policy{ + 96, + items_per_thread, + block_load_alg, + load_mod, + true, + BLOCK_SCAN_WARP_SCANS, + default_reduce_by_key_delay_constructor_policy(delay_ctor_key_size, sizeof(int), true)}; + } + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy + { if (arch >= ::cuda::arch_id::sm_100) { if (length_is_primitive && key_is_primitive && length_size == 4)