diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu index 4abf201cbb8..fc902393c9d 100644 --- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu +++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu @@ -16,33 +16,21 @@ // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE -# if TUNE_TRANSPOSE == 0 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT -# else // TUNE_TRANSPOSE == 1 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE -# endif // TUNE_TRANSPOSE - -# if TUNE_LOAD == 0 -# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT -# else // TUNE_LOAD == 1 -# define TUNE_LOAD_MODIFIER cub::LOAD_CA -# endif // TUNE_LOAD - -struct device_rle_policy_hub +struct bench_rle_policy_selector { - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> + [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const + -> cub::detail::rle::non_trivial_runs::rle_non_trivial_runs_policy { - using RleSweepPolicyT = - cub::AgentRlePolicy; - }; - - using MaxPolicy = Policy500; + return { + TUNE_THREADS, + TUNE_ITEMS, + TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE, + TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA, + static_cast(TUNE_TIME_SLICING), + cub::BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy, + }; + } }; #endif // !TUNE_BASE @@ -60,25 +48,6 @@ static void rle(nvbench::state& state, nvbench::type_list; -#if !TUNE_BASE - using dispatch_t = - cub::DeviceRleDispatch; -#else - using dispatch_t = - cub::DeviceRleDispatch; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; const std::size_t max_segment_size = static_cast(state.get_int64("MaxSegSize")); @@ -88,38 +57,39 @@ static void rle(nvbench::state& state, nvbench::type_list out_lengths(elements); thrust::device_vector in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); - T* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); + const T* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); offset_t* d_out_offsets = thrust::raw_pointer_cast(out_offsets.data()); run_length_t* d_out_lengths = thrust::raw_pointer_cast(out_lengths.data()); offset_t* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; + const offset_t num_items = static_cast(elements); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_offsets, - d_out_lengths, - d_num_runs_out, - equality_op_t{}, - elements, - 0); + auto dispatch_on_stream = [&](cudaStream_t stream) { + cub::detail::rle::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in_keys, + d_out_offsets, + d_out_lengths, + d_num_runs_out, + equality_op_t{}, + num_items, + stream +#if !TUNE_BASE + , + bench_rle_policy_selector{} +#endif + ); + }; + + dispatch_on_stream(cudaStream_t{0}); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_offsets, - d_out_lengths, - d_num_runs_out, - equality_op_t{}, - elements, - 0); + dispatch_on_stream(cudaStream_t{0}); cudaDeviceSynchronize(); const OffsetT num_runs = num_runs_out[0]; @@ -130,16 +100,7 @@ static void rle(nvbench::state& state, nvbench::type_list(1); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_offsets, - d_out_lengths, - d_num_runs_out, - equality_op_t{}, - elements, - launch.get_stream()); + dispatch_on_stream(launch.get_stream().get_stream()); }); } diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh index 44a409f497a..23d0c6d571e 100644 --- a/cub/cub/detail/delay_constructor.cuh +++ b/cub/cub/detail/delay_constructor.cuh @@ -32,7 +32,8 @@ enum class delay_constructor_kind exponential_backoff_jitter_window, exponential_backon_jitter_window, exponential_backon_jitter, - exponential_backon + exponential_backon, + reduce_by_key }; #if !_CCCL_COMPILER(NVRTC) @@ -56,6 +57,8 @@ inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kin return os << "delay_constructor_kind::exponential_backon_jitter"; case delay_constructor_kind::exponential_backon: return os << "delay_constructor_kind::exponential_backon"; + case delay_constructor_kind::reduce_by_key: + return os << "delay_constructor_kind::reduce_by_key"; default: return os << "(kind) << ">"; } @@ -125,6 +128,11 @@ template inline constexpr auto delay_constructor_policy_from_type> = delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency}; +template +inline constexpr auto + delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::reduce_by_key, Delay, L2WriteLatency}; + template struct delay_constructor_for; @@ -176,8 +184,24 @@ struct delay_constructor_for; }; +template +struct delay_constructor_for +{ + using type = reduce_by_key_delay_constructor_t; +}; + template using delay_constructor_t = typename delay_constructor_for::type; + +_CCCL_API constexpr auto +default_reduce_by_key_delay_constructor_policy(int key_size, int value_size, bool value_is_primitive) +{ + if (value_is_primitive && (value_size + key_size < 16)) + { + return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450}; + } + return delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450}; +} } // namespace detail CUB_NAMESPACE_END diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 691e0a28c30..44fac1b1cd9 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -327,25 +327,18 @@ struct DeviceRunLengthEncode { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::NonTrivialRuns"); - // Offset type used for global offsets - using offset_t = detail::choose_signed_offset_t; - using equality_op = ::cuda::std::equal_to<>; - - return DeviceRleDispatch< - InputIteratorT, - OffsetsOutputIteratorT, - LengthsOutputIteratorT, - NumRunsOutputIteratorT, - equality_op, - offset_t>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - equality_op{}, - num_items, - stream); + using global_offset_t = detail::choose_signed_offset_t; + using equality_op = ::cuda::std::equal_to<>; + return detail::rle::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + equality_op{}, + static_cast(num_items), + stream); } }; diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index a5b490287ab..2854b88833c 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -21,6 +21,7 @@ #endif // no system header #include +#include #include #include #include @@ -29,6 +30,10 @@ #include +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) +# include +#endif + #include #include #include @@ -110,8 +115,8 @@ struct streaming_context * Otherwise performs flag-based selection if FlagIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) * - * @tparam AgentRlePolicyT - * Parameterized AgentRlePolicyT tuning policy type + * @tparam PolicySelector + * Selects the tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading input items @iterator @@ -158,7 +163,7 @@ struct streaming_context * @param num_tiles * Total number of tiles for the entire problem */ -template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREADS)) +#if _CCCL_HAS_CONCEPTS() + requires non_trivial_runs::rle_non_trivial_runs_policy_selector +#endif // _CCCL_HAS_CONCEPTS() +__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel( InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, @@ -180,9 +188,19 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context) { - using AgentRlePolicyT = typename ChainedPolicyT::ActivePolicy::RleSweepPolicyT; + static constexpr non_trivial_runs::rle_non_trivial_runs_policy policy = + PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); + using AgentRlePolicyT = + AgentRlePolicy>; - // Thread block type for selecting data from input tiles using AgentRleT = AgentRle +struct policy_selector_from_hub +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const + -> non_trivial_runs::rle_non_trivial_runs_policy + { + using RleSweepPolicyT = typename PolicyHub::MaxPolicy::RleSweepPolicyT; + return non_trivial_runs::rle_non_trivial_runs_policy{ + RleSweepPolicyT::BLOCK_THREADS, + RleSweepPolicyT::ITEMS_PER_THREAD, + RleSweepPolicyT::LOAD_ALGORITHM, + RleSweepPolicyT::LOAD_MODIFIER, + RleSweepPolicyT::STORE_WARP_TIME_SLICING, + RleSweepPolicyT::SCAN_ALGORITHM, + delay_constructor_policy_from_type, + }; + } +}; } // namespace detail::rle /****************************************************************************** @@ -231,6 +269,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ +// TODO(bgruber): deprecate when we make the tuning API public and remove in CCCL 4.0 template ( detail::scan::DeviceCompactInitKernel, detail::rle::DeviceRleSweepKernel< - typename PolicyHub::MaxPolicy, + detail::rle::policy_selector_from_hub, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, @@ -601,4 +640,195 @@ struct DeviceRleDispatch } }; +namespace detail::rle +{ +template , + typename key_t = it_value_t, + typename PolicySelector = non_trivial_runs::policy_selector_from_types> +#if _CCCL_HAS_CONCEPTS() + requires non_trivial_runs::rle_non_trivial_runs_policy_selector +#endif // _CCCL_HAS_CONCEPTS() +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OffsetsOutputIteratorT d_offsets_out, + LengthsOutputIteratorT d_lengths_out, + NumRunsOutputIteratorT d_num_runs_out, + EqualityOpT equality_op, + OffsetT num_items, + cudaStream_t stream, + PolicySelector policy_selector = {}) +{ + using local_offset_t = ::cuda::std::int32_t; + using global_offset_t = OffsetT; + static constexpr bool use_streaming_invocation = + ::cuda::std::numeric_limits::max() > ::cuda::std::numeric_limits::max(); + using streaming_context_t = ::cuda::std:: + conditional_t, NullType>; + using ScanTileStateT = ReduceByKeyScanTileState; + static constexpr int init_kernel_threads = 128; + + ::cuda::arch_id arch_id{}; + if (const auto error = CubDebug(ptx_arch_id(arch_id))) + { + return error; + } + + const non_trivial_runs::rle_non_trivial_runs_policy active_policy = policy_selector(arch_id); +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + NV_IF_TARGET( + NV_IS_HOST, + (::std::stringstream ss; ss << active_policy; + _CubLog("Dispatching DeviceRle to arch %d with tuning: %s\n", static_cast(arch_id), ss.str().c_str());)) +#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + + const int block_threads = active_policy.block_threads; + const int items_per_thread = active_policy.items_per_thread; + const auto tile_size = static_cast(block_threads * items_per_thread); + + auto capped_num_items_per_invocation = num_items; + if constexpr (use_streaming_invocation) + { + capped_num_items_per_invocation = static_cast(::cuda::std::numeric_limits::max()); + capped_num_items_per_invocation -= (capped_num_items_per_invocation % tile_size); + } + + const auto max_num_items_per_invocation = + use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items; + const auto num_partitions = + (capped_num_items_per_invocation == 0) + ? global_offset_t{1} + : ::cuda::ceil_div(num_items, capped_num_items_per_invocation); + + const int max_num_tiles = static_cast(::cuda::ceil_div(max_num_items_per_invocation, tile_size)); + + size_t allocation_sizes[3]; + if (const auto error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0]))) + { + return error; + } + allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0}; + allocation_sizes[2] = num_partitions > 1 ? sizeof(length_t) * 2 : size_t{0}; + + void* allocations[3] = {}; + if (const auto error = + CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + { + return error; + } + + if (d_temp_storage == nullptr) + { + return cudaSuccess; + } + + for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) + { + global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation; + global_offset_t current_num_items = + (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : capped_num_items_per_invocation; + + const auto num_current_tiles = static_cast(::cuda::ceil_div(current_num_items, tile_size)); + ScanTileStateT tile_status; + if (const auto error = CubDebug(tile_status.Init(num_current_tiles, allocations[0], allocation_sizes[0]))) + { + return error; + } + + const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads)); +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", + init_grid_size, + init_kernel_threads, + (long long) stream); +#endif + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream) + .doit(&detail::scan::DeviceCompactInitKernel, + tile_status, + num_current_tiles, + d_num_runs_out))) + { + return error; + } + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + if (num_items <= 1) + { + return cudaSuccess; + } +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking device_rle_sweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + num_current_tiles, + block_threads, + (long long) stream, + items_per_thread); +#endif + + auto streaming_context = [&] { + if constexpr (use_streaming_invocation) + { + auto tmp_num_uniques = static_cast(allocations[1]); + auto tmp_prefix = static_cast(allocations[2]); + const bool is_first_partition = (partition_idx == 0); + const bool is_last_partition = (partition_idx + 1 == num_partitions); + const int buffer_selector = partition_idx % 2; + return streaming_context_t{ + is_first_partition, + is_last_partition, + current_partition_offset, + &tmp_prefix[buffer_selector], + &tmp_prefix[buffer_selector ^ 0x01], + &tmp_num_uniques[buffer_selector], + &tmp_num_uniques[buffer_selector ^ 0x01]}; + } + else + { + return NullType{}; + } + }(); + + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) + .doit(&detail::rle::DeviceRleSweepKernel< + PolicySelector, + InputIteratorT, + OffsetsOutputIteratorT, + LengthsOutputIteratorT, + NumRunsOutputIteratorT, + ScanTileStateT, + EqualityOpT, + local_offset_t, + global_offset_t, + streaming_context_t>, + d_in + current_partition_offset, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + tile_status, + equality_op, + static_cast(current_num_items), + num_current_tiles, + streaming_context))) + { + return error; + } + + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + } + return cudaSuccess; +} +} // namespace detail::rle CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh index 6d75fe750ce..87d8aadf084 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh @@ -18,15 +18,27 @@ #include #include #include +#include #include +#include #include +#include #include +#if _CCCL_HAS_CONCEPTS() +# include +#endif // _CCCL_HAS_CONCEPTS() + +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + CUB_NAMESPACE_BEGIN namespace detail::rle::non_trivial_runs { +// TODO(bgruber): remove in CCCL 4.0 when we drop all CUB dispatchers template (), @@ -92,6 +104,7 @@ struct sm80_tuning(), @@ -157,6 +170,7 @@ struct sm90_tuning(), @@ -238,6 +252,7 @@ struct sm100_tuning struct policy_hub { @@ -311,6 +326,287 @@ struct policy_hub using MaxPolicy = Policy1000; }; + +struct rle_non_trivial_runs_policy +{ + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_algorithm; + CacheLoadModifier load_modifier; + bool store_with_time_slicing; + BlockScanAlgorithm scan_algorithm; + delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}; + + [[nodiscard]] _CCCL_API constexpr friend bool + operator==(const rle_non_trivial_runs_policy& lhs, const rle_non_trivial_runs_policy& rhs) + { + return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread + && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier + && lhs.store_with_time_slicing == rhs.store_with_time_slicing && lhs.scan_algorithm == rhs.scan_algorithm + && lhs.delay_constructor == rhs.delay_constructor; + } + + [[nodiscard]] _CCCL_API constexpr friend bool + operator!=(const rle_non_trivial_runs_policy& lhs, const rle_non_trivial_runs_policy& rhs) + { + return !(lhs == rhs); + } + +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const rle_non_trivial_runs_policy& p) + { + return os + << "rle_non_trivial_runs_policy { .block_threads = " << p.block_threads + << ", .items_per_thread = " << p.items_per_thread << ", .load_algorithm = " << p.load_algorithm + << ", .load_modifier = " << p.load_modifier << ", .store_with_time_slicing = " << p.store_with_time_slicing + << ", .scan_algorithm = " << p.scan_algorithm << ", .delay_constructor = " << p.delay_constructor << " }"; + } +#endif // !_CCCL_COMPILER(NVRTC) +}; + +#if _CCCL_HAS_CONCEPTS() +template +concept rle_non_trivial_runs_policy_selector = detail::policy_selector; +#endif // _CCCL_HAS_CONCEPTS() + +struct policy_selector +{ + // TODO(bgruber): refactor the key information + int length_size; + int key_size; + type_t key_type; + bool length_is_primitive; + bool key_is_primitive; // TODO(bgruber): can probably be derived from key_type + + _CCCL_API constexpr auto + make_default_policy(BlockLoadAlgorithm block_load_alg, int delay_ctor_key_size, CacheLoadModifier load_mod) const + { + const int nominal_4B_items_per_thread = 15; + const int items_per_thread = + ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / key_size, 1, nominal_4B_items_per_thread); + return rle_non_trivial_runs_policy{ + 96, + items_per_thread, + block_load_alg, + load_mod, + true, + BLOCK_SCAN_WARP_SCANS, + default_reduce_by_key_delay_constructor_policy(delay_ctor_key_size, sizeof(int), true)}; + } + + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy + { + if (arch >= ::cuda::arch_id::sm_100) + { + if (length_is_primitive && key_is_primitive && length_size == 4) + { + if (key_size == 1) + { + // ipt_20.tpb_224.trp_1.ts_0.ld_1.ns_64.dcid_2.l2w_315 1.119878 1.003690 1.130067 1.338983 + return rle_non_trivial_runs_policy{ + 224, + 20, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_CA, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 64, 315}}; + } + if (key_size == 2) + { + // ipt_20.tpb_224.trp_1.ts_0.ld_0.ns_116.dcid_7.l2w_340 1.146528 1.072769 1.152390 1.333333 + return rle_non_trivial_runs_policy{ + 224, + 20, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon, 116, 340}}; + } + if (key_size == 4) + { + // ipt_13.tpb_224.trp_0.ts_0.ld_0.ns_252.dcid_2.l2w_470 1.113202 1.003690 1.133114 1.349296 + return rle_non_trivial_runs_policy{ + 224, + 13, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 252, 470}}; + } + if (key_size == 8 && key_type != type_t::float64) // fall back to SM90 for double + { + // ipt_15.tpb_256.trp_1.ts_0.ld_0.ns_28.dcid_2.l2w_520 1.114944 1.033189 1.122360 1.252083 + return rle_non_trivial_runs_policy{ + 256, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 28, 520}}; + } + } + + // no tuning for SM100, fall-through to SM90 + } + + if (arch >= ::cuda::arch_id::sm_90) + { + if (length_is_primitive && length_size == 4) + { + if (key_is_primitive && key_size == 1) + { + return rle_non_trivial_runs_policy{ + 256, + 18, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 385}}; + } + if (key_is_primitive && key_size == 2) + { + return rle_non_trivial_runs_policy{ + 224, + 20, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 675}}; + } + if (key_is_primitive && key_size == 4) + { + return rle_non_trivial_runs_policy{ + 256, + 18, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 695}}; + } + if (key_is_primitive && key_size == 8) + { + return rle_non_trivial_runs_policy{ + 224, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 840}}; + } + if (key_type == type_t::int128 || key_type == type_t::uint128) + { + return rle_non_trivial_runs_policy{ + 288, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 484, 1150}}; + } + } + + // no tuning for SM90, use a default policy + return make_default_policy(BLOCK_LOAD_WARP_TRANSPOSE, length_size, LOAD_DEFAULT); + } + + if (arch >= ::cuda::arch_id::sm_86) + { + // TODO(bgruber): I think we want `LengthT` instead of `int` + return make_default_policy(BLOCK_LOAD_DIRECT, sizeof(int), LOAD_LDG); + } + + if (arch >= ::cuda::arch_id::sm_80) + { + if (length_is_primitive && length_size == 4) + { + if (key_is_primitive && key_size == 1) + { + return rle_non_trivial_runs_policy{ + 192, + 20, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 630}}; + } + if (key_is_primitive && key_size == 2) + { + return rle_non_trivial_runs_policy{ + 192, + 20, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1015}}; + } + if (key_is_primitive && key_size == 4) + { + return rle_non_trivial_runs_policy{ + 224, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 915}}; + } + if (key_is_primitive && key_size == 8) + { + return rle_non_trivial_runs_policy{ + 256, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1065}}; + } + if (key_type == type_t::int128 || key_type == type_t::uint128) + { + return rle_non_trivial_runs_policy{ + 192, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1050}}; + } + } + // no tuning for SM80, use a default policy + return make_default_policy(BLOCK_LOAD_WARP_TRANSPOSE, length_size, LOAD_DEFAULT); + } + + // default is from SM50 + return make_default_policy(BLOCK_LOAD_DIRECT, int{sizeof(int)}, LOAD_LDG); + } +}; + +#if _CCCL_HAS_CONCEPTS() +static_assert(rle_non_trivial_runs_policy_selector); +#endif // _CCCL_HAS_CONCEPTS() + +template +struct policy_selector_from_types +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy + { + constexpr policy_selector selector{ + sizeof(LengthT), int{sizeof(KeyT)}, classify_type, is_primitive_v, is_primitive_v}; + return selector(arch); + } +}; } // namespace detail::rle::non_trivial_runs CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu index a84fc5315a1..ffb9c6fc783 100644 --- a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu +++ b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu @@ -247,18 +247,16 @@ C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle pointers", "[device][ // Guard against #293 template -struct device_rle_policy_hub +struct device_rle_policy_selector { static constexpr int threads = 96; static constexpr int items = 15; - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> + _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const + -> cub::detail::rle::non_trivial_runs::rle_non_trivial_runs_policy { - using RleSweepPolicyT = cub:: - AgentRlePolicy; - }; - - using MaxPolicy = Policy500; + return {threads, items, cub::BLOCK_LOAD_DIRECT, cub::LOAD_DEFAULT, TimeSlicing, cub::BLOCK_SCAN_WARP_SCANS}; + } }; struct CustomDeviceRunLengthEncode @@ -275,28 +273,20 @@ struct CustomDeviceRunLengthEncode OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, - int num_items, + int num_items, // Signed integer type for global offsets cudaStream_t stream = 0) { - using OffsetT = int; // Signed integer type for global offsets - using EqualityOp = cuda::std::equal_to<>; // Default == operator - - return cub::DeviceRleDispatch>:: - Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - EqualityOp(), - num_items, - stream); + return cub::detail::rle::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + cuda::std::equal_to<>{}, // Default == operator + num_items, + stream, + device_rle_policy_selector{}); } }; @@ -308,9 +298,9 @@ using time_slicing = c2h::type_list; C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns does not run out of memory", "[device][run_length_encode]", time_slicing) { using type = typename c2h::get<0, TestType>; - using policy_hub_t = device_rle_policy_hub; + using policy_sel_t = device_rle_policy_selector; - constexpr int tile_size = policy_hub_t::threads * policy_hub_t::items; + constexpr int tile_size = policy_sel_t::threads * policy_sel_t::items; constexpr int num_items = 2 * tile_size; constexpr int magic_number = num_items + 1;