diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu index 54d72ae4bb9..e02945740cc 100644 --- a/cub/benchmarks/bench/reduce/by_key.cu +++ b/cub/benchmarks/bench/reduce/by_key.cu @@ -15,72 +15,29 @@ // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE -# if TUNE_TRANSPOSE == 0 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT -# else // TUNE_TRANSPOSE == 1 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE -# endif // TUNE_TRANSPOSE - -# if TUNE_LOAD == 0 -# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT -# else // TUNE_LOAD == 1 -# define TUNE_LOAD_MODIFIER cub::LOAD_CA -# endif // TUNE_LOAD - -struct reduce_by_key_policy_hub +struct bench_reduce_by_key_policy_selector { - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const + -> cub::detail::reduce_by_key::reduce_by_key_policy { - using ReduceByKeyPolicyT = - cub::AgentReduceByKeyPolicy; - }; - - using MaxPolicy = Policy500; + return { + TUNE_THREADS, + TUNE_ITEMS, + TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE, + TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA, + cub::BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy, + }; + } }; #endif // !TUNE_BASE template static void reduce(nvbench::state& state, nvbench::type_list) { - using keys_input_it_t = const KeyT*; - using unique_output_it_t = KeyT*; - using vals_input_it_t = const ValueT*; - using aggregate_output_it_t = ValueT*; - using num_runs_output_iterator_t = OffsetT*; - using equality_op_t = ::cuda::std::equal_to<>; - using reduction_op_t = ::cuda::std::plus<>; - using accum_t = ValueT; - using offset_t = OffsetT; - -#if !TUNE_BASE - using dispatch_t = cub::DispatchReduceByKey< - keys_input_it_t, - unique_output_it_t, - vals_input_it_t, - aggregate_output_it_t, - num_runs_output_iterator_t, - equality_op_t, - reduction_op_t, - offset_t, - accum_t, - reduce_by_key_policy_hub>; -#else - using dispatch_t = cub::DispatchReduceByKey< - keys_input_it_t, - unique_output_it_t, - vals_input_it_t, - aggregate_output_it_t, - num_runs_output_iterator_t, - equality_op_t, - reduction_op_t, - offset_t, - accum_t>; -#endif + using equality_op_t = ::cuda::std::equal_to<>; + using reduction_op_t = ::cuda::std::plus<>; + using offset_t = OffsetT; const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; @@ -92,43 +49,42 @@ static void reduce(nvbench::state& state, nvbench::type_list out_keys(elements); thrust::device_vector in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); - KeyT* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); + const KeyT* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); KeyT* d_out_keys = thrust::raw_pointer_cast(out_keys.data()); - ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data()); + const ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data()); ValueT* d_out_vals = thrust::raw_pointer_cast(out_vals.data()); OffsetT* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; + const offset_t num_items = static_cast(elements); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - 0); + auto dispatch_on_stream = [&](cudaStream_t stream) { + return cub::detail::reduce_by_key::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in_keys, + d_out_keys, + d_in_vals, + d_out_vals, + d_num_runs_out, + equality_op_t{}, + reduction_op_t{}, + num_items, + stream +#if !TUNE_BASE + , + bench_reduce_by_key_policy_selector{} +#endif + ); + }; + + dispatch_on_stream(cudaStream_t{0}); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - 0); + dispatch_on_stream(cudaStream_t{0}); cudaDeviceSynchronize(); const OffsetT num_runs = num_runs_out[0]; @@ -140,18 +96,7 @@ static void reduce(nvbench::state& state, nvbench::type_list(1); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - launch.get_stream()); + dispatch_on_stream(launch.get_stream()); }); } diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index 51f23a424f3..37227aa7649 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -17,32 +17,20 @@ // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE -# if TUNE_TRANSPOSE == 0 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT -# else // TUNE_TRANSPOSE == 1 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE -# endif // TUNE_TRANSPOSE - -# if TUNE_LOAD == 0 -# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT -# else // TUNE_LOAD == 1 -# define TUNE_LOAD_MODIFIER cub::LOAD_CA -# endif // TUNE_LOAD - -struct reduce_by_key_policy_hub +struct bench_encode_policy_selector { - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const + -> cub::detail::reduce_by_key::reduce_by_key_policy { - using ReduceByKeyPolicyT = - cub::AgentReduceByKeyPolicy; - }; - - using MaxPolicy = Policy500; + return { + TUNE_THREADS, + TUNE_ITEMS, + TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE, + TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA, + cub::BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy, + }; + } }; #endif // !TUNE_BASE @@ -56,41 +44,10 @@ static void rle(nvbench::state& state, nvbench::type_list; - using run_length_output_it_t = run_length_t*; - using num_runs_output_iterator_t = num_runs_t*; - using equality_op_t = ::cuda::std::equal_to<>; - using reduction_op_t = ::cuda::std::plus<>; - using accum_t = run_length_t; - -#if !TUNE_BASE - using dispatch_t = cub::detail::reduce::DispatchStreamingReduceByKey< - keys_input_it_t, - unique_output_it_t, - run_length_input_it_t, - run_length_output_it_t, - num_runs_output_iterator_t, - equality_op_t, - reduction_op_t, - offset_t, - accum_t, - reduce_by_key_policy_hub>; -#else - using policy_t = cub::detail::rle::encode::policy_hub; - using dispatch_t = cub::detail::reduce::DispatchStreamingReduceByKey< - keys_input_it_t, - unique_output_it_t, - run_length_input_it_t, - run_length_output_it_t, - num_runs_output_iterator_t, - equality_op_t, - reduction_op_t, - offset_t, - accum_t, - policy_t>; -#endif + using run_length_input_it_t = thrust::constant_iterator; + using equality_op_t = ::cuda::std::equal_to<>; + using reduction_op_t = ::cuda::std::plus<>; + using accum_t = run_length_t; const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; @@ -101,7 +58,7 @@ static void rle(nvbench::state& state, nvbench::type_list out_keys(elements); thrust::device_vector in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); - T* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); + const T* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); T* d_out_keys = thrust::raw_pointer_cast(out_keys.data()); auto d_out_vals = thrust::raw_pointer_cast(out_vals.data()); auto d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); @@ -109,35 +66,35 @@ static void rle(nvbench::state& state, nvbench::type_list(elements); + + auto dispatch_on_stream = [&](cudaStream_t stream) { + return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key( + d_temp_storage, + temp_storage_bytes, + d_in_keys, + d_out_keys, + d_in_vals, + d_out_vals, + d_num_runs_out, + equality_op_t{}, + reduction_op_t{}, + num_items, + stream, +#if TUNE_BASE + cub::detail::rle::encode::policy_selector_from_types {} +#else // TUNE_BASE + bench_encode_policy_selector{} +#endif // TUNE_BASE + ); + }; - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - 0); + dispatch_on_stream(cudaStream_t{0}); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - 0); + dispatch_on_stream(cudaStream_t{0}); cudaDeviceSynchronize(); const num_runs_t num_runs = num_runs_out[0]; @@ -148,18 +105,7 @@ static void rle(nvbench::state& state, nvbench::type_list(1); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in_keys, - d_out_keys, - d_in_vals, - d_out_vals, - d_num_runs_out, - equality_op_t{}, - reduction_op_t{}, - elements, - launch.get_stream()); + dispatch_on_stream(launch.get_stream()); }); } diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh index 43f74df5f35..9566cb705e4 100644 --- a/cub/cub/agent/agent_reduce_by_key.cuh +++ b/cub/cub/agent/agent_reduce_by_key.cuh @@ -2,10 +2,9 @@ // SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 -/** - * @file cub::AgentReduceByKey implements a stateful abstraction of CUDA thread - * blocks for participating in device-wide reduce-value-by-key. - */ +//! @file +//! cub::detail::reduce_by_key::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for +//! participating in device-wide reduce-value-by-key. #pragma once @@ -92,7 +91,7 @@ struct AgentReduceByKeyPolicy * Thread block abstractions ******************************************************************************/ -namespace detail::reduce +namespace detail::reduce_by_key { /** * @brief AgentReduceByKey implements a stateful abstraction of CUDA thread @@ -771,6 +770,6 @@ struct AgentReduceByKey } } }; -} // namespace detail::reduce +} // namespace detail::reduce_by_key CUB_NAMESPACE_END diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh index 23d0c6d571e..9d939d4563f 100644 --- a/cub/cub/detail/delay_constructor.cuh +++ b/cub/cub/detail/delay_constructor.cuh @@ -193,15 +193,27 @@ struct delay_constructor_for using delay_constructor_t = typename delay_constructor_for::type; -_CCCL_API constexpr auto -default_reduce_by_key_delay_constructor_policy(int key_size, int value_size, bool value_is_primitive) +_CCCL_API constexpr auto default_delay_constructor_policy(bool is_primitive_or_trivially_copyable) { - if (value_is_primitive && (value_size + key_size < 16)) + if (is_primitive_or_trivially_copyable) { - return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450}; + return delay_constructor_policy{delay_constructor_kind::fixed_delay, 350, 450}; } return delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450}; } + +_CCCL_API constexpr auto default_reduce_by_key_delay_constructor_policy( + int key_size, + int value_size, + bool key_is_primitive_or_trivially_copyable, + bool value_is_primitive_or_trivially_copyable) +{ + if (value_is_primitive_or_trivially_copyable && (value_size + key_size < 16)) + { + return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450}; + } + return default_delay_constructor_policy(key_is_primitive_or_trivially_copyable); +} } // namespace detail CUB_NAMESPACE_END diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 50101c16e0f..afbbe219bf9 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -2308,35 +2308,21 @@ public: { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey"); - // Signed integer type for global offsets - using OffsetT = detail::choose_offset_t; - - // FlagT iterator type (not used) - - // Selection op (not used) - - // Default == operator + using OffsetT = detail::choose_offset_t; using EqualityOp = ::cuda::std::equal_to<>; - return DispatchReduceByKey< - KeysInputIteratorT, - UniqueOutputIteratorT, - ValuesInputIteratorT, - AggregatesOutputIteratorT, - NumRunsOutputIteratorT, - EqualityOp, - ReductionOpT, - OffsetT>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - EqualityOp(), - reduction_op, - static_cast(num_items), - stream); + return detail::reduce_by_key::dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + EqualityOp{}, + reduction_op, + static_cast(num_items), + stream); } }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 44fac1b1cd9..8b1373afdc5 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -186,29 +186,21 @@ struct DeviceRunLengthEncode using key_t = cub::detail::non_void_value_t>; - using policy_t = detail::rle::encode::policy_hub; + using policy_selector_t = detail::rle::encode::policy_selector_from_types; - return detail::reduce::DispatchStreamingReduceByKey< - InputIteratorT, - UniqueOutputIteratorT, - lengths_input_iterator_t, - LengthsOutputIteratorT, - NumRunsOutputIteratorT, - equality_op, - reduction_op, - offset_t, - accum_t, - policy_t>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_unique_out, - lengths_input_iterator_t((length_t) 1), - d_counts_out, - d_num_runs_out, - equality_op(), - reduction_op(), - num_items, - stream); + return detail::reduce_by_key::dispatch_streaming_reduce_by_key( + d_temp_storage, + temp_storage_bytes, + d_in, + d_unique_out, + lengths_input_iterator_t(length_t{1}), + d_counts_out, + d_num_runs_out, + equality_op{}, + reduction_op{}, + static_cast(num_items), + stream, + policy_selector_t{}); } //! @rst diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index cda8857f460..14fbf3ae15c 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -20,6 +20,8 @@ #endif // no system header #include +#include +#include #include #include #include @@ -29,13 +31,17 @@ #include +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) +# include +#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ -namespace detail::reduce +namespace detail::reduce_by_key { template struct streaming_context @@ -106,8 +112,8 @@ struct streaming_context /** * @brief Multi-block reduce-by-key sweep kernel entry point * - * @tparam AgentReduceByKeyPolicyT - * Parameterized AgentReduceByKeyPolicyT tuning policy type + * @tparam PolicySelector + * Selects the tuning policy * * @tparam KeysInputIteratorT * Random-access input iterator type for keys @@ -167,7 +173,7 @@ struct streaming_context * @param num_items * Total number of items to select from */ -template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_THREADS)) +#if _CCCL_HAS_CONCEPTS() + requires reduce_by_key_policy_selector +#endif +__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel( KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, @@ -194,8 +203,19 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH _CCCL_GRID_CONSTANT const StreamingContextT streaming_context, vsmem_t vsmem) { + static constexpr reduce_by_key_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); + using AgentReduceByKeyPolicyT = AgentReduceByKeyPolicy< + policy.block_threads, + policy.items_per_thread, + policy.load_algorithm, + policy.load_modifier, + policy.scan_algorithm, + delay_constructor_t>; + using vsmem_helper_t = vsmem_helper_default_fallback_policy_t< - typename ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT, + AgentReduceByKeyPolicyT, AgentReduceByKey, KeysInputIteratorT, UniqueOutputIteratorT, @@ -234,7 +254,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH // If applicable, hints to discard modified cache lines for vsmem vsmem_helper_t::discard_temp_storage(temp_storage); } -} // namespace detail::reduce +} // namespace detail::reduce_by_key /****************************************************************************** * Dispatch @@ -272,6 +292,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ +// TODO(bgruber): deprecate when we make the tuning API public and remove in CCCL 4.0 template ( detail::scan::DeviceCompactInitKernel, - detail::reduce::DeviceReduceByKeyKernel< - typename PolicyHub::MaxPolicy, + detail::reduce_by_key::DeviceReduceByKeyKernel< + detail::reduce_by_key::policy_selector_from_hub, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, @@ -622,4 +643,216 @@ struct DispatchReduceByKey } }; +namespace detail::reduce_by_key +{ +// we move the conversion of the policy to the agent policy and its use out of the lambda below, so MSVC does not ICE +template +_CCCL_API auto determine_threads_items_vsmem(PolicyGetter policy_getter) +{ + // TODO(bgruber): refactor this in the future + constexpr reduce_by_key_policy policy = policy_getter(); + using Policy = AgentReduceByKeyPolicy< + policy.block_threads, + policy.items_per_thread, + policy.load_algorithm, + policy.load_modifier, + policy.scan_algorithm, + delay_constructor_t>; + using vsmem_helper_t = vsmem_helper_default_fallback_policy_t; + return ::cuda::std::tuple{vsmem_helper_t::agent_policy_t::BLOCK_THREADS, + vsmem_helper_t::agent_policy_t::ITEMS_PER_THREAD, + vsmem_helper_t::vsmem_per_block}; +} + +template < + typename OverrideAccumT = use_default, + typename KeysInputIteratorT, + typename UniqueOutputIteratorT, + typename ValuesInputIteratorT, + typename AggregatesOutputIteratorT, + typename NumRunsOutputIteratorT, + typename EqualityOpT, + typename ReductionOpT, + typename OffsetT, + typename AccumT = ::cuda::std::conditional_t< + !::cuda::std::is_same_v, + OverrideAccumT, + ::cuda::std::__accumulator_t, it_value_t>>, + typename KeyT = non_void_value_t>, + typename PolicySelector = policy_selector_from_types> +#if _CCCL_HAS_CONCEPTS() + requires reduce_by_key::reduce_by_key_policy_selector +#endif // _CCCL_HAS_CONCEPTS() +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( + void* d_temp_storage, + size_t& temp_storage_bytes, + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, + EqualityOpT equality_op, + ReductionOpT reduction_op, + OffsetT num_items, + cudaStream_t stream, + PolicySelector policy_selector = {}) +{ + using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet + using ScanTileStateT = ReduceByKeyScanTileState; + [[maybe_unused]] static constexpr int init_kernel_threads = 128; + + ::cuda::arch_id arch_id{}; + if (const auto error = CubDebug(ptx_arch_id(arch_id))) + { + return error; + } + + return detail::dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) { +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + NV_IF_TARGET( + NV_IS_HOST, + (::std::stringstream ss; ss << policy_getter(); _CubLog( + "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast(arch_id), ss.str().c_str());)) +#endif + + const auto [block_threads, items_per_thread, vsmem_per_block] = determine_threads_items_vsmem< + decltype(policy_getter), + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + EqualityOpT, + ReductionOpT, + OffsetT, + AccumT, + streaming_context_t>(policy_getter); + + // Number of input tiles + const int tile_size = block_threads * items_per_thread; + const int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); + + // The amount of virtual shared memory to allocate + const auto vsmem_size = num_tiles * vsmem_per_block; + + size_t tile_descriptor_memory{}; + if (const auto error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, tile_descriptor_memory))) + { + return error; + } + size_t allocation_sizes[2] = {tile_descriptor_memory, vsmem_size}; + void* allocations[2] = {}; + + if (const auto error = + CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + { + return error; + } + + if (d_temp_storage == nullptr) + { + return cudaSuccess; + } + + ScanTileStateT tile_state; + if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) + { + return error; + } + + const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_tiles, init_kernel_threads)); +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); +#endif + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream) + .doit(detail::scan::DeviceCompactInitKernel, + tile_state, + num_tiles, + d_num_runs_out))) + { + return error; + } + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + if (num_items == 0) + { + return cudaSuccess; + } + + auto reduce_by_key_kernel = &DeviceReduceByKeyKernel< + PolicySelector, + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + ScanTileStateT, + EqualityOpT, + ReductionOpT, + OffsetT, + AccumT, + streaming_context_t>; + + int reduce_by_key_sm_occupancy{}; + if (const auto error = CubDebug(MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, block_threads))) + { + return error; + } + + int device_ordinal{}; + if (const auto error = CubDebug(cudaGetDevice(&device_ordinal))) + { + return error; + } + int max_dim_x{}; + if (const auto error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) + { + return error; + } + + const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, + scan_grid_size, + block_threads, + (long long) stream, + items_per_thread, + reduce_by_key_sm_occupancy); +#endif + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream) + .doit(reduce_by_key_kernel, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + tile_state, + start_tile, + equality_op, + reduction_op, + num_items, + streaming_context_t{}, + cub::detail::vsmem_t{allocations[1]}))) + { + return error; + } + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + } + return cudaSuccess; + }); +} +} // namespace detail::reduce_by_key + CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh index 57d34ff4e7e..5d959df2c4c 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh @@ -14,442 +14,246 @@ #endif // no system header #include +#include #include #include -#include #include -#include #include #include -#include #include -#if !_CCCL_COMPILER(NVRTC) -# include -#endif // !_CCCL_COMPILER(NVRTC) +#include +#include +#include + +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) +# include +#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) CUB_NAMESPACE_BEGIN -namespace detail::reduce +namespace detail::reduce_by_key { -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * @brief Utility class for dispatching the appropriately-tuned kernels for - * DeviceReduceByKey - * - * @tparam KeysInputIteratorT - * Random-access input iterator type for keys - * - * @tparam UniqueOutputIteratorT - * Random-access output iterator type for keys - * - * @tparam ValuesInputIteratorT - * Random-access input iterator type for values - * - * @tparam AggregatesOutputIteratorT - * Random-access output iterator type for values - * - * @tparam NumRunsOutputIteratorT - * Output iterator type for recording number of segments encountered - * - * @tparam EqualityOpT - * KeyT equality operator type - * - * @tparam ReductionOpT - * ValueT reduction operator type - * - * @tparam PolicyHub - * Implementation detail, do not specify directly, requirements on the - * content of this type are subject to breaking change. - */ -template , - cub::detail::it_value_t>, - typename PolicyHub = detail::reduce_by_key::policy_hub< - ReductionOpT, - AccumT, - cub::detail::non_void_value_t>>> -struct DispatchStreamingReduceByKey +template < + typename OverrideAccumT = use_default, + typename KeysInputIteratorT, + typename UniqueOutputIteratorT, + typename ValuesInputIteratorT, + typename AggregatesOutputIteratorT, + typename NumRunsOutputIteratorT, + typename EqualityOpT, + typename ReductionOpT, + typename OffsetT, + typename AccumT = ::cuda::std::conditional_t< + !::cuda::std::is_same_v, + OverrideAccumT, + ::cuda::std::__accumulator_t, it_value_t>>, + typename PolicySelector = + policy_selector_from_types>>> +#if _CCCL_HAS_CONCEPTS() + requires reduce_by_key_policy_selector +#endif +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_reduce_by_key( + void* d_temp_storage, + size_t& temp_storage_bytes, + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, + EqualityOpT equality_op, + ReductionOpT reduction_op, + OffsetT num_items, + cudaStream_t stream, + PolicySelector policy_selector = {}) { - //------------------------------------------------------------------------- - // Types and constants - //------------------------------------------------------------------------- - // Offsets to index items within one partition (i.e., a single kernel invocation) - using local_offset_t = ::cuda::std::int32_t; - - // If the number of items provided by the user may exceed the maximum number of items processed by a single kernel - // invocation, we may require multiple kernel invocations - static constexpr bool use_streaming_invocation = ::cuda::std::numeric_limits::max() - > ::cuda::std::numeric_limits::max(); - - // Offsets to index any item within the entire input (large enough to cover num_items) - using global_offset_t = OffsetT; - - // Type used to provide context about the current partition during a streaming invocation - using streaming_context_t = - ::cuda::std::conditional_t, - NullType>; - - // The input values type - using ValueInputT = cub::detail::it_value_t; + ::cuda::arch_id arch_id{}; + if (const auto error = CubDebug(ptx_arch_id(arch_id))) + { + return error; + } - static constexpr int init_kernel_threads = 128; + const reduce_by_key_policy policy = policy_selector(arch_id); - // Tile status descriptor interface type - using ScanTileStateT = ReduceByKeyScanTileState; +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + NV_IF_TARGET(NV_IS_HOST, + (::std::stringstream ss; ss << policy; + _CubLog("Dispatching streaming reduce by key to arch %d with tuning: %s\n", + static_cast(arch_id), + ss.str().c_str());)) +#endif - void* d_temp_storage; - size_t& temp_storage_bytes; - KeysInputIteratorT d_keys_in; - UniqueOutputIteratorT d_unique_out; - ValuesInputIteratorT d_values_in; - AggregatesOutputIteratorT d_aggregates_out; - NumRunsOutputIteratorT d_num_runs_out; - EqualityOpT equality_op; - ReductionOpT reduction_op; - global_offset_t num_items; - cudaStream_t stream; + using local_offset_t = ::cuda::std::int32_t; + using global_offset_t = OffsetT; + static constexpr bool use_streaming_invocation = + ::cuda::std::numeric_limits::max() > ::cuda::std::numeric_limits::max(); + using streaming_context_t = ::cuda::std:: + conditional_t, NullType>; + using ScanTileStateT = ReduceByKeyScanTileState; + [[maybe_unused]] static constexpr int init_kernel_threads = 128; + + const int block_threads = policy.block_threads; + const int items_per_thread = policy.items_per_thread; + const auto tile_size = static_cast(block_threads * items_per_thread); + + auto capped_num_items_per_invocation = num_items; + if constexpr (use_streaming_invocation) + { + capped_num_items_per_invocation = static_cast(::cuda::std::numeric_limits::max()); + capped_num_items_per_invocation -= (capped_num_items_per_invocation % tile_size); + } - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchStreamingReduceByKey( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - UniqueOutputIteratorT d_unique_out, - ValuesInputIteratorT d_values_in, - AggregatesOutputIteratorT d_aggregates_out, - NumRunsOutputIteratorT d_num_runs_out, - EqualityOpT equality_op, - ReductionOpT reduction_op, - global_offset_t num_items, - cudaStream_t stream) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_keys_in(d_keys_in) - , d_unique_out(d_unique_out) - , d_values_in(d_values_in) - , d_aggregates_out(d_aggregates_out) - , d_num_runs_out(d_num_runs_out) - , equality_op(equality_op) - , reduction_op(reduction_op) - , num_items(num_items) - , stream(stream) - {} + const auto max_num_items_per_invocation = + use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items; + auto const num_partitions = + (num_items == 0) ? global_offset_t{1} : ::cuda::ceil_div(num_items, capped_num_items_per_invocation); - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- + const auto max_num_tiles = static_cast(::cuda::ceil_div(max_num_items_per_invocation, tile_size)); - template - CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t - Invoke(ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel) + size_t allocation_sizes[3]; + if (const auto error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0]))) { - using AgentReduceByKeyPolicyT = typename ActivePolicyT::ReduceByKeyPolicyT; - constexpr int block_threads = AgentReduceByKeyPolicyT::BLOCK_THREADS; - constexpr int items_per_thread = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD; - - // The upper bound of for the number of items that a single kernel invocation will ever process - auto capped_num_items_per_invocation = num_items; - if constexpr (use_streaming_invocation) - { - capped_num_items_per_invocation = - static_cast(::cuda::std::numeric_limits::max()); - // Make sure that the number of items is a multiple of tile size - capped_num_items_per_invocation -= (capped_num_items_per_invocation % (block_threads * items_per_thread)); - } - - // Across invocations, the maximum number of items that a single kernel invocation will ever process - const auto max_num_items_per_invocation = - use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items; + return error; + } + allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0}; + allocation_sizes[2] = num_partitions > 1 ? sizeof(AccumT) * 2 : size_t{0}; - // Number of invocations required to "iterate" over the total input (at least one iteration to process zero items) - auto const num_partitions = - (num_items == 0) ? global_offset_t{1} : ::cuda::ceil_div(num_items, capped_num_items_per_invocation); + void* allocations[3] = {}; + if (const auto error = + CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + { + return error; + } - cudaError error = cudaSuccess; + if (d_temp_storage == nullptr) + { + return cudaSuccess; + } - // Number of input tiles - const auto tile_size = static_cast(block_threads * items_per_thread); - int max_num_tiles = static_cast(::cuda::ceil_div(max_num_items_per_invocation, tile_size)); + for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) + { + global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation; + global_offset_t current_num_items = + (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : capped_num_items_per_invocation; - // Specify temporary storage allocation requirements - size_t allocation_sizes[3]; - error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0])); - if (cudaSuccess != error) + const auto num_current_tiles = static_cast(::cuda::ceil_div(current_num_items, tile_size)); + ScanTileStateT tile_state; + if (const auto error = CubDebug(tile_state.Init(num_current_tiles, allocations[0], allocation_sizes[0]))) { return error; } - allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0}; - allocation_sizes[2] = num_partitions > 1 ? sizeof(AccumT) * 2 : size_t{0}; - // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[3] = {}; - - error = CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - if (cudaSuccess != error) + const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads)); +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); +#endif + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream) + .doit(&detail::scan::DeviceCompactInitKernel, + tile_state, + num_current_tiles, + d_num_runs_out))) { return error; } - if (d_temp_storage == nullptr) + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) { - // Return if the caller is simply requesting the size of the storage allocation return error; } - // Iterate over the partitions until all input is processed - for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) + if (num_items == 0) { - global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation; - global_offset_t current_num_items = - (partition_idx + 1 == num_partitions) - ? (num_items - current_partition_offset) - : capped_num_items_per_invocation; - - // Construct the tile status interface - const auto num_current_tiles = static_cast(::cuda::ceil_div(current_num_items, tile_size)); - - // Construct the tile status interface - ScanTileStateT tile_state; - error = CubDebug(tile_state.Init(num_current_tiles, allocations[0], allocation_sizes[0])); - if (cudaSuccess != error) - { - return error; - } - - // Log init_kernel configuration - int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads)); + return cudaSuccess; + } #ifdef CUB_DEBUG_LOG - _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); -#endif // CUB_DEBUG_LOG - - // Invoke init_kernel to initialize tile descriptors - error = CubDebug( - THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream) - .doit(init_kernel, tile_state, num_current_tiles, d_num_runs_out)); - if (cudaSuccess != error) - { - return error; - } - - // Sync the stream if specified to flush runtime errors - error = CubDebug(detail::DebugSyncStream(stream)); - if (cudaSuccess != error) - { - return error; - } + _CubLog("Invoking reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + num_current_tiles, + block_threads, + (long long) stream, + items_per_thread); +#endif + auto reduce_by_key_kernel = DeviceReduceByKeyKernel< + PolicySelector, + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + ScanTileStateT, + EqualityOpT, + ReductionOpT, + local_offset_t, + AccumT, + streaming_context_t>; - // For empty problems we can skip the reduce_by_key_kernel - if (num_items == 0) + if constexpr (use_streaming_invocation) + { + auto tmp_num_uniques = static_cast(allocations[1]); + auto tmp_prefix = static_cast(allocations[2]); + const bool is_first_partition = (partition_idx == 0); + const bool is_last_partition = (partition_idx + 1 == num_partitions); + const int buffer_selector = partition_idx % 2; + streaming_context_t streaming_context{ + is_first_partition, + is_last_partition, + is_first_partition ? d_keys_in : d_keys_in + current_partition_offset - 1, + &tmp_prefix[buffer_selector], + &tmp_prefix[buffer_selector ^ 0x01], + &tmp_num_uniques[buffer_selector], + &tmp_num_uniques[buffer_selector ^ 0x01]}; + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) + .doit(reduce_by_key_kernel, + d_keys_in + current_partition_offset, + d_unique_out, + d_values_in + current_partition_offset, + d_aggregates_out, + d_num_runs_out, + tile_state, + 0, + equality_op, + reduction_op, + static_cast(current_num_items), + streaming_context, + detail::vsmem_t{nullptr}))) { return error; } - -// Log reduce_by_key_kernel configuration -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d " - "items per thread\n", - num_current_tiles, - block_threads, - (long long) stream, - items_per_thread); -#endif // CUB_DEBUG_LOG - - // Invoke reduce_by_key_kernel - - if constexpr (use_streaming_invocation) - { - auto tmp_num_uniques = static_cast(allocations[1]); - auto tmp_prefix = static_cast(allocations[2]); - - const bool is_first_partition = (partition_idx == 0); - const bool is_last_partition = (partition_idx + 1 == num_partitions); - const int buffer_selector = partition_idx % 2; - - streaming_context_t streaming_context{ - is_first_partition, - is_last_partition, - is_first_partition ? d_keys_in : d_keys_in + current_partition_offset - 1, - &tmp_prefix[buffer_selector], - &tmp_prefix[buffer_selector ^ 0x01], - &tmp_num_uniques[buffer_selector], - &tmp_num_uniques[buffer_selector ^ 0x01]}; - - error = CubDebug( - THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) - .doit(reduce_by_key_kernel, - d_keys_in + current_partition_offset, - d_unique_out, - d_values_in + current_partition_offset, - d_aggregates_out, - d_num_runs_out, - tile_state, - 0, - equality_op, - reduction_op, - static_cast(current_num_items), - streaming_context, - cub::detail::vsmem_t{nullptr})); - if (cudaSuccess != error) - { - return error; - } - } - else - { - error = CubDebug( - THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) - .doit(reduce_by_key_kernel, - d_keys_in + current_partition_offset, - d_unique_out, - d_values_in + current_partition_offset, - d_aggregates_out, - d_num_runs_out, - tile_state, - 0, - equality_op, - reduction_op, - static_cast(current_num_items), - NullType{}, - cub::detail::vsmem_t{nullptr})); - if (cudaSuccess != error) - { - return error; - } - } - - // Sync the stream if specified to flush runtime errors - error = CubDebug(detail::DebugSyncStream(stream)); - if (cudaSuccess != error) + } + else + { + if (const auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream) + .doit(reduce_by_key_kernel, + d_keys_in + current_partition_offset, + d_unique_out, + d_values_in + current_partition_offset, + d_aggregates_out, + d_num_runs_out, + tile_state, + 0, + equality_op, + reduction_op, + static_cast(current_num_items), + NullType{}, + detail::vsmem_t{nullptr}))) { return error; } } - - return error; - } - - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() - { - return Invoke( - detail::scan::DeviceCompactInitKernel, - detail::reduce::DeviceReduceByKeyKernel< - typename PolicyHub::MaxPolicy, - KeysInputIteratorT, - UniqueOutputIteratorT, - ValuesInputIteratorT, - AggregatesOutputIteratorT, - NumRunsOutputIteratorT, - ScanTileStateT, - EqualityOpT, - ReductionOpT, - local_offset_t, - AccumT, - streaming_context_t>); - } - - /** - * Internal dispatch routine - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Pointer to the input sequence of keys - * - * @param[out] d_unique_out - * Pointer to the output sequence of unique keys (one key per run) - * - * @param[in] d_values_in - * Pointer to the input sequence of corresponding values - * - * @param[out] d_aggregates_out - * Pointer to the output sequence of value aggregates - * (one aggregate per run) - * - * @param[out] d_num_runs_out - * Pointer to total number of runs encountered - * (i.e., the length of d_unique_out) - * - * @param[in] equality_op - * KeyT equality operator - * - * @param[in] reduction_op - * ValueT reduction operator - * - * @param[in] num_items - * Total number of items to select from - * - * @param[in] stream - * CUDA stream to launch kernels within. Default is stream0. - */ - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - UniqueOutputIteratorT d_unique_out, - ValuesInputIteratorT d_values_in, - AggregatesOutputIteratorT d_aggregates_out, - NumRunsOutputIteratorT d_num_runs_out, - EqualityOpT equality_op, - ReductionOpT reduction_op, - global_offset_t num_items, - cudaStream_t stream) - { - cudaError error = cudaSuccess; - - do + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) { - // Get PTX version - int ptx_version = 0; - error = CubDebug(PtxVersion(ptx_version)); - if (cudaSuccess != error) - { - break; - } - - DispatchStreamingReduceByKey dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - equality_op, - reduction_op, - num_items, - stream); - - // Dispatch - error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); - if (cudaSuccess != error) - { - break; - } - } while (0); - - return error; + return error; + } } -}; -} // namespace detail::reduce + + return cudaSuccess; +} +} // namespace detail::reduce_by_key CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index cfaad7185da..64c96bc77d6 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -17,32 +17,46 @@ #include #include #include -#include +#include +#include #include #include +#include #include #include +#include + +#if !_CCCL_COMPILER(NVRTC) +# include +#endif CUB_NAMESPACE_BEGIN namespace detail::reduce_by_key { +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers enum class primitive_key { no, yes }; + +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers enum class primitive_accum { no, yes }; + +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers enum class primitive_op { no, yes }; + +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers enum class key_size { _1, @@ -52,6 +66,8 @@ enum class key_size _16, unknown }; + +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers enum class accum_size { _1, @@ -62,26 +78,30 @@ enum class accum_size unknown }; +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template -constexpr primitive_key is_primitive_key() +_CCCL_API constexpr primitive_key is_primitive_key() { return detail::is_primitive::value ? primitive_key::yes : primitive_key::no; } +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template -constexpr primitive_accum is_primitive_accum() +_CCCL_API constexpr primitive_accum is_primitive_accum() { return detail::is_primitive::value ? primitive_accum::yes : primitive_accum::no; } +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template -constexpr primitive_op is_primitive_op() +_CCCL_API constexpr primitive_op is_primitive_op() { return basic_binary_op_t::value ? primitive_op::yes : primitive_op::no; } +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template -constexpr key_size classify_key_size() +_CCCL_API constexpr key_size classify_key_size() { return sizeof(KeyT) == 1 ? key_size::_1 : sizeof(KeyT) == 2 ? key_size::_2 @@ -92,8 +112,9 @@ constexpr key_size classify_key_size() : key_size::unknown; } +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template -constexpr accum_size classify_accum_size() +_CCCL_API constexpr accum_size classify_accum_size() { return sizeof(AccumT) == 1 ? accum_size::_1 : sizeof(AccumT) == 2 ? accum_size::_2 @@ -104,6 +125,7 @@ constexpr accum_size classify_accum_size() : accum_size::unknown; } +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template ; }; +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template ; }; +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template struct sm100_tuning { // ipt_14.tpb_224.trp_1.ld_0.ns_476.dcid_5.l2w_1005 1.187378 1.119705 1.185397 1.258420 - static constexpr int items = 14; static constexpr int threads = 224; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; @@ -833,6 +856,7 @@ struct sm100_tuning; // }; +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers template struct policy_hub { @@ -895,7 +919,7 @@ struct policy_hub struct Policy1000 : ChainedPolicy<1000, Policy1000, Policy900> { - // Use values from tuning if a specialization exists, otherwise pick the default + // Use values from tuning if a specialization exists, otherwise fall back to SM90 template static auto select_agent_policy(int) -> AgentReduceByKeyPolicy +concept reduce_by_key_policy_selector = detail::policy_selector; +#endif // _CCCL_HAS_CONCEPTS() + +struct policy_selector +{ + int key_size; + int accum_size; + type_t accum_t; + + // TODO(bgruber): we want to get rid of the following four and just assume by default that types behave "primitive". + // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because + // then users can opt-out of tunings again + bool key_is_primitive; + bool key_is_trivially_copyable; + bool accum_is_primitive; + bool op_is_primitive; + + _CCCL_API constexpr auto __make_default_policy(CacheLoadModifier load_mod) const -> reduce_by_key_policy + { + constexpr int nominal_4B_items_per_thread = 6; + const int combined_input_bytes = key_size + accum_size; + const int max_input_bytes = (::cuda::std::max) (key_size, accum_size); + const int items_per_thread = + (max_input_bytes <= 8) + ? 6 + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); + return reduce_by_key_policy{ + 128, + items_per_thread, + BLOCK_LOAD_DIRECT, + load_mod, + BLOCK_SCAN_WARP_SCANS, + default_reduce_by_key_delay_constructor_policy( + accum_size, sizeof(int), key_is_primitive || key_is_trivially_copyable, true)}; + } + + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy + { + // bail out if we don't know the operation. TODO(bgruber): drop this check when we make the tuning API public + if (!op_is_primitive) + { + return __make_default_policy(LOAD_LDG); + } + + const bool use_tuning = (key_is_primitive || key_size == 16) && (accum_is_primitive || accum_size == 16); + + if (arch >= ::cuda::arch_id::sm_100 && use_tuning) // if we don't have a tuning, fall back to SM90 + { + if (key_size == 1 && accum_size == 1) + { + // ipt_13.tpb_576.trp_0.ld_1.ns_2044.dcid_5.l2w_240 1.161888 0.848558 1.134941 1.299109 + return {576, + 13, + BLOCK_LOAD_DIRECT, + LOAD_CA, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon_jitter_window, 2044, 240}}; + } + if (key_size == 1 && accum_size == 2) + { + // ipt_10.tpb_224.trp_0.ld_0.ns_244.dcid_4.l2w_390 1.313932 1.260540 1.319588 1.427374 + return {224, + 10, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff_jitter_window, 224, 390}}; + } + if (key_size == 1 && accum_size == 4) + { + // ipt_14.tpb_128.trp_0.ld_0.ns_248.dcid_2.l2w_285 1.118109 1.051534 1.134336 1.326788 + return {128, + 14, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 248, 285}}; + } + if (key_size == 1 && accum_size == 8) + { + // ipt_19.tpb_128.trp_1.ld_0.ns_132.dcid_1.l2w_540 1.113820 1.002404 1.105014 1.202296 + return {128, + 19, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 132, 540}}; + } + if (key_size == 2 && accum_size == 1) + { + // ipt_14.tpb_128.trp_1.ld_0.ns_164.dcid_2.l2w_290 1.239579 1.119705 1.239111 1.313112 + return {128, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 164, 290}}; + } + if (key_size == 2 && accum_size == 2) + { + // ipt_14.tpb_256.trp_1.ld_0.ns_180.dcid_2.l2w_975 1.145635 1.012658 1.139956 1.251546 + return {256, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 180, 975}}; + } + if (key_size == 2 && accum_size == 4 && accum_t != type_t::float32) // I16, F32, I32 regressed, fall back to SM90 + { + // ipt_11.tpb_256.trp_0.ld_0.ns_224.dcid_2.l2w_550 1.066293 1.000109 1.073092 1.181818 + return {256, + 11, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 224, 550}}; + } + if (key_size == 2 && accum_size == 8) + { + // ipt_10.tpb_160.trp_1.ld_0.ns_156.dcid_1.l2w_725 1.045007 1.002105 1.049690 1.141827 + return {160, + 10, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 156, 725}}; + } + if (key_size == 4 && accum_size == 1) + { + // ipt_10.tpb_224.trp_0.ld_0.ns_324.dcid_2.l2w_285 1.157217 1.073724 1.166510 1.356940 + return {224, + 10, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 324, 285}}; + } + if (key_size == 4 && accum_size == 2) + { + // ipt_11.tpb_256.trp_0.ld_0.ns_1984.dcid_5.l2w_115 1.214155 1.128842 1.214093 1.364476 + return {256, + 11, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon_jitter_window, 1984, 115}}; + } + if (key_size == 4 && accum_size == 4) + { + // ipt_14.tpb_224.trp_1.ld_0.ns_476.dcid_5.l2w_1005 1.187378 1.119705 1.185397 1.258420 + return {224, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon_jitter_window, 476, 1005}}; + } + if (key_size == 4 && accum_size == 8) + { + // ipt_10.tpb_256.trp_1.ld_0.ns_1868.dcid_7.l2w_145 1.142915 1.020581 1.137459 1.237913 + return {256, + 10, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon, 1868, 145}}; + } + if (key_size == 8 && accum_size == 1) + { + // ipt_9.tpb_224.trp_1.ld_0.ns_1940.dcid_5.l2w_460 1.157294 1.075650 1.153566 1.250729 + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon_jitter_window, 1940, 460}}; + } + if (key_size == 8 && accum_size == 2) + { + // ipt_11.tpb_224.trp_1.ld_1.ns_392.dcid_2.l2w_550 1.104034 1.007212 1.099543 1.220401 + return {224, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_CA, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 392, 550}}; + } + if (key_size == 8 && accum_size == 4) + { + // ipt_9.tpb_224.trp_1.ld_0.ns_244.dcid_2.l2w_475 1.130098 1.000000 1.130661 1.215722 + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 244, 475}}; + } + if (key_size == 8 && accum_size == 8) + { + // ipt_9.tpb_224.trp_1.ld_0.ns_196.dcid_2.l2w_340 1.272056 1.142857 1.262499 1.352941 + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 196, 340}}; + } + } + + if (arch >= ::cuda::arch_id::sm_90) + { + if (use_tuning) + { + if (key_size == 1 && accum_size == 1) + { + return { + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 720}}; + } + if (key_size == 1 && accum_size == 2) + { + return { + 320, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 865}}; + } + if (key_size == 1 && accum_size == 4) + { + return {192, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 735}}; + } + if (key_size == 1 && accum_size == 8) + { + return {128, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 580}}; + } + if (key_size == 1 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1100}}; + } + if (key_size == 2 && accum_size == 1) + { + return { + 128, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 985}}; + } + if (key_size == 2 && accum_size == 2) + { + return {256, + 11, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 276, 650}}; + } + if (key_size == 2 && accum_size == 4) + { + return {256, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 240, 765}}; + } + if (key_size == 2 && accum_size == 8) + { + return {128, + 19, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1190}}; + } + if (key_size == 2 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1175}}; + } + if (key_size == 4 && accum_size == 1) + { + return {256, + 13, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 404, 645}}; + } + if (key_size == 4 && accum_size == 2) + { + return {256, + 18, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1160}}; + } + if (key_size == 4 && accum_size == 4) + { + return {256, + 18, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1170}}; + } + if (key_size == 4 && accum_size == 8) + { + return {128, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1055}}; + } + if (key_size == 4 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1195}}; + } + if (key_size == 8 && accum_size == 1) + { + return {256, + 10, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1170}}; + } + if (key_size == 8 && accum_size == 2) + { + return {256, + 9, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 236, 1030}}; + } + if (key_size == 8 && accum_size == 4) + { + return {128, + 13, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 152, 560}}; + } + if (key_size == 8 && accum_size == 8) + { + return {128, + 23, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1030}}; + } + if (key_size == 8 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1125}}; + } + if (key_size == 16 && accum_size == 1) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1080}}; + } + if (key_size == 16 && accum_size == 2) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 320, 1005}}; + } + if (key_size == 16 && accum_size == 4) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 232, 1100}}; + } + if (key_size == 16 && accum_size == 8) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1195}}; + } + if (key_size == 16 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1150}}; + } + } + + // no tuning, use a default one + return __make_default_policy(LOAD_DEFAULT); + } + + if (arch >= ::cuda::arch_id::sm_86) + { + return __make_default_policy(LOAD_LDG); + } + + if (arch >= ::cuda::arch_id::sm_80) + { + if (use_tuning) + { + if (key_size == 1 && accum_size == 1) + { + return { + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 975}}; + } + if (key_size == 1 && accum_size == 2) + { + return { + 224, 12, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 840}}; + } + if (key_size == 1 && accum_size == 4) + { + return {256, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 760}}; + } + if (key_size == 1 && accum_size == 8) + { + return { + 224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1070}}; + } + if (key_size == 1 && accum_size == 16) + { + return {128, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1175}}; + } + if (key_size == 2 && accum_size == 1) + { + return { + 256, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}}; + } + if (key_size == 2 && accum_size == 2) + { + return {224, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 640}}; + } + if (key_size == 2 && accum_size == 4) + { + return {256, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 905}}; + } + if (key_size == 2 && accum_size == 8) + { + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 810}}; + } + if (key_size == 2 && accum_size == 16) + { + return {160, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1115}}; + } + if (key_size == 4 && accum_size == 1) + { + return {288, + 11, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1110}}; + } + if (key_size == 4 && accum_size == 2) + { + return {192, + 15, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1200}}; + } + if (key_size == 4 && accum_size == 4) + { + return {256, + 15, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1110}}; + } + if (key_size == 4 && accum_size == 8) + { + return {224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1165}}; + } + if (key_size == 4 && accum_size == 16) + { + return {160, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1100}}; + } + if (key_size == 8 && accum_size == 1) + { + return {192, + 10, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1175}}; + } + if (key_size == 8 && accum_size == 2) + { + return { + 224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1075}}; + } + if (key_size == 8 && accum_size == 4) + { + return { + 384, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1040}}; + } + if (key_size == 8 && accum_size == 8) + { + return {128, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1080}}; + } + if (key_size == 8 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 430}}; + } + if (key_size == 16 && accum_size == 1) + { + return { + 192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1105}}; + } + if (key_size == 16 && accum_size == 2) + { + return {192, + 7, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 755}}; + } + if (key_size == 16 && accum_size == 4) + { + return {192, + 7, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 535}}; + } + if (key_size == 16 && accum_size == 8) + { + return { + 192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1035}}; + } + if (key_size == 16 && accum_size == 16) + { + return {128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1090}}; + } + } + + // no tuning, use a default one + return __make_default_policy(LOAD_DEFAULT); + } + + // for SM50 + return __make_default_policy(LOAD_LDG); + } +}; + +#if _CCCL_HAS_CONCEPTS() +static_assert(reduce_by_key_policy_selector); +#endif // _CCCL_HAS_CONCEPTS() + +// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers +template +struct policy_selector_from_hub +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_by_key_policy + { + using ReduceByKeyPolicyT = typename PolicyHub::MaxPolicy::ReduceByKeyPolicyT; + return reduce_by_key_policy{ + ReduceByKeyPolicyT::BLOCK_THREADS, + ReduceByKeyPolicyT::ITEMS_PER_THREAD, + ReduceByKeyPolicyT::LOAD_ALGORITHM, + ReduceByKeyPolicyT::LOAD_MODIFIER, + ReduceByKeyPolicyT::SCAN_ALGORITHM, + delay_constructor_policy_from_type, + }; + } +}; + +template +struct policy_selector_from_types +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy + { + return policy_selector{ + int{sizeof(KeyT)}, + int{sizeof(AccumT)}, + classify_type, + is_primitive_v, + ::cuda::std::is_trivially_copyable_v, + is_primitive_v, + basic_binary_op_t::value}(arch); + } +}; } // namespace detail::reduce_by_key CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh index 77c393d7643..5083ed52b85 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh @@ -18,17 +18,26 @@ #include #include #include +#include #include +#include #include #include +#include #include #include +#include + +#if !_CCCL_COMPILER(NVRTC) +# include +#endif CUB_NAMESPACE_BEGIN namespace detail::rle::encode { +// TODO(bgruber): remove in CCCL 4.0 when we drop the CUB dispatchers template (), @@ -89,6 +98,7 @@ struct sm80_tuning(), @@ -149,6 +159,7 @@ struct sm90_tuning(), @@ -218,7 +229,8 @@ struct sm100_tuning struct policy_hub { @@ -294,6 +306,227 @@ struct policy_hub using MaxPolicy = Policy1000; }; + +// DeviceRunLengthEncode::Encode delegates to reduce by key +using rle_encode_policy = reduce_by_key::reduce_by_key_policy; + +#if _CCCL_HAS_CONCEPTS() +template +concept rle_encode_policy_selector = reduce_by_key::reduce_by_key_policy_selector; +#endif // _CCCL_HAS_CONCEPTS() + +// TODO(bgruber): remove in CCCL 4.0 when we drop the RLE dispatchers +using reduce_by_key::policy_selector_from_hub; + +struct policy_selector +{ + int length_size; + int key_size; + type_t key_t; + + // TODO(bgruber): we want to get rid of the following three and just assume by default that types behave "primitive". + // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because + // then users can opt-out of tunings again + bool length_is_primitive; + bool length_is_trivially_copyable; + bool key_is_primitive; + + _CCCL_API constexpr auto __make_default_policy(CacheLoadModifier load_mod) const -> rle_encode_policy + { + constexpr int nominal_4B_items_per_thread = 6; + const int combined_input_bytes = length_size + key_size; + const int max_input_bytes = (::cuda::std::max) (length_size, key_size); + const int items_per_thread = + (max_input_bytes <= 8) + ? 6 + : ::cuda::std::clamp( + ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread); + return rle_encode_policy{ + 128, + items_per_thread, + BLOCK_LOAD_DIRECT, + load_mod, + BLOCK_SCAN_WARP_SCANS, + default_reduce_by_key_delay_constructor_policy( + length_size, int{sizeof(int)}, length_is_primitive || length_is_trivially_copyable, true)}; + } + + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy + { + // if we don't have a tuning for SM100, fall back to SM90 + if (arch >= ::cuda::arch_id::sm_100 && length_is_primitive && length_size == 4 && key_is_primitive) + { + if (key_size == 1) + { + return rle_encode_policy{ + 256, + 14, + BLOCK_LOAD_DIRECT, + LOAD_CA, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon, 468, 300}}; + } + if (key_size == 2) + { + return rle_encode_policy{ + 224, + 14, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon, 376, 420}}; + } + if (key_size == 4) + { + return rle_encode_policy{ + 256, + 14, + BLOCK_LOAD_DIRECT, + LOAD_CA, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backon, 956, 70}}; + } + if (key_size == 8) + { + return rle_encode_policy{ + 224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::exponential_backoff, 188, 765}}; + } + } + + if (arch >= ::cuda::arch_id::sm_90) + { + if (length_is_primitive && length_size == 4) + { + if (key_is_primitive && key_size == 1) + { + return rle_encode_policy{ + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}}; + } + if (key_is_primitive && key_size == 2) + { + return rle_encode_policy{ + 128, 22, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 775}}; + } + if (key_is_primitive && key_size == 4) + { + return rle_encode_policy{ + 192, + 14, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 284, 480}}; + } + if (key_is_primitive && key_size == 8) + { + return rle_encode_policy{ + 128, + 19, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 515}}; + } + if (key_t == type_t::int128 || key_t == type_t::uint128) + { + return rle_encode_policy{ + 128, + 11, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::fixed_delay, 428, 930}}; + } + } + + // no tuning, use a default one + return __make_default_policy(LOAD_DEFAULT); + } + + if (arch >= ::cuda::arch_id::sm_86) + { + return __make_default_policy(LOAD_LDG); + } + + if (arch >= ::cuda::arch_id::sm_80) + { + if (length_is_primitive && length_size == 4) + { + if (key_is_primitive && key_size == 1) + { + return rle_encode_policy{ + 256, 14, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 640}}; + } + if (key_is_primitive && key_size == 2) + { + return rle_encode_policy{ + 256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 900}}; + } + if (key_is_primitive && key_size == 4) + { + return rle_encode_policy{ + 256, + 13, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1080}}; + } + if (key_is_primitive && key_size == 8) + { + return rle_encode_policy{ + 224, + 9, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 1075}}; + } + if (key_t == type_t::int128 || key_t == type_t::uint128) + { + return rle_encode_policy{ + 128, + 7, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS, + {delay_constructor_kind::no_delay, 0, 630}}; + } + } + + // no tuning, use a default one + return __make_default_policy(LOAD_DEFAULT); + } + + // for SM50 + return __make_default_policy(LOAD_LDG); + } +}; + +#if _CCCL_HAS_CONCEPTS() +static_assert(rle_encode_policy_selector); +#endif // _CCCL_HAS_CONCEPTS() + +template +struct policy_selector_from_types +{ + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy + { + constexpr policy_selector selector{ + int{sizeof(LengthT)}, + int{sizeof(KeyT)}, + classify_type, + is_primitive_v, + ::cuda::std::is_trivially_copyable_v, + is_primitive_v}; + return selector(arch); + } +}; } // namespace detail::rle::encode CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh index 3ff351a2bd4..97f1d9a03e6 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh @@ -374,6 +374,7 @@ struct policy_selector type_t key_type; bool length_is_primitive; bool key_is_primitive; // TODO(bgruber): can probably be derived from key_type + bool key_is_trivially_copyable; _CCCL_API constexpr auto make_default_policy(BlockLoadAlgorithm block_load_alg, int delay_ctor_key_size, CacheLoadModifier load_mod) const @@ -388,7 +389,8 @@ struct policy_selector load_mod, true, BLOCK_SCAN_WARP_SCANS, - default_reduce_by_key_delay_constructor_policy(delay_ctor_key_size, sizeof(int), true)}; + default_reduce_by_key_delay_constructor_policy( + delay_ctor_key_size, sizeof(int), key_is_primitive || key_is_trivially_copyable, true)}; } [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy @@ -600,7 +602,12 @@ struct policy_selector_from_types [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy { constexpr policy_selector selector{ - sizeof(LengthT), int{sizeof(KeyT)}, classify_type, is_primitive_v, is_primitive_v}; + sizeof(LengthT), + int{sizeof(KeyT)}, + classify_type, + is_primitive_v, + is_primitive_v, + ::cuda::std::is_trivially_copyable_v}; return selector(arch); } }; diff --git a/nvbench_helper/nvbench_helper/look_back_helper.cuh b/nvbench_helper/nvbench_helper/look_back_helper.cuh index f7a97937dcb..5d31dde94dd 100644 --- a/nvbench_helper/nvbench_helper/look_back_helper.cuh +++ b/nvbench_helper/nvbench_helper/look_back_helper.cuh @@ -16,4 +16,8 @@ using delay_constructor_t = cub::detail::delay_constructor_t(TUNE_DELAY_CONSTRUCTOR_ID), TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>; + +inline constexpr auto delay_constructor_policy = cub::detail::delay_constructor_policy{ + static_cast(TUNE_DELAY_CONSTRUCTOR_ID), TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS}; + #endif // !TUNE_BASE