diff --git a/c/parallel/src/radix_sort.cu b/c/parallel/src/radix_sort.cu index 84db1db7d93..4f4ca7275b2 100644 --- a/c/parallel/src/radix_sort.cu +++ b/c/parallel/src/radix_sort.cu @@ -255,6 +255,7 @@ struct __align__({3}) values_storage_t {{ {4} using device_radix_sort_policy = {5}; using namespace cub; +using namespace cub::detail; using namespace cub::detail::radix_sort; static_assert(device_radix_sort_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6}, "Host generated and JIT compiled policy mismatch"); )XXX", diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh new file mode 100644 index 00000000000..44a409f497a --- /dev/null +++ b/cub/cub/detail/delay_constructor.cuh @@ -0,0 +1,183 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3 + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + +CUB_NAMESPACE_BEGIN + +namespace detail +{ +enum class delay_constructor_kind +{ + no_delay, + fixed_delay, + exponential_backoff, + exponential_backoff_jitter, + exponential_backoff_jitter_window, + exponential_backon_jitter_window, + exponential_backon_jitter, + exponential_backon +}; + +#if !_CCCL_COMPILER(NVRTC) +inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kind) +{ + switch (kind) + { + case delay_constructor_kind::no_delay: + return os << "delay_constructor_kind::no_delay"; + case delay_constructor_kind::fixed_delay: + return os << "delay_constructor_kind::fixed_delay"; + case delay_constructor_kind::exponential_backoff: + return os << "delay_constructor_kind::exponential_backoff"; + case delay_constructor_kind::exponential_backoff_jitter: + return os << "delay_constructor_kind::exponential_backoff_jitter"; + case delay_constructor_kind::exponential_backoff_jitter_window: + return os << "delay_constructor_kind::exponential_backoff_jitter_window"; + case delay_constructor_kind::exponential_backon_jitter_window: + return os << "delay_constructor_kind::exponential_backon_jitter_window"; + case delay_constructor_kind::exponential_backon_jitter: + return os << "delay_constructor_kind::exponential_backon_jitter"; + case delay_constructor_kind::exponential_backon: + return os << "delay_constructor_kind::exponential_backon"; + default: + return os << "(kind) << ">"; + } +} +#endif // !_CCCL_COMPILER(NVRTC) + +struct delay_constructor_policy +{ + delay_constructor_kind kind; + unsigned int delay; + unsigned int l2_write_latency; + + _CCCL_API constexpr friend bool operator==(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs) + { + return lhs.kind == rhs.kind && lhs.delay == rhs.delay && lhs.l2_write_latency == rhs.l2_write_latency; + } + + _CCCL_API constexpr friend bool operator!=(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs) + { + return !(lhs == rhs); + } + +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const delay_constructor_policy& p) + { + return os << "delay_constructor_policy { .kind = " << p.kind << ", .delay = " << p.delay + << ", .l2_write_latency = " << p.l2_write_latency << " }"; + } +#endif // !_CCCL_COMPILER(NVRTC) +}; + +template +inline constexpr auto delay_constructor_policy_from_type = 0; + +template +inline constexpr auto delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::no_delay, 0, L2WriteLatency}; + +template +inline constexpr auto delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::fixed_delay, Delay, L2WriteLatency}; + +template +inline constexpr auto delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backoff, Delay, L2WriteLatency}; + +template +inline constexpr auto + delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter, Delay, L2WriteLatency}; + +template +inline constexpr auto + delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter_window, Delay, L2WriteLatency}; + +template +inline constexpr auto + delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, Delay, L2WriteLatency}; + +template +inline constexpr auto delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, Delay, L2WriteLatency}; + +template +inline constexpr auto delay_constructor_policy_from_type> = + delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency}; + +template +struct delay_constructor_for; + +template +struct delay_constructor_for +{ + using type = no_delay_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = fixed_delay_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backoff_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backoff_jitter_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backoff_jitter_window_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backon_jitter_window_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backon_jitter_constructor_t; +}; + +template +struct delay_constructor_for +{ + using type = exponential_backon_constructor_t; +}; + +template +using delay_constructor_t = typename delay_constructor_for::type; +} // namespace detail + +CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh index 7ae1e0bde7c..8d3f0ec852d 100644 --- a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -31,130 +32,6 @@ CUB_NAMESPACE_BEGIN namespace detail::radix_sort { -enum class delay_constructor_kind -{ - no_delay, - fixed_delay, - exponential_backoff, - exponential_backoff_jitter, - exponential_backoff_jitter_window, - exponential_backon_jitter_window, - exponential_backon_jitter, - exponential_backon -}; - -#if !_CCCL_COMPILER(NVRTC) -inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kind) -{ - switch (kind) - { - case delay_constructor_kind::no_delay: - return os << "delay_constructor_kind::no_delay"; - case delay_constructor_kind::fixed_delay: - return os << "delay_constructor_kind::fixed_delay"; - case delay_constructor_kind::exponential_backoff: - return os << "delay_constructor_kind::exponential_backoff"; - case delay_constructor_kind::exponential_backoff_jitter: - return os << "delay_constructor_kind::exponential_backoff_jitter"; - case delay_constructor_kind::exponential_backoff_jitter_window: - return os << "delay_constructor_kind::exponential_backoff_jitter_window"; - case delay_constructor_kind::exponential_backon_jitter_window: - return os << "delay_constructor_kind::exponential_backon_jitter_window"; - case delay_constructor_kind::exponential_backon_jitter: - return os << "delay_constructor_kind::exponential_backon_jitter"; - case delay_constructor_kind::exponential_backon: - return os << "delay_constructor_kind::exponential_backon"; - default: - return os << "(kind) << ">"; - } -} -#endif // !_CCCL_COMPILER(NVRTC) - -struct delay_constructor_policy -{ - delay_constructor_kind kind; - unsigned int delay; - unsigned int l2_write_latency; - - _CCCL_API constexpr friend bool operator==(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs) - { - return lhs.kind == rhs.kind && lhs.delay == rhs.delay && lhs.l2_write_latency == rhs.l2_write_latency; - } - - _CCCL_API constexpr friend bool operator!=(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs) - { - return !(lhs == rhs); - } - -#if !_CCCL_COMPILER(NVRTC) - friend ::std::ostream& operator<<(::std::ostream& os, const delay_constructor_policy& p) - { - return os << "delay_constructor_policy { .kind = " << p.kind << ", .delay = " << p.delay - << ", .l2_write_latency = " << p.l2_write_latency << " }"; - } -#endif // !_CCCL_COMPILER(NVRTC) -}; - -template -inline constexpr auto delay_constructor_policy_from_type = 0; - -template -inline constexpr auto delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::no_delay, 0, L2WriteLatency}; - -template -inline constexpr auto delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::fixed_delay, Delay, L2WriteLatency}; - -template -inline constexpr auto delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backoff, Delay, L2WriteLatency}; - -template -inline constexpr auto - delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter, Delay, L2WriteLatency}; - -template -inline constexpr auto - delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter_window, Delay, L2WriteLatency}; - -template -inline constexpr auto - delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, Delay, L2WriteLatency}; - -template -inline constexpr auto delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, Delay, L2WriteLatency}; - -template -inline constexpr auto delay_constructor_policy_from_type> = - delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency}; - -// TODO(bgruber): this is modeled after , unify this -template -struct __delay_constructor_t_helper -{ -private: - using delay_constructors = ::cuda::std::__type_list< - detail::no_delay_constructor_t, - detail::fixed_delay_constructor_t, - detail::exponential_backoff_constructor_t, - detail::exponential_backoff_jitter_constructor_t, - detail::exponential_backoff_jitter_window_constructor_t, - detail::exponential_backon_jitter_window_constructor_t, - detail::exponential_backon_jitter_constructor_t, - detail::exponential_backon_constructor_t>; - -public: - using type = ::cuda::std::__type_at_c(Kind), delay_constructors>; -}; - -template -using delay_constructor_t = typename __delay_constructor_t_helper::type; - struct radix_sort_histogram_policy { int block_threads; @@ -1778,7 +1655,7 @@ struct policy_selector return ::cuda::std::max(value_size, key_size); } - [[nodiscard]] _CCCL_API constexpr auto make_onsweep_small_key_policy(const small_key_tuning_values& tuning) const + [[nodiscard]] _CCCL_API constexpr auto make_onesweep_small_key_policy(const small_key_tuning_values& tuning) const -> radix_sort_policy { const int primary_radix_bits = (key_size > 1) ? 7 : 5; @@ -1921,12 +1798,12 @@ struct policy_selector if (arch >= ::cuda::arch_id::sm_100) { - return make_onsweep_small_key_policy(get_sm100_tuning(key_size, value_size, offset_size, key_type)); + return make_onesweep_small_key_policy(get_sm100_tuning(key_size, value_size, offset_size, key_type)); } if (arch >= ::cuda::arch_id::sm_90) { - return make_onsweep_small_key_policy(get_sm90_tuning(key_size, value_size, offset_size)); + return make_onesweep_small_key_policy(get_sm90_tuning(key_size, value_size, offset_size)); } if (arch >= ::cuda::arch_id::sm_80) diff --git a/nvbench_helper/nvbench_helper/look_back_helper.cuh b/nvbench_helper/nvbench_helper/look_back_helper.cuh index bee6708a781..f7a97937dcb 100644 --- a/nvbench_helper/nvbench_helper/look_back_helper.cuh +++ b/nvbench_helper/nvbench_helper/look_back_helper.cuh @@ -1,10 +1,10 @@ -// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #pragma once #if !TUNE_BASE -# include +# include # include @@ -12,15 +12,8 @@ # error "TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS, and TUNE_DELAY_CONSTRUCTOR_ID must be defined" # endif -using delay_constructors = nvbench::type_list< - cub::detail::no_delay_constructor_t, - cub::detail::fixed_delay_constructor_t, - cub::detail::exponential_backoff_constructor_t, - cub::detail::exponential_backoff_jitter_constructor_t, - cub::detail::exponential_backoff_jitter_window_constructor_t, - cub::detail::exponential_backon_jitter_window_constructor_t, - cub::detail::exponential_backon_jitter_constructor_t, - cub::detail::exponential_backon_constructor_t>; - -using delay_constructor_t = nvbench::tl::get; +using delay_constructor_t = + cub::detail::delay_constructor_t(TUNE_DELAY_CONSTRUCTOR_ID), + TUNE_MAGIC_NS, + TUNE_L2_WRITE_LATENCY_NS>; #endif // !TUNE_BASE