Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions c/parallel/src/radix_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ struct __align__({3}) values_storage_t {{
{4}
using device_radix_sort_policy = {5};
using namespace cub;
using namespace cub::detail;
using namespace cub::detail::radix_sort;
static_assert(device_radix_sort_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6}, "Host generated and JIT compiled policy mismatch");
)XXX",
Expand Down
183 changes: 183 additions & 0 deletions cub/cub/detail/delay_constructor.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
// SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/agent/single_pass_scan_operators.cuh>

#if !_CCCL_COMPILER(NVRTC)
# include <ostream>
#endif

CUB_NAMESPACE_BEGIN

namespace detail
{
enum class delay_constructor_kind
{
no_delay,
fixed_delay,
exponential_backoff,
exponential_backoff_jitter,
exponential_backoff_jitter_window,
exponential_backon_jitter_window,
exponential_backon_jitter,
exponential_backon
};

#if !_CCCL_COMPILER(NVRTC)
inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kind)
{
switch (kind)
{
case delay_constructor_kind::no_delay:
return os << "delay_constructor_kind::no_delay";
case delay_constructor_kind::fixed_delay:
return os << "delay_constructor_kind::fixed_delay";
case delay_constructor_kind::exponential_backoff:
return os << "delay_constructor_kind::exponential_backoff";
case delay_constructor_kind::exponential_backoff_jitter:
return os << "delay_constructor_kind::exponential_backoff_jitter";
case delay_constructor_kind::exponential_backoff_jitter_window:
return os << "delay_constructor_kind::exponential_backoff_jitter_window";
case delay_constructor_kind::exponential_backon_jitter_window:
return os << "delay_constructor_kind::exponential_backon_jitter_window";
case delay_constructor_kind::exponential_backon_jitter:
return os << "delay_constructor_kind::exponential_backon_jitter";
case delay_constructor_kind::exponential_backon:
return os << "delay_constructor_kind::exponential_backon";
default:
return os << "<unknown delay_constructor_kind: " << static_cast<int>(kind) << ">";
}
}
#endif // !_CCCL_COMPILER(NVRTC)

struct delay_constructor_policy
{
delay_constructor_kind kind;
unsigned int delay;
unsigned int l2_write_latency;

_CCCL_API constexpr friend bool operator==(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs)
{
return lhs.kind == rhs.kind && lhs.delay == rhs.delay && lhs.l2_write_latency == rhs.l2_write_latency;
}

_CCCL_API constexpr friend bool operator!=(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs)
{
return !(lhs == rhs);
}

#if !_CCCL_COMPILER(NVRTC)
friend ::std::ostream& operator<<(::std::ostream& os, const delay_constructor_policy& p)
{
return os << "delay_constructor_policy { .kind = " << p.kind << ", .delay = " << p.delay
<< ", .l2_write_latency = " << p.l2_write_latency << " }";
}
#endif // !_CCCL_COMPILER(NVRTC)
};

template <typename DelayConstructor>
inline constexpr auto delay_constructor_policy_from_type = 0;

template <unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<no_delay_constructor_t<L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::no_delay, 0, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<fixed_delay_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::fixed_delay, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<exponential_backoff_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backoff, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto
delay_constructor_policy_from_type<exponential_backoff_jitter_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto
delay_constructor_policy_from_type<exponential_backoff_jitter_window_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter_window, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto
delay_constructor_policy_from_type<exponential_backon_jitter_window_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<exponential_backon_jitter_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<exponential_backon_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency};

template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for;

template <unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for<delay_constructor_kind::no_delay, Delay, L2WriteLatency>
{
using type = no_delay_constructor_t<L2WriteLatency>;
};

template <unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for<delay_constructor_kind::fixed_delay, Delay, L2WriteLatency>
{
using type = fixed_delay_constructor_t<Delay, L2WriteLatency>;
};

template <unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for<delay_constructor_kind::exponential_backoff, Delay, L2WriteLatency>
{
using type = exponential_backoff_constructor_t<Delay, L2WriteLatency>;
};

template <unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for<delay_constructor_kind::exponential_backoff_jitter, Delay, L2WriteLatency>
{
using type = exponential_backoff_jitter_constructor_t<Delay, L2WriteLatency>;
};

template <unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for<delay_constructor_kind::exponential_backoff_jitter_window, Delay, L2WriteLatency>
{
using type = exponential_backoff_jitter_window_constructor_t<Delay, L2WriteLatency>;
};

template <unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for<delay_constructor_kind::exponential_backon_jitter_window, Delay, L2WriteLatency>
{
using type = exponential_backon_jitter_window_constructor_t<Delay, L2WriteLatency>;
};

template <unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for<delay_constructor_kind::exponential_backon_jitter, Delay, L2WriteLatency>
{
using type = exponential_backon_jitter_constructor_t<Delay, L2WriteLatency>;
};

template <unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for<delay_constructor_kind::exponential_backon, Delay, L2WriteLatency>
{
using type = exponential_backon_constructor_t<Delay, L2WriteLatency>;
};

template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
using delay_constructor_t = typename delay_constructor_for<Kind, Delay, L2WriteLatency>::type;
} // namespace detail

CUB_NAMESPACE_END
131 changes: 4 additions & 127 deletions cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cub/agent/agent_radix_sort_onesweep.cuh>
#include <cub/agent/agent_radix_sort_upsweep.cuh>
#include <cub/agent/agent_scan.cuh>
#include <cub/detail/delay_constructor.cuh>
#include <cub/device/dispatch/tuning/common.cuh>
#include <cub/util_device.cuh>

Expand All @@ -31,130 +32,6 @@
CUB_NAMESPACE_BEGIN
namespace detail::radix_sort
{
enum class delay_constructor_kind
{
no_delay,
fixed_delay,
exponential_backoff,
exponential_backoff_jitter,
exponential_backoff_jitter_window,
exponential_backon_jitter_window,
exponential_backon_jitter,
exponential_backon
};

#if !_CCCL_COMPILER(NVRTC)
inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kind)
{
switch (kind)
{
case delay_constructor_kind::no_delay:
return os << "delay_constructor_kind::no_delay";
case delay_constructor_kind::fixed_delay:
return os << "delay_constructor_kind::fixed_delay";
case delay_constructor_kind::exponential_backoff:
return os << "delay_constructor_kind::exponential_backoff";
case delay_constructor_kind::exponential_backoff_jitter:
return os << "delay_constructor_kind::exponential_backoff_jitter";
case delay_constructor_kind::exponential_backoff_jitter_window:
return os << "delay_constructor_kind::exponential_backoff_jitter_window";
case delay_constructor_kind::exponential_backon_jitter_window:
return os << "delay_constructor_kind::exponential_backon_jitter_window";
case delay_constructor_kind::exponential_backon_jitter:
return os << "delay_constructor_kind::exponential_backon_jitter";
case delay_constructor_kind::exponential_backon:
return os << "delay_constructor_kind::exponential_backon";
default:
return os << "<unknown delay_constructor_kind: " << static_cast<int>(kind) << ">";
}
}
#endif // !_CCCL_COMPILER(NVRTC)

struct delay_constructor_policy
{
delay_constructor_kind kind;
unsigned int delay;
unsigned int l2_write_latency;

_CCCL_API constexpr friend bool operator==(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs)
{
return lhs.kind == rhs.kind && lhs.delay == rhs.delay && lhs.l2_write_latency == rhs.l2_write_latency;
}

_CCCL_API constexpr friend bool operator!=(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs)
{
return !(lhs == rhs);
}

#if !_CCCL_COMPILER(NVRTC)
friend ::std::ostream& operator<<(::std::ostream& os, const delay_constructor_policy& p)
{
return os << "delay_constructor_policy { .kind = " << p.kind << ", .delay = " << p.delay
<< ", .l2_write_latency = " << p.l2_write_latency << " }";
}
#endif // !_CCCL_COMPILER(NVRTC)
};

template <typename DelayConstructor>
inline constexpr auto delay_constructor_policy_from_type = 0;

template <unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<no_delay_constructor_t<L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::no_delay, 0, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<fixed_delay_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::fixed_delay, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<exponential_backoff_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backoff, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto
delay_constructor_policy_from_type<exponential_backoff_jitter_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto
delay_constructor_policy_from_type<exponential_backoff_jitter_window_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter_window, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto
delay_constructor_policy_from_type<exponential_backon_jitter_window_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<exponential_backon_jitter_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<exponential_backon_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency};

// TODO(bgruber): this is modeled after <look_back_helper.cuh>, unify this
template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
struct __delay_constructor_t_helper
{
private:
using delay_constructors = ::cuda::std::__type_list<
detail::no_delay_constructor_t<L2WriteLatency>,
detail::fixed_delay_constructor_t<Delay, L2WriteLatency>,
detail::exponential_backoff_constructor_t<Delay, L2WriteLatency>,
detail::exponential_backoff_jitter_constructor_t<Delay, L2WriteLatency>,
detail::exponential_backoff_jitter_window_constructor_t<Delay, L2WriteLatency>,
detail::exponential_backon_jitter_window_constructor_t<Delay, L2WriteLatency>,
detail::exponential_backon_jitter_constructor_t<Delay, L2WriteLatency>,
detail::exponential_backon_constructor_t<Delay, L2WriteLatency>>;

public:
using type = ::cuda::std::__type_at_c<static_cast<int>(Kind), delay_constructors>;
};

template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
using delay_constructor_t = typename __delay_constructor_t_helper<Kind, Delay, L2WriteLatency>::type;

struct radix_sort_histogram_policy
{
int block_threads;
Expand Down Expand Up @@ -1778,7 +1655,7 @@ struct policy_selector
return ::cuda::std::max(value_size, key_size);
}

[[nodiscard]] _CCCL_API constexpr auto make_onsweep_small_key_policy(const small_key_tuning_values& tuning) const
[[nodiscard]] _CCCL_API constexpr auto make_onesweep_small_key_policy(const small_key_tuning_values& tuning) const
-> radix_sort_policy
{
const int primary_radix_bits = (key_size > 1) ? 7 : 5;
Expand Down Expand Up @@ -1921,12 +1798,12 @@ struct policy_selector

if (arch >= ::cuda::arch_id::sm_100)
{
return make_onsweep_small_key_policy(get_sm100_tuning(key_size, value_size, offset_size, key_type));
return make_onesweep_small_key_policy(get_sm100_tuning(key_size, value_size, offset_size, key_type));
}

if (arch >= ::cuda::arch_id::sm_90)
{
return make_onsweep_small_key_policy(get_sm90_tuning(key_size, value_size, offset_size));
return make_onesweep_small_key_policy(get_sm90_tuning(key_size, value_size, offset_size));
}

if (arch >= ::cuda::arch_id::sm_80)
Expand Down
19 changes: 6 additions & 13 deletions nvbench_helper/nvbench_helper/look_back_helper.cuh
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

#pragma once

#if !TUNE_BASE
# include <cub/agent/single_pass_scan_operators.cuh>
# include <cub/detail/delay_constructor.cuh>

# include <nvbench_helper.cuh>

# if !defined(TUNE_MAGIC_NS) || !defined(TUNE_L2_WRITE_LATENCY_NS) || !defined(TUNE_DELAY_CONSTRUCTOR_ID)
# error "TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS, and TUNE_DELAY_CONSTRUCTOR_ID must be defined"
# endif

using delay_constructors = nvbench::type_list<
cub::detail::no_delay_constructor_t<TUNE_L2_WRITE_LATENCY_NS>,
cub::detail::fixed_delay_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
cub::detail::exponential_backoff_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
cub::detail::exponential_backoff_jitter_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
cub::detail::exponential_backoff_jitter_window_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
cub::detail::exponential_backon_jitter_window_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
cub::detail::exponential_backon_jitter_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
cub::detail::exponential_backon_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>>;

using delay_constructor_t = nvbench::tl::get<TUNE_DELAY_CONSTRUCTOR_ID, delay_constructors>;
using delay_constructor_t =
cub::detail::delay_constructor_t<static_cast<cub::detail::delay_constructor_kind>(TUNE_DELAY_CONSTRUCTOR_ID),
TUNE_MAGIC_NS,
TUNE_L2_WRITE_LATENCY_NS>;
#endif // !TUNE_BASE
Loading