Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 36 additions & 75 deletions cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,21 @@
// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5

#if !TUNE_BASE
# if TUNE_TRANSPOSE == 0
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
# else // TUNE_TRANSPOSE == 1
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
# endif // TUNE_TRANSPOSE

# if TUNE_LOAD == 0
# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
# else // TUNE_LOAD == 1
# define TUNE_LOAD_MODIFIER cub::LOAD_CA
# endif // TUNE_LOAD

struct device_rle_policy_hub
struct bench_rle_policy_selector
{
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
[[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const
-> cub::detail::rle::non_trivial_runs::rle_non_trivial_runs_policy
{
using RleSweepPolicyT =
cub::AgentRlePolicy<TUNE_THREADS,
TUNE_ITEMS,
TUNE_LOAD_ALGORITHM,
TUNE_LOAD_MODIFIER,
TUNE_TIME_SLICING,
cub::BLOCK_SCAN_WARP_SCANS,
delay_constructor_t>;
};

using MaxPolicy = Policy500;
return {
TUNE_THREADS,
TUNE_ITEMS,
TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA,
static_cast<bool>(TUNE_TIME_SLICING),
cub::BLOCK_SCAN_WARP_SCANS,
delay_constructor_policy,
};
}
};
#endif // !TUNE_BASE

Expand All @@ -60,25 +48,6 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
using num_runs_output_iterator_t = offset_t*;
using equality_op_t = ::cuda::std::equal_to<>;

#if !TUNE_BASE
using dispatch_t =
cub::DeviceRleDispatch<keys_input_it_t,
offset_output_it_t,
length_output_it_t,
num_runs_output_iterator_t,
equality_op_t,
offset_t,
device_rle_policy_hub>;
#else
using dispatch_t =
cub::DeviceRleDispatch<keys_input_it_t,
offset_output_it_t,
length_output_it_t,
num_runs_output_iterator_t,
equality_op_t,
offset_t>;
#endif

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
constexpr std::size_t min_segment_size = 1;
const std::size_t max_segment_size = static_cast<std::size_t>(state.get_int64("MaxSegSize"));
Expand All @@ -88,38 +57,39 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
thrust::device_vector<run_length_t> out_lengths(elements);
thrust::device_vector<T> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);

T* d_in_keys = thrust::raw_pointer_cast(in_keys.data());
const T* d_in_keys = thrust::raw_pointer_cast(in_keys.data());
offset_t* d_out_offsets = thrust::raw_pointer_cast(out_offsets.data());
run_length_t* d_out_lengths = thrust::raw_pointer_cast(out_lengths.data());
offset_t* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());

std::uint8_t* d_temp_storage{};
std::size_t temp_storage_bytes{};
const offset_t num_items = static_cast<offset_t>(elements);

dispatch_t::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_offsets,
d_out_lengths,
d_num_runs_out,
equality_op_t{},
elements,
0);
auto dispatch_on_stream = [&](cudaStream_t stream) {
cub::detail::rle::dispatch(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_offsets,
d_out_lengths,
d_num_runs_out,
equality_op_t{},
num_items,
stream
#if !TUNE_BASE
,
bench_rle_policy_selector{}
#endif
);
};

dispatch_on_stream(cudaStream_t{0});

thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

dispatch_t::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_offsets,
d_out_lengths,
d_num_runs_out,
equality_op_t{},
elements,
0);
dispatch_on_stream(cudaStream_t{0});
cudaDeviceSynchronize();
const OffsetT num_runs = num_runs_out[0];

Expand All @@ -130,16 +100,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
state.add_global_memory_writes<OffsetT>(1);

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in_keys,
d_out_offsets,
d_out_lengths,
d_num_runs_out,
equality_op_t{},
elements,
launch.get_stream());
dispatch_on_stream(launch.get_stream().get_stream());
});
}

Expand Down
26 changes: 25 additions & 1 deletion cub/cub/detail/delay_constructor.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ enum class delay_constructor_kind
exponential_backoff_jitter_window,
exponential_backon_jitter_window,
exponential_backon_jitter,
exponential_backon
exponential_backon,
reduce_by_key
};

#if !_CCCL_COMPILER(NVRTC)
Expand All @@ -56,6 +57,8 @@ inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kin
return os << "delay_constructor_kind::exponential_backon_jitter";
case delay_constructor_kind::exponential_backon:
return os << "delay_constructor_kind::exponential_backon";
case delay_constructor_kind::reduce_by_key:
return os << "delay_constructor_kind::reduce_by_key";
default:
return os << "<unknown delay_constructor_kind: " << static_cast<int>(kind) << ">";
}
Expand Down Expand Up @@ -125,6 +128,11 @@ template <unsigned int Delay, unsigned int L2WriteLatency>
inline constexpr auto delay_constructor_policy_from_type<exponential_backon_constructor_t<Delay, L2WriteLatency>> =
delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency};

template <unsigned int Delay, unsigned int L2WriteLatency, unsigned int GridThreshold>
inline constexpr auto
delay_constructor_policy_from_type<reduce_by_key_delay_constructor_t<Delay, L2WriteLatency, GridThreshold>> =
delay_constructor_policy{delay_constructor_kind::reduce_by_key, Delay, L2WriteLatency};

template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for;

Expand Down Expand Up @@ -176,8 +184,24 @@ struct delay_constructor_for<delay_constructor_kind::exponential_backon, Delay,
using type = exponential_backon_constructor_t<Delay, L2WriteLatency>;
};

template <unsigned int Delay, unsigned int L2WriteLatency>
struct delay_constructor_for<delay_constructor_kind::reduce_by_key, Delay, L2WriteLatency>
{
using type = reduce_by_key_delay_constructor_t<Delay, L2WriteLatency>;
};

template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
using delay_constructor_t = typename delay_constructor_for<Kind, Delay, L2WriteLatency>::type;

_CCCL_API constexpr auto
default_reduce_by_key_delay_constructor_policy(int key_size, int value_size, bool value_is_primitive)
{
if (value_is_primitive && (value_size + key_size < 16))
{
return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450};
}
return delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450};
}
} // namespace detail

CUB_NAMESPACE_END
31 changes: 12 additions & 19 deletions cub/cub/device/device_run_length_encode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -327,25 +327,18 @@ struct DeviceRunLengthEncode
{
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::NonTrivialRuns");

// Offset type used for global offsets
using offset_t = detail::choose_signed_offset_t<NumItemsT>;
using equality_op = ::cuda::std::equal_to<>;

return DeviceRleDispatch<
InputIteratorT,
OffsetsOutputIteratorT,
LengthsOutputIteratorT,
NumRunsOutputIteratorT,
equality_op,
offset_t>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_offsets_out,
d_lengths_out,
d_num_runs_out,
equality_op{},
num_items,
stream);
using global_offset_t = detail::choose_signed_offset_t<NumItemsT>;
using equality_op = ::cuda::std::equal_to<>;
return detail::rle::dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_offsets_out,
d_lengths_out,
d_num_runs_out,
equality_op{},
static_cast<global_offset_t>(num_items),
stream);
}
};

Expand Down
Loading
Loading