Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions cub/benchmarks/bench/transform/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@
#include <nvbench_helper.cuh>

#if !TUNE_BASE
// TODO(bgruber): can we get by without the base class?
struct policy_selector : cub::detail::transform::tuning<policy_selector>
struct policy_selector
{
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy
{
Expand Down
63 changes: 24 additions & 39 deletions cub/cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,24 +59,6 @@ inline constexpr bool is_non_deterministic_v =

namespace reduce
{
struct get_tuning_query_t
{};

template <class Derived>
struct tuning
{
[[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived
{
return static_cast<const Derived&>(*this);
}
};

struct default_rfa_tuning : tuning<default_rfa_tuning>
{
template <class AccumT, class Offset, class OpT>
using fn = detail::rfa::policy_selector_from_types<AccumT>;
};

template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
struct unzip_and_write_arg_extremum_op
{
Expand Down Expand Up @@ -145,10 +127,9 @@ private:
using offset_t = detail::choose_offset_t<NumItemsT>;
using accum_t = ::cuda::std::
__accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
TuningEnvT,
detail::reduce::get_tuning_query_t,
detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;
using default_policy_selector = detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>;
using policy_selector =
::cuda::std::execution::__query_result_or_t<TuningEnvT, detail::reduce::reduce_policy, default_policy_selector>;

return detail::reduce::dispatch<accum_t>(
d_temp_storage,
Expand All @@ -160,7 +141,7 @@ private:
init,
stream,
transform_op,
reduce_tuning_t{});
policy_selector{});
}

template <typename TuningEnvT,
Expand All @@ -183,16 +164,22 @@ private:
cudaStream_t stream)
{
using offset_t = detail::choose_offset_t<NumItemsT>;

using reduce_tuning_t = ::cuda::std::execution::
__query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_rfa_tuning>;

using accum_t = ::cuda::std::
using accum_t = ::cuda::std::
__accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
using default_policy_selector = detail::rfa::policy_selector_from_types<accum_t>;
using policy_selector =
::cuda::std::execution::__query_result_or_t<TuningEnvT, detail::rfa::rfa_policy, default_policy_selector>;

return detail::rfa::dispatch<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t, policy_t>(
d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), init, stream, transform_op);
return detail::rfa::dispatch<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: Do we actually need the explicit template arguments beyond accum_t?

Suggested change
return detail::rfa::dispatch<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t>(
return detail::rfa::dispatch<accum_t>(

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need to verify this case by case. There are some algorithms that want to override the accumulator that the dispatch function deduces.

d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
static_cast<offset_t>(num_items),
init,
stream,
transform_op,
policy_selector{});
}

template <typename TuningEnvT,
Expand All @@ -214,13 +201,11 @@ private:
::cuda::execution::determinism::not_guaranteed_t,
cudaStream_t stream)
{
using offset_t = detail::choose_offset_t<NumItemsT>;
using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;

using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
TuningEnvT,
detail::reduce::get_tuning_query_t,
detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;
using offset_t = detail::choose_offset_t<NumItemsT>;
using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
using default_policy_selector = detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>;
using policy_selector =
::cuda::std::execution::__query_result_or_t<TuningEnvT, detail::reduce::reduce_policy, default_policy_selector>;

return detail::reduce::dispatch_nondeterministic<accum_t>(
d_temp_storage,
Expand All @@ -232,7 +217,7 @@ private:
init,
stream,
transform_op,
reduce_tuning_t{});
policy_selector{});
}

public:
Expand Down
24 changes: 2 additions & 22 deletions cub/cub/device/device_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,6 @@ struct ::cuda::proclaims_copyable_arguments<CUB_NS_QUALIFIER::detail::__return_c
{};

CUB_NAMESPACE_BEGIN
namespace detail::transform
{
// TODO(bgruber): can we get by without the tuning base class? Since we have transform_policy_selector, could we enrich
// get_tuning_query_t to just check if the environment has a type that fulfills transform_policy_selector?
struct get_tuning_query_t
{};

template <class PolicySelector>
// TODO(bgruber): we cannot check the concept here because PolicySelector is usually an incomplete type still
// #if _CCCL_HAS_CONCEPTS()
// requires transform_policy_selector<PolicySelector>
// #endif // _CCCL_HAS_CONCEPTS()
struct tuning
{
[[nodiscard]] _CCCL_TRIVIAL_API constexpr auto query(const get_tuning_query_t&) const noexcept -> PolicySelector
{
return static_cast<const PolicySelector&>(*this);
}
};
} // namespace detail::transform

//! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
//! sequences into an output sequence.
struct DeviceTransform
Expand Down Expand Up @@ -105,8 +84,9 @@ private:
::cuda::std::is_same_v<Predicate, detail::transform::always_true_predicate>,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut>;

using policy_selector = ::cuda::std::execution::
__query_result_or_t<tuning_env, detail::transform::get_tuning_query_t, default_policy_selector>;
__query_result_or_t<tuning_env, detail::transform::transform_policy, default_policy_selector>;

#if _CCCL_HAS_CONCEPTS()
static_assert(detail::transform::transform_policy_selector<policy_selector>);
Expand Down
24 changes: 10 additions & 14 deletions cub/test/catch2_test_device_reduce_env.cu
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ TEST_CASE("Device sum works with default environment", "[reduce][device]")
}

template <int BlockThreads>
struct reduce_tuning : cub::detail::reduce::tuning<reduce_tuning<BlockThreads>>
struct reduce_tuning
{
_CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::reduce::reduce_policy
{
Expand All @@ -115,20 +115,16 @@ struct reduce_tuning : cub::detail::reduce::tuning<reduce_tuning<BlockThreads>>
}
};

struct get_scan_tuning_query_t
struct unrelated_policy
{};

struct scan_tuning
struct unrelated_tuning
{
[[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_scan_tuning_query_t&) const noexcept
// should never be called
auto operator()(cuda::arch_id /*arch*/) const -> unrelated_policy
{
return *this;
throw 1337;
}

// Make sure this is not used
template <class /* AccumT */, class /* Offset */, class /* OpT */>
struct fn
{};
};

using block_sizes = c2h::type_list<cuda::std::integral_constant<int, 32>, cuda::std::integral_constant<int, 64>>;
Expand All @@ -143,8 +139,8 @@ C2H_TEST("Device reduce can be tuned", "[reduce][device]", block_sizes)
auto d_in = cuda::constant_iterator(1);
auto d_out = thrust::device_vector<int>(1);

// We are expecting that `scan_tuning` is ignored
auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, scan_tuning{});
// We are expecting that `unrelated_tuning` is ignored
auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, unrelated_tuning{});

REQUIRE(cudaSuccess == cub::DeviceReduce::Reduce(d_in, d_out.begin(), num_items, block_size_check, 0, env));
REQUIRE(d_out[0] == num_items);
Expand All @@ -159,8 +155,8 @@ C2H_TEST("Device sum can be tuned", "[reduce][device]", block_sizes)
auto d_in = cuda::constant_iterator(1);
auto d_out = thrust::device_vector<int>(1);

// We are expecting that `scan_tuning` is ignored
auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, scan_tuning{});
// We are expecting that `unrelated_tuning` is ignored
auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, unrelated_tuning{});

REQUIRE(cudaSuccess == cub::DeviceReduce::Sum(d_in, d_out.begin(), num_items, env));
REQUIRE(d_out[0] == num_items);
Expand Down
51 changes: 14 additions & 37 deletions cub/test/catch2_test_device_reduce_nondeterministic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,42 +29,19 @@ using float_type_list =
#endif
>;

template <int NOMINAL_BLOCK_THREADS_4B, int NOMINAL_ITEMS_PER_THREAD_4B>
struct AgentReducePolicy
{
/// Number of items per vectorized load
static constexpr int VECTOR_LOAD_LENGTH = 4;

/// Cooperative block-wide reduction algorithm to use
static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM =
cub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC;

/// Cache load modifier for reading input elements
static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::CacheLoadModifier::LOAD_DEFAULT;
constexpr static int ITEMS_PER_THREAD = NOMINAL_ITEMS_PER_THREAD_4B;
constexpr static int BLOCK_THREADS = NOMINAL_BLOCK_THREADS_4B;
};

template <int ItemsPerThread, int BlockSize>
struct hub_t
struct custom_policy_selector
{
struct Policy : cub::ChainedPolicy<300, Policy, Policy>
_CCCL_API constexpr auto operator()(::cuda::arch_id) const -> cub::detail::reduce::reduce_policy
{
constexpr static int ITEMS_PER_THREAD = ItemsPerThread;

using ReducePolicy = AgentReducePolicy<BlockSize, ItemsPerThread>;

// SingleTilePolicy
using SingleTilePolicy = ReducePolicy;

// SegmentedReducePolicy
using SegmentedReducePolicy = ReducePolicy;

// ReduceNondeterministicPolicy
using ReduceNondeterministicPolicy = ReducePolicy;
};

using MaxPolicy = Policy;
auto rp = cub::detail::reduce::agent_reduce_policy{
BlockSize,
ItemsPerThread,
4,
cub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC,
cub::CacheLoadModifier::LOAD_DEFAULT};
return {rp, rp, rp};
}
};

C2H_TEST("Nondeterministic Device reduce works with float and double on gpu",
Expand Down Expand Up @@ -157,11 +134,11 @@ C2H_TEST("Nondeterministic Device reduce works with float and double on gpu with
c2h::device_vector<type> d_output_p1(1);
c2h::device_vector<type> d_output_p2(1);

auto env1 = cuda::std::execution::env{
cuda::execution::require(cuda::execution::determinism::not_guaranteed), cuda::execution::__tune(hub_t<1, 128>{})};
auto env1 = cuda::std::execution::env{cuda::execution::require(cuda::execution::determinism::not_guaranteed),
cuda::execution::__tune(custom_policy_selector<1, 128>{})};

auto env2 = cuda::std::execution::env{
cuda::execution::require(cuda::execution::determinism::not_guaranteed), cuda::execution::__tune(hub_t<2, 256>{})};
auto env2 = cuda::std::execution::env{cuda::execution::require(cuda::execution::determinism::not_guaranteed),
cuda::execution::__tune(custom_policy_selector<2, 256>{})};

REQUIRE(
cudaSuccess == cub::DeviceReduce::Reduce(d_input.begin(), d_output_p1.begin(), num_items, min_op, init, env1));
Expand Down
35 changes: 14 additions & 21 deletions cub/test/catch2_test_device_scan_env.cu
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev
REQUIRE(d_out[0] == value_t{0});
}

// TODO(bgruber): convert to the new tuning API
template <int BlockThreads>
struct scan_tuning : cub::detail::scan::tuning<scan_tuning<BlockThreads>>
{
Expand Down Expand Up @@ -143,24 +144,16 @@ struct scan_tuning : cub::detail::scan::tuning<scan_tuning<BlockThreads>>
};
};

struct get_reduce_tuning_query_t
struct unrelated_policy
{};

struct reduce_tuning
struct unrelated_tuning
{
[[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_reduce_tuning_query_t&) const noexcept
// should never be called
auto operator()(cuda::arch_id /*arch*/) const -> unrelated_policy
{
return *this;
throw 1337;
}

// Make sure this is not used
template <class /* InputValueT */,
class /* OutputValueT */,
class /* AccumT */,
class /* Offset */,
class /* ScanOpT */>
struct fn
{};
};

using block_sizes = c2h::type_list<cuda::std::integral_constant<int, 32>, cuda::std::integral_constant<int, 64>>;
Expand All @@ -175,8 +168,8 @@ C2H_TEST("Device scan exclusive-scan can be tuned", "[scan][device]", block_size
auto d_in = cuda::constant_iterator(1);
auto d_out = thrust::device_vector<int>(num_items);

// We are expecting that `reduce_tuning` is ignored
auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
// We are expecting that `unrelated_tuning` is ignored
auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});

REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveScan(d_in, d_out.begin(), block_size_check, 0, num_items, env));

Expand All @@ -195,8 +188,8 @@ C2H_TEST("Device scan exclusive-sum can be tuned", "[scan][device]", block_sizes
auto d_in = cuda::constant_iterator(1);
auto d_out = thrust::device_vector<int>(num_items);

// We are expecting that `reduce_tuning` is ignored
auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
// We are expecting that `unrelated_tuning` is ignored
auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});

REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveSum(d_in, d_out.begin(), num_items, env));

Expand Down Expand Up @@ -248,8 +241,8 @@ C2H_TEST("Device scan inclusive-scan can be tuned", "[scan][device]", block_size
auto d_in = cuda::constant_iterator(1);
auto d_out = thrust::device_vector<int>(num_items);

// We are expecting that `reduce_tuning` is ignored
auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
// We are expecting that `unrelated_tuning` is ignored
auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});

REQUIRE(cudaSuccess == cub::DeviceScan::InclusiveScan(d_in, d_out.begin(), block_size_check, num_items, env));

Expand Down Expand Up @@ -291,8 +284,8 @@ C2H_TEST("Device scan inclusive-scan-init can be tuned", "[scan][device]", block

int init{10};

// We are expecting that `reduce_tuning` is ignored
auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
// We are expecting that `unrelated_tuning` is ignored
auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});

REQUIRE(
cudaSuccess == cub::DeviceScan::InclusiveScanInit(d_in, d_out.begin(), block_size_check, init, num_items, env));
Expand Down
Loading
Loading