From f7fc306624f8ac0864a9c69db5cac1dbddd30dd6 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 27 Feb 2026 23:17:31 +0100 Subject: [PATCH 1/5] Use tuning policy as tag for tuning environment --- cub/benchmarks/bench/transform/common.h | 3 +-- cub/cub/device/device_transform.cuh | 24 ++------------------ cub/test/catch2_test_device_transform_env.cu | 3 +-- libcudacxx/include/cuda/__execution/tune.h | 13 +++++++---- 4 files changed, 12 insertions(+), 31 deletions(-) diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h index 90547fc2dd6..08cb5df557b 100644 --- a/cub/benchmarks/bench/transform/common.h +++ b/cub/benchmarks/bench/transform/common.h @@ -26,8 +26,7 @@ #include #if !TUNE_BASE -// TODO(bgruber): can we get by without the base class? -struct policy_selector : cub::detail::transform::tuning +struct policy_selector { _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy { diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 74bd0ecdfac..13051f03d1f 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -46,27 +46,6 @@ struct ::cuda::proclaims_copyable_arguments -// TODO(bgruber): we cannot check the concept here because PolicySelector is usually an incomplete type still -// #if _CCCL_HAS_CONCEPTS() -// requires transform_policy_selector -// #endif // _CCCL_HAS_CONCEPTS() -struct tuning -{ - [[nodiscard]] _CCCL_TRIVIAL_API constexpr auto query(const get_tuning_query_t&) const noexcept -> PolicySelector - { - return static_cast(*this); - } -}; -} // namespace detail::transform - //! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input //! sequences into an output sequence. struct DeviceTransform @@ -105,8 +84,9 @@ private: ::cuda::std::is_same_v, ::cuda::std::tuple, RandomAccessIteratorOut>; + using policy_selector = ::cuda::std::execution:: - __query_result_or_t; + __query_result_or_t; #if _CCCL_HAS_CONCEPTS() static_assert(detail::transform::transform_policy_selector); diff --git a/cub/test/catch2_test_device_transform_env.cu b/cub/test/catch2_test_device_transform_env.cu index f9e3b8c3fb8..642392ea113 100644 --- a/cub/test/catch2_test_device_transform_env.cu +++ b/cub/test/catch2_test_device_transform_env.cu @@ -224,8 +224,7 @@ C2H_TEST("DeviceTransform::TransformStableArgumentAddresses custom stream", "[de } // use a policy selector that prescribes to run with exactly 8 threads per block and 3 items per thread -// TODO(bgruber): can we get by without the base class? -struct my_policy_selector : cub::detail::transform::tuning +struct my_policy_selector { _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy { diff --git a/libcudacxx/include/cuda/__execution/tune.h b/libcudacxx/include/cuda/__execution/tune.h index b273930dd87..0890e094177 100644 --- a/libcudacxx/include/cuda/__execution/tune.h +++ b/libcudacxx/include/cuda/__execution/tune.h @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -51,14 +52,16 @@ struct __get_tuning_t _CCCL_GLOBAL_CONSTANT auto __get_tuning = __get_tuning_t{}; -template -[[nodiscard]] _CCCL_NODEBUG_API auto __tune(_Tunings...) +template +[[nodiscard]] _CCCL_NODEBUG_API auto __tune(_PolicySelectors...) { - static_assert((::cuda::std::is_empty_v<_Tunings> && ...), "Stateful tunings are not implemented"); + static_assert((::cuda::std::is_empty_v<_PolicySelectors> && ...), "Stateful policy selectors are not implemented"); - // clang < 19 doesn't like this code // since all the tunings are stateless, let's ignore incoming parameters - ::cuda::std::execution::env<_Tunings...> __env{}; + // we use the return type of the policy_selector as tag + ::cuda::std::execution::env< + ::cuda::std::execution::prop...> + __env; return ::cuda::std::execution::prop{__get_tuning_t{}, __env}; } From 77a1bc36f3347bb2fcb77eb8179ebccbb27531f2 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 2 Mar 2026 14:03:14 +0100 Subject: [PATCH 2/5] Update libcudacxx/include/cuda/__execution/tune.h Co-authored-by: Michael Schellenberger Costa --- libcudacxx/include/cuda/__execution/tune.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libcudacxx/include/cuda/__execution/tune.h b/libcudacxx/include/cuda/__execution/tune.h index 0890e094177..6d96e077bda 100644 --- a/libcudacxx/include/cuda/__execution/tune.h +++ b/libcudacxx/include/cuda/__execution/tune.h @@ -58,12 +58,12 @@ template static_assert((::cuda::std::is_empty_v<_PolicySelectors> && ...), "Stateful policy selectors are not implemented"); // since all the tunings are stateless, let's ignore incoming parameters + // we use the return type of the policy_selector as tag - ::cuda::std::execution::env< - ::cuda::std::execution::prop...> - __env; - - return ::cuda::std::execution::prop{__get_tuning_t{}, __env}; + using tuning_env = ::cuda::std::execution::env< + ::cuda::std::execution::prop...>; + + return ::cuda::std::execution::prop{__get_tuning_t{}, tuning_env{}}; } _CCCL_END_NAMESPACE_CUDA_EXECUTION From cf4bcc2bac2e4ff6364b3bf70068625625ede8fa Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 2 Mar 2026 15:12:39 +0100 Subject: [PATCH 3/5] Format --- libcudacxx/include/cuda/__execution/tune.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/__execution/tune.h b/libcudacxx/include/cuda/__execution/tune.h index 6d96e077bda..fb009470c86 100644 --- a/libcudacxx/include/cuda/__execution/tune.h +++ b/libcudacxx/include/cuda/__execution/tune.h @@ -58,11 +58,11 @@ template static_assert((::cuda::std::is_empty_v<_PolicySelectors> && ...), "Stateful policy selectors are not implemented"); // since all the tunings are stateless, let's ignore incoming parameters - + // we use the return type of the policy_selector as tag using tuning_env = ::cuda::std::execution::env< ::cuda::std::execution::prop...>; - + return ::cuda::std::execution::prop{__get_tuning_t{}, tuning_env{}}; } From 1c9f70f6301571f6355370dcfef2cccfae86fc26 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 4 Mar 2026 22:05:44 +0100 Subject: [PATCH 4/5] Rewrite more policies --- cub/cub/device/device_reduce.cuh | 63 +++++++------------ cub/test/catch2_test_device_reduce_env.cu | 24 +++---- ...ch2_test_device_reduce_nondeterministic.cu | 30 +++------ cub/test/catch2_test_device_scan_env.cu | 35 +++++------ ...catch2_test_device_segmented_reduce_env.cu | 47 ++++---------- .../libcudacxx/cuda/execution/tune.pass.cpp | 61 ++++++------------ 6 files changed, 88 insertions(+), 172 deletions(-) diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 628601149eb..6dac9dfaa9d 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -59,24 +59,6 @@ inline constexpr bool is_non_deterministic_v = namespace reduce { -struct get_tuning_query_t -{}; - -template -struct tuning -{ - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived - { - return static_cast(*this); - } -}; - -struct default_rfa_tuning : tuning -{ - template - using fn = detail::rfa::policy_selector_from_types; -}; - template struct unzip_and_write_arg_extremum_op { @@ -145,10 +127,9 @@ private: using offset_t = detail::choose_offset_t; using accum_t = ::cuda::std:: __accumulator_t>, T>; - using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t< - TuningEnvT, - detail::reduce::get_tuning_query_t, - detail::reduce::policy_selector_from_types>; + using default_policy_selector = detail::reduce::policy_selector_from_types; + using policy_selector = + ::cuda::std::execution::__query_result_or_t; return detail::reduce::dispatch( d_temp_storage, @@ -160,7 +141,7 @@ private: init, stream, transform_op, - reduce_tuning_t{}); + policy_selector{}); } template ; - - using reduce_tuning_t = ::cuda::std::execution:: - __query_result_or_t; - - using accum_t = ::cuda::std:: + using accum_t = ::cuda::std:: __accumulator_t>, T>; - using policy_t = typename reduce_tuning_t::template fn; + using default_policy_selector = detail::rfa::policy_selector_from_types; + using policy_selector = + ::cuda::std::execution::__query_result_or_t; - return detail::rfa::dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast(num_items), init, stream, transform_op); + return detail::rfa::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + static_cast(num_items), + init, + stream, + transform_op, + policy_selector{}); } template ; - using accum_t = ::cuda::std::__accumulator_t, T>; - - using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t< - TuningEnvT, - detail::reduce::get_tuning_query_t, - detail::reduce::policy_selector_from_types>; + using offset_t = detail::choose_offset_t; + using accum_t = ::cuda::std::__accumulator_t, T>; + using default_policy_selector = detail::reduce::policy_selector_from_types; + using policy_selector = + ::cuda::std::execution::__query_result_or_t; return detail::reduce::dispatch_nondeterministic( d_temp_storage, @@ -232,7 +217,7 @@ private: init, stream, transform_op, - reduce_tuning_t{}); + policy_selector{}); } public: diff --git a/cub/test/catch2_test_device_reduce_env.cu b/cub/test/catch2_test_device_reduce_env.cu index 13625e85807..954f7b8454d 100644 --- a/cub/test/catch2_test_device_reduce_env.cu +++ b/cub/test/catch2_test_device_reduce_env.cu @@ -105,7 +105,7 @@ TEST_CASE("Device sum works with default environment", "[reduce][device]") } template -struct reduce_tuning : cub::detail::reduce::tuning> +struct reduce_tuning { _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::reduce::reduce_policy { @@ -115,20 +115,16 @@ struct reduce_tuning : cub::detail::reduce::tuning> } }; -struct get_scan_tuning_query_t +struct unrelated_policy {}; -struct scan_tuning +struct unrelated_tuning { - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_scan_tuning_query_t&) const noexcept + // should never be called + auto operator()(cuda::arch_id /*arch*/) const -> unrelated_policy { - return *this; + throw 1337; } - - // Make sure this is not used - template - struct fn - {}; }; using block_sizes = c2h::type_list, cuda::std::integral_constant>; @@ -143,8 +139,8 @@ C2H_TEST("Device reduce can be tuned", "[reduce][device]", block_sizes) auto d_in = cuda::constant_iterator(1); auto d_out = thrust::device_vector(1); - // We are expecting that `scan_tuning` is ignored - auto env = cuda::execution::__tune(reduce_tuning{}, scan_tuning{}); + // We are expecting that `unrelated_tuning` is ignored + auto env = cuda::execution::__tune(reduce_tuning{}, unrelated_tuning{}); REQUIRE(cudaSuccess == cub::DeviceReduce::Reduce(d_in, d_out.begin(), num_items, block_size_check, 0, env)); REQUIRE(d_out[0] == num_items); @@ -159,8 +155,8 @@ C2H_TEST("Device sum can be tuned", "[reduce][device]", block_sizes) auto d_in = cuda::constant_iterator(1); auto d_out = thrust::device_vector(1); - // We are expecting that `scan_tuning` is ignored - auto env = cuda::execution::__tune(reduce_tuning{}, scan_tuning{}); + // We are expecting that `unrelated_tuning` is ignored + auto env = cuda::execution::__tune(reduce_tuning{}, unrelated_tuning{}); REQUIRE(cudaSuccess == cub::DeviceReduce::Sum(d_in, d_out.begin(), num_items, env)); REQUIRE(d_out[0] == num_items); diff --git a/cub/test/catch2_test_device_reduce_nondeterministic.cu b/cub/test/catch2_test_device_reduce_nondeterministic.cu index 9d3c00ea423..72cd5d9160e 100644 --- a/cub/test/catch2_test_device_reduce_nondeterministic.cu +++ b/cub/test/catch2_test_device_reduce_nondeterministic.cu @@ -46,25 +46,13 @@ struct AgentReducePolicy }; template -struct hub_t +struct custom_policy_selector { - struct Policy : cub::ChainedPolicy<300, Policy, Policy> + _CCCL_API constexpr auto operator()(::cuda::arch_id) const -> cub::detail::reduce::reduce_policy { - constexpr static int ITEMS_PER_THREAD = ItemsPerThread; - - using ReducePolicy = AgentReducePolicy; - - // SingleTilePolicy - using SingleTilePolicy = ReducePolicy; - - // SegmentedReducePolicy - using SegmentedReducePolicy = ReducePolicy; - - // ReduceNondeterministicPolicy - using ReduceNondeterministicPolicy = ReducePolicy; - }; - - using MaxPolicy = Policy; + auto rp = cub::detail::reduce::agent_reduce_policy{BlockSize, ItemsPerThread}; + return {rp, rp, rp}; + } }; C2H_TEST("Nondeterministic Device reduce works with float and double on gpu", @@ -157,11 +145,11 @@ C2H_TEST("Nondeterministic Device reduce works with float and double on gpu with c2h::device_vector d_output_p1(1); c2h::device_vector d_output_p2(1); - auto env1 = cuda::std::execution::env{ - cuda::execution::require(cuda::execution::determinism::not_guaranteed), cuda::execution::__tune(hub_t<1, 128>{})}; + auto env1 = cuda::std::execution::env{cuda::execution::require(cuda::execution::determinism::not_guaranteed), + cuda::execution::__tune(custom_policy_selector<1, 128>{})}; - auto env2 = cuda::std::execution::env{ - cuda::execution::require(cuda::execution::determinism::not_guaranteed), cuda::execution::__tune(hub_t<2, 256>{})}; + auto env2 = cuda::std::execution::env{cuda::execution::require(cuda::execution::determinism::not_guaranteed), + cuda::execution::__tune(custom_policy_selector<2, 256>{})}; REQUIRE( cudaSuccess == cub::DeviceReduce::Reduce(d_input.begin(), d_output_p1.begin(), num_items, min_op, init, env1)); diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu index e8a692b6051..d490142bd79 100644 --- a/cub/test/catch2_test_device_scan_env.cu +++ b/cub/test/catch2_test_device_scan_env.cu @@ -113,6 +113,7 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev REQUIRE(d_out[0] == value_t{0}); } +// TODO(bgruber): convert to the new tuning API template struct scan_tuning : cub::detail::scan::tuning> { @@ -143,24 +144,16 @@ struct scan_tuning : cub::detail::scan::tuning> }; }; -struct get_reduce_tuning_query_t +struct unrelated_policy {}; -struct reduce_tuning +struct unrelated_tuning { - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_reduce_tuning_query_t&) const noexcept + // should never be called + auto operator()(cuda::arch_id /*arch*/) const -> unrelated_policy { - return *this; + throw 1337; } - - // Make sure this is not used - template - struct fn - {}; }; using block_sizes = c2h::type_list, cuda::std::integral_constant>; @@ -175,8 +168,8 @@ C2H_TEST("Device scan exclusive-scan can be tuned", "[scan][device]", block_size auto d_in = cuda::constant_iterator(1); auto d_out = thrust::device_vector(num_items); - // We are expecting that `reduce_tuning` is ignored - auto env = cuda::execution::__tune(scan_tuning{}, reduce_tuning{}); + // We are expecting that `unrelated_tuning` is ignored + auto env = cuda::execution::__tune(scan_tuning{}, unrelated_tuning{}); REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveScan(d_in, d_out.begin(), block_size_check, 0, num_items, env)); @@ -195,8 +188,8 @@ C2H_TEST("Device scan exclusive-sum can be tuned", "[scan][device]", block_sizes auto d_in = cuda::constant_iterator(1); auto d_out = thrust::device_vector(num_items); - // We are expecting that `reduce_tuning` is ignored - auto env = cuda::execution::__tune(scan_tuning{}, reduce_tuning{}); + // We are expecting that `unrelated_tuning` is ignored + auto env = cuda::execution::__tune(scan_tuning{}, unrelated_tuning{}); REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveSum(d_in, d_out.begin(), num_items, env)); @@ -248,8 +241,8 @@ C2H_TEST("Device scan inclusive-scan can be tuned", "[scan][device]", block_size auto d_in = cuda::constant_iterator(1); auto d_out = thrust::device_vector(num_items); - // We are expecting that `reduce_tuning` is ignored - auto env = cuda::execution::__tune(scan_tuning{}, reduce_tuning{}); + // We are expecting that `unrelated_tuning` is ignored + auto env = cuda::execution::__tune(scan_tuning{}, unrelated_tuning{}); REQUIRE(cudaSuccess == cub::DeviceScan::InclusiveScan(d_in, d_out.begin(), block_size_check, num_items, env)); @@ -291,8 +284,8 @@ C2H_TEST("Device scan inclusive-scan-init can be tuned", "[scan][device]", block int init{10}; - // We are expecting that `reduce_tuning` is ignored - auto env = cuda::execution::__tune(scan_tuning{}, reduce_tuning{}); + // We are expecting that `unrelated_tuning` is ignored + auto env = cuda::execution::__tune(scan_tuning{}, unrelated_tuning{}); REQUIRE( cudaSuccess == cub::DeviceScan::InclusiveScanInit(d_in, d_out.begin(), block_size_check, init, num_items, env)); diff --git a/cub/test/catch2_test_device_segmented_reduce_env.cu b/cub/test/catch2_test_device_segmented_reduce_env.cu index 2f9f7ee30f3..72df98e3ba9 100644 --- a/cub/test/catch2_test_device_segmented_reduce_env.cu +++ b/cub/test/catch2_test_device_segmented_reduce_env.cu @@ -10,47 +10,26 @@ #include template -struct reduce_tuning : cub::detail::reduce::tuning> +struct reduce_tuning { - template - struct fn + _CCCL_API constexpr auto operator()(::cuda::arch_id) const -> cub::detail::reduce::reduce_policy { - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> - { - struct ReducePolicy - { - static constexpr int VECTOR_LOAD_LENGTH = 1; - - static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM = cub::BLOCK_REDUCE_WARP_REDUCTIONS; - - static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_DEFAULT; - - static constexpr int ITEMS_PER_THREAD = 1; - static constexpr int BLOCK_THREADS = BlockThreads; - }; - - using SingleTilePolicy = ReducePolicy; - using SegmentedReducePolicy = ReducePolicy; - }; - - using MaxPolicy = Policy500; - }; + auto rp = cub::detail::reduce::agent_reduce_policy{ + BlockThreads, 1, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS, cub::LOAD_DEFAULT}; + return {rp, rp, rp}; + } }; -struct get_scan_tuning_query_t +struct unrelated_policy {}; -struct scan_tuning +struct unrelated_tuning { - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_scan_tuning_query_t&) const noexcept + // should never be called + auto operator()(cuda::arch_id /*arch*/) const -> unrelated_policy { - return *this; + throw 1337; } - - // Make sure this is not used - template - struct fn - {}; }; using block_sizes = c2h::type_list, cuda::std::integral_constant>; @@ -65,8 +44,8 @@ C2H_TEST("Device segmented sum can be tuned", "[reduce][device]", block_sizes) thrust::device_vector d_in{8, 6, 7, 5, 3, 0, 9}; thrust::device_vector d_out(3); - // We are expecting that `scan_tuning` is ignored - auto env = cuda::execution::__tune(reduce_tuning{}, scan_tuning{}); + // We are expecting that `unrelated_tuning` is ignored + auto env = cuda::execution::__tune(reduce_tuning{}, unrelated_tuning{}); auto error = cub::DeviceSegmentedReduce::Sum(d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, env); diff --git a/libcudacxx/test/libcudacxx/cuda/execution/tune.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/tune.pass.cpp index 5944599ed2c..088670a862d 100644 --- a/libcudacxx/test/libcudacxx/cuda/execution/tune.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/execution/tune.pass.cpp @@ -10,54 +10,31 @@ #include -struct get_reduce_tuning_query_t -{}; +struct reduce_policy +{ + int block_threads; +}; -template -struct reduce_tuning +template +struct reduce_policy_selector { - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_reduce_tuning_query_t&) const noexcept -> Derived + _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> reduce_policy { - return static_cast(*this); + return {BlockThreads / sizeof(T)}; } }; -template -struct reduce : reduce_tuning> +struct scan_policy { - template - struct type - { - struct max_policy - { - struct reduce_policy - { - static constexpr int block_threads = BlockThreads / sizeof(T); - }; - }; - }; + int block_threads = 1; }; -struct get_scan_tuning_query_t -{}; - -struct scan_tuning +struct scan_policy_selector { - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_scan_tuning_query_t&) const noexcept + _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> scan_policy { - return *this; + return {}; } - - struct type - { - struct max_policy - { - struct reduce_policy - { - static constexpr int block_threads = 1; - }; - }; - }; }; __host__ __device__ void test() @@ -65,15 +42,13 @@ __host__ __device__ void test() constexpr int nominal_block_threads = 256; constexpr int block_threads = nominal_block_threads / sizeof(int); - using env_t = decltype(cuda::execution::__tune(reduce{}, scan_tuning{})); + using env_t = decltype(cuda::execution::__tune(reduce{}, scan_tuning{})); using tuning_t = cuda::std::execution::__query_result_t; - using reduce_tuning_t = cuda::std::execution::__query_result_t; - using scan_tuning_t = cuda::std::execution::__query_result_t; - using reduce_policy_t = reduce_tuning_t::type; - using scan_policy_t = scan_tuning_t::type; + using reduce_policy_t = cuda::std::execution::__query_result_t; + using scan_policy_t = cuda::std::execution::__query_result_t; - static_assert(reduce_policy_t::max_policy::reduce_policy::block_threads == block_threads); - static_assert(scan_policy_t::max_policy::reduce_policy::block_threads == 1); + static_assert(reduce_policy_t{}(cuda::arch_id::sm_75). : block_threads == block_threads); + static_assert(scan_policy_t{}(cuda::arch_id::sm_75). : block_threads == 1); } int main(int, char**) From 7b0534bc4403eca4e3318e82c557cf9f2d34b89e Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 9 Mar 2026 21:50:13 +0100 Subject: [PATCH 5/5] Fix --- ...ch2_test_device_reduce_nondeterministic.cu | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/cub/test/catch2_test_device_reduce_nondeterministic.cu b/cub/test/catch2_test_device_reduce_nondeterministic.cu index 72cd5d9160e..a85c794fa7b 100644 --- a/cub/test/catch2_test_device_reduce_nondeterministic.cu +++ b/cub/test/catch2_test_device_reduce_nondeterministic.cu @@ -29,28 +29,17 @@ using float_type_list = #endif >; -template -struct AgentReducePolicy -{ - /// Number of items per vectorized load - static constexpr int VECTOR_LOAD_LENGTH = 4; - - /// Cooperative block-wide reduction algorithm to use - static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM = - cub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC; - - /// Cache load modifier for reading input elements - static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::CacheLoadModifier::LOAD_DEFAULT; - constexpr static int ITEMS_PER_THREAD = NOMINAL_ITEMS_PER_THREAD_4B; - constexpr static int BLOCK_THREADS = NOMINAL_BLOCK_THREADS_4B; -}; - template struct custom_policy_selector { _CCCL_API constexpr auto operator()(::cuda::arch_id) const -> cub::detail::reduce::reduce_policy { - auto rp = cub::detail::reduce::agent_reduce_policy{BlockSize, ItemsPerThread}; + auto rp = cub::detail::reduce::agent_reduce_policy{ + BlockSize, + ItemsPerThread, + 4, + cub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC, + cub::CacheLoadModifier::LOAD_DEFAULT}; return {rp, rp, rp}; } };