NVIDIA · bernhardmgruber · Feb 27, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 4, 2026
@@ -26,8 +26,7 @@
 #include <nvbench_helper.cuh>
 
 #if !TUNE_BASE
-// TODO(bgruber): can we get by without the base class?
-struct policy_selector : cub::detail::transform::tuning<policy_selector>
+struct policy_selector
 {
   _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy
   {

@@ -59,24 +59,6 @@ inline constexpr bool is_non_deterministic_v =
 
 namespace reduce
 {
-struct get_tuning_query_t
-{};
-
-template <class Derived>
-struct tuning
-{
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived
-  {
-    return static_cast<const Derived&>(*this);
-  }
-};
-
-struct default_rfa_tuning : tuning<default_rfa_tuning>
-{
-  template <class AccumT, class Offset, class OpT>
-  using fn = detail::rfa::policy_selector_from_types<AccumT>;
-};
-
 template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
 struct unzip_and_write_arg_extremum_op
 {
@@ -145,10 +127,9 @@ private:
     using offset_t = detail::choose_offset_t<NumItemsT>;
     using accum_t  = ::cuda::std::
       __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
-    using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
-      TuningEnvT,
-      detail::reduce::get_tuning_query_t,
-      detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;
+    using default_policy_selector = detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>;
+    using policy_selector =
+      ::cuda::std::execution::__query_result_or_t<TuningEnvT, detail::reduce::reduce_policy, default_policy_selector>;
 
     return detail::reduce::dispatch<accum_t>(
       d_temp_storage,
@@ -160,7 +141,7 @@ private:
       init,
       stream,
       transform_op,
-      reduce_tuning_t{});
+      policy_selector{});
   }
 
   template <typename TuningEnvT,
@@ -183,16 +164,22 @@ private:
     cudaStream_t stream)
   {
     using offset_t = detail::choose_offset_t<NumItemsT>;
-
-    using reduce_tuning_t = ::cuda::std::execution::
-      __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_rfa_tuning>;
-
-    using accum_t = ::cuda::std::
+    using accum_t  = ::cuda::std::
       __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
-    using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
+    using default_policy_selector = detail::rfa::policy_selector_from_types<accum_t>;
+    using policy_selector =
+      ::cuda::std::execution::__query_result_or_t<TuningEnvT, detail::rfa::rfa_policy, default_policy_selector>;
 
-    return detail::rfa::dispatch<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t, policy_t>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), init, stream, transform_op);
+    return detail::rfa::dispatch<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t>(
-    return detail::rfa::dispatch<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t>(
+    return detail::rfa::dispatch<accum_t>(
-    return detail::rfa::dispatch<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t>(
+    return detail::rfa::dispatch<accum_t>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      static_cast<offset_t>(num_items),
+      init,
+      stream,
+      transform_op,
+      policy_selector{});
   }
 
   template <typename TuningEnvT,
@@ -214,13 +201,11 @@ private:
     ::cuda::execution::determinism::not_guaranteed_t,
     cudaStream_t stream)
   {
-    using offset_t = detail::choose_offset_t<NumItemsT>;
-    using accum_t  = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
-
-    using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
-      TuningEnvT,
-      detail::reduce::get_tuning_query_t,
-      detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;
+    using offset_t                = detail::choose_offset_t<NumItemsT>;
+    using accum_t                 = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
+    using default_policy_selector = detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>;
+    using policy_selector =
+      ::cuda::std::execution::__query_result_or_t<TuningEnvT, detail::reduce::reduce_policy, default_policy_selector>;
 
     return detail::reduce::dispatch_nondeterministic<accum_t>(
       d_temp_storage,
@@ -232,7 +217,7 @@ private:
       init,
       stream,
       transform_op,
-      reduce_tuning_t{});
+      policy_selector{});
   }
 
 public:

@@ -46,27 +46,6 @@ struct ::cuda::proclaims_copyable_arguments<CUB_NS_QUALIFIER::detail::__return_c
 {};
 
 CUB_NAMESPACE_BEGIN
-namespace detail::transform
-{
-// TODO(bgruber): can we get by without the tuning base class? Since we have transform_policy_selector, could we enrich
-// get_tuning_query_t to just check if the environment has a type that fulfills transform_policy_selector?
-struct get_tuning_query_t
-{};
-
-template <class PolicySelector>
-// TODO(bgruber): we cannot check the concept here because PolicySelector is usually an incomplete type still
-// #if _CCCL_HAS_CONCEPTS()
-//   requires transform_policy_selector<PolicySelector>
-// #endif // _CCCL_HAS_CONCEPTS()
-struct tuning
-{
-  [[nodiscard]] _CCCL_TRIVIAL_API constexpr auto query(const get_tuning_query_t&) const noexcept -> PolicySelector
-  {
-    return static_cast<const PolicySelector&>(*this);
-  }
-};
-} // namespace detail::transform
-
 //! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
 //! sequences into an output sequence.
 struct DeviceTransform
@@ -105,8 +84,9 @@ private:
       ::cuda::std::is_same_v<Predicate, detail::transform::always_true_predicate>,
       ::cuda::std::tuple<RandomAccessIteratorsIn...>,
       RandomAccessIteratorOut>;
+
     using policy_selector = ::cuda::std::execution::
-      __query_result_or_t<tuning_env, detail::transform::get_tuning_query_t, default_policy_selector>;
+      __query_result_or_t<tuning_env, detail::transform::transform_policy, default_policy_selector>;
 
 #if _CCCL_HAS_CONCEPTS()
     static_assert(detail::transform::transform_policy_selector<policy_selector>);

@@ -105,7 +105,7 @@ TEST_CASE("Device sum works with default environment", "[reduce][device]")
 }
 
 template <int BlockThreads>
-struct reduce_tuning : cub::detail::reduce::tuning<reduce_tuning<BlockThreads>>
+struct reduce_tuning
 {
   _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::reduce::reduce_policy
   {
@@ -115,20 +115,16 @@ struct reduce_tuning : cub::detail::reduce::tuning<reduce_tuning<BlockThreads>>
   }
 };
 
-struct get_scan_tuning_query_t
+struct unrelated_policy
 {};
 
-struct scan_tuning
+struct unrelated_tuning
 {
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_scan_tuning_query_t&) const noexcept
+  // should never be called
+  auto operator()(cuda::arch_id /*arch*/) const -> unrelated_policy
   {
-    return *this;
+    throw 1337;
   }
-
-  // Make sure this is not used
-  template <class /* AccumT */, class /* Offset */, class /* OpT */>
-  struct fn
-  {};
 };
 
 using block_sizes = c2h::type_list<cuda::std::integral_constant<int, 32>, cuda::std::integral_constant<int, 64>>;
@@ -143,8 +139,8 @@ C2H_TEST("Device reduce can be tuned", "[reduce][device]", block_sizes)
   auto d_in      = cuda::constant_iterator(1);
   auto d_out     = thrust::device_vector<int>(1);
 
-  // We are expecting that `scan_tuning` is ignored
-  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, scan_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(cudaSuccess == cub::DeviceReduce::Reduce(d_in, d_out.begin(), num_items, block_size_check, 0, env));
   REQUIRE(d_out[0] == num_items);
@@ -159,8 +155,8 @@ C2H_TEST("Device sum can be tuned", "[reduce][device]", block_sizes)
   auto d_in      = cuda::constant_iterator(1);
   auto d_out     = thrust::device_vector<int>(1);
 
-  // We are expecting that `scan_tuning` is ignored
-  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, scan_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(cudaSuccess == cub::DeviceReduce::Sum(d_in, d_out.begin(), num_items, env));
   REQUIRE(d_out[0] == num_items);

@@ -29,42 +29,19 @@ using float_type_list =
 #endif
                  >;
 
-template <int NOMINAL_BLOCK_THREADS_4B, int NOMINAL_ITEMS_PER_THREAD_4B>
-struct AgentReducePolicy
-{
-  /// Number of items per vectorized load
-  static constexpr int VECTOR_LOAD_LENGTH = 4;
-
-  /// Cooperative block-wide reduction algorithm to use
-  static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM =
-    cub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC;
-
-  /// Cache load modifier for reading input elements
-  static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::CacheLoadModifier::LOAD_DEFAULT;
-  constexpr static int ITEMS_PER_THREAD                 = NOMINAL_ITEMS_PER_THREAD_4B;
-  constexpr static int BLOCK_THREADS                    = NOMINAL_BLOCK_THREADS_4B;
-};
-
 template <int ItemsPerThread, int BlockSize>
-struct hub_t
+struct custom_policy_selector
 {
-  struct Policy : cub::ChainedPolicy<300, Policy, Policy>
+  _CCCL_API constexpr auto operator()(::cuda::arch_id) const -> cub::detail::reduce::reduce_policy
   {
-    constexpr static int ITEMS_PER_THREAD = ItemsPerThread;
-
-    using ReducePolicy = AgentReducePolicy<BlockSize, ItemsPerThread>;
-
-    // SingleTilePolicy
-    using SingleTilePolicy = ReducePolicy;
-
-    // SegmentedReducePolicy
-    using SegmentedReducePolicy = ReducePolicy;
-
-    // ReduceNondeterministicPolicy
-    using ReduceNondeterministicPolicy = ReducePolicy;
-  };
-
-  using MaxPolicy = Policy;
+    auto rp = cub::detail::reduce::agent_reduce_policy{
+      BlockSize,
+      ItemsPerThread,
+      4,
+      cub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC,
+      cub::CacheLoadModifier::LOAD_DEFAULT};
+    return {rp, rp, rp};
+  }
 };
 
 C2H_TEST("Nondeterministic Device reduce works with float and double on gpu",
@@ -157,11 +134,11 @@ C2H_TEST("Nondeterministic Device reduce works with float and double on gpu with
   c2h::device_vector<type> d_output_p1(1);
   c2h::device_vector<type> d_output_p2(1);
 
-  auto env1 = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed), cuda::execution::__tune(hub_t<1, 128>{})};
+  auto env1 = cuda::std::execution::env{cuda::execution::require(cuda::execution::determinism::not_guaranteed),
+                                        cuda::execution::__tune(custom_policy_selector<1, 128>{})};
 
-  auto env2 = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed), cuda::execution::__tune(hub_t<2, 256>{})};
+  auto env2 = cuda::std::execution::env{cuda::execution::require(cuda::execution::determinism::not_guaranteed),
+                                        cuda::execution::__tune(custom_policy_selector<2, 256>{})};
 
   REQUIRE(
     cudaSuccess == cub::DeviceReduce::Reduce(d_input.begin(), d_output_p1.begin(), num_items, min_op, init, env1));

@@ -113,6 +113,7 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev
   REQUIRE(d_out[0] == value_t{0});
 }
 
+// TODO(bgruber): convert to the new tuning API
 template <int BlockThreads>
 struct scan_tuning : cub::detail::scan::tuning<scan_tuning<BlockThreads>>
 {
@@ -143,24 +144,16 @@ struct scan_tuning : cub::detail::scan::tuning<scan_tuning<BlockThreads>>
   };
 };
 
-struct get_reduce_tuning_query_t
+struct unrelated_policy
 {};
 
-struct reduce_tuning
+struct unrelated_tuning
 {
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_reduce_tuning_query_t&) const noexcept
+  // should never be called
+  auto operator()(cuda::arch_id /*arch*/) const -> unrelated_policy
   {
-    return *this;
+    throw 1337;
   }
-
-  // Make sure this is not used
-  template <class /* InputValueT */,
-            class /* OutputValueT */,
-            class /* AccumT */,
-            class /* Offset */,
-            class /* ScanOpT */>
-  struct fn
-  {};
 };
 
 using block_sizes = c2h::type_list<cuda::std::integral_constant<int, 32>, cuda::std::integral_constant<int, 64>>;
@@ -175,8 +168,8 @@ C2H_TEST("Device scan exclusive-scan can be tuned", "[scan][device]", block_size
   auto d_in      = cuda::constant_iterator(1);
   auto d_out     = thrust::device_vector<int>(num_items);
 
-  // We are expecting that `reduce_tuning` is ignored
-  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveScan(d_in, d_out.begin(), block_size_check, 0, num_items, env));
 
@@ -195,8 +188,8 @@ C2H_TEST("Device scan exclusive-sum can be tuned", "[scan][device]", block_sizes
   auto d_in      = cuda::constant_iterator(1);
   auto d_out     = thrust::device_vector<int>(num_items);
 
-  // We are expecting that `reduce_tuning` is ignored
-  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveSum(d_in, d_out.begin(), num_items, env));
 
@@ -248,8 +241,8 @@ C2H_TEST("Device scan inclusive-scan can be tuned", "[scan][device]", block_size
   auto d_in      = cuda::constant_iterator(1);
   auto d_out     = thrust::device_vector<int>(num_items);
 
-  // We are expecting that `reduce_tuning` is ignored
-  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(cudaSuccess == cub::DeviceScan::InclusiveScan(d_in, d_out.begin(), block_size_check, num_items, env));
 
@@ -291,8 +284,8 @@ C2H_TEST("Device scan inclusive-scan-init can be tuned", "[scan][device]", block
 
   int init{10};
 
-  // We are expecting that `reduce_tuning` is ignored
-  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(
     cudaSuccess == cub::DeviceScan::InclusiveScanInit(d_in, d_out.begin(), block_size_check, init, num_items, env));