From f7fc306624f8ac0864a9c69db5cac1dbddd30dd6 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 27 Feb 2026 23:17:31 +0100
Subject: [PATCH 1/5] Use tuning policy as tag for tuning environment

---
 cub/benchmarks/bench/transform/common.h      |  3 +--
 cub/cub/device/device_transform.cuh          | 24 ++------------------
 cub/test/catch2_test_device_transform_env.cu |  3 +--
 libcudacxx/include/cuda/__execution/tune.h   | 13 +++++++----
 4 files changed, 12 insertions(+), 31 deletions(-)
diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h
index 90547fc2dd6..08cb5df557b 100644
--- a/cub/benchmarks/bench/transform/common.h
+++ b/cub/benchmarks/bench/transform/common.h
@@ -26,8 +26,7 @@
 #include <nvbench_helper.cuh>
 
 #if !TUNE_BASE
-// TODO(bgruber): can we get by without the base class?
-struct policy_selector : cub::detail::transform::tuning<policy_selector>
+struct policy_selector
 {
   _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy
   {
diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 74bd0ecdfac..13051f03d1f 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -46,27 +46,6 @@ struct ::cuda::proclaims_copyable_arguments<CUB_NS_QUALIFIER::detail::__return_c
 {};
 
 CUB_NAMESPACE_BEGIN
-namespace detail::transform
-{
-// TODO(bgruber): can we get by without the tuning base class? Since we have transform_policy_selector, could we enrich
-// get_tuning_query_t to just check if the environment has a type that fulfills transform_policy_selector?
-struct get_tuning_query_t
-{};
-
-template <class PolicySelector>
-// TODO(bgruber): we cannot check the concept here because PolicySelector is usually an incomplete type still
-// #if _CCCL_HAS_CONCEPTS()
-//   requires transform_policy_selector<PolicySelector>
-// #endif // _CCCL_HAS_CONCEPTS()
-struct tuning
-{
-  [[nodiscard]] _CCCL_TRIVIAL_API constexpr auto query(const get_tuning_query_t&) const noexcept -> PolicySelector
-  {
-    return static_cast<const PolicySelector&>(*this);
-  }
-};
-} // namespace detail::transform
-
 //! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
 //! sequences into an output sequence.
 struct DeviceTransform
@@ -105,8 +84,9 @@ private:
       ::cuda::std::is_same_v<Predicate, detail::transform::always_true_predicate>,
       ::cuda::std::tuple<RandomAccessIteratorsIn...>,
       RandomAccessIteratorOut>;
+
     using policy_selector = ::cuda::std::execution::
-      __query_result_or_t<tuning_env, detail::transform::get_tuning_query_t, default_policy_selector>;
+      __query_result_or_t<tuning_env, detail::transform::transform_policy, default_policy_selector>;
 
 #if _CCCL_HAS_CONCEPTS()
     static_assert(detail::transform::transform_policy_selector<policy_selector>);
diff --git a/cub/test/catch2_test_device_transform_env.cu b/cub/test/catch2_test_device_transform_env.cu
index f9e3b8c3fb8..642392ea113 100644
--- a/cub/test/catch2_test_device_transform_env.cu
+++ b/cub/test/catch2_test_device_transform_env.cu
@@ -224,8 +224,7 @@ C2H_TEST("DeviceTransform::TransformStableArgumentAddresses custom stream", "[de
 }
 
 // use a policy selector that prescribes to run with exactly 8 threads per block and 3 items per thread
-// TODO(bgruber): can we get by without the base class?
-struct my_policy_selector : cub::detail::transform::tuning<my_policy_selector>
+struct my_policy_selector
 {
   _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy
   {
diff --git a/libcudacxx/include/cuda/__execution/tune.h b/libcudacxx/include/cuda/__execution/tune.h
index b273930dd87..0890e094177 100644
--- a/libcudacxx/include/cuda/__execution/tune.h
+++ b/libcudacxx/include/cuda/__execution/tune.h
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__device/arch_id.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__execution/env.h>
 #include <cuda/std/__type_traits/conjunction.h>
@@ -51,14 +52,16 @@ struct __get_tuning_t
 
 _CCCL_GLOBAL_CONSTANT auto __get_tuning = __get_tuning_t{};
 
-template <class... _Tunings>
-[[nodiscard]] _CCCL_NODEBUG_API auto __tune(_Tunings...)
+template <class... _PolicySelectors>
+[[nodiscard]] _CCCL_NODEBUG_API auto __tune(_PolicySelectors...)
 {
-  static_assert((::cuda::std::is_empty_v<_Tunings> && ...), "Stateful tunings are not implemented");
+  static_assert((::cuda::std::is_empty_v<_PolicySelectors> && ...), "Stateful policy selectors are not implemented");
 
-  // clang < 19 doesn't like this code
   // since all the tunings are stateless, let's ignore incoming parameters
-  ::cuda::std::execution::env<_Tunings...> __env{};
+  // we use the return type of the policy_selector as tag
+  ::cuda::std::execution::env<
+    ::cuda::std::execution::prop<decltype(_PolicySelectors{}(::cuda::arch_id{})), _PolicySelectors>...>
+    __env;
 
   return ::cuda::std::execution::prop{__get_tuning_t{}, __env};
 }

From 77a1bc36f3347bb2fcb77eb8179ebccbb27531f2 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 2 Mar 2026 14:03:14 +0100
Subject: [PATCH 2/5] Update libcudacxx/include/cuda/__execution/tune.h

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 libcudacxx/include/cuda/__execution/tune.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libcudacxx/include/cuda/__execution/tune.h b/libcudacxx/include/cuda/__execution/tune.h
index 0890e094177..6d96e077bda 100644
--- a/libcudacxx/include/cuda/__execution/tune.h
+++ b/libcudacxx/include/cuda/__execution/tune.h
@@ -58,12 +58,12 @@ template <class... _PolicySelectors>
   static_assert((::cuda::std::is_empty_v<_PolicySelectors> && ...), "Stateful policy selectors are not implemented");
 
   // since all the tunings are stateless, let's ignore incoming parameters
+  
   // we use the return type of the policy_selector as tag
-  ::cuda::std::execution::env<
-    ::cuda::std::execution::prop<decltype(_PolicySelectors{}(::cuda::arch_id{})), _PolicySelectors>...>
-    __env;
-
-  return ::cuda::std::execution::prop{__get_tuning_t{}, __env};
+  using tuning_env = ::cuda::std::execution::env<
+    ::cuda::std::execution::prop<decltype(_PolicySelectors{}(::cuda::arch_id{})), _PolicySelectors>...>;
+  
+  return ::cuda::std::execution::prop{__get_tuning_t{}, tuning_env{}};
 }
 
 _CCCL_END_NAMESPACE_CUDA_EXECUTION

From cf4bcc2bac2e4ff6364b3bf70068625625ede8fa Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 2 Mar 2026 15:12:39 +0100
Subject: [PATCH 3/5] Format

---
 libcudacxx/include/cuda/__execution/tune.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/__execution/tune.h b/libcudacxx/include/cuda/__execution/tune.h
index 6d96e077bda..fb009470c86 100644
--- a/libcudacxx/include/cuda/__execution/tune.h
+++ b/libcudacxx/include/cuda/__execution/tune.h
@@ -58,11 +58,11 @@ template <class... _PolicySelectors>
   static_assert((::cuda::std::is_empty_v<_PolicySelectors> && ...), "Stateful policy selectors are not implemented");
 
   // since all the tunings are stateless, let's ignore incoming parameters
-  
+
   // we use the return type of the policy_selector as tag
   using tuning_env = ::cuda::std::execution::env<
     ::cuda::std::execution::prop<decltype(_PolicySelectors{}(::cuda::arch_id{})), _PolicySelectors>...>;
-  
+
   return ::cuda::std::execution::prop{__get_tuning_t{}, tuning_env{}};
 }
 

From 1c9f70f6301571f6355370dcfef2cccfae86fc26 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 4 Mar 2026 22:05:44 +0100
Subject: [PATCH 4/5] Rewrite more policies

---
 cub/cub/device/device_reduce.cuh              | 63 +++++++------------
 cub/test/catch2_test_device_reduce_env.cu     | 24 +++----
 ...ch2_test_device_reduce_nondeterministic.cu | 30 +++------
 cub/test/catch2_test_device_scan_env.cu       | 35 +++++------
 ...catch2_test_device_segmented_reduce_env.cu | 47 ++++----------
 .../libcudacxx/cuda/execution/tune.pass.cpp   | 61 ++++++------------
 6 files changed, 88 insertions(+), 172 deletions(-)

diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index 628601149eb..6dac9dfaa9d 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -59,24 +59,6 @@ inline constexpr bool is_non_deterministic_v =
 
 namespace reduce
 {
-struct get_tuning_query_t
-{};
-
-template <class Derived>
-struct tuning
-{
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived
-  {
-    return static_cast<const Derived&>(*this);
-  }
-};
-
-struct default_rfa_tuning : tuning<default_rfa_tuning>
-{
-  template <class AccumT, class Offset, class OpT>
-  using fn = detail::rfa::policy_selector_from_types<AccumT>;
-};
-
 template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
 struct unzip_and_write_arg_extremum_op
 {
@@ -145,10 +127,9 @@ private:
     using offset_t = detail::choose_offset_t<NumItemsT>;
     using accum_t  = ::cuda::std::
       __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
-    using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
-      TuningEnvT,
-      detail::reduce::get_tuning_query_t,
-      detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;
+    using default_policy_selector = detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>;
+    using policy_selector =
+      ::cuda::std::execution::__query_result_or_t<TuningEnvT, detail::reduce::reduce_policy, default_policy_selector>;
 
     return detail::reduce::dispatch<accum_t>(
       d_temp_storage,
@@ -160,7 +141,7 @@ private:
       init,
       stream,
       transform_op,
-      reduce_tuning_t{});
+      policy_selector{});
   }
 
   template <typename TuningEnvT,
@@ -183,16 +164,22 @@ private:
     cudaStream_t stream)
   {
     using offset_t = detail::choose_offset_t<NumItemsT>;
-
-    using reduce_tuning_t = ::cuda::std::execution::
-      __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_rfa_tuning>;
-
-    using accum_t = ::cuda::std::
+    using accum_t  = ::cuda::std::
       __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
-    using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
+    using default_policy_selector = detail::rfa::policy_selector_from_types<accum_t>;
+    using policy_selector =
+      ::cuda::std::execution::__query_result_or_t<TuningEnvT, detail::rfa::rfa_policy, default_policy_selector>;
 
-    return detail::rfa::dispatch<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t, policy_t>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), init, stream, transform_op);
+    return detail::rfa::dispatch<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      static_cast<offset_t>(num_items),
+      init,
+      stream,
+      transform_op,
+      policy_selector{});
   }
 
   template <typename TuningEnvT,
@@ -214,13 +201,11 @@ private:
     ::cuda::execution::determinism::not_guaranteed_t,
     cudaStream_t stream)
   {
-    using offset_t = detail::choose_offset_t<NumItemsT>;
-    using accum_t  = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
-
-    using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
-      TuningEnvT,
-      detail::reduce::get_tuning_query_t,
-      detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>>;
+    using offset_t                = detail::choose_offset_t<NumItemsT>;
+    using accum_t                 = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
+    using default_policy_selector = detail::reduce::policy_selector_from_types<accum_t, offset_t, ReductionOpT>;
+    using policy_selector =
+      ::cuda::std::execution::__query_result_or_t<TuningEnvT, detail::reduce::reduce_policy, default_policy_selector>;
 
     return detail::reduce::dispatch_nondeterministic<accum_t>(
       d_temp_storage,
@@ -232,7 +217,7 @@ private:
       init,
       stream,
       transform_op,
-      reduce_tuning_t{});
+      policy_selector{});
   }
 
 public:
diff --git a/cub/test/catch2_test_device_reduce_env.cu b/cub/test/catch2_test_device_reduce_env.cu
index 13625e85807..954f7b8454d 100644
--- a/cub/test/catch2_test_device_reduce_env.cu
+++ b/cub/test/catch2_test_device_reduce_env.cu
@@ -105,7 +105,7 @@ TEST_CASE("Device sum works with default environment", "[reduce][device]")
 }
 
 template <int BlockThreads>
-struct reduce_tuning : cub::detail::reduce::tuning<reduce_tuning<BlockThreads>>
+struct reduce_tuning
 {
   _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::reduce::reduce_policy
   {
@@ -115,20 +115,16 @@ struct reduce_tuning : cub::detail::reduce::tuning<reduce_tuning<BlockThreads>>
   }
 };
 
-struct get_scan_tuning_query_t
+struct unrelated_policy
 {};
 
-struct scan_tuning
+struct unrelated_tuning
 {
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_scan_tuning_query_t&) const noexcept
+  // should never be called
+  auto operator()(cuda::arch_id /*arch*/) const -> unrelated_policy
   {
-    return *this;
+    throw 1337;
   }
-
-  // Make sure this is not used
-  template <class /* AccumT */, class /* Offset */, class /* OpT */>
-  struct fn
-  {};
 };
 
 using block_sizes = c2h::type_list<cuda::std::integral_constant<int, 32>, cuda::std::integral_constant<int, 64>>;
@@ -143,8 +139,8 @@ C2H_TEST("Device reduce can be tuned", "[reduce][device]", block_sizes)
   auto d_in      = cuda::constant_iterator(1);
   auto d_out     = thrust::device_vector<int>(1);
 
-  // We are expecting that `scan_tuning` is ignored
-  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, scan_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(cudaSuccess == cub::DeviceReduce::Reduce(d_in, d_out.begin(), num_items, block_size_check, 0, env));
   REQUIRE(d_out[0] == num_items);
@@ -159,8 +155,8 @@ C2H_TEST("Device sum can be tuned", "[reduce][device]", block_sizes)
   auto d_in      = cuda::constant_iterator(1);
   auto d_out     = thrust::device_vector<int>(1);
 
-  // We are expecting that `scan_tuning` is ignored
-  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, scan_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(cudaSuccess == cub::DeviceReduce::Sum(d_in, d_out.begin(), num_items, env));
   REQUIRE(d_out[0] == num_items);
diff --git a/cub/test/catch2_test_device_reduce_nondeterministic.cu b/cub/test/catch2_test_device_reduce_nondeterministic.cu
index 9d3c00ea423..72cd5d9160e 100644
--- a/cub/test/catch2_test_device_reduce_nondeterministic.cu
+++ b/cub/test/catch2_test_device_reduce_nondeterministic.cu
@@ -46,25 +46,13 @@ struct AgentReducePolicy
 };
 
 template <int ItemsPerThread, int BlockSize>
-struct hub_t
+struct custom_policy_selector
 {
-  struct Policy : cub::ChainedPolicy<300, Policy, Policy>
+  _CCCL_API constexpr auto operator()(::cuda::arch_id) const -> cub::detail::reduce::reduce_policy
   {
-    constexpr static int ITEMS_PER_THREAD = ItemsPerThread;
-
-    using ReducePolicy = AgentReducePolicy<BlockSize, ItemsPerThread>;
-
-    // SingleTilePolicy
-    using SingleTilePolicy = ReducePolicy;
-
-    // SegmentedReducePolicy
-    using SegmentedReducePolicy = ReducePolicy;
-
-    // ReduceNondeterministicPolicy
-    using ReduceNondeterministicPolicy = ReducePolicy;
-  };
-
-  using MaxPolicy = Policy;
+    auto rp = cub::detail::reduce::agent_reduce_policy{BlockSize, ItemsPerThread};
+    return {rp, rp, rp};
+  }
 };
 
 C2H_TEST("Nondeterministic Device reduce works with float and double on gpu",
@@ -157,11 +145,11 @@ C2H_TEST("Nondeterministic Device reduce works with float and double on gpu with
   c2h::device_vector<type> d_output_p1(1);
   c2h::device_vector<type> d_output_p2(1);
 
-  auto env1 = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed), cuda::execution::__tune(hub_t<1, 128>{})};
+  auto env1 = cuda::std::execution::env{cuda::execution::require(cuda::execution::determinism::not_guaranteed),
+                                        cuda::execution::__tune(custom_policy_selector<1, 128>{})};
 
-  auto env2 = cuda::std::execution::env{
-    cuda::execution::require(cuda::execution::determinism::not_guaranteed), cuda::execution::__tune(hub_t<2, 256>{})};
+  auto env2 = cuda::std::execution::env{cuda::execution::require(cuda::execution::determinism::not_guaranteed),
+                                        cuda::execution::__tune(custom_policy_selector<2, 256>{})};
 
   REQUIRE(
     cudaSuccess == cub::DeviceReduce::Reduce(d_input.begin(), d_output_p1.begin(), num_items, min_op, init, env1));
diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu
index e8a692b6051..d490142bd79 100644
--- a/cub/test/catch2_test_device_scan_env.cu
+++ b/cub/test/catch2_test_device_scan_env.cu
@@ -113,6 +113,7 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev
   REQUIRE(d_out[0] == value_t{0});
 }
 
+// TODO(bgruber): convert to the new tuning API
 template <int BlockThreads>
 struct scan_tuning : cub::detail::scan::tuning<scan_tuning<BlockThreads>>
 {
@@ -143,24 +144,16 @@ struct scan_tuning : cub::detail::scan::tuning<scan_tuning<BlockThreads>>
   };
 };
 
-struct get_reduce_tuning_query_t
+struct unrelated_policy
 {};
 
-struct reduce_tuning
+struct unrelated_tuning
 {
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_reduce_tuning_query_t&) const noexcept
+  // should never be called
+  auto operator()(cuda::arch_id /*arch*/) const -> unrelated_policy
   {
-    return *this;
+    throw 1337;
   }
-
-  // Make sure this is not used
-  template <class /* InputValueT */,
-            class /* OutputValueT */,
-            class /* AccumT */,
-            class /* Offset */,
-            class /* ScanOpT */>
-  struct fn
-  {};
 };
 
 using block_sizes = c2h::type_list<cuda::std::integral_constant<int, 32>, cuda::std::integral_constant<int, 64>>;
@@ -175,8 +168,8 @@ C2H_TEST("Device scan exclusive-scan can be tuned", "[scan][device]", block_size
   auto d_in      = cuda::constant_iterator(1);
   auto d_out     = thrust::device_vector<int>(num_items);
 
-  // We are expecting that `reduce_tuning` is ignored
-  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveScan(d_in, d_out.begin(), block_size_check, 0, num_items, env));
 
@@ -195,8 +188,8 @@ C2H_TEST("Device scan exclusive-sum can be tuned", "[scan][device]", block_sizes
   auto d_in      = cuda::constant_iterator(1);
   auto d_out     = thrust::device_vector<int>(num_items);
 
-  // We are expecting that `reduce_tuning` is ignored
-  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveSum(d_in, d_out.begin(), num_items, env));
 
@@ -248,8 +241,8 @@ C2H_TEST("Device scan inclusive-scan can be tuned", "[scan][device]", block_size
   auto d_in      = cuda::constant_iterator(1);
   auto d_out     = thrust::device_vector<int>(num_items);
 
-  // We are expecting that `reduce_tuning` is ignored
-  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(cudaSuccess == cub::DeviceScan::InclusiveScan(d_in, d_out.begin(), block_size_check, num_items, env));
 
@@ -291,8 +284,8 @@ C2H_TEST("Device scan inclusive-scan-init can be tuned", "[scan][device]", block
 
   int init{10};
 
-  // We are expecting that `reduce_tuning` is ignored
-  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, reduce_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});
 
   REQUIRE(
     cudaSuccess == cub::DeviceScan::InclusiveScanInit(d_in, d_out.begin(), block_size_check, init, num_items, env));
diff --git a/cub/test/catch2_test_device_segmented_reduce_env.cu b/cub/test/catch2_test_device_segmented_reduce_env.cu
index 2f9f7ee30f3..72df98e3ba9 100644
--- a/cub/test/catch2_test_device_segmented_reduce_env.cu
+++ b/cub/test/catch2_test_device_segmented_reduce_env.cu
@@ -10,47 +10,26 @@
 #include <c2h/catch2_test_helper.h>
 
 template <int BlockThreads>
-struct reduce_tuning : cub::detail::reduce::tuning<reduce_tuning<BlockThreads>>
+struct reduce_tuning
 {
-  template <class /* AccumT */, class /* Offset */, class /* OpT */>
-  struct fn
+  _CCCL_API constexpr auto operator()(::cuda::arch_id) const -> cub::detail::reduce::reduce_policy
   {
-    struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
-    {
-      struct ReducePolicy
-      {
-        static constexpr int VECTOR_LOAD_LENGTH = 1;
-
-        static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM = cub::BLOCK_REDUCE_WARP_REDUCTIONS;
-
-        static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_DEFAULT;
-
-        static constexpr int ITEMS_PER_THREAD = 1;
-        static constexpr int BLOCK_THREADS    = BlockThreads;
-      };
-
-      using SingleTilePolicy      = ReducePolicy;
-      using SegmentedReducePolicy = ReducePolicy;
-    };
-
-    using MaxPolicy = Policy500;
-  };
+    auto rp = cub::detail::reduce::agent_reduce_policy{
+      BlockThreads, 1, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS, cub::LOAD_DEFAULT};
+    return {rp, rp, rp};
+  }
 };
 
-struct get_scan_tuning_query_t
+struct unrelated_policy
 {};
 
-struct scan_tuning
+struct unrelated_tuning
 {
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_scan_tuning_query_t&) const noexcept
+  // should never be called
+  auto operator()(cuda::arch_id /*arch*/) const -> unrelated_policy
   {
-    return *this;
+    throw 1337;
   }
-
-  // Make sure this is not used
-  template <class /* AccumT */, class /* Offset */, class /* OpT */>
-  struct fn
-  {};
 };
 
 using block_sizes = c2h::type_list<cuda::std::integral_constant<int, 32>, cuda::std::integral_constant<int, 64>>;
@@ -65,8 +44,8 @@ C2H_TEST("Device segmented sum can be tuned", "[reduce][device]", block_sizes)
   thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
   thrust::device_vector<int> d_out(3);
 
-  // We are expecting that `scan_tuning` is ignored
-  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, scan_tuning{});
+  // We are expecting that `unrelated_tuning` is ignored
+  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, unrelated_tuning{});
 
   auto error =
     cub::DeviceSegmentedReduce::Sum(d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, env);
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/tune.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/tune.pass.cpp
index 5944599ed2c..088670a862d 100644
--- a/libcudacxx/test/libcudacxx/cuda/execution/tune.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/execution/tune.pass.cpp
@@ -10,54 +10,31 @@
 
 #include <cuda/__execution/tune.h>
 
-struct get_reduce_tuning_query_t
-{};
+struct reduce_policy
+{
+  int block_threads;
+};
 
-template <class Derived>
-struct reduce_tuning
+template <int BlockThreads, class T>
+struct reduce_policy_selector
 {
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_reduce_tuning_query_t&) const noexcept -> Derived
+  _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> reduce_policy
   {
-    return static_cast<const Derived&>(*this);
+    return {BlockThreads / sizeof(T)};
   }
 };
 
-template <int BlockThreads>
-struct reduce : reduce_tuning<reduce<BlockThreads>>
+struct scan_policy
 {
-  template <class T>
-  struct type
-  {
-    struct max_policy
-    {
-      struct reduce_policy
-      {
-        static constexpr int block_threads = BlockThreads / sizeof(T);
-      };
-    };
-  };
+  int block_threads = 1;
 };
 
-struct get_scan_tuning_query_t
-{};
-
-struct scan_tuning
+struct scan_policy_selector
 {
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_scan_tuning_query_t&) const noexcept
+  _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> scan_policy
   {
-    return *this;
+    return {};
   }
-
-  struct type
-  {
-    struct max_policy
-    {
-      struct reduce_policy
-      {
-        static constexpr int block_threads = 1;
-      };
-    };
-  };
 };
 
 __host__ __device__ void test()
@@ -65,15 +42,13 @@ __host__ __device__ void test()
   constexpr int nominal_block_threads = 256;
   constexpr int block_threads         = nominal_block_threads / sizeof(int);
 
-  using env_t           = decltype(cuda::execution::__tune(reduce<nominal_block_threads>{}, scan_tuning{}));
+  using env_t           = decltype(cuda::execution::__tune(reduce<nominal_block_threads, int>{}, scan_tuning{}));
   using tuning_t        = cuda::std::execution::__query_result_t<env_t, cuda::execution::__get_tuning_t>;
-  using reduce_tuning_t = cuda::std::execution::__query_result_t<tuning_t, get_reduce_tuning_query_t>;
-  using scan_tuning_t   = cuda::std::execution::__query_result_t<tuning_t, get_scan_tuning_query_t>;
-  using reduce_policy_t = reduce_tuning_t::type<int>;
-  using scan_policy_t   = scan_tuning_t::type;
+  using reduce_policy_t = cuda::std::execution::__query_result_t<tuning_t, reduce_policy>;
+  using scan_policy_t   = cuda::std::execution::__query_result_t<tuning_t, scan_policy>;
 
-  static_assert(reduce_policy_t::max_policy::reduce_policy::block_threads == block_threads);
-  static_assert(scan_policy_t::max_policy::reduce_policy::block_threads == 1);
+  static_assert(reduce_policy_t{}(cuda::arch_id::sm_75). : block_threads == block_threads);
+  static_assert(scan_policy_t{}(cuda::arch_id::sm_75). : block_threads == 1);
 }
 
 int main(int, char**)

From 7b0534bc4403eca4e3318e82c557cf9f2d34b89e Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 9 Mar 2026 21:50:13 +0100
Subject: [PATCH 5/5] Fix

---
 ...ch2_test_device_reduce_nondeterministic.cu | 23 +++++--------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/cub/test/catch2_test_device_reduce_nondeterministic.cu b/cub/test/catch2_test_device_reduce_nondeterministic.cu
index 72cd5d9160e..a85c794fa7b 100644
--- a/cub/test/catch2_test_device_reduce_nondeterministic.cu
+++ b/cub/test/catch2_test_device_reduce_nondeterministic.cu
@@ -29,28 +29,17 @@ using float_type_list =
 #endif
                  >;
 
-template <int NOMINAL_BLOCK_THREADS_4B, int NOMINAL_ITEMS_PER_THREAD_4B>
-struct AgentReducePolicy
-{
-  /// Number of items per vectorized load
-  static constexpr int VECTOR_LOAD_LENGTH = 4;
-
-  /// Cooperative block-wide reduction algorithm to use
-  static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM =
-    cub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC;
-
-  /// Cache load modifier for reading input elements
-  static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::CacheLoadModifier::LOAD_DEFAULT;
-  constexpr static int ITEMS_PER_THREAD                 = NOMINAL_ITEMS_PER_THREAD_4B;
-  constexpr static int BLOCK_THREADS                    = NOMINAL_BLOCK_THREADS_4B;
-};
-
 template <int ItemsPerThread, int BlockSize>
 struct custom_policy_selector
 {
   _CCCL_API constexpr auto operator()(::cuda::arch_id) const -> cub::detail::reduce::reduce_policy
   {
-    auto rp = cub::detail::reduce::agent_reduce_policy{BlockSize, ItemsPerThread};
+    auto rp = cub::detail::reduce::agent_reduce_policy{
+      BlockSize,
+      ItemsPerThread,
+      4,
+      cub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC,
+      cub::CacheLoadModifier::LOAD_DEFAULT};
     return {rp, rp, rp};
   }
 };