diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu
index 54d72ae4bb9..e02945740cc 100644
--- a/cub/benchmarks/bench/reduce/by_key.cu
+++ b/cub/benchmarks/bench/reduce/by_key.cu
@@ -15,72 +15,29 @@
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
 #if !TUNE_BASE
-#  if TUNE_TRANSPOSE == 0
-#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
-#  else // TUNE_TRANSPOSE == 1
-#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
-#  endif // TUNE_TRANSPOSE
-
-#  if TUNE_LOAD == 0
-#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
-#  else // TUNE_LOAD == 1
-#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
-#  endif // TUNE_LOAD
-
-struct reduce_by_key_policy_hub
+struct bench_reduce_by_key_policy_selector
 {
-  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const
+    -> cub::detail::reduce_by_key::reduce_by_key_policy
   {
-    using ReduceByKeyPolicyT =
-      cub::AgentReduceByKeyPolicy<TUNE_THREADS,
-                                  TUNE_ITEMS,
-                                  TUNE_LOAD_ALGORITHM,
-                                  TUNE_LOAD_MODIFIER,
-                                  cub::BLOCK_SCAN_WARP_SCANS,
-                                  delay_constructor_t>;
-  };
-
-  using MaxPolicy = Policy500;
+    return {
+      TUNE_THREADS,
+      TUNE_ITEMS,
+      TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
+      TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      delay_constructor_policy,
+    };
+  }
 };
 #endif // !TUNE_BASE
 
 template <class KeyT, class ValueT, class OffsetT>
 static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
 {
-  using keys_input_it_t            = const KeyT*;
-  using unique_output_it_t         = KeyT*;
-  using vals_input_it_t            = const ValueT*;
-  using aggregate_output_it_t      = ValueT*;
-  using num_runs_output_iterator_t = OffsetT*;
-  using equality_op_t              = ::cuda::std::equal_to<>;
-  using reduction_op_t             = ::cuda::std::plus<>;
-  using accum_t                    = ValueT;
-  using offset_t                   = OffsetT;
-
-#if !TUNE_BASE
-  using dispatch_t = cub::DispatchReduceByKey<
-    keys_input_it_t,
-    unique_output_it_t,
-    vals_input_it_t,
-    aggregate_output_it_t,
-    num_runs_output_iterator_t,
-    equality_op_t,
-    reduction_op_t,
-    offset_t,
-    accum_t,
-    reduce_by_key_policy_hub>;
-#else
-  using dispatch_t = cub::DispatchReduceByKey<
-    keys_input_it_t,
-    unique_output_it_t,
-    vals_input_it_t,
-    aggregate_output_it_t,
-    num_runs_output_iterator_t,
-    equality_op_t,
-    reduction_op_t,
-    offset_t,
-    accum_t>;
-#endif
+  using equality_op_t  = ::cuda::std::equal_to<>;
+  using reduction_op_t = ::cuda::std::plus<>;
+  using offset_t       = OffsetT;
 
   const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   constexpr std::size_t min_segment_size = 1;
@@ -92,43 +49,42 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
   thrust::device_vector<KeyT> out_keys(elements);
   thrust::device_vector<KeyT> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
 
-  KeyT* d_in_keys         = thrust::raw_pointer_cast(in_keys.data());
+  const KeyT* d_in_keys   = thrust::raw_pointer_cast(in_keys.data());
   KeyT* d_out_keys        = thrust::raw_pointer_cast(out_keys.data());
-  ValueT* d_in_vals       = thrust::raw_pointer_cast(in_vals.data());
+  const ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data());
   ValueT* d_out_vals      = thrust::raw_pointer_cast(out_vals.data());
   OffsetT* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
+  const offset_t num_items = static_cast<offset_t>(elements);
 
-  dispatch_t::Dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_in_keys,
-    d_out_keys,
-    d_in_vals,
-    d_out_vals,
-    d_num_runs_out,
-    equality_op_t{},
-    reduction_op_t{},
-    elements,
-    0);
+  auto dispatch_on_stream = [&](cudaStream_t stream) {
+    return cub::detail::reduce_by_key::dispatch</* OverrideAccumT */ ValueT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in_keys,
+      d_out_keys,
+      d_in_vals,
+      d_out_vals,
+      d_num_runs_out,
+      equality_op_t{},
+      reduction_op_t{},
+      num_items,
+      stream
+#if !TUNE_BASE
+      ,
+      bench_reduce_by_key_policy_selector{}
+#endif
+    );
+  };
+
+  dispatch_on_stream(cudaStream_t{0});
 
   thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  dispatch_t::Dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_in_keys,
-    d_out_keys,
-    d_in_vals,
-    d_out_vals,
-    d_num_runs_out,
-    equality_op_t{},
-    reduction_op_t{},
-    elements,
-    0);
+  dispatch_on_stream(cudaStream_t{0});
   cudaDeviceSynchronize();
   const OffsetT num_runs = num_runs_out[0];
 
@@ -140,18 +96,7 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
   state.add_global_memory_writes<OffsetT>(1);
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in_keys,
-      d_out_keys,
-      d_in_vals,
-      d_out_vals,
-      d_num_runs_out,
-      equality_op_t{},
-      reduction_op_t{},
-      elements,
-      launch.get_stream());
+    dispatch_on_stream(launch.get_stream());
   });
 }
 
diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu
index 51f23a424f3..37227aa7649 100644
--- a/cub/benchmarks/bench/run_length_encode/encode.cu
+++ b/cub/benchmarks/bench/run_length_encode/encode.cu
@@ -17,32 +17,20 @@
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
 #if !TUNE_BASE
-#  if TUNE_TRANSPOSE == 0
-#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
-#  else // TUNE_TRANSPOSE == 1
-#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
-#  endif // TUNE_TRANSPOSE
-
-#  if TUNE_LOAD == 0
-#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
-#  else // TUNE_LOAD == 1
-#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
-#  endif // TUNE_LOAD
-
-struct reduce_by_key_policy_hub
+struct bench_encode_policy_selector
 {
-  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const
+    -> cub::detail::reduce_by_key::reduce_by_key_policy
   {
-    using ReduceByKeyPolicyT =
-      cub::AgentReduceByKeyPolicy<TUNE_THREADS,
-                                  TUNE_ITEMS,
-                                  TUNE_LOAD_ALGORITHM,
-                                  TUNE_LOAD_MODIFIER,
-                                  cub::BLOCK_SCAN_WARP_SCANS,
-                                  delay_constructor_t>;
-  };
-
-  using MaxPolicy = Policy500;
+    return {
+      TUNE_THREADS,
+      TUNE_ITEMS,
+      TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
+      TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      delay_constructor_policy,
+    };
+  }
 };
 #endif // !TUNE_BASE
 
@@ -56,41 +44,10 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   // Offset type large enough to represent the total number of runs in the sequence
   using num_runs_t = offset_t;
 
-  using keys_input_it_t            = const T*;
-  using unique_output_it_t         = T*;
-  using run_length_input_it_t      = thrust::constant_iterator<run_length_t, offset_t>;
-  using run_length_output_it_t     = run_length_t*;
-  using num_runs_output_iterator_t = num_runs_t*;
-  using equality_op_t              = ::cuda::std::equal_to<>;
-  using reduction_op_t             = ::cuda::std::plus<>;
-  using accum_t                    = run_length_t;
-
-#if !TUNE_BASE
-  using dispatch_t = cub::detail::reduce::DispatchStreamingReduceByKey<
-    keys_input_it_t,
-    unique_output_it_t,
-    run_length_input_it_t,
-    run_length_output_it_t,
-    num_runs_output_iterator_t,
-    equality_op_t,
-    reduction_op_t,
-    offset_t,
-    accum_t,
-    reduce_by_key_policy_hub>;
-#else
-  using policy_t   = cub::detail::rle::encode::policy_hub<accum_t, T>;
-  using dispatch_t = cub::detail::reduce::DispatchStreamingReduceByKey<
-    keys_input_it_t,
-    unique_output_it_t,
-    run_length_input_it_t,
-    run_length_output_it_t,
-    num_runs_output_iterator_t,
-    equality_op_t,
-    reduction_op_t,
-    offset_t,
-    accum_t,
-    policy_t>;
-#endif
+  using run_length_input_it_t = thrust::constant_iterator<run_length_t, offset_t>;
+  using equality_op_t         = ::cuda::std::equal_to<>;
+  using reduction_op_t        = ::cuda::std::plus<>;
+  using accum_t               = run_length_t;
 
   const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   constexpr std::size_t min_segment_size = 1;
@@ -101,7 +58,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   thrust::device_vector<T> out_keys(elements);
   thrust::device_vector<T> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
 
-  T* d_in_keys        = thrust::raw_pointer_cast(in_keys.data());
+  const T* d_in_keys  = thrust::raw_pointer_cast(in_keys.data());
   T* d_out_keys       = thrust::raw_pointer_cast(out_keys.data());
   auto d_out_vals     = thrust::raw_pointer_cast(out_vals.data());
   auto d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());
@@ -109,35 +66,35 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
+  const offset_t num_items = static_cast<offset_t>(elements);
+
+  auto dispatch_on_stream = [&](cudaStream_t stream) {
+    return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key</* OverrideAccumT */ accum_t>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in_keys,
+      d_out_keys,
+      d_in_vals,
+      d_out_vals,
+      d_num_runs_out,
+      equality_op_t{},
+      reduction_op_t{},
+      num_items,
+      stream,
+#if TUNE_BASE
+      cub::detail::rle::encode::policy_selector_from_types<accum_t, T> {}
+#else // TUNE_BASE
+      bench_encode_policy_selector{}
+#endif // TUNE_BASE
+    );
+  };
 
-  dispatch_t::Dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_in_keys,
-    d_out_keys,
-    d_in_vals,
-    d_out_vals,
-    d_num_runs_out,
-    equality_op_t{},
-    reduction_op_t{},
-    elements,
-    0);
+  dispatch_on_stream(cudaStream_t{0});
 
   thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  dispatch_t::Dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_in_keys,
-    d_out_keys,
-    d_in_vals,
-    d_out_vals,
-    d_num_runs_out,
-    equality_op_t{},
-    reduction_op_t{},
-    elements,
-    0);
+  dispatch_on_stream(cudaStream_t{0});
   cudaDeviceSynchronize();
   const num_runs_t num_runs = num_runs_out[0];
 
@@ -148,18 +105,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   state.add_global_memory_writes<num_runs_t>(1);
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in_keys,
-      d_out_keys,
-      d_in_vals,
-      d_out_vals,
-      d_num_runs_out,
-      equality_op_t{},
-      reduction_op_t{},
-      elements,
-      launch.get_stream());
+    dispatch_on_stream(launch.get_stream());
   });
 }
 
diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh
index 43f74df5f35..9566cb705e4 100644
--- a/cub/cub/agent/agent_reduce_by_key.cuh
+++ b/cub/cub/agent/agent_reduce_by_key.cuh
@@ -2,10 +2,9 @@
 // SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
-/**
- * @file cub::AgentReduceByKey implements a stateful abstraction of CUDA thread
- *       blocks for participating in device-wide reduce-value-by-key.
- */
+//! @file
+//! cub::detail::reduce_by_key::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for
+//! participating in device-wide reduce-value-by-key.
 
 #pragma once
 
@@ -92,7 +91,7 @@ struct AgentReduceByKeyPolicy
  * Thread block abstractions
  ******************************************************************************/
 
-namespace detail::reduce
+namespace detail::reduce_by_key
 {
 /**
  * @brief AgentReduceByKey implements a stateful abstraction of CUDA thread
@@ -771,6 +770,6 @@ struct AgentReduceByKey
     }
   }
 };
-} // namespace detail::reduce
+} // namespace detail::reduce_by_key
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh
index 23d0c6d571e..9d939d4563f 100644
--- a/cub/cub/detail/delay_constructor.cuh
+++ b/cub/cub/detail/delay_constructor.cuh
@@ -193,15 +193,27 @@ struct delay_constructor_for<delay_constructor_kind::reduce_by_key, Delay, L2Wri
 template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
 using delay_constructor_t = typename delay_constructor_for<Kind, Delay, L2WriteLatency>::type;
 
-_CCCL_API constexpr auto
-default_reduce_by_key_delay_constructor_policy(int key_size, int value_size, bool value_is_primitive)
+_CCCL_API constexpr auto default_delay_constructor_policy(bool is_primitive_or_trivially_copyable)
 {
-  if (value_is_primitive && (value_size + key_size < 16))
+  if (is_primitive_or_trivially_copyable)
   {
-    return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450};
+    return delay_constructor_policy{delay_constructor_kind::fixed_delay, 350, 450};
   }
   return delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450};
 }
+
+_CCCL_API constexpr auto default_reduce_by_key_delay_constructor_policy(
+  int key_size,
+  int value_size,
+  bool key_is_primitive_or_trivially_copyable,
+  bool value_is_primitive_or_trivially_copyable)
+{
+  if (value_is_primitive_or_trivially_copyable && (value_size + key_size < 16))
+  {
+    return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450};
+  }
+  return default_delay_constructor_policy(key_is_primitive_or_trivially_copyable);
+}
 } // namespace detail
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index 50101c16e0f..afbbe219bf9 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -2308,35 +2308,21 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey");
 
-    // Signed integer type for global offsets
-    using OffsetT = detail::choose_offset_t<NumItemsT>;
-
-    // FlagT iterator type (not used)
-
-    // Selection op (not used)
-
-    // Default == operator
+    using OffsetT    = detail::choose_offset_t<NumItemsT>;
     using EqualityOp = ::cuda::std::equal_to<>;
 
-    return DispatchReduceByKey<
-      KeysInputIteratorT,
-      UniqueOutputIteratorT,
-      ValuesInputIteratorT,
-      AggregatesOutputIteratorT,
-      NumRunsOutputIteratorT,
-      EqualityOp,
-      ReductionOpT,
-      OffsetT>::Dispatch(d_temp_storage,
-                         temp_storage_bytes,
-                         d_keys_in,
-                         d_unique_out,
-                         d_values_in,
-                         d_aggregates_out,
-                         d_num_runs_out,
-                         EqualityOp(),
-                         reduction_op,
-                         static_cast<OffsetT>(num_items),
-                         stream);
+    return detail::reduce_by_key::dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_unique_out,
+      d_values_in,
+      d_aggregates_out,
+      d_num_runs_out,
+      EqualityOp{},
+      reduction_op,
+      static_cast<OffsetT>(num_items),
+      stream);
   }
 };
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh
index 44fac1b1cd9..8b1373afdc5 100644
--- a/cub/cub/device/device_run_length_encode.cuh
+++ b/cub/cub/device/device_run_length_encode.cuh
@@ -186,29 +186,21 @@ struct DeviceRunLengthEncode
 
     using key_t = cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
 
-    using policy_t = detail::rle::encode::policy_hub<accum_t, key_t>;
+    using policy_selector_t = detail::rle::encode::policy_selector_from_types<accum_t, key_t>;
 
-    return detail::reduce::DispatchStreamingReduceByKey<
-      InputIteratorT,
-      UniqueOutputIteratorT,
-      lengths_input_iterator_t,
-      LengthsOutputIteratorT,
-      NumRunsOutputIteratorT,
-      equality_op,
-      reduction_op,
-      offset_t,
-      accum_t,
-      policy_t>::Dispatch(d_temp_storage,
-                          temp_storage_bytes,
-                          d_in,
-                          d_unique_out,
-                          lengths_input_iterator_t((length_t) 1),
-                          d_counts_out,
-                          d_num_runs_out,
-                          equality_op(),
-                          reduction_op(),
-                          num_items,
-                          stream);
+    return detail::reduce_by_key::dispatch_streaming_reduce_by_key</* OverrideAccumT */ accum_t>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_unique_out,
+      lengths_input_iterator_t(length_t{1}),
+      d_counts_out,
+      d_num_runs_out,
+      equality_op{},
+      reduction_op{},
+      static_cast<offset_t>(num_items),
+      stream,
+      policy_selector_t{});
   }
 
   //! @rst
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index cda8857f460..14fbf3ae15c 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -20,6 +20,8 @@
 #endif // no system header
 
 #include <cub/agent/agent_reduce_by_key.cuh>
+#include <cub/detail/arch_dispatch.cuh>
+#include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -29,13 +31,17 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+#  include <sstream>
+#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
  * Kernel entry points
  *****************************************************************************/
 
-namespace detail::reduce
+namespace detail::reduce_by_key
 {
 template <typename PrecedingKeyItT, typename AccumT, typename GlobalOffsetT>
 struct streaming_context
@@ -106,8 +112,8 @@ struct streaming_context
 /**
  * @brief Multi-block reduce-by-key sweep kernel entry point
  *
- * @tparam AgentReduceByKeyPolicyT
- *   Parameterized AgentReduceByKeyPolicyT tuning policy type
+ * @tparam PolicySelector
+ *   Selects the tuning policy
  *
  * @tparam KeysInputIteratorT
  *   Random-access input iterator type for keys
@@ -167,7 +173,7 @@ struct streaming_context
  * @param num_items
  *   Total number of items to select from
  */
-template <typename ChainedPolicyT,
+template <typename PolicySelector,
           typename KeysInputIteratorT,
           typename UniqueOutputIteratorT,
           typename ValuesInputIteratorT,
@@ -179,7 +185,10 @@ template <typename ChainedPolicyT,
           typename OffsetT,
           typename AccumT,
           typename StreamingContextT>
-__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_THREADS))
+#if _CCCL_HAS_CONCEPTS()
+  requires reduce_by_key_policy_selector<PolicySelector>
+#endif
+__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel(
     KeysInputIteratorT d_keys_in,
     UniqueOutputIteratorT d_unique_out,
@@ -194,8 +203,19 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
     vsmem_t vsmem)
 {
+  static constexpr reduce_by_key_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
+  using AgentReduceByKeyPolicyT                = AgentReduceByKeyPolicy<
+                   policy.block_threads,
+                   policy.items_per_thread,
+                   policy.load_algorithm,
+                   policy.load_modifier,
+                   policy.scan_algorithm,
+                   delay_constructor_t<policy.delay_constructor.kind,
+                                       policy.delay_constructor.delay,
+                                       policy.delay_constructor.l2_write_latency>>;
+
   using vsmem_helper_t = vsmem_helper_default_fallback_policy_t<
-    typename ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT,
+    AgentReduceByKeyPolicyT,
     AgentReduceByKey,
     KeysInputIteratorT,
     UniqueOutputIteratorT,
@@ -234,7 +254,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
   // If applicable, hints to discard modified cache lines for vsmem
   vsmem_helper_t::discard_temp_storage(temp_storage);
 }
-} // namespace detail::reduce
+} // namespace detail::reduce_by_key
 
 /******************************************************************************
  * Dispatch
@@ -272,6 +292,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
  *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
+// TODO(bgruber): deprecate when we make the tuning API public and remove in CCCL 4.0
 template <typename KeysInputIteratorT,
           typename UniqueOutputIteratorT,
           typename ValuesInputIteratorT,
@@ -351,7 +372,7 @@ struct DispatchReduceByKey
   {
     using vsmem_helper_t = detail::vsmem_helper_default_fallback_policy_t<
       typename ActivePolicyT::ReduceByKeyPolicyT,
-      detail::reduce::AgentReduceByKey,
+      detail::reduce_by_key::AgentReduceByKey,
       KeysInputIteratorT,
       UniqueOutputIteratorT,
       ValuesInputIteratorT,
@@ -518,8 +539,8 @@ struct DispatchReduceByKey
   {
     return Invoke<ActivePolicyT>(
       detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-      detail::reduce::DeviceReduceByKeyKernel<
-        typename PolicyHub::MaxPolicy,
+      detail::reduce_by_key::DeviceReduceByKeyKernel<
+        detail::reduce_by_key::policy_selector_from_hub<PolicyHub>,
         KeysInputIteratorT,
         UniqueOutputIteratorT,
         ValuesInputIteratorT,
@@ -622,4 +643,216 @@ struct DispatchReduceByKey
   }
 };
 
+namespace detail::reduce_by_key
+{
+// we move the conversion of the policy to the agent policy and its use out of the lambda below, so MSVC does not ICE
+template <typename PolicyGetter, typename... Args>
+_CCCL_API auto determine_threads_items_vsmem(PolicyGetter policy_getter)
+{
+  // TODO(bgruber): refactor this in the future
+  constexpr reduce_by_key_policy policy = policy_getter();
+  using Policy                          = AgentReduceByKeyPolicy<
+                             policy.block_threads,
+                             policy.items_per_thread,
+                             policy.load_algorithm,
+                             policy.load_modifier,
+                             policy.scan_algorithm,
+                             delay_constructor_t<policy.delay_constructor.kind,
+                                                 policy.delay_constructor.delay,
+                                                 policy.delay_constructor.l2_write_latency>>;
+  using vsmem_helper_t = vsmem_helper_default_fallback_policy_t<Policy, AgentReduceByKey, Args...>;
+  return ::cuda::std::tuple{vsmem_helper_t::agent_policy_t::BLOCK_THREADS,
+                            vsmem_helper_t::agent_policy_t::ITEMS_PER_THREAD,
+                            vsmem_helper_t::vsmem_per_block};
+}
+
+template <
+  typename OverrideAccumT = use_default,
+  typename KeysInputIteratorT,
+  typename UniqueOutputIteratorT,
+  typename ValuesInputIteratorT,
+  typename AggregatesOutputIteratorT,
+  typename NumRunsOutputIteratorT,
+  typename EqualityOpT,
+  typename ReductionOpT,
+  typename OffsetT,
+  typename AccumT = ::cuda::std::conditional_t<
+    !::cuda::std::is_same_v<OverrideAccumT, use_default>,
+    OverrideAccumT,
+    ::cuda::std::__accumulator_t<ReductionOpT, it_value_t<ValuesInputIteratorT>, it_value_t<ValuesInputIteratorT>>>,
+  typename KeyT           = non_void_value_t<UniqueOutputIteratorT, it_value_t<KeysInputIteratorT>>,
+  typename PolicySelector = policy_selector_from_types<ReductionOpT, AccumT, KeyT>>
+#if _CCCL_HAS_CONCEPTS()
+  requires reduce_by_key::reduce_by_key_policy_selector<PolicySelector>
+#endif // _CCCL_HAS_CONCEPTS()
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  KeysInputIteratorT d_keys_in,
+  UniqueOutputIteratorT d_unique_out,
+  ValuesInputIteratorT d_values_in,
+  AggregatesOutputIteratorT d_aggregates_out,
+  NumRunsOutputIteratorT d_num_runs_out,
+  EqualityOpT equality_op,
+  ReductionOpT reduction_op,
+  OffsetT num_items,
+  cudaStream_t stream,
+  PolicySelector policy_selector = {})
+{
+  using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet
+  using ScanTileStateT      = ReduceByKeyScanTileState<AccumT, OffsetT>;
+  [[maybe_unused]] static constexpr int init_kernel_threads = 128;
+
+  ::cuda::arch_id arch_id{};
+  if (const auto error = CubDebug(ptx_arch_id(arch_id)))
+  {
+    return error;
+  }
+
+  return detail::dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) {
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+    NV_IF_TARGET(
+      NV_IS_HOST,
+      (::std::stringstream ss; ss << policy_getter(); _CubLog(
+         "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast<int>(arch_id), ss.str().c_str());))
+#endif
+
+    const auto [block_threads, items_per_thread, vsmem_per_block] = determine_threads_items_vsmem<
+      decltype(policy_getter),
+      KeysInputIteratorT,
+      UniqueOutputIteratorT,
+      ValuesInputIteratorT,
+      AggregatesOutputIteratorT,
+      NumRunsOutputIteratorT,
+      EqualityOpT,
+      ReductionOpT,
+      OffsetT,
+      AccumT,
+      streaming_context_t>(policy_getter);
+
+    // Number of input tiles
+    const int tile_size = block_threads * items_per_thread;
+    const int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+    // The amount of virtual shared memory to allocate
+    const auto vsmem_size = num_tiles * vsmem_per_block;
+
+    size_t tile_descriptor_memory{};
+    if (const auto error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, tile_descriptor_memory)))
+    {
+      return error;
+    }
+    size_t allocation_sizes[2] = {tile_descriptor_memory, vsmem_size};
+    void* allocations[2]       = {};
+
+    if (const auto error =
+          CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+    {
+      return error;
+    }
+
+    if (d_temp_storage == nullptr)
+    {
+      return cudaSuccess;
+    }
+
+    ScanTileStateT tile_state;
+    if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])))
+    {
+      return error;
+    }
+
+    const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_tiles, init_kernel_threads));
+#ifdef CUB_DEBUG_LOG
+    _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
+#endif
+    if (const auto error = CubDebug(
+          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream)
+            .doit(detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                  tile_state,
+                  num_tiles,
+                  d_num_runs_out)))
+    {
+      return error;
+    }
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+    {
+      return error;
+    }
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    auto reduce_by_key_kernel = &DeviceReduceByKeyKernel<
+      PolicySelector,
+      KeysInputIteratorT,
+      UniqueOutputIteratorT,
+      ValuesInputIteratorT,
+      AggregatesOutputIteratorT,
+      NumRunsOutputIteratorT,
+      ScanTileStateT,
+      EqualityOpT,
+      ReductionOpT,
+      OffsetT,
+      AccumT,
+      streaming_context_t>;
+
+    int reduce_by_key_sm_occupancy{};
+    if (const auto error = CubDebug(MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, block_threads)))
+    {
+      return error;
+    }
+
+    int device_ordinal{};
+    if (const auto error = CubDebug(cudaGetDevice(&device_ordinal)))
+    {
+      return error;
+    }
+    int max_dim_x{};
+    if (const auto error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
+    {
+      return error;
+    }
+
+    const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x);
+    for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+    {
+#ifdef CUB_DEBUG_LOG
+      _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+              start_tile,
+              scan_grid_size,
+              block_threads,
+              (long long) stream,
+              items_per_thread,
+              reduce_by_key_sm_occupancy);
+#endif
+      if (const auto error = CubDebug(
+            THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream)
+              .doit(reduce_by_key_kernel,
+                    d_keys_in,
+                    d_unique_out,
+                    d_values_in,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    start_tile,
+                    equality_op,
+                    reduction_op,
+                    num_items,
+                    streaming_context_t{},
+                    cub::detail::vsmem_t{allocations[1]})))
+      {
+        return error;
+      }
+      if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+      {
+        return error;
+      }
+    }
+    return cudaSuccess;
+  });
+}
+} // namespace detail::reduce_by_key
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
index 57d34ff4e7e..5d959df2c4c 100644
--- a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
@@ -14,442 +14,246 @@
 #endif // no system header
 
 #include <cub/agent/agent_reduce_by_key.cuh>
+#include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
-#include <cub/thread/thread_operators.cuh>
 #include <cub/util_device.cuh>
-#include <cub/util_math.cuh>
 #include <cub/util_type.cuh>
 
 #include <thrust/iterator/offset_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
-#if !_CCCL_COMPILER(NVRTC)
-#  include <iostream>
-#endif // !_CCCL_COMPILER(NVRTC)
+#include <cuda/std/__functional/invoke.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_same.h>
+
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+#  include <sstream>
+#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail::reduce
+namespace detail::reduce_by_key
 {
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * @brief Utility class for dispatching the appropriately-tuned kernels for
- *        DeviceReduceByKey
- *
- * @tparam KeysInputIteratorT
- *   Random-access input iterator type for keys
- *
- * @tparam UniqueOutputIteratorT
- *   Random-access output iterator type for keys
- *
- * @tparam ValuesInputIteratorT
- *   Random-access input iterator type for values
- *
- * @tparam AggregatesOutputIteratorT
- *   Random-access output iterator type for values
- *
- * @tparam NumRunsOutputIteratorT
- *   Output iterator type for recording number of segments encountered
- *
- * @tparam EqualityOpT
- *   KeyT equality operator type
- *
- * @tparam ReductionOpT
- *   ValueT reduction operator type
- *
- * @tparam PolicyHub
- *   Implementation detail, do not specify directly, requirements on the
- *   content of this type are subject to breaking change.
- */
-template <typename KeysInputIteratorT,
-          typename UniqueOutputIteratorT,
-          typename ValuesInputIteratorT,
-          typename AggregatesOutputIteratorT,
-          typename NumRunsOutputIteratorT,
-          typename EqualityOpT,
-          typename ReductionOpT,
-          typename OffsetT,
-          typename AccumT    = ::cuda::std::__accumulator_t<ReductionOpT,
-                                                            cub::detail::it_value_t<ValuesInputIteratorT>,
-                                                            cub::detail::it_value_t<ValuesInputIteratorT>>,
-          typename PolicyHub = detail::reduce_by_key::policy_hub<
-            ReductionOpT,
-            AccumT,
-            cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::it_value_t<KeysInputIteratorT>>>>
-struct DispatchStreamingReduceByKey
+template <
+  typename OverrideAccumT = use_default,
+  typename KeysInputIteratorT,
+  typename UniqueOutputIteratorT,
+  typename ValuesInputIteratorT,
+  typename AggregatesOutputIteratorT,
+  typename NumRunsOutputIteratorT,
+  typename EqualityOpT,
+  typename ReductionOpT,
+  typename OffsetT,
+  typename AccumT = ::cuda::std::conditional_t<
+    !::cuda::std::is_same_v<OverrideAccumT, use_default>,
+    OverrideAccumT,
+    ::cuda::std::__accumulator_t<ReductionOpT, it_value_t<ValuesInputIteratorT>, it_value_t<ValuesInputIteratorT>>>,
+  typename PolicySelector =
+    policy_selector_from_types<ReductionOpT,
+                               AccumT,
+                               non_void_value_t<UniqueOutputIteratorT, it_value_t<KeysInputIteratorT>>>>
+#if _CCCL_HAS_CONCEPTS()
+  requires reduce_by_key_policy_selector<PolicySelector>
+#endif
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_reduce_by_key(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  KeysInputIteratorT d_keys_in,
+  UniqueOutputIteratorT d_unique_out,
+  ValuesInputIteratorT d_values_in,
+  AggregatesOutputIteratorT d_aggregates_out,
+  NumRunsOutputIteratorT d_num_runs_out,
+  EqualityOpT equality_op,
+  ReductionOpT reduction_op,
+  OffsetT num_items,
+  cudaStream_t stream,
+  PolicySelector policy_selector = {})
 {
-  //-------------------------------------------------------------------------
-  // Types and constants
-  //-------------------------------------------------------------------------
-  // Offsets to index items within one partition (i.e., a single kernel invocation)
-  using local_offset_t = ::cuda::std::int32_t;
-
-  // If the number of items provided by the user may exceed the maximum number of items processed by a single kernel
-  // invocation, we may require multiple kernel invocations
-  static constexpr bool use_streaming_invocation = ::cuda::std::numeric_limits<OffsetT>::max()
-                                                 > ::cuda::std::numeric_limits<local_offset_t>::max();
-
-  // Offsets to index any item within the entire input (large enough to cover num_items)
-  using global_offset_t = OffsetT;
-
-  // Type used to provide context about the current partition during a streaming invocation
-  using streaming_context_t =
-    ::cuda::std::conditional_t<use_streaming_invocation,
-                               detail::reduce::streaming_context<KeysInputIteratorT, AccumT, global_offset_t>,
-                               NullType>;
-
-  // The input values type
-  using ValueInputT = cub::detail::it_value_t<ValuesInputIteratorT>;
+  ::cuda::arch_id arch_id{};
+  if (const auto error = CubDebug(ptx_arch_id(arch_id)))
+  {
+    return error;
+  }
 
-  static constexpr int init_kernel_threads = 128;
+  const reduce_by_key_policy policy = policy_selector(arch_id);
 
-  // Tile status descriptor interface type
-  using ScanTileStateT = ReduceByKeyScanTileState<AccumT, local_offset_t>;
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+  NV_IF_TARGET(NV_IS_HOST,
+               (::std::stringstream ss; ss << policy;
+                _CubLog("Dispatching streaming reduce by key to arch %d with tuning: %s\n",
+                        static_cast<int>(arch_id),
+                        ss.str().c_str());))
+#endif
 
-  void* d_temp_storage;
-  size_t& temp_storage_bytes;
-  KeysInputIteratorT d_keys_in;
-  UniqueOutputIteratorT d_unique_out;
-  ValuesInputIteratorT d_values_in;
-  AggregatesOutputIteratorT d_aggregates_out;
-  NumRunsOutputIteratorT d_num_runs_out;
-  EqualityOpT equality_op;
-  ReductionOpT reduction_op;
-  global_offset_t num_items;
-  cudaStream_t stream;
+  using local_offset_t  = ::cuda::std::int32_t;
+  using global_offset_t = OffsetT;
+  static constexpr bool use_streaming_invocation =
+    ::cuda::std::numeric_limits<OffsetT>::max() > ::cuda::std::numeric_limits<local_offset_t>::max();
+  using streaming_context_t = ::cuda::std::
+    conditional_t<use_streaming_invocation, streaming_context<KeysInputIteratorT, AccumT, global_offset_t>, NullType>;
+  using ScanTileStateT                                      = ReduceByKeyScanTileState<AccumT, local_offset_t>;
+  [[maybe_unused]] static constexpr int init_kernel_threads = 128;
+
+  const int block_threads    = policy.block_threads;
+  const int items_per_thread = policy.items_per_thread;
+  const auto tile_size       = static_cast<global_offset_t>(block_threads * items_per_thread);
+
+  auto capped_num_items_per_invocation = num_items;
+  if constexpr (use_streaming_invocation)
+  {
+    capped_num_items_per_invocation = static_cast<global_offset_t>(::cuda::std::numeric_limits<local_offset_t>::max());
+    capped_num_items_per_invocation -= (capped_num_items_per_invocation % tile_size);
+  }
 
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchStreamingReduceByKey(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    UniqueOutputIteratorT d_unique_out,
-    ValuesInputIteratorT d_values_in,
-    AggregatesOutputIteratorT d_aggregates_out,
-    NumRunsOutputIteratorT d_num_runs_out,
-    EqualityOpT equality_op,
-    ReductionOpT reduction_op,
-    global_offset_t num_items,
-    cudaStream_t stream)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_keys_in(d_keys_in)
-      , d_unique_out(d_unique_out)
-      , d_values_in(d_values_in)
-      , d_aggregates_out(d_aggregates_out)
-      , d_num_runs_out(d_num_runs_out)
-      , equality_op(equality_op)
-      , reduction_op(reduction_op)
-      , num_items(num_items)
-      , stream(stream)
-  {}
+  const auto max_num_items_per_invocation =
+    use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items;
+  auto const num_partitions =
+    (num_items == 0) ? global_offset_t{1} : ::cuda::ceil_div(num_items, capped_num_items_per_invocation);
 
-  //---------------------------------------------------------------------
-  // Dispatch entrypoints
-  //---------------------------------------------------------------------
+  const auto max_num_tiles = static_cast<int>(::cuda::ceil_div(max_num_items_per_invocation, tile_size));
 
-  template <typename ActivePolicyT, typename ScanInitKernelT, typename ReduceByKeyKernelT>
-  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
-  Invoke(ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel)
+  size_t allocation_sizes[3];
+  if (const auto error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0])))
   {
-    using AgentReduceByKeyPolicyT  = typename ActivePolicyT::ReduceByKeyPolicyT;
-    constexpr int block_threads    = AgentReduceByKeyPolicyT::BLOCK_THREADS;
-    constexpr int items_per_thread = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD;
-
-    // The upper bound of for the number of items that a single kernel invocation will ever process
-    auto capped_num_items_per_invocation = num_items;
-    if constexpr (use_streaming_invocation)
-    {
-      capped_num_items_per_invocation =
-        static_cast<global_offset_t>(::cuda::std::numeric_limits<local_offset_t>::max());
-      // Make sure that the number of items is a multiple of tile size
-      capped_num_items_per_invocation -= (capped_num_items_per_invocation % (block_threads * items_per_thread));
-    }
-
-    // Across invocations, the maximum number of items that a single kernel invocation will ever process
-    const auto max_num_items_per_invocation =
-      use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items;
+    return error;
+  }
+  allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0};
+  allocation_sizes[2] = num_partitions > 1 ? sizeof(AccumT) * 2 : size_t{0};
 
-    // Number of invocations required to "iterate" over the total input (at least one iteration to process zero items)
-    auto const num_partitions =
-      (num_items == 0) ? global_offset_t{1} : ::cuda::ceil_div(num_items, capped_num_items_per_invocation);
+  void* allocations[3] = {};
+  if (const auto error =
+        CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+  {
+    return error;
+  }
 
-    cudaError error = cudaSuccess;
+  if (d_temp_storage == nullptr)
+  {
+    return cudaSuccess;
+  }
 
-    // Number of input tiles
-    const auto tile_size = static_cast<global_offset_t>(block_threads * items_per_thread);
-    int max_num_tiles    = static_cast<int>(::cuda::ceil_div(max_num_items_per_invocation, tile_size));
+  for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++)
+  {
+    global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation;
+    global_offset_t current_num_items =
+      (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : capped_num_items_per_invocation;
 
-    // Specify temporary storage allocation requirements
-    size_t allocation_sizes[3];
-    error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0]));
-    if (cudaSuccess != error)
+    const auto num_current_tiles = static_cast<int>(::cuda::ceil_div(current_num_items, tile_size));
+    ScanTileStateT tile_state;
+    if (const auto error = CubDebug(tile_state.Init(num_current_tiles, allocations[0], allocation_sizes[0])))
     {
       return error;
     }
-    allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0};
-    allocation_sizes[2] = num_partitions > 1 ? sizeof(AccumT) * 2 : size_t{0};
 
-    // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-    void* allocations[3] = {};
-
-    error = CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
-    if (cudaSuccess != error)
+    const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads));
+#ifdef CUB_DEBUG_LOG
+    _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
+#endif
+    if (const auto error = CubDebug(
+          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream)
+            .doit(&detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                  tile_state,
+                  num_current_tiles,
+                  d_num_runs_out)))
     {
       return error;
     }
 
-    if (d_temp_storage == nullptr)
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
     {
-      // Return if the caller is simply requesting the size of the storage allocation
       return error;
     }
 
-    // Iterate over the partitions until all input is processed
-    for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++)
+    if (num_items == 0)
     {
-      global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation;
-      global_offset_t current_num_items =
-        (partition_idx + 1 == num_partitions)
-          ? (num_items - current_partition_offset)
-          : capped_num_items_per_invocation;
-
-      // Construct the tile status interface
-      const auto num_current_tiles = static_cast<int>(::cuda::ceil_div(current_num_items, tile_size));
-
-      // Construct the tile status interface
-      ScanTileStateT tile_state;
-      error = CubDebug(tile_state.Init(num_current_tiles, allocations[0], allocation_sizes[0]));
-      if (cudaSuccess != error)
-      {
-        return error;
-      }
-
-      // Log init_kernel configuration
-      int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads));
+      return cudaSuccess;
+    }
 
 #ifdef CUB_DEBUG_LOG
-      _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
-#endif // CUB_DEBUG_LOG
-
-      // Invoke init_kernel to initialize tile descriptors
-      error = CubDebug(
-        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream)
-          .doit(init_kernel, tile_state, num_current_tiles, d_num_runs_out));
-      if (cudaSuccess != error)
-      {
-        return error;
-      }
-
-      // Sync the stream if specified to flush runtime errors
-      error = CubDebug(detail::DebugSyncStream(stream));
-      if (cudaSuccess != error)
-      {
-        return error;
-      }
+    _CubLog("Invoking reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+            num_current_tiles,
+            block_threads,
+            (long long) stream,
+            items_per_thread);
+#endif
+    auto reduce_by_key_kernel = DeviceReduceByKeyKernel<
+      PolicySelector,
+      KeysInputIteratorT,
+      UniqueOutputIteratorT,
+      ValuesInputIteratorT,
+      AggregatesOutputIteratorT,
+      NumRunsOutputIteratorT,
+      ScanTileStateT,
+      EqualityOpT,
+      ReductionOpT,
+      local_offset_t,
+      AccumT,
+      streaming_context_t>;
 
-      // For empty problems we can skip the reduce_by_key_kernel
-      if (num_items == 0)
+    if constexpr (use_streaming_invocation)
+    {
+      auto tmp_num_uniques          = static_cast<global_offset_t*>(allocations[1]);
+      auto tmp_prefix               = static_cast<AccumT*>(allocations[2]);
+      const bool is_first_partition = (partition_idx == 0);
+      const bool is_last_partition  = (partition_idx + 1 == num_partitions);
+      const int buffer_selector     = partition_idx % 2;
+      streaming_context_t streaming_context{
+        is_first_partition,
+        is_last_partition,
+        is_first_partition ? d_keys_in : d_keys_in + current_partition_offset - 1,
+        &tmp_prefix[buffer_selector],
+        &tmp_prefix[buffer_selector ^ 0x01],
+        &tmp_num_uniques[buffer_selector],
+        &tmp_num_uniques[buffer_selector ^ 0x01]};
+      if (const auto error = CubDebug(
+            THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream)
+              .doit(reduce_by_key_kernel,
+                    d_keys_in + current_partition_offset,
+                    d_unique_out,
+                    d_values_in + current_partition_offset,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    0,
+                    equality_op,
+                    reduction_op,
+                    static_cast<local_offset_t>(current_num_items),
+                    streaming_context,
+                    detail::vsmem_t{nullptr})))
       {
         return error;
       }
-
-// Log reduce_by_key_kernel configuration
-#ifdef CUB_DEBUG_LOG
-      _CubLog("Invoking reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
-              "items per thread\n",
-              num_current_tiles,
-              block_threads,
-              (long long) stream,
-              items_per_thread);
-#endif // CUB_DEBUG_LOG
-
-      // Invoke reduce_by_key_kernel
-
-      if constexpr (use_streaming_invocation)
-      {
-        auto tmp_num_uniques = static_cast<global_offset_t*>(allocations[1]);
-        auto tmp_prefix      = static_cast<AccumT*>(allocations[2]);
-
-        const bool is_first_partition = (partition_idx == 0);
-        const bool is_last_partition  = (partition_idx + 1 == num_partitions);
-        const int buffer_selector     = partition_idx % 2;
-
-        streaming_context_t streaming_context{
-          is_first_partition,
-          is_last_partition,
-          is_first_partition ? d_keys_in : d_keys_in + current_partition_offset - 1,
-          &tmp_prefix[buffer_selector],
-          &tmp_prefix[buffer_selector ^ 0x01],
-          &tmp_num_uniques[buffer_selector],
-          &tmp_num_uniques[buffer_selector ^ 0x01]};
-
-        error = CubDebug(
-          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream)
-            .doit(reduce_by_key_kernel,
-                  d_keys_in + current_partition_offset,
-                  d_unique_out,
-                  d_values_in + current_partition_offset,
-                  d_aggregates_out,
-                  d_num_runs_out,
-                  tile_state,
-                  0,
-                  equality_op,
-                  reduction_op,
-                  static_cast<local_offset_t>(current_num_items),
-                  streaming_context,
-                  cub::detail::vsmem_t{nullptr}));
-        if (cudaSuccess != error)
-        {
-          return error;
-        }
-      }
-      else
-      {
-        error = CubDebug(
-          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream)
-            .doit(reduce_by_key_kernel,
-                  d_keys_in + current_partition_offset,
-                  d_unique_out,
-                  d_values_in + current_partition_offset,
-                  d_aggregates_out,
-                  d_num_runs_out,
-                  tile_state,
-                  0,
-                  equality_op,
-                  reduction_op,
-                  static_cast<local_offset_t>(current_num_items),
-                  NullType{},
-                  cub::detail::vsmem_t{nullptr}));
-        if (cudaSuccess != error)
-        {
-          return error;
-        }
-      }
-
-      // Sync the stream if specified to flush runtime errors
-      error = CubDebug(detail::DebugSyncStream(stream));
-      if (cudaSuccess != error)
+    }
+    else
+    {
+      if (const auto error = CubDebug(
+            THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream)
+              .doit(reduce_by_key_kernel,
+                    d_keys_in + current_partition_offset,
+                    d_unique_out,
+                    d_values_in + current_partition_offset,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    0,
+                    equality_op,
+                    reduction_op,
+                    static_cast<local_offset_t>(current_num_items),
+                    NullType{},
+                    detail::vsmem_t{nullptr})))
       {
         return error;
       }
     }
-
-    return error;
-  }
-
-  template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
-  {
-    return Invoke<ActivePolicyT>(
-      detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-      detail::reduce::DeviceReduceByKeyKernel<
-        typename PolicyHub::MaxPolicy,
-        KeysInputIteratorT,
-        UniqueOutputIteratorT,
-        ValuesInputIteratorT,
-        AggregatesOutputIteratorT,
-        NumRunsOutputIteratorT,
-        ScanTileStateT,
-        EqualityOpT,
-        ReductionOpT,
-        local_offset_t,
-        AccumT,
-        streaming_context_t>);
-  }
-
-  /**
-   * Internal dispatch routine
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_keys_in
-   *   Pointer to the input sequence of keys
-   *
-   * @param[out] d_unique_out
-   *   Pointer to the output sequence of unique keys (one key per run)
-   *
-   * @param[in] d_values_in
-   *   Pointer to the input sequence of corresponding values
-   *
-   * @param[out] d_aggregates_out
-   *   Pointer to the output sequence of value aggregates
-   *   (one aggregate per run)
-   *
-   * @param[out] d_num_runs_out
-   *   Pointer to total number of runs encountered
-   *   (i.e., the length of d_unique_out)
-   *
-   * @param[in] equality_op
-   *   KeyT equality operator
-   *
-   * @param[in] reduction_op
-   *   ValueT reduction operator
-   *
-   * @param[in] num_items
-   *   Total number of items to select from
-   *
-   * @param[in] stream
-   *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
-   */
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    UniqueOutputIteratorT d_unique_out,
-    ValuesInputIteratorT d_values_in,
-    AggregatesOutputIteratorT d_aggregates_out,
-    NumRunsOutputIteratorT d_num_runs_out,
-    EqualityOpT equality_op,
-    ReductionOpT reduction_op,
-    global_offset_t num_items,
-    cudaStream_t stream)
-  {
-    cudaError error = cudaSuccess;
-
-    do
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
     {
-      // Get PTX version
-      int ptx_version = 0;
-      error           = CubDebug(PtxVersion(ptx_version));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
-
-      DispatchStreamingReduceByKey dispatch(
-        d_temp_storage,
-        temp_storage_bytes,
-        d_keys_in,
-        d_unique_out,
-        d_values_in,
-        d_aggregates_out,
-        d_num_runs_out,
-        equality_op,
-        reduction_op,
-        num_items,
-        stream);
-
-      // Dispatch
-      error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
-    } while (0);
-
-    return error;
+      return error;
+    }
   }
-};
-} // namespace detail::reduce
+
+  return cudaSuccess;
+}
+} // namespace detail::reduce_by_key
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index cfaad7185da..64c96bc77d6 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -17,32 +17,46 @@
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
-#include <cub/block/block_store.cuh>
+#include <cub/detail/delay_constructor.cuh>
+#include <cub/device/dispatch/tuning/common.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/__device/arch_id.h>
 #include <cuda/cmath>
 #include <cuda/std/__algorithm/clamp.h>
+#include <cuda/std/concepts>
+
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
 
 CUB_NAMESPACE_BEGIN
 
 namespace detail::reduce_by_key
 {
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 enum class primitive_key
 {
   no,
   yes
 };
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 enum class primitive_accum
 {
   no,
   yes
 };
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 enum class primitive_op
 {
   no,
   yes
 };
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 enum class key_size
 {
   _1,
@@ -52,6 +66,8 @@ enum class key_size
   _16,
   unknown
 };
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 enum class accum_size
 {
   _1,
@@ -62,26 +78,30 @@ enum class accum_size
   unknown
 };
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class T>
-constexpr primitive_key is_primitive_key()
+_CCCL_API constexpr primitive_key is_primitive_key()
 {
   return detail::is_primitive<T>::value ? primitive_key::yes : primitive_key::no;
 }
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class T>
-constexpr primitive_accum is_primitive_accum()
+_CCCL_API constexpr primitive_accum is_primitive_accum()
 {
   return detail::is_primitive<T>::value ? primitive_accum::yes : primitive_accum::no;
 }
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class ReductionOpT>
-constexpr primitive_op is_primitive_op()
+_CCCL_API constexpr primitive_op is_primitive_op()
 {
   return basic_binary_op_t<ReductionOpT>::value ? primitive_op::yes : primitive_op::no;
 }
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class KeyT>
-constexpr key_size classify_key_size()
+_CCCL_API constexpr key_size classify_key_size()
 {
   return sizeof(KeyT) == 1 ? key_size::_1
        : sizeof(KeyT) == 2 ? key_size::_2
@@ -92,8 +112,9 @@ constexpr key_size classify_key_size()
          : key_size::unknown;
 }
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class AccumT>
-constexpr accum_size classify_accum_size()
+_CCCL_API constexpr accum_size classify_accum_size()
 {
   return sizeof(AccumT) == 1 ? accum_size::_1
        : sizeof(AccumT) == 2 ? accum_size::_2
@@ -104,6 +125,7 @@ constexpr accum_size classify_accum_size()
          : accum_size::unknown;
 }
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class KeyT,
           class AccumT,
           primitive_op PrimitiveOp,
@@ -343,6 +365,7 @@ struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive
   using delay_constructor                            = detail::no_delay_constructor_t<1090>;
 };
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class KeyT,
           class AccumT,
           primitive_op PrimitiveOp,
@@ -582,6 +605,7 @@ struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive
   using delay_constructor                            = detail::no_delay_constructor_t<1150>;
 };
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class KeyT,
           class AccumT,
           primitive_op PrimitiveOp,
@@ -735,7 +759,6 @@ template <class KeyT, class AccumT>
 struct sm100_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_4, accum_size::_4>
 {
   // ipt_14.tpb_224.trp_1.ld_0.ns_476.dcid_5.l2w_1005 1.187378  1.119705  1.185397  1.258420
-
   static constexpr int items                         = 14;
   static constexpr int threads                       = 224;
   static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
@@ -833,6 +856,7 @@ struct sm100_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primiti
 //   using delay_constructor                            = detail::no_delay_constructor_t<1125>;
 // };
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class ReductionOpT, class AccumT, class KeyT>
 struct policy_hub
 {
@@ -895,7 +919,7 @@ struct policy_hub
 
   struct Policy1000 : ChainedPolicy<1000, Policy1000, Policy900>
   {
-    // Use values from tuning if a specialization exists, otherwise pick the default
+    // Use values from tuning if a specialization exists, otherwise fall back to SM90
     template <typename Tuning>
     static auto select_agent_policy(int)
       -> AgentReduceByKeyPolicy<Tuning::threads,
@@ -913,6 +937,727 @@ struct policy_hub
   };
   using MaxPolicy = Policy1000;
 };
+
+struct reduce_by_key_policy
+{
+  int block_threads;
+  int items_per_thread;
+  BlockLoadAlgorithm load_algorithm;
+  CacheLoadModifier load_modifier;
+  BlockScanAlgorithm scan_algorithm;
+  delay_constructor_policy delay_constructor;
+
+  [[nodiscard]] _CCCL_API constexpr friend bool
+  operator==(const reduce_by_key_policy& lhs, const reduce_by_key_policy& rhs)
+  {
+    return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread
+        && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier
+        && lhs.scan_algorithm == rhs.scan_algorithm && lhs.delay_constructor == rhs.delay_constructor;
+  }
+
+  [[nodiscard]] _CCCL_API constexpr friend bool
+  operator!=(const reduce_by_key_policy& lhs, const reduce_by_key_policy& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const reduce_by_key_policy& p)
+  {
+    return os
+        << "reduce_by_key_policy { .block_threads = " << p.block_threads << ", .items_per_thread = "
+        << p.items_per_thread << ", .load_algorithm = " << p.load_algorithm << ", .load_modifier = " << p.load_modifier
+        << ", .scan_algorithm = " << p.scan_algorithm << ", .delay_constructor = " << p.delay_constructor << " }";
+  }
+#endif // !_CCCL_COMPILER(NVRTC)
+};
+
+#if _CCCL_HAS_CONCEPTS()
+template <typename T>
+concept reduce_by_key_policy_selector = detail::policy_selector<T, reduce_by_key_policy>;
+#endif // _CCCL_HAS_CONCEPTS()
+
+struct policy_selector
+{
+  int key_size;
+  int accum_size;
+  type_t accum_t;
+
+  // TODO(bgruber): we want to get rid of the following four and just assume by default that types behave "primitive".
+  // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because
+  // then users can opt-out of tunings again
+  bool key_is_primitive;
+  bool key_is_trivially_copyable;
+  bool accum_is_primitive;
+  bool op_is_primitive;
+
+  _CCCL_API constexpr auto __make_default_policy(CacheLoadModifier load_mod) const -> reduce_by_key_policy
+  {
+    constexpr int nominal_4B_items_per_thread = 6;
+    const int combined_input_bytes            = key_size + accum_size;
+    const int max_input_bytes                 = (::cuda::std::max) (key_size, accum_size);
+    const int items_per_thread =
+      (max_input_bytes <= 8)
+        ? 6
+        : ::cuda::std::clamp(
+            ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
+    return reduce_by_key_policy{
+      128,
+      items_per_thread,
+      BLOCK_LOAD_DIRECT,
+      load_mod,
+      BLOCK_SCAN_WARP_SCANS,
+      default_reduce_by_key_delay_constructor_policy(
+        accum_size, sizeof(int), key_is_primitive || key_is_trivially_copyable, true)};
+  }
+
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy
+  {
+    // bail out if we don't know the operation. TODO(bgruber): drop this check when we make the tuning API public
+    if (!op_is_primitive)
+    {
+      return __make_default_policy(LOAD_LDG);
+    }
+
+    const bool use_tuning = (key_is_primitive || key_size == 16) && (accum_is_primitive || accum_size == 16);
+
+    if (arch >= ::cuda::arch_id::sm_100 && use_tuning) // if we don't have a tuning, fall back to SM90
+    {
+      if (key_size == 1 && accum_size == 1)
+      {
+        // ipt_13.tpb_576.trp_0.ld_1.ns_2044.dcid_5.l2w_240 1.161888  0.848558  1.134941  1.299109
+        return {576,
+                13,
+                BLOCK_LOAD_DIRECT,
+                LOAD_CA,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backon_jitter_window, 2044, 240}};
+      }
+      if (key_size == 1 && accum_size == 2)
+      {
+        // ipt_10.tpb_224.trp_0.ld_0.ns_244.dcid_4.l2w_390 1.313932  1.260540  1.319588  1.427374
+        return {224,
+                10,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff_jitter_window, 224, 390}};
+      }
+      if (key_size == 1 && accum_size == 4)
+      {
+        // ipt_14.tpb_128.trp_0.ld_0.ns_248.dcid_2.l2w_285  1.118109  1.051534  1.134336  1.326788
+        return {128,
+                14,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 248, 285}};
+      }
+      if (key_size == 1 && accum_size == 8)
+      {
+        // ipt_19.tpb_128.trp_1.ld_0.ns_132.dcid_1.l2w_540 1.113820  1.002404  1.105014  1.202296
+        return {128,
+                19,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 132, 540}};
+      }
+      if (key_size == 2 && accum_size == 1)
+      {
+        // ipt_14.tpb_128.trp_1.ld_0.ns_164.dcid_2.l2w_290 1.239579  1.119705  1.239111  1.313112
+        return {128,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 164, 290}};
+      }
+      if (key_size == 2 && accum_size == 2)
+      {
+        // ipt_14.tpb_256.trp_1.ld_0.ns_180.dcid_2.l2w_975 1.145635  1.012658  1.139956  1.251546
+        return {256,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 180, 975}};
+      }
+      if (key_size == 2 && accum_size == 4 && accum_t != type_t::float32) // I16, F32, I32 regressed, fall back to SM90
+      {
+        // ipt_11.tpb_256.trp_0.ld_0.ns_224.dcid_2.l2w_550 1.066293  1.000109  1.073092  1.181818
+        return {256,
+                11,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 224, 550}};
+      }
+      if (key_size == 2 && accum_size == 8)
+      {
+        // ipt_10.tpb_160.trp_1.ld_0.ns_156.dcid_1.l2w_725 1.045007  1.002105  1.049690  1.141827
+        return {160,
+                10,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 156, 725}};
+      }
+      if (key_size == 4 && accum_size == 1)
+      {
+        // ipt_10.tpb_224.trp_0.ld_0.ns_324.dcid_2.l2w_285 1.157217  1.073724  1.166510  1.356940
+        return {224,
+                10,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 324, 285}};
+      }
+      if (key_size == 4 && accum_size == 2)
+      {
+        // ipt_11.tpb_256.trp_0.ld_0.ns_1984.dcid_5.l2w_115 1.214155  1.128842  1.214093  1.364476
+        return {256,
+                11,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backon_jitter_window, 1984, 115}};
+      }
+      if (key_size == 4 && accum_size == 4)
+      {
+        // ipt_14.tpb_224.trp_1.ld_0.ns_476.dcid_5.l2w_1005 1.187378  1.119705  1.185397  1.258420
+        return {224,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backon_jitter_window, 476, 1005}};
+      }
+      if (key_size == 4 && accum_size == 8)
+      {
+        // ipt_10.tpb_256.trp_1.ld_0.ns_1868.dcid_7.l2w_145 1.142915  1.020581  1.137459  1.237913
+        return {256,
+                10,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backon, 1868, 145}};
+      }
+      if (key_size == 8 && accum_size == 1)
+      {
+        // ipt_9.tpb_224.trp_1.ld_0.ns_1940.dcid_5.l2w_460 1.157294  1.075650  1.153566  1.250729
+        return {224,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backon_jitter_window, 1940, 460}};
+      }
+      if (key_size == 8 && accum_size == 2)
+      {
+        // ipt_11.tpb_224.trp_1.ld_1.ns_392.dcid_2.l2w_550 1.104034  1.007212  1.099543  1.220401
+        return {224,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_CA,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 392, 550}};
+      }
+      if (key_size == 8 && accum_size == 4)
+      {
+        // ipt_9.tpb_224.trp_1.ld_0.ns_244.dcid_2.l2w_475 1.130098  1.000000  1.130661  1.215722
+        return {224,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 244, 475}};
+      }
+      if (key_size == 8 && accum_size == 8)
+      {
+        // ipt_9.tpb_224.trp_1.ld_0.ns_196.dcid_2.l2w_340 1.272056  1.142857  1.262499  1.352941
+        return {224,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 196, 340}};
+      }
+    }
+
+    if (arch >= ::cuda::arch_id::sm_90)
+    {
+      if (use_tuning)
+      {
+        if (key_size == 1 && accum_size == 1)
+        {
+          return {
+            256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 720}};
+        }
+        if (key_size == 1 && accum_size == 2)
+        {
+          return {
+            320, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 865}};
+        }
+        if (key_size == 1 && accum_size == 4)
+        {
+          return {192,
+                  14,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 735}};
+        }
+        if (key_size == 1 && accum_size == 8)
+        {
+          return {128,
+                  13,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 580}};
+        }
+        if (key_size == 1 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1100}};
+        }
+        if (key_size == 2 && accum_size == 1)
+        {
+          return {
+            128, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 985}};
+        }
+        if (key_size == 2 && accum_size == 2)
+        {
+          return {256,
+                  11,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 276, 650}};
+        }
+        if (key_size == 2 && accum_size == 4)
+        {
+          return {256,
+                  14,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 240, 765}};
+        }
+        if (key_size == 2 && accum_size == 8)
+        {
+          return {128,
+                  19,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1190}};
+        }
+        if (key_size == 2 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1175}};
+        }
+        if (key_size == 4 && accum_size == 1)
+        {
+          return {256,
+                  13,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 404, 645}};
+        }
+        if (key_size == 4 && accum_size == 2)
+        {
+          return {256,
+                  18,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1160}};
+        }
+        if (key_size == 4 && accum_size == 4)
+        {
+          return {256,
+                  18,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1170}};
+        }
+        if (key_size == 4 && accum_size == 8)
+        {
+          return {128,
+                  13,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1055}};
+        }
+        if (key_size == 4 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1195}};
+        }
+        if (key_size == 8 && accum_size == 1)
+        {
+          return {256,
+                  10,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1170}};
+        }
+        if (key_size == 8 && accum_size == 2)
+        {
+          return {256,
+                  9,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 236, 1030}};
+        }
+        if (key_size == 8 && accum_size == 4)
+        {
+          return {128,
+                  13,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 152, 560}};
+        }
+        if (key_size == 8 && accum_size == 8)
+        {
+          return {128,
+                  23,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1030}};
+        }
+        if (key_size == 8 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1125}};
+        }
+        if (key_size == 16 && accum_size == 1)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1080}};
+        }
+        if (key_size == 16 && accum_size == 2)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 320, 1005}};
+        }
+        if (key_size == 16 && accum_size == 4)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 232, 1100}};
+        }
+        if (key_size == 16 && accum_size == 8)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1195}};
+        }
+        if (key_size == 16 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1150}};
+        }
+      }
+
+      // no tuning, use a default one
+      return __make_default_policy(LOAD_DEFAULT);
+    }
+
+    if (arch >= ::cuda::arch_id::sm_86)
+    {
+      return __make_default_policy(LOAD_LDG);
+    }
+
+    if (arch >= ::cuda::arch_id::sm_80)
+    {
+      if (use_tuning)
+      {
+        if (key_size == 1 && accum_size == 1)
+        {
+          return {
+            256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 975}};
+        }
+        if (key_size == 1 && accum_size == 2)
+        {
+          return {
+            224, 12, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 840}};
+        }
+        if (key_size == 1 && accum_size == 4)
+        {
+          return {256,
+                  15,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 760}};
+        }
+        if (key_size == 1 && accum_size == 8)
+        {
+          return {
+            224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1070}};
+        }
+        if (key_size == 1 && accum_size == 16)
+        {
+          return {128,
+                  9,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1175}};
+        }
+        if (key_size == 2 && accum_size == 1)
+        {
+          return {
+            256, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}};
+        }
+        if (key_size == 2 && accum_size == 2)
+        {
+          return {224,
+                  14,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 640}};
+        }
+        if (key_size == 2 && accum_size == 4)
+        {
+          return {256,
+                  14,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 905}};
+        }
+        if (key_size == 2 && accum_size == 8)
+        {
+          return {224,
+                  9,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 810}};
+        }
+        if (key_size == 2 && accum_size == 16)
+        {
+          return {160,
+                  9,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1115}};
+        }
+        if (key_size == 4 && accum_size == 1)
+        {
+          return {288,
+                  11,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1110}};
+        }
+        if (key_size == 4 && accum_size == 2)
+        {
+          return {192,
+                  15,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1200}};
+        }
+        if (key_size == 4 && accum_size == 4)
+        {
+          return {256,
+                  15,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1110}};
+        }
+        if (key_size == 4 && accum_size == 8)
+        {
+          return {224,
+                  9,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1165}};
+        }
+        if (key_size == 4 && accum_size == 16)
+        {
+          return {160,
+                  9,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1100}};
+        }
+        if (key_size == 8 && accum_size == 1)
+        {
+          return {192,
+                  10,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1175}};
+        }
+        if (key_size == 8 && accum_size == 2)
+        {
+          return {
+            224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1075}};
+        }
+        if (key_size == 8 && accum_size == 4)
+        {
+          return {
+            384, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1040}};
+        }
+        if (key_size == 8 && accum_size == 8)
+        {
+          return {128,
+                  14,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1080}};
+        }
+        if (key_size == 8 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 430}};
+        }
+        if (key_size == 16 && accum_size == 1)
+        {
+          return {
+            192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1105}};
+        }
+        if (key_size == 16 && accum_size == 2)
+        {
+          return {192,
+                  7,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 755}};
+        }
+        if (key_size == 16 && accum_size == 4)
+        {
+          return {192,
+                  7,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 535}};
+        }
+        if (key_size == 16 && accum_size == 8)
+        {
+          return {
+            192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1035}};
+        }
+        if (key_size == 16 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1090}};
+        }
+      }
+
+      // no tuning, use a default one
+      return __make_default_policy(LOAD_DEFAULT);
+    }
+
+    // for SM50
+    return __make_default_policy(LOAD_LDG);
+  }
+};
+
+#if _CCCL_HAS_CONCEPTS()
+static_assert(reduce_by_key_policy_selector<policy_selector>);
+#endif // _CCCL_HAS_CONCEPTS()
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
+template <typename PolicyHub>
+struct policy_selector_from_hub
+{
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_by_key_policy
+  {
+    using ReduceByKeyPolicyT = typename PolicyHub::MaxPolicy::ReduceByKeyPolicyT;
+    return reduce_by_key_policy{
+      ReduceByKeyPolicyT::BLOCK_THREADS,
+      ReduceByKeyPolicyT::ITEMS_PER_THREAD,
+      ReduceByKeyPolicyT::LOAD_ALGORITHM,
+      ReduceByKeyPolicyT::LOAD_MODIFIER,
+      ReduceByKeyPolicyT::SCAN_ALGORITHM,
+      delay_constructor_policy_from_type<typename ReduceByKeyPolicyT::detail::delay_constructor_t>,
+    };
+  }
+};
+
+template <class ReductionOpT, class AccumT, class KeyT>
+struct policy_selector_from_types
+{
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy
+  {
+    return policy_selector{
+      int{sizeof(KeyT)},
+      int{sizeof(AccumT)},
+      classify_type<AccumT>,
+      is_primitive_v<KeyT>,
+      ::cuda::std::is_trivially_copyable_v<KeyT>,
+      is_primitive_v<AccumT>,
+      basic_binary_op_t<ReductionOpT>::value}(arch);
+  }
+};
 } // namespace detail::reduce_by_key
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
index 77c393d7643..5083ed52b85 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
@@ -18,17 +18,26 @@
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
+#include <cub/detail/delay_constructor.cuh>
 #include <cub/device/dispatch/tuning/common.cuh>
+#include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
 #include <cub/util_device.cuh>
 
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/__device/arch_id.h>
 #include <cuda/std/__algorithm/clamp.h>
 #include <cuda/std/__algorithm/max.h>
+#include <cuda/std/concepts>
+
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
 
 CUB_NAMESPACE_BEGIN
 
 namespace detail::rle::encode
 {
+// TODO(bgruber): remove in CCCL 4.0 when we drop the CUB dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -89,6 +98,7 @@ struct sm80_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::n
 {};
 #endif
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the CUB dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -149,6 +159,7 @@ struct sm90_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::n
 {};
 #endif
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the CUB dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -218,7 +229,8 @@ struct sm100_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, le
 // {};
 #endif
 
-// this policy is passed to DispatchReduceByKey
+// TODO(bgruber): remove in CCCL 4.0 when we drop all CUB dispatchers
+// this policy is passed to DispatchStreamingReduceByKey and then to DeviceReduceByKeyKernel
 template <class LengthT, class KeyT>
 struct policy_hub
 {
@@ -294,6 +306,227 @@ struct policy_hub
 
   using MaxPolicy = Policy1000;
 };
+
+// DeviceRunLengthEncode::Encode delegates to reduce by key
+using rle_encode_policy = reduce_by_key::reduce_by_key_policy;
+
+#if _CCCL_HAS_CONCEPTS()
+template <typename T>
+concept rle_encode_policy_selector = reduce_by_key::reduce_by_key_policy_selector<T>;
+#endif // _CCCL_HAS_CONCEPTS()
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the RLE dispatchers
+using reduce_by_key::policy_selector_from_hub;
+
+struct policy_selector
+{
+  int length_size;
+  int key_size;
+  type_t key_t;
+
+  // TODO(bgruber): we want to get rid of the following three and just assume by default that types behave "primitive".
+  // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because
+  // then users can opt-out of tunings again
+  bool length_is_primitive;
+  bool length_is_trivially_copyable;
+  bool key_is_primitive;
+
+  _CCCL_API constexpr auto __make_default_policy(CacheLoadModifier load_mod) const -> rle_encode_policy
+  {
+    constexpr int nominal_4B_items_per_thread = 6;
+    const int combined_input_bytes            = length_size + key_size;
+    const int max_input_bytes                 = (::cuda::std::max) (length_size, key_size);
+    const int items_per_thread =
+      (max_input_bytes <= 8)
+        ? 6
+        : ::cuda::std::clamp(
+            ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
+    return rle_encode_policy{
+      128,
+      items_per_thread,
+      BLOCK_LOAD_DIRECT,
+      load_mod,
+      BLOCK_SCAN_WARP_SCANS,
+      default_reduce_by_key_delay_constructor_policy(
+        length_size, int{sizeof(int)}, length_is_primitive || length_is_trivially_copyable, true)};
+  }
+
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy
+  {
+    // if we don't have a tuning for SM100, fall back to SM90
+    if (arch >= ::cuda::arch_id::sm_100 && length_is_primitive && length_size == 4 && key_is_primitive)
+    {
+      if (key_size == 1)
+      {
+        return rle_encode_policy{
+          256,
+          14,
+          BLOCK_LOAD_DIRECT,
+          LOAD_CA,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::exponential_backon, 468, 300}};
+      }
+      if (key_size == 2)
+      {
+        return rle_encode_policy{
+          224,
+          14,
+          BLOCK_LOAD_DIRECT,
+          LOAD_DEFAULT,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::exponential_backon, 376, 420}};
+      }
+      if (key_size == 4)
+      {
+        return rle_encode_policy{
+          256,
+          14,
+          BLOCK_LOAD_DIRECT,
+          LOAD_CA,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::exponential_backon, 956, 70}};
+      }
+      if (key_size == 8)
+      {
+        return rle_encode_policy{
+          224,
+          9,
+          BLOCK_LOAD_WARP_TRANSPOSE,
+          LOAD_DEFAULT,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::exponential_backoff, 188, 765}};
+      }
+    }
+
+    if (arch >= ::cuda::arch_id::sm_90)
+    {
+      if (length_is_primitive && length_size == 4)
+      {
+        if (key_is_primitive && key_size == 1)
+        {
+          return rle_encode_policy{
+            256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}};
+        }
+        if (key_is_primitive && key_size == 2)
+        {
+          return rle_encode_policy{
+            128, 22, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 775}};
+        }
+        if (key_is_primitive && key_size == 4)
+        {
+          return rle_encode_policy{
+            192,
+            14,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::fixed_delay, 284, 480}};
+        }
+        if (key_is_primitive && key_size == 8)
+        {
+          return rle_encode_policy{
+            128,
+            19,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 515}};
+        }
+        if (key_t == type_t::int128 || key_t == type_t::uint128)
+        {
+          return rle_encode_policy{
+            128,
+            11,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::fixed_delay, 428, 930}};
+        }
+      }
+
+      // no tuning, use a default one
+      return __make_default_policy(LOAD_DEFAULT);
+    }
+
+    if (arch >= ::cuda::arch_id::sm_86)
+    {
+      return __make_default_policy(LOAD_LDG);
+    }
+
+    if (arch >= ::cuda::arch_id::sm_80)
+    {
+      if (length_is_primitive && length_size == 4)
+      {
+        if (key_is_primitive && key_size == 1)
+        {
+          return rle_encode_policy{
+            256, 14, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 640}};
+        }
+        if (key_is_primitive && key_size == 2)
+        {
+          return rle_encode_policy{
+            256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 900}};
+        }
+        if (key_is_primitive && key_size == 4)
+        {
+          return rle_encode_policy{
+            256,
+            13,
+            BLOCK_LOAD_DIRECT,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 1080}};
+        }
+        if (key_is_primitive && key_size == 8)
+        {
+          return rle_encode_policy{
+            224,
+            9,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 1075}};
+        }
+        if (key_t == type_t::int128 || key_t == type_t::uint128)
+        {
+          return rle_encode_policy{
+            128,
+            7,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 630}};
+        }
+      }
+
+      // no tuning, use a default one
+      return __make_default_policy(LOAD_DEFAULT);
+    }
+
+    // for SM50
+    return __make_default_policy(LOAD_LDG);
+  }
+};
+
+#if _CCCL_HAS_CONCEPTS()
+static_assert(rle_encode_policy_selector<policy_selector>);
+#endif // _CCCL_HAS_CONCEPTS()
+
+template <class LengthT, class KeyT>
+struct policy_selector_from_types
+{
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy
+  {
+    constexpr policy_selector selector{
+      int{sizeof(LengthT)},
+      int{sizeof(KeyT)},
+      classify_type<KeyT>,
+      is_primitive_v<LengthT>,
+      ::cuda::std::is_trivially_copyable_v<LengthT>,
+      is_primitive_v<KeyT>};
+    return selector(arch);
+  }
+};
 } // namespace detail::rle::encode
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
index 3ff351a2bd4..97f1d9a03e6 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
@@ -374,6 +374,7 @@ struct policy_selector
   type_t key_type;
   bool length_is_primitive;
   bool key_is_primitive; // TODO(bgruber): can probably be derived from key_type
+  bool key_is_trivially_copyable;
 
   _CCCL_API constexpr auto
   make_default_policy(BlockLoadAlgorithm block_load_alg, int delay_ctor_key_size, CacheLoadModifier load_mod) const
@@ -388,7 +389,8 @@ struct policy_selector
       load_mod,
       true,
       BLOCK_SCAN_WARP_SCANS,
-      default_reduce_by_key_delay_constructor_policy(delay_ctor_key_size, sizeof(int), true)};
+      default_reduce_by_key_delay_constructor_policy(
+        delay_ctor_key_size, sizeof(int), key_is_primitive || key_is_trivially_copyable, true)};
   }
 
   [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy
@@ -600,7 +602,12 @@ struct policy_selector_from_types
   [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy
   {
     constexpr policy_selector selector{
-      sizeof(LengthT), int{sizeof(KeyT)}, classify_type<KeyT>, is_primitive_v<LengthT>, is_primitive_v<KeyT>};
+      sizeof(LengthT),
+      int{sizeof(KeyT)},
+      classify_type<KeyT>,
+      is_primitive_v<LengthT>,
+      is_primitive_v<KeyT>,
+      ::cuda::std::is_trivially_copyable_v<KeyT>};
     return selector(arch);
   }
 };
diff --git a/nvbench_helper/nvbench_helper/look_back_helper.cuh b/nvbench_helper/nvbench_helper/look_back_helper.cuh
index f7a97937dcb..5d31dde94dd 100644
--- a/nvbench_helper/nvbench_helper/look_back_helper.cuh
+++ b/nvbench_helper/nvbench_helper/look_back_helper.cuh
@@ -16,4 +16,8 @@ using delay_constructor_t =
   cub::detail::delay_constructor_t<static_cast<cub::detail::delay_constructor_kind>(TUNE_DELAY_CONSTRUCTOR_ID),
                                    TUNE_MAGIC_NS,
                                    TUNE_L2_WRITE_LATENCY_NS>;
+
+inline constexpr auto delay_constructor_policy = cub::detail::delay_constructor_policy{
+  static_cast<cub::detail::delay_constructor_kind>(TUNE_DELAY_CONSTRUCTOR_ID), TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS};
+
 #endif // !TUNE_BASE