diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
index 4abf201cbb8..fc902393c9d 100644
--- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
+++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
@@ -16,33 +16,21 @@
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
 #if !TUNE_BASE
-#  if TUNE_TRANSPOSE == 0
-#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
-#  else // TUNE_TRANSPOSE == 1
-#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
-#  endif // TUNE_TRANSPOSE
-
-#  if TUNE_LOAD == 0
-#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
-#  else // TUNE_LOAD == 1
-#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
-#  endif // TUNE_LOAD
-
-struct device_rle_policy_hub
+struct bench_rle_policy_selector
 {
-  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
+  [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const
+    -> cub::detail::rle::non_trivial_runs::rle_non_trivial_runs_policy
   {
-    using RleSweepPolicyT =
-      cub::AgentRlePolicy<TUNE_THREADS,
-                          TUNE_ITEMS,
-                          TUNE_LOAD_ALGORITHM,
-                          TUNE_LOAD_MODIFIER,
-                          TUNE_TIME_SLICING,
-                          cub::BLOCK_SCAN_WARP_SCANS,
-                          delay_constructor_t>;
-  };
-
-  using MaxPolicy = Policy500;
+    return {
+      TUNE_THREADS,
+      TUNE_ITEMS,
+      TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
+      TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA,
+      static_cast<bool>(TUNE_TIME_SLICING),
+      cub::BLOCK_SCAN_WARP_SCANS,
+      delay_constructor_policy,
+    };
+  }
 };
 #endif // !TUNE_BASE
 
@@ -60,25 +48,6 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   using num_runs_output_iterator_t = offset_t*;
   using equality_op_t              = ::cuda::std::equal_to<>;
 
-#if !TUNE_BASE
-  using dispatch_t =
-    cub::DeviceRleDispatch<keys_input_it_t,
-                           offset_output_it_t,
-                           length_output_it_t,
-                           num_runs_output_iterator_t,
-                           equality_op_t,
-                           offset_t,
-                           device_rle_policy_hub>;
-#else
-  using dispatch_t =
-    cub::DeviceRleDispatch<keys_input_it_t,
-                           offset_output_it_t,
-                           length_output_it_t,
-                           num_runs_output_iterator_t,
-                           equality_op_t,
-                           offset_t>;
-#endif
-
   const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   constexpr std::size_t min_segment_size = 1;
   const std::size_t max_segment_size     = static_cast<std::size_t>(state.get_int64("MaxSegSize"));
@@ -88,38 +57,39 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   thrust::device_vector<run_length_t> out_lengths(elements);
   thrust::device_vector<T> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
 
-  T* d_in_keys                = thrust::raw_pointer_cast(in_keys.data());
+  const T* d_in_keys          = thrust::raw_pointer_cast(in_keys.data());
   offset_t* d_out_offsets     = thrust::raw_pointer_cast(out_offsets.data());
   run_length_t* d_out_lengths = thrust::raw_pointer_cast(out_lengths.data());
   offset_t* d_num_runs_out    = thrust::raw_pointer_cast(num_runs_out.data());
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
+  const offset_t num_items = static_cast<offset_t>(elements);
 
-  dispatch_t::Dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_in_keys,
-    d_out_offsets,
-    d_out_lengths,
-    d_num_runs_out,
-    equality_op_t{},
-    elements,
-    0);
+  auto dispatch_on_stream = [&](cudaStream_t stream) {
+    cub::detail::rle::dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in_keys,
+      d_out_offsets,
+      d_out_lengths,
+      d_num_runs_out,
+      equality_op_t{},
+      num_items,
+      stream
+#if !TUNE_BASE
+      ,
+      bench_rle_policy_selector{}
+#endif
+    );
+  };
+
+  dispatch_on_stream(cudaStream_t{0});
 
   thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  dispatch_t::Dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_in_keys,
-    d_out_offsets,
-    d_out_lengths,
-    d_num_runs_out,
-    equality_op_t{},
-    elements,
-    0);
+  dispatch_on_stream(cudaStream_t{0});
   cudaDeviceSynchronize();
   const OffsetT num_runs = num_runs_out[0];
 
@@ -130,16 +100,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   state.add_global_memory_writes<OffsetT>(1);
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in_keys,
-      d_out_offsets,
-      d_out_lengths,
-      d_num_runs_out,
-      equality_op_t{},
-      elements,
-      launch.get_stream());
+    dispatch_on_stream(launch.get_stream().get_stream());
   });
 }
 
diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh
index 44a409f497a..23d0c6d571e 100644
--- a/cub/cub/detail/delay_constructor.cuh
+++ b/cub/cub/detail/delay_constructor.cuh
@@ -32,7 +32,8 @@ enum class delay_constructor_kind
   exponential_backoff_jitter_window,
   exponential_backon_jitter_window,
   exponential_backon_jitter,
-  exponential_backon
+  exponential_backon,
+  reduce_by_key
 };
 
 #if !_CCCL_COMPILER(NVRTC)
@@ -56,6 +57,8 @@ inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kin
       return os << "delay_constructor_kind::exponential_backon_jitter";
     case delay_constructor_kind::exponential_backon:
       return os << "delay_constructor_kind::exponential_backon";
+    case delay_constructor_kind::reduce_by_key:
+      return os << "delay_constructor_kind::reduce_by_key";
     default:
       return os << "<unknown delay_constructor_kind: " << static_cast<int>(kind) << ">";
   }
@@ -125,6 +128,11 @@ template <unsigned int Delay, unsigned int L2WriteLatency>
 inline constexpr auto delay_constructor_policy_from_type<exponential_backon_constructor_t<Delay, L2WriteLatency>> =
   delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency};
 
+template <unsigned int Delay, unsigned int L2WriteLatency, unsigned int GridThreshold>
+inline constexpr auto
+  delay_constructor_policy_from_type<reduce_by_key_delay_constructor_t<Delay, L2WriteLatency, GridThreshold>> =
+    delay_constructor_policy{delay_constructor_kind::reduce_by_key, Delay, L2WriteLatency};
+
 template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
 struct delay_constructor_for;
 
@@ -176,8 +184,24 @@ struct delay_constructor_for<delay_constructor_kind::exponential_backon, Delay,
   using type = exponential_backon_constructor_t<Delay, L2WriteLatency>;
 };
 
+template <unsigned int Delay, unsigned int L2WriteLatency>
+struct delay_constructor_for<delay_constructor_kind::reduce_by_key, Delay, L2WriteLatency>
+{
+  using type = reduce_by_key_delay_constructor_t<Delay, L2WriteLatency>;
+};
+
 template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
 using delay_constructor_t = typename delay_constructor_for<Kind, Delay, L2WriteLatency>::type;
+
+_CCCL_API constexpr auto
+default_reduce_by_key_delay_constructor_policy(int key_size, int value_size, bool value_is_primitive)
+{
+  if (value_is_primitive && (value_size + key_size < 16))
+  {
+    return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450};
+  }
+  return delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450};
+}
 } // namespace detail
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh
index 691e0a28c30..44fac1b1cd9 100644
--- a/cub/cub/device/device_run_length_encode.cuh
+++ b/cub/cub/device/device_run_length_encode.cuh
@@ -327,25 +327,18 @@ struct DeviceRunLengthEncode
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::NonTrivialRuns");
 
-    // Offset type used for global offsets
-    using offset_t    = detail::choose_signed_offset_t<NumItemsT>;
-    using equality_op = ::cuda::std::equal_to<>;
-
-    return DeviceRleDispatch<
-      InputIteratorT,
-      OffsetsOutputIteratorT,
-      LengthsOutputIteratorT,
-      NumRunsOutputIteratorT,
-      equality_op,
-      offset_t>::Dispatch(d_temp_storage,
-                          temp_storage_bytes,
-                          d_in,
-                          d_offsets_out,
-                          d_lengths_out,
-                          d_num_runs_out,
-                          equality_op{},
-                          num_items,
-                          stream);
+    using global_offset_t = detail::choose_signed_offset_t<NumItemsT>;
+    using equality_op     = ::cuda::std::equal_to<>;
+    return detail::rle::dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_offsets_out,
+      d_lengths_out,
+      d_num_runs_out,
+      equality_op{},
+      static_cast<global_offset_t>(num_items),
+      stream);
   }
 };
 
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index a5b490287ab..2854b88833c 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -21,6 +21,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_rle.cuh>
+#include <cub/detail/arch_dispatch.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -29,6 +30,10 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+#  include <sstream>
+#endif
+
 #include <cuda/__cmath/ceil_div.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__algorithm/min.h>
@@ -110,8 +115,8 @@ struct streaming_context
  * Otherwise performs flag-based selection if FlagIterator's value type != NullType
  * Otherwise performs discontinuity selection (keep unique)
  *
- * @tparam AgentRlePolicyT
- *   Parameterized AgentRlePolicyT tuning policy type
+ * @tparam PolicySelector
+ *   Selects the tuning policy
  *
  * @tparam InputIteratorT
  *   Random-access input iterator type for reading input items @iterator
@@ -158,7 +163,7 @@ struct streaming_context
  * @param num_tiles
  *   Total number of tiles for the entire problem
  */
-template <typename ChainedPolicyT,
+template <typename PolicySelector,
           typename InputIteratorT,
           typename OffsetsOutputIteratorT,
           typename LengthsOutputIteratorT,
@@ -168,7 +173,10 @@ template <typename ChainedPolicyT,
           typename OffsetT,
           typename GlobalOffsetT,
           typename StreamingContextT>
-__launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREADS))
+#if _CCCL_HAS_CONCEPTS()
+  requires non_trivial_runs::rle_non_trivial_runs_policy_selector<PolicySelector>
+#endif // _CCCL_HAS_CONCEPTS()
+__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel(
     InputIteratorT d_in,
     OffsetsOutputIteratorT d_offsets_out,
@@ -180,9 +188,19 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA
     int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context)
 {
-  using AgentRlePolicyT = typename ChainedPolicyT::ActivePolicy::RleSweepPolicyT;
+  static constexpr non_trivial_runs::rle_non_trivial_runs_policy policy =
+    PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
+  using AgentRlePolicyT =
+    AgentRlePolicy<policy.block_threads,
+                   policy.items_per_thread,
+                   policy.load_algorithm,
+                   policy.load_modifier,
+                   policy.store_with_time_slicing,
+                   policy.scan_algorithm,
+                   delay_constructor_t<policy.delay_constructor.kind,
+                                       policy.delay_constructor.delay,
+                                       policy.delay_constructor.l2_write_latency>>;
 
-  // Thread block type for selecting data from input tiles
   using AgentRleT =
     AgentRle<AgentRlePolicyT,
              InputIteratorT,
@@ -200,6 +218,26 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA
   AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items, streaming_context)
     .ConsumeRange(num_tiles, tile_status, d_num_runs_out);
 }
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the RLE dispatchers
+template <typename PolicyHub>
+struct policy_selector_from_hub
+{
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const
+    -> non_trivial_runs::rle_non_trivial_runs_policy
+  {
+    using RleSweepPolicyT = typename PolicyHub::MaxPolicy::RleSweepPolicyT;
+    return non_trivial_runs::rle_non_trivial_runs_policy{
+      RleSweepPolicyT::BLOCK_THREADS,
+      RleSweepPolicyT::ITEMS_PER_THREAD,
+      RleSweepPolicyT::LOAD_ALGORITHM,
+      RleSweepPolicyT::LOAD_MODIFIER,
+      RleSweepPolicyT::STORE_WARP_TIME_SLICING,
+      RleSweepPolicyT::SCAN_ALGORITHM,
+      delay_constructor_policy_from_type<typename RleSweepPolicyT::detail::delay_constructor_t>,
+    };
+  }
+};
 } // namespace detail::rle
 
 /******************************************************************************
@@ -231,6 +269,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA
  *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
+// TODO(bgruber): deprecate when we make the tuning API public and remove in CCCL 4.0
 template <typename InputIteratorT,
           typename OffsetsOutputIteratorT,
           typename LengthsOutputIteratorT,
@@ -514,7 +553,7 @@ struct DeviceRleDispatch
     return Invoke<ActivePolicyT>(
       detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
       detail::rle::DeviceRleSweepKernel<
-        typename PolicyHub::MaxPolicy,
+        detail::rle::policy_selector_from_hub<PolicyHub>,
         InputIteratorT,
         OffsetsOutputIteratorT,
         LengthsOutputIteratorT,
@@ -601,4 +640,195 @@ struct DeviceRleDispatch
   }
 };
 
+namespace detail::rle
+{
+template <typename InputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename NumRunsOutputIteratorT,
+          typename EqualityOpT,
+          typename OffsetT,
+          typename length_t       = non_void_value_t<LengthsOutputIteratorT, OffsetT>,
+          typename key_t          = it_value_t<InputIteratorT>,
+          typename PolicySelector = non_trivial_runs::policy_selector_from_types<length_t, key_t>>
+#if _CCCL_HAS_CONCEPTS()
+  requires non_trivial_runs::rle_non_trivial_runs_policy_selector<PolicySelector>
+#endif // _CCCL_HAS_CONCEPTS()
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  InputIteratorT d_in,
+  OffsetsOutputIteratorT d_offsets_out,
+  LengthsOutputIteratorT d_lengths_out,
+  NumRunsOutputIteratorT d_num_runs_out,
+  EqualityOpT equality_op,
+  OffsetT num_items,
+  cudaStream_t stream,
+  PolicySelector policy_selector = {})
+{
+  using local_offset_t  = ::cuda::std::int32_t;
+  using global_offset_t = OffsetT;
+  static constexpr bool use_streaming_invocation =
+    ::cuda::std::numeric_limits<OffsetT>::max() > ::cuda::std::numeric_limits<local_offset_t>::max();
+  using streaming_context_t = ::cuda::std::
+    conditional_t<use_streaming_invocation, streaming_context<InputIteratorT, length_t, global_offset_t>, NullType>;
+  using ScanTileStateT                     = ReduceByKeyScanTileState<length_t, local_offset_t>;
+  static constexpr int init_kernel_threads = 128;
+
+  ::cuda::arch_id arch_id{};
+  if (const auto error = CubDebug(ptx_arch_id(arch_id)))
+  {
+    return error;
+  }
+
+  const non_trivial_runs::rle_non_trivial_runs_policy active_policy = policy_selector(arch_id);
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (::std::stringstream ss; ss << active_policy;
+     _CubLog("Dispatching DeviceRle to arch %d with tuning: %s\n", static_cast<int>(arch_id), ss.str().c_str());))
+#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+
+  const int block_threads    = active_policy.block_threads;
+  const int items_per_thread = active_policy.items_per_thread;
+  const auto tile_size       = static_cast<global_offset_t>(block_threads * items_per_thread);
+
+  auto capped_num_items_per_invocation = num_items;
+  if constexpr (use_streaming_invocation)
+  {
+    capped_num_items_per_invocation = static_cast<global_offset_t>(::cuda::std::numeric_limits<local_offset_t>::max());
+    capped_num_items_per_invocation -= (capped_num_items_per_invocation % tile_size);
+  }
+
+  const auto max_num_items_per_invocation =
+    use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items;
+  const auto num_partitions =
+    (capped_num_items_per_invocation == 0)
+      ? global_offset_t{1}
+      : ::cuda::ceil_div(num_items, capped_num_items_per_invocation);
+
+  const int max_num_tiles = static_cast<int>(::cuda::ceil_div(max_num_items_per_invocation, tile_size));
+
+  size_t allocation_sizes[3];
+  if (const auto error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0])))
+  {
+    return error;
+  }
+  allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0};
+  allocation_sizes[2] = num_partitions > 1 ? sizeof(length_t) * 2 : size_t{0};
+
+  void* allocations[3] = {};
+  if (const auto error =
+        CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+  {
+    return error;
+  }
+
+  if (d_temp_storage == nullptr)
+  {
+    return cudaSuccess;
+  }
+
+  for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++)
+  {
+    global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation;
+    global_offset_t current_num_items =
+      (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : capped_num_items_per_invocation;
+
+    const auto num_current_tiles = static_cast<int>(::cuda::ceil_div(current_num_items, tile_size));
+    ScanTileStateT tile_status;
+    if (const auto error = CubDebug(tile_status.Init(num_current_tiles, allocations[0], allocation_sizes[0])))
+    {
+      return error;
+    }
+
+    const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads));
+#ifdef CUB_DEBUG_LOG
+    _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n",
+            init_grid_size,
+            init_kernel_threads,
+            (long long) stream);
+#endif
+    if (const auto error = CubDebug(
+          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream)
+            .doit(&detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                  tile_status,
+                  num_current_tiles,
+                  d_num_runs_out)))
+    {
+      return error;
+    }
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+    {
+      return error;
+    }
+    if (num_items <= 1)
+    {
+      return cudaSuccess;
+    }
+#ifdef CUB_DEBUG_LOG
+    _CubLog("Invoking device_rle_sweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+            num_current_tiles,
+            block_threads,
+            (long long) stream,
+            items_per_thread);
+#endif
+
+    auto streaming_context = [&] {
+      if constexpr (use_streaming_invocation)
+      {
+        auto tmp_num_uniques          = static_cast<global_offset_t*>(allocations[1]);
+        auto tmp_prefix               = static_cast<length_t*>(allocations[2]);
+        const bool is_first_partition = (partition_idx == 0);
+        const bool is_last_partition  = (partition_idx + 1 == num_partitions);
+        const int buffer_selector     = partition_idx % 2;
+        return streaming_context_t{
+          is_first_partition,
+          is_last_partition,
+          current_partition_offset,
+          &tmp_prefix[buffer_selector],
+          &tmp_prefix[buffer_selector ^ 0x01],
+          &tmp_num_uniques[buffer_selector],
+          &tmp_num_uniques[buffer_selector ^ 0x01]};
+      }
+      else
+      {
+        return NullType{};
+      }
+    }();
+
+    if (const auto error = CubDebug(
+          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream)
+            .doit(&detail::rle::DeviceRleSweepKernel<
+                    PolicySelector,
+                    InputIteratorT,
+                    OffsetsOutputIteratorT,
+                    LengthsOutputIteratorT,
+                    NumRunsOutputIteratorT,
+                    ScanTileStateT,
+                    EqualityOpT,
+                    local_offset_t,
+                    global_offset_t,
+                    streaming_context_t>,
+                  d_in + current_partition_offset,
+                  d_offsets_out,
+                  d_lengths_out,
+                  d_num_runs_out,
+                  tile_status,
+                  equality_op,
+                  static_cast<local_offset_t>(current_num_items),
+                  num_current_tiles,
+                  streaming_context)))
+    {
+      return error;
+    }
+
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+    {
+      return error;
+    }
+  }
+  return cudaSuccess;
+}
+} // namespace detail::rle
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
index 6d75fe750ce..87d8aadf084 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
@@ -18,15 +18,27 @@
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
+#include <cub/detail/delay_constructor.cuh>
 #include <cub/device/dispatch/tuning/common.cuh>
+#include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
 #include <cub/util_device.cuh>
 
+#include <cuda/__device/arch_id.h>
 #include <cuda/std/__algorithm/clamp.h>
 
+#if _CCCL_HAS_CONCEPTS()
+#  include <cuda/std/concepts>
+#endif // _CCCL_HAS_CONCEPTS()
+
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::rle::non_trivial_runs
 {
+// TODO(bgruber): remove in CCCL 4.0 when we drop all CUB dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -92,6 +104,7 @@ struct sm80_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::n
 {};
 #endif
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop all CUB dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -157,6 +170,7 @@ struct sm90_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::n
 {};
 #endif
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop all CUB dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -238,6 +252,7 @@ struct sm100_tuning<LengthT, double, primitive_length::yes, primitive_key::yes,
 // {};
 #endif
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop all CUB dispatchers
 template <class LengthT, class KeyT>
 struct policy_hub
 {
@@ -311,6 +326,287 @@ struct policy_hub
 
   using MaxPolicy = Policy1000;
 };
+
+struct rle_non_trivial_runs_policy
+{
+  int block_threads;
+  int items_per_thread;
+  BlockLoadAlgorithm load_algorithm;
+  CacheLoadModifier load_modifier;
+  bool store_with_time_slicing;
+  BlockScanAlgorithm scan_algorithm;
+  delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450};
+
+  [[nodiscard]] _CCCL_API constexpr friend bool
+  operator==(const rle_non_trivial_runs_policy& lhs, const rle_non_trivial_runs_policy& rhs)
+  {
+    return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread
+        && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier
+        && lhs.store_with_time_slicing == rhs.store_with_time_slicing && lhs.scan_algorithm == rhs.scan_algorithm
+        && lhs.delay_constructor == rhs.delay_constructor;
+  }
+
+  [[nodiscard]] _CCCL_API constexpr friend bool
+  operator!=(const rle_non_trivial_runs_policy& lhs, const rle_non_trivial_runs_policy& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const rle_non_trivial_runs_policy& p)
+  {
+    return os
+        << "rle_non_trivial_runs_policy { .block_threads = " << p.block_threads
+        << ", .items_per_thread = " << p.items_per_thread << ", .load_algorithm = " << p.load_algorithm
+        << ", .load_modifier = " << p.load_modifier << ", .store_with_time_slicing = " << p.store_with_time_slicing
+        << ", .scan_algorithm = " << p.scan_algorithm << ", .delay_constructor = " << p.delay_constructor << " }";
+  }
+#endif // !_CCCL_COMPILER(NVRTC)
+};
+
+#if _CCCL_HAS_CONCEPTS()
+template <typename T>
+concept rle_non_trivial_runs_policy_selector = detail::policy_selector<T, rle_non_trivial_runs_policy>;
+#endif // _CCCL_HAS_CONCEPTS()
+
+struct policy_selector
+{
+  // TODO(bgruber): refactor the key information
+  int length_size;
+  int key_size;
+  type_t key_type;
+  bool length_is_primitive;
+  bool key_is_primitive; // TODO(bgruber): can probably be derived from key_type
+
+  _CCCL_API constexpr auto
+  make_default_policy(BlockLoadAlgorithm block_load_alg, int delay_ctor_key_size, CacheLoadModifier load_mod) const
+  {
+    const int nominal_4B_items_per_thread = 15;
+    const int items_per_thread =
+      ::cuda::std::clamp(nominal_4B_items_per_thread * 4 / key_size, 1, nominal_4B_items_per_thread);
+    return rle_non_trivial_runs_policy{
+      96,
+      items_per_thread,
+      block_load_alg,
+      load_mod,
+      true,
+      BLOCK_SCAN_WARP_SCANS,
+      default_reduce_by_key_delay_constructor_policy(delay_ctor_key_size, sizeof(int), true)};
+  }
+
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy
+  {
+    if (arch >= ::cuda::arch_id::sm_100)
+    {
+      if (length_is_primitive && key_is_primitive && length_size == 4)
+      {
+        if (key_size == 1)
+        {
+          // ipt_20.tpb_224.trp_1.ts_0.ld_1.ns_64.dcid_2.l2w_315 1.119878  1.003690  1.130067  1.338983
+          return rle_non_trivial_runs_policy{
+            224,
+            20,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_CA,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::exponential_backoff, 64, 315}};
+        }
+        if (key_size == 2)
+        {
+          // ipt_20.tpb_224.trp_1.ts_0.ld_0.ns_116.dcid_7.l2w_340 1.146528  1.072769  1.152390  1.333333
+          return rle_non_trivial_runs_policy{
+            224,
+            20,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::exponential_backon, 116, 340}};
+        }
+        if (key_size == 4)
+        {
+          // ipt_13.tpb_224.trp_0.ts_0.ld_0.ns_252.dcid_2.l2w_470 1.113202  1.003690  1.133114  1.349296
+          return rle_non_trivial_runs_policy{
+            224,
+            13,
+            BLOCK_LOAD_DIRECT,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::exponential_backoff, 252, 470}};
+        }
+        if (key_size == 8 && key_type != type_t::float64) // fall back to SM90 for double
+        {
+          // ipt_15.tpb_256.trp_1.ts_0.ld_0.ns_28.dcid_2.l2w_520 1.114944  1.033189  1.122360  1.252083
+          return rle_non_trivial_runs_policy{
+            256,
+            15,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::exponential_backoff, 28, 520}};
+        }
+      }
+
+      // no tuning for SM100, fall-through to SM90
+    }
+
+    if (arch >= ::cuda::arch_id::sm_90)
+    {
+      if (length_is_primitive && length_size == 4)
+      {
+        if (key_is_primitive && key_size == 1)
+        {
+          return rle_non_trivial_runs_policy{
+            256,
+            18,
+            BLOCK_LOAD_DIRECT,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 385}};
+        }
+        if (key_is_primitive && key_size == 2)
+        {
+          return rle_non_trivial_runs_policy{
+            224,
+            20,
+            BLOCK_LOAD_DIRECT,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 675}};
+        }
+        if (key_is_primitive && key_size == 4)
+        {
+          return rle_non_trivial_runs_policy{
+            256,
+            18,
+            BLOCK_LOAD_DIRECT,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 695}};
+        }
+        if (key_is_primitive && key_size == 8)
+        {
+          return rle_non_trivial_runs_policy{
+            224,
+            14,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 840}};
+        }
+        if (key_type == type_t::int128 || key_type == type_t::uint128)
+        {
+          return rle_non_trivial_runs_policy{
+            288,
+            9,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::fixed_delay, 484, 1150}};
+        }
+      }
+
+      // no tuning for SM90, use a default policy
+      return make_default_policy(BLOCK_LOAD_WARP_TRANSPOSE, length_size, LOAD_DEFAULT);
+    }
+
+    if (arch >= ::cuda::arch_id::sm_86)
+    {
+      // TODO(bgruber): I think we want `LengthT` instead of `int`
+      return make_default_policy(BLOCK_LOAD_DIRECT, sizeof(int), LOAD_LDG);
+    }
+
+    if (arch >= ::cuda::arch_id::sm_80)
+    {
+      if (length_is_primitive && length_size == 4)
+      {
+        if (key_is_primitive && key_size == 1)
+        {
+          return rle_non_trivial_runs_policy{
+            192,
+            20,
+            BLOCK_LOAD_DIRECT,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 630}};
+        }
+        if (key_is_primitive && key_size == 2)
+        {
+          return rle_non_trivial_runs_policy{
+            192,
+            20,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 1015}};
+        }
+        if (key_is_primitive && key_size == 4)
+        {
+          return rle_non_trivial_runs_policy{
+            224,
+            15,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 915}};
+        }
+        if (key_is_primitive && key_size == 8)
+        {
+          return rle_non_trivial_runs_policy{
+            256,
+            13,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 1065}};
+        }
+        if (key_type == type_t::int128 || key_type == type_t::uint128)
+        {
+          return rle_non_trivial_runs_policy{
+            192,
+            13,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            false,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 1050}};
+        }
+      }
+      // no tuning for SM80, use a default policy
+      return make_default_policy(BLOCK_LOAD_WARP_TRANSPOSE, length_size, LOAD_DEFAULT);
+    }
+
+    // default is from SM50
+    return make_default_policy(BLOCK_LOAD_DIRECT, int{sizeof(int)}, LOAD_LDG);
+  }
+};
+
+#if _CCCL_HAS_CONCEPTS()
+static_assert(rle_non_trivial_runs_policy_selector<policy_selector>);
+#endif // _CCCL_HAS_CONCEPTS()
+
+template <class LengthT, class KeyT>
+struct policy_selector_from_types
+{
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy
+  {
+    constexpr policy_selector selector{
+      sizeof(LengthT), int{sizeof(KeyT)}, classify_type<KeyT>, is_primitive_v<LengthT>, is_primitive_v<KeyT>};
+    return selector(arch);
+  }
+};
 } // namespace detail::rle::non_trivial_runs
 
 CUB_NAMESPACE_END
diff --git a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu
index a84fc5315a1..ffb9c6fc783 100644
--- a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu
+++ b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu
@@ -247,18 +247,16 @@ C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle pointers", "[device][
 
 // Guard against #293
 template <bool TimeSlicing>
-struct device_rle_policy_hub
+struct device_rle_policy_selector
 {
   static constexpr int threads = 96;
   static constexpr int items   = 15;
 
-  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
+  _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const
+    -> cub::detail::rle::non_trivial_runs::rle_non_trivial_runs_policy
   {
-    using RleSweepPolicyT = cub::
-      AgentRlePolicy<threads, items, cub::BLOCK_LOAD_DIRECT, cub::LOAD_DEFAULT, TimeSlicing, cub::BLOCK_SCAN_WARP_SCANS>;
-  };
-
-  using MaxPolicy = Policy500;
+    return {threads, items, cub::BLOCK_LOAD_DIRECT, cub::LOAD_DEFAULT, TimeSlicing, cub::BLOCK_SCAN_WARP_SCANS};
+  }
 };
 
 struct CustomDeviceRunLengthEncode
@@ -275,28 +273,20 @@ struct CustomDeviceRunLengthEncode
     OffsetsOutputIteratorT d_offsets_out,
     LengthsOutputIteratorT d_lengths_out,
     NumRunsOutputIteratorT d_num_runs_out,
-    int num_items,
+    int num_items, // Signed integer type for global offsets
     cudaStream_t stream = 0)
   {
-    using OffsetT    = int; // Signed integer type for global offsets
-    using EqualityOp = cuda::std::equal_to<>; // Default == operator
-
-    return cub::DeviceRleDispatch<InputIteratorT,
-                                  OffsetsOutputIteratorT,
-                                  LengthsOutputIteratorT,
-                                  NumRunsOutputIteratorT,
-                                  EqualityOp,
-                                  OffsetT,
-                                  device_rle_policy_hub<TimeSlicing>>::
-      Dispatch(d_temp_storage,
-               temp_storage_bytes,
-               d_in,
-               d_offsets_out,
-               d_lengths_out,
-               d_num_runs_out,
-               EqualityOp(),
-               num_items,
-               stream);
+    return cub::detail::rle::dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_offsets_out,
+      d_lengths_out,
+      d_num_runs_out,
+      cuda::std::equal_to<>{}, // Default == operator
+      num_items,
+      stream,
+      device_rle_policy_selector<TimeSlicing>{});
   }
 };
 
@@ -308,9 +298,9 @@ using time_slicing = c2h::type_list<std::true_type, std::false_type>;
 C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns does not run out of memory", "[device][run_length_encode]", time_slicing)
 {
   using type         = typename c2h::get<0, TestType>;
-  using policy_hub_t = device_rle_policy_hub<type::value>;
+  using policy_sel_t = device_rle_policy_selector<type::value>;
 
-  constexpr int tile_size    = policy_hub_t::threads * policy_hub_t::items;
+  constexpr int tile_size    = policy_sel_t::threads * policy_sel_t::items;
   constexpr int num_items    = 2 * tile_size;
   constexpr int magic_number = num_items + 1;