From 03cb980c32842d9d1c1f6d281d30576fed4493d1 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 13 Feb 2026 13:45:01 +0100
Subject: [PATCH 01/23] Implement the new tuning API for
 Dispatch[Streaming]ReduceByKey

Fixes: #7531, #7533
---
 cub/benchmarks/bench/reduce/by_key.cu         | 122 +--
 .../bench/run_length_encode/encode.cu         | 123 +--
 cub/cub/agent/agent_reduce_by_key.cuh         |  11 +-
 cub/cub/device/device_reduce.cuh              |  40 +-
 cub/cub/device/device_run_length_encode.cuh   |  27 +-
 .../dispatch/dispatch_reduce_by_key.cuh       | 244 +++++-
 .../dispatch_streaming_reduce_by_key.cuh      | 570 +++++---------
 .../dispatch/tuning/tuning_reduce_by_key.cuh  | 719 +++++++++++++++++-
 .../dispatch/tuning/tuning_rle_encode.cuh     | 206 ++++-
 .../nvbench_helper/look_back_helper.cuh       |   4 +
 10 files changed, 1447 insertions(+), 619 deletions(-)

diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu
index 54d72ae4bb9..41b486aa752 100644
--- a/cub/benchmarks/bench/reduce/by_key.cu
+++ b/cub/benchmarks/bench/reduce/by_key.cu
@@ -15,32 +15,21 @@
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
 #if !TUNE_BASE
-#  if TUNE_TRANSPOSE == 0
-#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
-#  else // TUNE_TRANSPOSE == 1
-#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
-#  endif // TUNE_TRANSPOSE
-
-#  if TUNE_LOAD == 0
-#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
-#  else // TUNE_LOAD == 1
-#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
-#  endif // TUNE_LOAD
-
-struct reduce_by_key_policy_hub
+struct bench_reduce_by_key_policy_selector
 {
-  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
+  [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const
+    -> cub::detail::reduce_by_key::reduce_by_key_policy
   {
-    using ReduceByKeyPolicyT =
-      cub::AgentReduceByKeyPolicy<TUNE_THREADS,
-                                  TUNE_ITEMS,
-                                  TUNE_LOAD_ALGORITHM,
-                                  TUNE_LOAD_MODIFIER,
-                                  cub::BLOCK_SCAN_WARP_SCANS,
-                                  delay_constructor_t>;
-  };
-
-  using MaxPolicy = Policy500;
+    return {
+      TUNE_THREADS,
+      TUNE_ITEMS,
+      TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
+      ,
+      TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      delay_constructor_policy,
+    };
+  }
 };
 #endif // !TUNE_BASE
 
@@ -57,31 +46,6 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
   using accum_t                    = ValueT;
   using offset_t                   = OffsetT;
 
-#if !TUNE_BASE
-  using dispatch_t = cub::DispatchReduceByKey<
-    keys_input_it_t,
-    unique_output_it_t,
-    vals_input_it_t,
-    aggregate_output_it_t,
-    num_runs_output_iterator_t,
-    equality_op_t,
-    reduction_op_t,
-    offset_t,
-    accum_t,
-    reduce_by_key_policy_hub>;
-#else
-  using dispatch_t = cub::DispatchReduceByKey<
-    keys_input_it_t,
-    unique_output_it_t,
-    vals_input_it_t,
-    aggregate_output_it_t,
-    num_runs_output_iterator_t,
-    equality_op_t,
-    reduction_op_t,
-    offset_t,
-    accum_t>;
-#endif
-
   const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   constexpr std::size_t min_segment_size = 1;
   const std::size_t max_segment_size     = static_cast<std::size_t>(state.get_int64("MaxSegSize"));
@@ -100,35 +64,34 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
+  const offset_t num_items = static_cast<offset_t>(elements);
 
-  dispatch_t::Dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_in_keys,
-    d_out_keys,
-    d_in_vals,
-    d_out_vals,
-    d_num_runs_out,
-    equality_op_t{},
-    reduction_op_t{},
-    elements,
-    0);
+  auto dispatch_on_stream = [&](cudaStream_t stream) {
+    return cub::detail::reduce_by_key::dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in_keys,
+      d_out_keys,
+      d_in_vals,
+      d_out_vals,
+      d_num_runs_out,
+      equality_op_t{},
+      reduction_op_t{},
+      num_items,
+      stream
+#if !TUNE_BASE
+      ,
+      bench_reduce_by_key_policy_selector{}
+#endif
+    );
+  };
+
+  dispatch_on_stream(cudaStream_t{0});
 
   thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  dispatch_t::Dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_in_keys,
-    d_out_keys,
-    d_in_vals,
-    d_out_vals,
-    d_num_runs_out,
-    equality_op_t{},
-    reduction_op_t{},
-    elements,
-    0);
+  dispatch_on_stream(cudaStream_t{0});
   cudaDeviceSynchronize();
   const OffsetT num_runs = num_runs_out[0];
 
@@ -140,18 +103,7 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
   state.add_global_memory_writes<OffsetT>(1);
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in_keys,
-      d_out_keys,
-      d_in_vals,
-      d_out_vals,
-      d_num_runs_out,
-      equality_op_t{},
-      reduction_op_t{},
-      elements,
-      launch.get_stream());
+    dispatch_on_stream(launch.get_stream());
   });
 }
 
diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu
index 51f23a424f3..da115685082 100644
--- a/cub/benchmarks/bench/run_length_encode/encode.cu
+++ b/cub/benchmarks/bench/run_length_encode/encode.cu
@@ -17,32 +17,20 @@
 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
 
 #if !TUNE_BASE
-#  if TUNE_TRANSPOSE == 0
-#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
-#  else // TUNE_TRANSPOSE == 1
-#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
-#  endif // TUNE_TRANSPOSE
-
-#  if TUNE_LOAD == 0
-#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
-#  else // TUNE_LOAD == 1
-#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
-#  endif // TUNE_LOAD
-
-struct reduce_by_key_policy_hub
+struct bench_encode_policy_selector
 {
-  struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
+  [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const
+    -> cub::detail::reduce_by_key::reduce_by_key_policy
   {
-    using ReduceByKeyPolicyT =
-      cub::AgentReduceByKeyPolicy<TUNE_THREADS,
-                                  TUNE_ITEMS,
-                                  TUNE_LOAD_ALGORITHM,
-                                  TUNE_LOAD_MODIFIER,
-                                  cub::BLOCK_SCAN_WARP_SCANS,
-                                  delay_constructor_t>;
-  };
-
-  using MaxPolicy = Policy500;
+    return {
+      TUNE_THREADS,
+      TUNE_ITEMS,
+      TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
+      TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      delay_constructor_policy,
+    };
+  }
 };
 #endif // !TUNE_BASE
 
@@ -65,33 +53,6 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   using reduction_op_t             = ::cuda::std::plus<>;
   using accum_t                    = run_length_t;
 
-#if !TUNE_BASE
-  using dispatch_t = cub::detail::reduce::DispatchStreamingReduceByKey<
-    keys_input_it_t,
-    unique_output_it_t,
-    run_length_input_it_t,
-    run_length_output_it_t,
-    num_runs_output_iterator_t,
-    equality_op_t,
-    reduction_op_t,
-    offset_t,
-    accum_t,
-    reduce_by_key_policy_hub>;
-#else
-  using policy_t   = cub::detail::rle::encode::policy_hub<accum_t, T>;
-  using dispatch_t = cub::detail::reduce::DispatchStreamingReduceByKey<
-    keys_input_it_t,
-    unique_output_it_t,
-    run_length_input_it_t,
-    run_length_output_it_t,
-    num_runs_output_iterator_t,
-    equality_op_t,
-    reduction_op_t,
-    offset_t,
-    accum_t,
-    policy_t>;
-#endif
-
   const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   constexpr std::size_t min_segment_size = 1;
   const std::size_t max_segment_size     = static_cast<std::size_t>(state.get_int64("MaxSegSize"));
@@ -109,35 +70,34 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
 
   std::uint8_t* d_temp_storage{};
   std::size_t temp_storage_bytes{};
+  const offset_t num_items = static_cast<offset_t>(elements);
 
-  dispatch_t::Dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_in_keys,
-    d_out_keys,
-    d_in_vals,
-    d_out_vals,
-    d_num_runs_out,
-    equality_op_t{},
-    reduction_op_t{},
-    elements,
-    0);
+  auto dispatch_on_stream = [&](cudaStream_t stream) {
+    return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in_keys,
+      d_out_keys,
+      d_in_vals,
+      d_out_vals,
+      d_num_runs_out,
+      equality_op_t{},
+      reduction_op_t{},
+      num_items,
+      stream
+#if !TUNE_BASE
+      ,
+      bench_encode_policy_selector{}
+#endif
+    );
+  };
+
+  dispatch_on_stream(cudaStream_t{0});
 
   thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  dispatch_t::Dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_in_keys,
-    d_out_keys,
-    d_in_vals,
-    d_out_vals,
-    d_num_runs_out,
-    equality_op_t{},
-    reduction_op_t{},
-    elements,
-    0);
+  dispatch_on_stream(cudaStream_t{0});
   cudaDeviceSynchronize();
   const num_runs_t num_runs = num_runs_out[0];
 
@@ -148,18 +108,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   state.add_global_memory_writes<num_runs_t>(1);
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in_keys,
-      d_out_keys,
-      d_in_vals,
-      d_out_vals,
-      d_num_runs_out,
-      equality_op_t{},
-      reduction_op_t{},
-      elements,
-      launch.get_stream());
+    dispatch_on_stream(launch.get_stream());
   });
 }
 
diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh
index 43f74df5f35..9566cb705e4 100644
--- a/cub/cub/agent/agent_reduce_by_key.cuh
+++ b/cub/cub/agent/agent_reduce_by_key.cuh
@@ -2,10 +2,9 @@
 // SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
-/**
- * @file cub::AgentReduceByKey implements a stateful abstraction of CUDA thread
- *       blocks for participating in device-wide reduce-value-by-key.
- */
+//! @file
+//! cub::detail::reduce_by_key::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for
+//! participating in device-wide reduce-value-by-key.
 
 #pragma once
 
@@ -92,7 +91,7 @@ struct AgentReduceByKeyPolicy
  * Thread block abstractions
  ******************************************************************************/
 
-namespace detail::reduce
+namespace detail::reduce_by_key
 {
 /**
  * @brief AgentReduceByKey implements a stateful abstraction of CUDA thread
@@ -771,6 +770,6 @@ struct AgentReduceByKey
     }
   }
 };
-} // namespace detail::reduce
+} // namespace detail::reduce_by_key
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index 50101c16e0f..afbbe219bf9 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -2308,35 +2308,21 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey");
 
-    // Signed integer type for global offsets
-    using OffsetT = detail::choose_offset_t<NumItemsT>;
-
-    // FlagT iterator type (not used)
-
-    // Selection op (not used)
-
-    // Default == operator
+    using OffsetT    = detail::choose_offset_t<NumItemsT>;
     using EqualityOp = ::cuda::std::equal_to<>;
 
-    return DispatchReduceByKey<
-      KeysInputIteratorT,
-      UniqueOutputIteratorT,
-      ValuesInputIteratorT,
-      AggregatesOutputIteratorT,
-      NumRunsOutputIteratorT,
-      EqualityOp,
-      ReductionOpT,
-      OffsetT>::Dispatch(d_temp_storage,
-                         temp_storage_bytes,
-                         d_keys_in,
-                         d_unique_out,
-                         d_values_in,
-                         d_aggregates_out,
-                         d_num_runs_out,
-                         EqualityOp(),
-                         reduction_op,
-                         static_cast<OffsetT>(num_items),
-                         stream);
+    return detail::reduce_by_key::dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_unique_out,
+      d_values_in,
+      d_aggregates_out,
+      d_num_runs_out,
+      EqualityOp{},
+      reduction_op,
+      static_cast<OffsetT>(num_items),
+      stream);
   }
 };
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh
index 44fac1b1cd9..d0c5d05e0fa 100644
--- a/cub/cub/device/device_run_length_encode.cuh
+++ b/cub/cub/device/device_run_length_encode.cuh
@@ -186,9 +186,9 @@ struct DeviceRunLengthEncode
 
     using key_t = cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
 
-    using policy_t = detail::rle::encode::policy_hub<accum_t, key_t>;
+    using policy_selector_t = detail::rle::encode::policy_selector_from_types<accum_t, key_t>;
 
-    return detail::reduce::DispatchStreamingReduceByKey<
+    return detail::reduce_by_key::dispatch_streaming_reduce_by_key<
       InputIteratorT,
       UniqueOutputIteratorT,
       lengths_input_iterator_t,
@@ -198,17 +198,18 @@ struct DeviceRunLengthEncode
       reduction_op,
       offset_t,
       accum_t,
-      policy_t>::Dispatch(d_temp_storage,
-                          temp_storage_bytes,
-                          d_in,
-                          d_unique_out,
-                          lengths_input_iterator_t((length_t) 1),
-                          d_counts_out,
-                          d_num_runs_out,
-                          equality_op(),
-                          reduction_op(),
-                          num_items,
-                          stream);
+      policy_selector_t>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_unique_out,
+      lengths_input_iterator_t((length_t) 1),
+      d_counts_out,
+      d_num_runs_out,
+      equality_op(),
+      reduction_op(),
+      num_items,
+      stream);
   }
 
   //! @rst
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index cda8857f460..72564332c0e 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -20,6 +20,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_reduce_by_key.cuh>
+#include <cub/detail/arch_dispatch.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -29,13 +30,17 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+#  include <sstream>
+#endif
+
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
  * Kernel entry points
  *****************************************************************************/
 
-namespace detail::reduce
+namespace detail::reduce_by_key
 {
 template <typename PrecedingKeyItT, typename AccumT, typename GlobalOffsetT>
 struct streaming_context
@@ -106,8 +111,8 @@ struct streaming_context
 /**
  * @brief Multi-block reduce-by-key sweep kernel entry point
  *
- * @tparam AgentReduceByKeyPolicyT
- *   Parameterized AgentReduceByKeyPolicyT tuning policy type
+ * @tparam PolicySelector
+ *   Selects the tuning policy
  *
  * @tparam KeysInputIteratorT
  *   Random-access input iterator type for keys
@@ -167,7 +172,7 @@ struct streaming_context
  * @param num_items
  *   Total number of items to select from
  */
-template <typename ChainedPolicyT,
+template <typename PolicySelector,
           typename KeysInputIteratorT,
           typename UniqueOutputIteratorT,
           typename ValuesInputIteratorT,
@@ -179,7 +184,10 @@ template <typename ChainedPolicyT,
           typename OffsetT,
           typename AccumT,
           typename StreamingContextT>
-__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_THREADS))
+#if _CCCL_HAS_CONCEPTS()
+  requires reduce_by_key_policy_selector<PolicySelector>
+#endif
+__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel(
     KeysInputIteratorT d_keys_in,
     UniqueOutputIteratorT d_unique_out,
@@ -194,8 +202,19 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
     vsmem_t vsmem)
 {
+  static constexpr reduce_by_key_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
+  using AgentReduceByKeyPolicyT                = AgentReduceByKeyPolicy<
+                   policy.block_threads,
+                   policy.items_per_thread,
+                   policy.load_algorithm,
+                   policy.load_modifier,
+                   policy.scan_algorithm,
+                   delay_constructor_t<policy.delay_constructor.kind,
+                                       policy.delay_constructor.delay,
+                                       policy.delay_constructor.l2_write_latency>>;
+
   using vsmem_helper_t = vsmem_helper_default_fallback_policy_t<
-    typename ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT,
+    AgentReduceByKeyPolicyT,
     AgentReduceByKey,
     KeysInputIteratorT,
     UniqueOutputIteratorT,
@@ -234,7 +253,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
   // If applicable, hints to discard modified cache lines for vsmem
   vsmem_helper_t::discard_temp_storage(temp_storage);
 }
-} // namespace detail::reduce
+} // namespace detail::reduce_by_key
 
 /******************************************************************************
  * Dispatch
@@ -272,6 +291,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
  *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
+// TODO(bgruber): deprecate when we make the tuning API public and remove in CCCL 4.0
 template <typename KeysInputIteratorT,
           typename UniqueOutputIteratorT,
           typename ValuesInputIteratorT,
@@ -351,7 +371,7 @@ struct DispatchReduceByKey
   {
     using vsmem_helper_t = detail::vsmem_helper_default_fallback_policy_t<
       typename ActivePolicyT::ReduceByKeyPolicyT,
-      detail::reduce::AgentReduceByKey,
+      detail::reduce_by_key::AgentReduceByKey,
       KeysInputIteratorT,
       UniqueOutputIteratorT,
       ValuesInputIteratorT,
@@ -518,8 +538,8 @@ struct DispatchReduceByKey
   {
     return Invoke<ActivePolicyT>(
       detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-      detail::reduce::DeviceReduceByKeyKernel<
-        typename PolicyHub::MaxPolicy,
+      detail::reduce_by_key::DeviceReduceByKeyKernel<
+        detail::reduce_by_key::policy_selector_from_hub<PolicyHub>,
         KeysInputIteratorT,
         UniqueOutputIteratorT,
         ValuesInputIteratorT,
@@ -622,4 +642,208 @@ struct DispatchReduceByKey
   }
 };
 
+namespace detail::reduce_by_key
+{
+template <
+  typename KeysInputIteratorT,
+  typename UniqueOutputIteratorT,
+  typename ValuesInputIteratorT,
+  typename AggregatesOutputIteratorT,
+  typename NumRunsOutputIteratorT,
+  typename EqualityOpT,
+  typename ReductionOpT,
+  typename OffsetT,
+  typename AccumT =
+    ::cuda::std::__accumulator_t<ReductionOpT, it_value_t<ValuesInputIteratorT>, it_value_t<ValuesInputIteratorT>>,
+  typename KeyT           = non_void_value_t<UniqueOutputIteratorT, it_value_t<KeysInputIteratorT>>,
+  typename PolicySelector = policy_selector_from_types<ReductionOpT, AccumT, KeyT>>
+#if _CCCL_HAS_CONCEPTS()
+  requires reduce_by_key::reduce_by_key_policy_selector<PolicySelector>
+#endif // _CCCL_HAS_CONCEPTS()
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  KeysInputIteratorT d_keys_in,
+  UniqueOutputIteratorT d_unique_out,
+  ValuesInputIteratorT d_values_in,
+  AggregatesOutputIteratorT d_aggregates_out,
+  NumRunsOutputIteratorT d_num_runs_out,
+  EqualityOpT equality_op,
+  ReductionOpT reduction_op,
+  OffsetT num_items,
+  cudaStream_t stream,
+  PolicySelector policy_selector = {})
+{
+  using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet
+  using ScanTileStateT      = ReduceByKeyScanTileState<AccumT, OffsetT>;
+  [[maybe_unused]] static constexpr int INIT_KERNEL_THREADS = 128;
+
+  ::cuda::arch_id arch_id{};
+  if (const auto error = CubDebug(ptx_arch_id(arch_id)))
+  {
+    return error;
+  }
+
+  return detail::dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) {
+    constexpr reduce_by_key_policy policy = policy_getter(); // need the constexpr of vsmem_helper
+
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+    NV_IF_TARGET(
+      NV_IS_HOST,
+      (::std::stringstream ss; ss << policy; _CubLog(
+         "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast<int>(arch_id), ss.str().c_str());))
+#endif
+
+    // convert the policy into a legacy agent policy for vsmem_helper. TODO(bgruber): refactor this in the future
+    using AgentReduceByKeyPolicy = AgentReduceByKeyPolicy<
+      policy.block_threads,
+      policy.items_per_thread,
+      policy.load_algorithm,
+      policy.load_modifier,
+      policy.scan_algorithm,
+      delay_constructor_t<policy.delay_constructor.kind,
+                          policy.delay_constructor.delay,
+                          policy.delay_constructor.l2_write_latency>>;
+
+    using vsmem_helper_t = vsmem_helper_default_fallback_policy_t<
+      AgentReduceByKeyPolicy,
+      AgentReduceByKey,
+      KeysInputIteratorT,
+      UniqueOutputIteratorT,
+      ValuesInputIteratorT,
+      AggregatesOutputIteratorT,
+      NumRunsOutputIteratorT,
+      EqualityOpT,
+      ReductionOpT,
+      OffsetT,
+      AccumT,
+      streaming_context_t>;
+
+    constexpr int block_threads    = vsmem_helper_t::agent_policy_t::BLOCK_THREADS;
+    constexpr int items_per_thread = vsmem_helper_t::agent_policy_t::ITEMS_PER_THREAD;
+
+    // Number of input tiles
+    const int tile_size = block_threads * items_per_thread;
+    const int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+    // The amount of virtual shared memory to allocate
+    const auto vsmem_size = num_tiles * vsmem_helper_t::vsmem_per_block;
+
+    size_t tile_descriptor_memory{};
+    if (const auto error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, tile_descriptor_memory)))
+    {
+      return error;
+    }
+    size_t allocation_sizes[2] = {tile_descriptor_memory, vsmem_size};
+    void* allocations[2]       = {};
+
+    if (const auto error =
+          CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+    {
+      return error;
+    }
+
+    if (d_temp_storage == nullptr)
+    {
+      return cudaSuccess;
+    }
+
+    ScanTileStateT tile_state;
+    if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])))
+    {
+      return error;
+    }
+
+    const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
+#ifdef CUB_DEBUG_LOG
+    _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+#endif
+    if (const auto error = CubDebug(
+          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+            .doit(detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                  tile_state,
+                  num_tiles,
+                  d_num_runs_out)))
+    {
+      return error;
+    }
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+    {
+      return error;
+    }
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    auto reduce_by_key_kernel = &DeviceReduceByKeyKernel<
+      PolicySelector,
+      KeysInputIteratorT,
+      UniqueOutputIteratorT,
+      ValuesInputIteratorT,
+      AggregatesOutputIteratorT,
+      NumRunsOutputIteratorT,
+      ScanTileStateT,
+      EqualityOpT,
+      ReductionOpT,
+      OffsetT,
+      AccumT,
+      streaming_context_t>;
+    int reduce_by_key_sm_occupancy{};
+    if (const auto error = CubDebug(MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, block_threads)))
+    {
+      return error;
+    }
+
+    int device_ordinal{};
+    if (const auto error = CubDebug(cudaGetDevice(&device_ordinal)))
+    {
+      return error;
+    }
+    int max_dim_x{};
+    if (const auto error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
+    {
+      return error;
+    }
+
+    const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x);
+    for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+    {
+#ifdef CUB_DEBUG_LOG
+      _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+              start_tile,
+              scan_grid_size,
+              block_threads,
+              (long long) stream,
+              items_per_thread,
+              reduce_by_key_sm_occupancy);
+#endif
+      if (const auto error = CubDebug(
+            THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream)
+              .doit(reduce_by_key_kernel,
+                    d_keys_in,
+                    d_unique_out,
+                    d_values_in,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    start_tile,
+                    equality_op,
+                    reduction_op,
+                    num_items,
+                    streaming_context_t{},
+                    cub::detail::vsmem_t{allocations[1]})))
+      {
+        return error;
+      }
+      if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+      {
+        return error;
+      }
+    }
+    return cudaSuccess;
+  });
+}
+} // namespace detail::reduce_by_key
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
index 57d34ff4e7e..4fec39263a6 100644
--- a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
@@ -14,6 +14,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_reduce_by_key.cuh>
+#include <cub/detail/arch_dispatch.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -25,431 +26,230 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
-#if !_CCCL_COMPILER(NVRTC)
-#  include <iostream>
-#endif // !_CCCL_COMPILER(NVRTC)
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+#  include <sstream>
+#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail::reduce
+namespace detail::reduce_by_key
 {
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * @brief Utility class for dispatching the appropriately-tuned kernels for
- *        DeviceReduceByKey
- *
- * @tparam KeysInputIteratorT
- *   Random-access input iterator type for keys
- *
- * @tparam UniqueOutputIteratorT
- *   Random-access output iterator type for keys
- *
- * @tparam ValuesInputIteratorT
- *   Random-access input iterator type for values
- *
- * @tparam AggregatesOutputIteratorT
- *   Random-access output iterator type for values
- *
- * @tparam NumRunsOutputIteratorT
- *   Output iterator type for recording number of segments encountered
- *
- * @tparam EqualityOpT
- *   KeyT equality operator type
- *
- * @tparam ReductionOpT
- *   ValueT reduction operator type
- *
- * @tparam PolicyHub
- *   Implementation detail, do not specify directly, requirements on the
- *   content of this type are subject to breaking change.
- */
-template <typename KeysInputIteratorT,
-          typename UniqueOutputIteratorT,
-          typename ValuesInputIteratorT,
-          typename AggregatesOutputIteratorT,
-          typename NumRunsOutputIteratorT,
-          typename EqualityOpT,
-          typename ReductionOpT,
-          typename OffsetT,
-          typename AccumT    = ::cuda::std::__accumulator_t<ReductionOpT,
-                                                            cub::detail::it_value_t<ValuesInputIteratorT>,
-                                                            cub::detail::it_value_t<ValuesInputIteratorT>>,
-          typename PolicyHub = detail::reduce_by_key::policy_hub<
-            ReductionOpT,
-            AccumT,
-            cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::it_value_t<KeysInputIteratorT>>>>
-struct DispatchStreamingReduceByKey
+template <
+  typename KeysInputIteratorT,
+  typename UniqueOutputIteratorT,
+  typename ValuesInputIteratorT,
+  typename AggregatesOutputIteratorT,
+  typename NumRunsOutputIteratorT,
+  typename EqualityOpT,
+  typename ReductionOpT,
+  typename OffsetT,
+  typename AccumT =
+    ::cuda::std::__accumulator_t<ReductionOpT, it_value_t<ValuesInputIteratorT>, it_value_t<ValuesInputIteratorT>>,
+  typename PolicySelector =
+    policy_selector_from_types<ReductionOpT,
+                               AccumT,
+                               non_void_value_t<UniqueOutputIteratorT, it_value_t<KeysInputIteratorT>>>>
+#if _CCCL_HAS_CONCEPTS()
+  requires reduce_by_key_policy_selector<PolicySelector>
+#endif
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_reduce_by_key(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  KeysInputIteratorT d_keys_in,
+  UniqueOutputIteratorT d_unique_out,
+  ValuesInputIteratorT d_values_in,
+  AggregatesOutputIteratorT d_aggregates_out,
+  NumRunsOutputIteratorT d_num_runs_out,
+  EqualityOpT equality_op,
+  ReductionOpT reduction_op,
+  OffsetT num_items,
+  cudaStream_t stream,
+  PolicySelector policy_selector = {})
 {
-  //-------------------------------------------------------------------------
-  // Types and constants
-  //-------------------------------------------------------------------------
-  // Offsets to index items within one partition (i.e., a single kernel invocation)
-  using local_offset_t = ::cuda::std::int32_t;
-
-  // If the number of items provided by the user may exceed the maximum number of items processed by a single kernel
-  // invocation, we may require multiple kernel invocations
-  static constexpr bool use_streaming_invocation = ::cuda::std::numeric_limits<OffsetT>::max()
-                                                 > ::cuda::std::numeric_limits<local_offset_t>::max();
-
-  // Offsets to index any item within the entire input (large enough to cover num_items)
-  using global_offset_t = OffsetT;
-
-  // Type used to provide context about the current partition during a streaming invocation
-  using streaming_context_t =
-    ::cuda::std::conditional_t<use_streaming_invocation,
-                               detail::reduce::streaming_context<KeysInputIteratorT, AccumT, global_offset_t>,
-                               NullType>;
-
-  // The input values type
-  using ValueInputT = cub::detail::it_value_t<ValuesInputIteratorT>;
+  ::cuda::arch_id arch_id{};
+  if (const auto error = CubDebug(ptx_arch_id(arch_id)))
+  {
+    return error;
+  }
 
-  static constexpr int init_kernel_threads = 128;
+  const reduce_by_key_policy policy = policy_selector(arch_id);
 
-  // Tile status descriptor interface type
-  using ScanTileStateT = ReduceByKeyScanTileState<AccumT, local_offset_t>;
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+  NV_IF_TARGET(NV_IS_HOST,
+               (::std::stringstream ss; ss << policy;
+                _CubLog("Dispatching streaming reduce by key to arch %d with tuning: %s\n",
+                        static_cast<int>(arch_id),
+                        ss.str().c_str());))
+#endif
 
-  void* d_temp_storage;
-  size_t& temp_storage_bytes;
-  KeysInputIteratorT d_keys_in;
-  UniqueOutputIteratorT d_unique_out;
-  ValuesInputIteratorT d_values_in;
-  AggregatesOutputIteratorT d_aggregates_out;
-  NumRunsOutputIteratorT d_num_runs_out;
-  EqualityOpT equality_op;
-  ReductionOpT reduction_op;
-  global_offset_t num_items;
-  cudaStream_t stream;
+  using local_offset_t  = ::cuda::std::int32_t;
+  using global_offset_t = OffsetT;
+  static constexpr bool use_streaming_invocation =
+    ::cuda::std::numeric_limits<OffsetT>::max() > ::cuda::std::numeric_limits<local_offset_t>::max();
+  using streaming_context_t = ::cuda::std::
+    conditional_t<use_streaming_invocation, streaming_context<KeysInputIteratorT, AccumT, global_offset_t>, NullType>;
+  using ScanTileStateT                                      = ReduceByKeyScanTileState<AccumT, local_offset_t>;
+  [[maybe_unused]] static constexpr int init_kernel_threads = 128;
+
+  const int block_threads    = policy.block_threads;
+  const int items_per_thread = policy.items_per_thread;
+  const auto tile_size       = static_cast<global_offset_t>(block_threads * items_per_thread);
+
+  auto capped_num_items_per_invocation = num_items;
+  if constexpr (use_streaming_invocation)
+  {
+    capped_num_items_per_invocation = static_cast<global_offset_t>(::cuda::std::numeric_limits<local_offset_t>::max());
+    capped_num_items_per_invocation -= (capped_num_items_per_invocation % tile_size);
+  }
 
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchStreamingReduceByKey(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    UniqueOutputIteratorT d_unique_out,
-    ValuesInputIteratorT d_values_in,
-    AggregatesOutputIteratorT d_aggregates_out,
-    NumRunsOutputIteratorT d_num_runs_out,
-    EqualityOpT equality_op,
-    ReductionOpT reduction_op,
-    global_offset_t num_items,
-    cudaStream_t stream)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_keys_in(d_keys_in)
-      , d_unique_out(d_unique_out)
-      , d_values_in(d_values_in)
-      , d_aggregates_out(d_aggregates_out)
-      , d_num_runs_out(d_num_runs_out)
-      , equality_op(equality_op)
-      , reduction_op(reduction_op)
-      , num_items(num_items)
-      , stream(stream)
-  {}
+  const auto max_num_items_per_invocation =
+    use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items;
+  auto const num_partitions =
+    (num_items == 0) ? global_offset_t{1} : ::cuda::ceil_div(num_items, capped_num_items_per_invocation);
 
-  //---------------------------------------------------------------------
-  // Dispatch entrypoints
-  //---------------------------------------------------------------------
+  const auto max_num_tiles = static_cast<int>(::cuda::ceil_div(max_num_items_per_invocation, tile_size));
 
-  template <typename ActivePolicyT, typename ScanInitKernelT, typename ReduceByKeyKernelT>
-  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
-  Invoke(ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel)
+  size_t allocation_sizes[3];
+  if (const auto error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0])))
   {
-    using AgentReduceByKeyPolicyT  = typename ActivePolicyT::ReduceByKeyPolicyT;
-    constexpr int block_threads    = AgentReduceByKeyPolicyT::BLOCK_THREADS;
-    constexpr int items_per_thread = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD;
-
-    // The upper bound of for the number of items that a single kernel invocation will ever process
-    auto capped_num_items_per_invocation = num_items;
-    if constexpr (use_streaming_invocation)
-    {
-      capped_num_items_per_invocation =
-        static_cast<global_offset_t>(::cuda::std::numeric_limits<local_offset_t>::max());
-      // Make sure that the number of items is a multiple of tile size
-      capped_num_items_per_invocation -= (capped_num_items_per_invocation % (block_threads * items_per_thread));
-    }
-
-    // Across invocations, the maximum number of items that a single kernel invocation will ever process
-    const auto max_num_items_per_invocation =
-      use_streaming_invocation ? ::cuda::std::min(capped_num_items_per_invocation, num_items) : num_items;
+    return error;
+  }
+  allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0};
+  allocation_sizes[2] = num_partitions > 1 ? sizeof(AccumT) * 2 : size_t{0};
 
-    // Number of invocations required to "iterate" over the total input (at least one iteration to process zero items)
-    auto const num_partitions =
-      (num_items == 0) ? global_offset_t{1} : ::cuda::ceil_div(num_items, capped_num_items_per_invocation);
+  void* allocations[3] = {};
+  if (const auto error =
+        CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+  {
+    return error;
+  }
 
-    cudaError error = cudaSuccess;
+  if (d_temp_storage == nullptr)
+  {
+    return cudaSuccess;
+  }
 
-    // Number of input tiles
-    const auto tile_size = static_cast<global_offset_t>(block_threads * items_per_thread);
-    int max_num_tiles    = static_cast<int>(::cuda::ceil_div(max_num_items_per_invocation, tile_size));
+  for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++)
+  {
+    global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation;
+    global_offset_t current_num_items =
+      (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : capped_num_items_per_invocation;
 
-    // Specify temporary storage allocation requirements
-    size_t allocation_sizes[3];
-    error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles, allocation_sizes[0]));
-    if (cudaSuccess != error)
+    const auto num_current_tiles = static_cast<int>(::cuda::ceil_div(current_num_items, tile_size));
+    ScanTileStateT tile_state;
+    if (const auto error = CubDebug(tile_state.Init(num_current_tiles, allocations[0], allocation_sizes[0])))
     {
       return error;
     }
-    allocation_sizes[1] = num_partitions > 1 ? sizeof(global_offset_t) * 2 : size_t{0};
-    allocation_sizes[2] = num_partitions > 1 ? sizeof(AccumT) * 2 : size_t{0};
-
-    // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-    void* allocations[3] = {};
 
-    error = CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
-    if (cudaSuccess != error)
+    const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads));
+#ifdef CUB_DEBUG_LOG
+    _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
+#endif
+    if (const auto error = CubDebug(
+          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream)
+            .doit(&detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                  tile_state,
+                  num_current_tiles,
+                  d_num_runs_out)))
     {
       return error;
     }
 
-    if (d_temp_storage == nullptr)
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
     {
-      // Return if the caller is simply requesting the size of the storage allocation
       return error;
     }
 
-    // Iterate over the partitions until all input is processed
-    for (global_offset_t partition_idx = 0; partition_idx < num_partitions; partition_idx++)
+    if (num_items == 0)
     {
-      global_offset_t current_partition_offset = partition_idx * capped_num_items_per_invocation;
-      global_offset_t current_num_items =
-        (partition_idx + 1 == num_partitions)
-          ? (num_items - current_partition_offset)
-          : capped_num_items_per_invocation;
-
-      // Construct the tile status interface
-      const auto num_current_tiles = static_cast<int>(::cuda::ceil_div(current_num_items, tile_size));
-
-      // Construct the tile status interface
-      ScanTileStateT tile_state;
-      error = CubDebug(tile_state.Init(num_current_tiles, allocations[0], allocation_sizes[0]));
-      if (cudaSuccess != error)
-      {
-        return error;
-      }
-
-      // Log init_kernel configuration
-      int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_current_tiles, init_kernel_threads));
+      return cudaSuccess;
+    }
 
 #ifdef CUB_DEBUG_LOG
-      _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
-#endif // CUB_DEBUG_LOG
-
-      // Invoke init_kernel to initialize tile descriptors
-      error = CubDebug(
-        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream)
-          .doit(init_kernel, tile_state, num_current_tiles, d_num_runs_out));
-      if (cudaSuccess != error)
-      {
-        return error;
-      }
+    _CubLog("Invoking reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+            num_current_tiles,
+            block_threads,
+            (long long) stream,
+            items_per_thread);
+#endif
+    auto reduce_by_key_kernel = DeviceReduceByKeyKernel<
+      PolicySelector,
+      KeysInputIteratorT,
+      UniqueOutputIteratorT,
+      ValuesInputIteratorT,
+      AggregatesOutputIteratorT,
+      NumRunsOutputIteratorT,
+      ScanTileStateT,
+      EqualityOpT,
+      ReductionOpT,
+      local_offset_t,
+      AccumT,
+      streaming_context_t>;
 
-      // Sync the stream if specified to flush runtime errors
-      error = CubDebug(detail::DebugSyncStream(stream));
-      if (cudaSuccess != error)
-      {
-        return error;
-      }
-
-      // For empty problems we can skip the reduce_by_key_kernel
-      if (num_items == 0)
+    if constexpr (use_streaming_invocation)
+    {
+      auto tmp_num_uniques          = static_cast<global_offset_t*>(allocations[1]);
+      auto tmp_prefix               = static_cast<AccumT*>(allocations[2]);
+      const bool is_first_partition = (partition_idx == 0);
+      const bool is_last_partition  = (partition_idx + 1 == num_partitions);
+      const int buffer_selector     = partition_idx % 2;
+      streaming_context_t streaming_context{
+        is_first_partition,
+        is_last_partition,
+        is_first_partition ? d_keys_in : d_keys_in + current_partition_offset - 1,
+        &tmp_prefix[buffer_selector],
+        &tmp_prefix[buffer_selector ^ 0x01],
+        &tmp_num_uniques[buffer_selector],
+        &tmp_num_uniques[buffer_selector ^ 0x01]};
+      if (const auto error = CubDebug(
+            THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream)
+              .doit(reduce_by_key_kernel,
+                    d_keys_in + current_partition_offset,
+                    d_unique_out,
+                    d_values_in + current_partition_offset,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    0,
+                    equality_op,
+                    reduction_op,
+                    static_cast<local_offset_t>(current_num_items),
+                    streaming_context,
+                    detail::vsmem_t{nullptr})))
       {
         return error;
       }
-
-// Log reduce_by_key_kernel configuration
-#ifdef CUB_DEBUG_LOG
-      _CubLog("Invoking reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
-              "items per thread\n",
-              num_current_tiles,
-              block_threads,
-              (long long) stream,
-              items_per_thread);
-#endif // CUB_DEBUG_LOG
-
-      // Invoke reduce_by_key_kernel
-
-      if constexpr (use_streaming_invocation)
-      {
-        auto tmp_num_uniques = static_cast<global_offset_t*>(allocations[1]);
-        auto tmp_prefix      = static_cast<AccumT*>(allocations[2]);
-
-        const bool is_first_partition = (partition_idx == 0);
-        const bool is_last_partition  = (partition_idx + 1 == num_partitions);
-        const int buffer_selector     = partition_idx % 2;
-
-        streaming_context_t streaming_context{
-          is_first_partition,
-          is_last_partition,
-          is_first_partition ? d_keys_in : d_keys_in + current_partition_offset - 1,
-          &tmp_prefix[buffer_selector],
-          &tmp_prefix[buffer_selector ^ 0x01],
-          &tmp_num_uniques[buffer_selector],
-          &tmp_num_uniques[buffer_selector ^ 0x01]};
-
-        error = CubDebug(
-          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream)
-            .doit(reduce_by_key_kernel,
-                  d_keys_in + current_partition_offset,
-                  d_unique_out,
-                  d_values_in + current_partition_offset,
-                  d_aggregates_out,
-                  d_num_runs_out,
-                  tile_state,
-                  0,
-                  equality_op,
-                  reduction_op,
-                  static_cast<local_offset_t>(current_num_items),
-                  streaming_context,
-                  cub::detail::vsmem_t{nullptr}));
-        if (cudaSuccess != error)
-        {
-          return error;
-        }
-      }
-      else
-      {
-        error = CubDebug(
-          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream)
-            .doit(reduce_by_key_kernel,
-                  d_keys_in + current_partition_offset,
-                  d_unique_out,
-                  d_values_in + current_partition_offset,
-                  d_aggregates_out,
-                  d_num_runs_out,
-                  tile_state,
-                  0,
-                  equality_op,
-                  reduction_op,
-                  static_cast<local_offset_t>(current_num_items),
-                  NullType{},
-                  cub::detail::vsmem_t{nullptr}));
-        if (cudaSuccess != error)
-        {
-          return error;
-        }
-      }
-
-      // Sync the stream if specified to flush runtime errors
-      error = CubDebug(detail::DebugSyncStream(stream));
-      if (cudaSuccess != error)
+    }
+    else
+    {
+      if (const auto error = CubDebug(
+            THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_current_tiles, block_threads, 0, stream)
+              .doit(reduce_by_key_kernel,
+                    d_keys_in + current_partition_offset,
+                    d_unique_out,
+                    d_values_in + current_partition_offset,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    0,
+                    equality_op,
+                    reduction_op,
+                    static_cast<local_offset_t>(current_num_items),
+                    NullType{},
+                    detail::vsmem_t{nullptr})))
       {
         return error;
       }
     }
-
-    return error;
-  }
-
-  template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
-  {
-    return Invoke<ActivePolicyT>(
-      detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-      detail::reduce::DeviceReduceByKeyKernel<
-        typename PolicyHub::MaxPolicy,
-        KeysInputIteratorT,
-        UniqueOutputIteratorT,
-        ValuesInputIteratorT,
-        AggregatesOutputIteratorT,
-        NumRunsOutputIteratorT,
-        ScanTileStateT,
-        EqualityOpT,
-        ReductionOpT,
-        local_offset_t,
-        AccumT,
-        streaming_context_t>);
-  }
-
-  /**
-   * Internal dispatch routine
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_keys_in
-   *   Pointer to the input sequence of keys
-   *
-   * @param[out] d_unique_out
-   *   Pointer to the output sequence of unique keys (one key per run)
-   *
-   * @param[in] d_values_in
-   *   Pointer to the input sequence of corresponding values
-   *
-   * @param[out] d_aggregates_out
-   *   Pointer to the output sequence of value aggregates
-   *   (one aggregate per run)
-   *
-   * @param[out] d_num_runs_out
-   *   Pointer to total number of runs encountered
-   *   (i.e., the length of d_unique_out)
-   *
-   * @param[in] equality_op
-   *   KeyT equality operator
-   *
-   * @param[in] reduction_op
-   *   ValueT reduction operator
-   *
-   * @param[in] num_items
-   *   Total number of items to select from
-   *
-   * @param[in] stream
-   *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
-   */
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    UniqueOutputIteratorT d_unique_out,
-    ValuesInputIteratorT d_values_in,
-    AggregatesOutputIteratorT d_aggregates_out,
-    NumRunsOutputIteratorT d_num_runs_out,
-    EqualityOpT equality_op,
-    ReductionOpT reduction_op,
-    global_offset_t num_items,
-    cudaStream_t stream)
-  {
-    cudaError error = cudaSuccess;
-
-    do
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
     {
-      // Get PTX version
-      int ptx_version = 0;
-      error           = CubDebug(PtxVersion(ptx_version));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
-
-      DispatchStreamingReduceByKey dispatch(
-        d_temp_storage,
-        temp_storage_bytes,
-        d_keys_in,
-        d_unique_out,
-        d_values_in,
-        d_aggregates_out,
-        d_num_runs_out,
-        equality_op,
-        reduction_op,
-        num_items,
-        stream);
-
-      // Dispatch
-      error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
-    } while (0);
-
-    return error;
+      return error;
+    }
   }
-};
-} // namespace detail::reduce
+
+  return cudaSuccess;
+}
+} // namespace detail::reduce_by_key
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index cfaad7185da..1dd6e433206 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -18,12 +18,22 @@
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
+#include <cub/detail/delay_constructor.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/__device/arch_id.h>
 #include <cuda/cmath>
 #include <cuda/std/__algorithm/clamp.h>
 
+#if _CCCL_HAS_CONCEPTS()
+#  include <cuda/std/concepts>
+#endif // _CCCL_HAS_CONCEPTS()
+
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::reduce_by_key
@@ -63,25 +73,25 @@ enum class accum_size
 };
 
 template <class T>
-constexpr primitive_key is_primitive_key()
+_CCCL_API constexpr primitive_key is_primitive_key()
 {
   return detail::is_primitive<T>::value ? primitive_key::yes : primitive_key::no;
 }
 
 template <class T>
-constexpr primitive_accum is_primitive_accum()
+_CCCL_API constexpr primitive_accum is_primitive_accum()
 {
   return detail::is_primitive<T>::value ? primitive_accum::yes : primitive_accum::no;
 }
 
 template <class ReductionOpT>
-constexpr primitive_op is_primitive_op()
+_CCCL_API constexpr primitive_op is_primitive_op()
 {
   return basic_binary_op_t<ReductionOpT>::value ? primitive_op::yes : primitive_op::no;
 }
 
 template <class KeyT>
-constexpr key_size classify_key_size()
+_CCCL_API constexpr key_size classify_key_size()
 {
   return sizeof(KeyT) == 1 ? key_size::_1
        : sizeof(KeyT) == 2 ? key_size::_2
@@ -93,7 +103,7 @@ constexpr key_size classify_key_size()
 }
 
 template <class AccumT>
-constexpr accum_size classify_accum_size()
+_CCCL_API constexpr accum_size classify_accum_size()
 {
   return sizeof(AccumT) == 1 ? accum_size::_1
        : sizeof(AccumT) == 2 ? accum_size::_2
@@ -104,6 +114,28 @@ constexpr accum_size classify_accum_size()
          : accum_size::unknown;
 }
 
+_CCCL_API constexpr int size_of(key_size sz)
+{
+  return sz == key_size::_1 ? 1
+       : sz == key_size::_2 ? 2
+       : sz == key_size::_4 ? 4
+       : sz == key_size::_8 ? 8
+       : sz == key_size::_16
+         ? 16
+         : 4;
+}
+
+_CCCL_API constexpr int size_of(accum_size sz)
+{
+  return sz == accum_size::_1 ? 1
+       : sz == accum_size::_2 ? 2
+       : sz == accum_size::_4 ? 4
+       : sz == accum_size::_8 ? 8
+       : sz == accum_size::_16
+         ? 16
+         : 4;
+}
+
 template <class KeyT,
           class AccumT,
           primitive_op PrimitiveOp,
@@ -913,6 +945,683 @@ struct policy_hub
   };
   using MaxPolicy = Policy1000;
 };
+
+struct reduce_by_key_policy
+{
+  int block_threads;
+  int items_per_thread;
+  BlockLoadAlgorithm load_algorithm;
+  CacheLoadModifier load_modifier;
+  BlockScanAlgorithm scan_algorithm;
+  delay_constructor_policy delay_constructor;
+
+  [[nodiscard]] _CCCL_API constexpr friend bool
+  operator==(const reduce_by_key_policy& lhs, const reduce_by_key_policy& rhs)
+  {
+    return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread
+        && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier
+        && lhs.scan_algorithm == rhs.scan_algorithm && lhs.delay_constructor == rhs.delay_constructor;
+  }
+
+  [[nodiscard]] _CCCL_API constexpr friend bool
+  operator!=(const reduce_by_key_policy& lhs, const reduce_by_key_policy& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const reduce_by_key_policy& p)
+  {
+    return os
+        << "reduce_by_key_policy { .block_threads = " << p.block_threads << ", .items_per_thread = "
+        << p.items_per_thread << ", .load_algorithm = " << p.load_algorithm << ", .load_modifier = " << p.load_modifier
+        << ", .scan_algorithm = " << p.scan_algorithm << ", .delay_constructor = " << p.delay_constructor << " }";
+  }
+#endif // !_CCCL_COMPILER(NVRTC)
+};
+
+_CCCL_HOST_DEVICE constexpr reduce_by_key_policy
+make_default_reduce_by_key_policy(int combined_input_bytes, int max_input_bytes, CacheLoadModifier load_mod)
+{
+  constexpr int nominal_4B_items_per_thread = 6;
+  const int items_per_thread =
+    (max_input_bytes <= 8)
+      ? 6
+      : ::cuda::std::clamp(static_cast<int>(::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes)),
+                           1,
+                           nominal_4B_items_per_thread);
+  return reduce_by_key_policy{
+    128,
+    items_per_thread,
+    BLOCK_LOAD_DIRECT,
+    load_mod,
+    BLOCK_SCAN_WARP_SCANS,
+    {delay_constructor_kind::fixed_delay, 350, 450}};
+}
+
+struct policy_selector
+{
+  int key_size;
+  int accum_size;
+  bool is_primitive_key_t;
+  bool is_primitive_accum_t;
+  bool is_primitive_op;
+
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy
+  {
+    const int combined_input_bytes = key_size + accum_size;
+    const int max_input_bytes      = (::cuda::std::max) (key_size, accum_size);
+    const auto default_ldg         = [&] {
+      return make_default_reduce_by_key_policy(combined_input_bytes, max_input_bytes, LOAD_LDG);
+    };
+    const auto default_load_default = [&] {
+      return make_default_reduce_by_key_policy(combined_input_bytes, max_input_bytes, LOAD_DEFAULT);
+    };
+
+    const bool tuned_prim = (is_primitive_key_t && is_primitive_accum_t);
+
+    if (!is_primitive_op)
+    {
+      return default_ldg();
+    }
+
+    if (arch >= ::cuda::arch_id::sm_100 && tuned_prim)
+    {
+      if (key_size == 1 && accum_size == 1)
+      {
+        return {576,
+                13,
+                BLOCK_LOAD_DIRECT,
+                LOAD_CA,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backon_jitter_window, 2044, 240}};
+      }
+      if (key_size == 1 && accum_size == 2)
+      {
+        return {224,
+                10,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff_jitter_window, 224, 390}};
+      }
+      if (key_size == 1 && accum_size == 4)
+      {
+        return {128,
+                14,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 248, 285}};
+      }
+      if (key_size == 1 && accum_size == 8)
+      {
+        return {128,
+                19,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 132, 540}};
+      }
+      if (key_size == 2 && accum_size == 1)
+      {
+        return {128,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 164, 290}};
+      }
+      if (key_size == 2 && accum_size == 2)
+      {
+        return {256,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 180, 975}};
+      }
+      if (key_size == 2 && accum_size == 4)
+      {
+        return {256,
+                11,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 224, 550}};
+      }
+      if (key_size == 2 && accum_size == 8)
+      {
+        return {160,
+                10,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 156, 725}};
+      }
+      if (key_size == 4 && accum_size == 1)
+      {
+        return {224,
+                10,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 324, 285}};
+      }
+      if (key_size == 4 && accum_size == 2)
+      {
+        return {256,
+                11,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backon_jitter_window, 1984, 115}};
+      }
+      if (key_size == 4 && accum_size == 4)
+      {
+        return {224,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backon_jitter_window, 476, 1005}};
+      }
+      if (key_size == 4 && accum_size == 8)
+      {
+        return {256,
+                10,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backon, 1868, 145}};
+      }
+      if (key_size == 8 && accum_size == 1)
+      {
+        return {224,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backon_jitter_window, 1940, 460}};
+      }
+      if (key_size == 8 && accum_size == 2)
+      {
+        return {224,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_CA,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 392, 550}};
+      }
+      if (key_size == 8 && accum_size == 4)
+      {
+        return {224,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 244, 475}};
+      }
+      if (key_size == 8 && accum_size == 8)
+      {
+        return {224,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::exponential_backoff, 196, 340}};
+      }
+    }
+
+    if (arch >= ::cuda::arch_id::sm_90 && tuned_prim)
+    {
+      if (key_size == 1 && accum_size == 1)
+      {
+        return {
+          256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 720}};
+      }
+      if (key_size == 1 && accum_size == 2)
+      {
+        return {
+          320, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 865}};
+      }
+      if (key_size == 1 && accum_size == 4)
+      {
+        return {192,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 735}};
+      }
+      if (key_size == 1 && accum_size == 8)
+      {
+        return {128,
+                13,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 580}};
+      }
+      if (key_size == 1 && accum_size == 16)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1100}};
+      }
+      if (key_size == 2 && accum_size == 1)
+      {
+        return {
+          128, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 985}};
+      }
+      if (key_size == 2 && accum_size == 2)
+      {
+        return {256,
+                11,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 276, 650}};
+      }
+      if (key_size == 2 && accum_size == 4)
+      {
+        return {256,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 240, 765}};
+      }
+      if (key_size == 2 && accum_size == 8)
+      {
+        return {128,
+                19,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1190}};
+      }
+      if (key_size == 2 && accum_size == 16)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1175}};
+      }
+      if (key_size == 4 && accum_size == 1)
+      {
+        return {256,
+                13,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 404, 645}};
+      }
+      if (key_size == 4 && accum_size == 2)
+      {
+        return {256,
+                18,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1160}};
+      }
+      if (key_size == 4 && accum_size == 4)
+      {
+        return {256,
+                18,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1170}};
+      }
+      if (key_size == 4 && accum_size == 8)
+      {
+        return {128,
+                13,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1055}};
+      }
+      if (key_size == 4 && accum_size == 16)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1195}};
+      }
+      if (key_size == 8 && accum_size == 1)
+      {
+        return {
+          256, 10, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1170}};
+      }
+      if (key_size == 8 && accum_size == 2)
+      {
+        return {256,
+                9,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 236, 1030}};
+      }
+      if (key_size == 8 && accum_size == 4)
+      {
+        return {128,
+                13,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 152, 560}};
+      }
+      if (key_size == 8 && accum_size == 8)
+      {
+        return {128,
+                23,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1030}};
+      }
+      if (key_size == 8 && accum_size == 16)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1125}};
+      }
+      if (key_size == 16 && !is_primitive_key_t && accum_size == 1)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1080}};
+      }
+      if (key_size == 16 && !is_primitive_key_t && accum_size == 2)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 320, 1005}};
+      }
+      if (key_size == 16 && !is_primitive_key_t && accum_size == 4)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::fixed_delay, 232, 1100}};
+      }
+      if (key_size == 16 && !is_primitive_key_t && accum_size == 8)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1195}};
+      }
+      if (key_size == 16 && !is_primitive_key_t && accum_size == 16)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1150}};
+      }
+      return default_load_default();
+    }
+
+    if (arch >= ::cuda::arch_id::sm_86)
+    {
+      return default_ldg();
+    }
+
+    if (arch >= ::cuda::arch_id::sm_80 && tuned_prim)
+    {
+      if (key_size == 1 && accum_size == 1)
+      {
+        return {
+          256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 975}};
+      }
+      if (key_size == 1 && accum_size == 2)
+      {
+        return {
+          224, 12, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 840}};
+      }
+      if (key_size == 1 && accum_size == 4)
+      {
+        return {256,
+                15,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 760}};
+      }
+      if (key_size == 1 && accum_size == 8)
+      {
+        return {
+          224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1070}};
+      }
+      if (key_size == 1 && accum_size == 16)
+      {
+        return {128,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1175}};
+      }
+      if (key_size == 2 && accum_size == 1)
+      {
+        return {
+          256, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}};
+      }
+      if (key_size == 2 && accum_size == 2)
+      {
+        return {224,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 640}};
+      }
+      if (key_size == 2 && accum_size == 4)
+      {
+        return {256,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 905}};
+      }
+      if (key_size == 2 && accum_size == 8)
+      {
+        return {224,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 810}};
+      }
+      if (key_size == 2 && accum_size == 16)
+      {
+        return {160,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1115}};
+      }
+      if (key_size == 4 && accum_size == 1)
+      {
+        return {
+          288, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1110}};
+      }
+      if (key_size == 4 && accum_size == 2)
+      {
+        return {192,
+                15,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1200}};
+      }
+      if (key_size == 4 && accum_size == 4)
+      {
+        return {
+          256, 15, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1110}};
+      }
+      if (key_size == 4 && accum_size == 8)
+      {
+        return {224,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1165}};
+      }
+      if (key_size == 4 && accum_size == 16)
+      {
+        return {160,
+                9,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1100}};
+      }
+      if (key_size == 8 && accum_size == 1)
+      {
+        return {192,
+                10,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1175}};
+      }
+      if (key_size == 8 && accum_size == 2)
+      {
+        return {
+          224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1075}};
+      }
+      if (key_size == 8 && accum_size == 4)
+      {
+        return {
+          384, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1040}};
+      }
+      if (key_size == 8 && accum_size == 8)
+      {
+        return {128,
+                14,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1080}};
+      }
+      if (key_size == 8 && accum_size == 16)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 430}};
+      }
+      if (key_size == 16 && !is_primitive_key_t && accum_size == 1)
+      {
+        return {
+          192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1105}};
+      }
+      if (key_size == 16 && !is_primitive_key_t && accum_size == 2)
+      {
+        return {192,
+                7,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 755}};
+      }
+      if (key_size == 16 && !is_primitive_key_t && accum_size == 4)
+      {
+        return {192,
+                7,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 535}};
+      }
+      if (key_size == 16 && !is_primitive_key_t && accum_size == 8)
+      {
+        return {
+          192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1035}};
+      }
+      if (key_size == 16 && !is_primitive_key_t && accum_size == 16)
+      {
+        return {128,
+                11,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS,
+                {delay_constructor_kind::no_delay, 0, 1090}};
+      }
+      return default_load_default();
+    }
+
+    return default_ldg();
+  }
+};
+
+#if _CCCL_HAS_CONCEPTS()
+template <typename T>
+concept reduce_by_key_policy_selector = detail::policy_selector<T, reduce_by_key_policy>;
+#endif // _CCCL_HAS_CONCEPTS()
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
+template <typename PolicyHub>
+struct policy_selector_from_hub
+{
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> reduce_by_key_policy
+  {
+    using ReduceByKeyPolicyT = typename PolicyHub::MaxPolicy::ReduceByKeyPolicyT;
+    return reduce_by_key_policy{
+      ReduceByKeyPolicyT::BLOCK_THREADS,
+      ReduceByKeyPolicyT::ITEMS_PER_THREAD,
+      ReduceByKeyPolicyT::LOAD_ALGORITHM,
+      ReduceByKeyPolicyT::LOAD_MODIFIER,
+      ReduceByKeyPolicyT::SCAN_ALGORITHM,
+      delay_constructor_policy_from_type<typename ReduceByKeyPolicyT::detail::delay_constructor_t>,
+    };
+  }
+};
+
+template <class ReductionOpT, class AccumT, class KeyT>
+struct policy_selector_from_types
+{
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy
+  {
+    return policy_selector{
+      int{sizeof(KeyT)},
+      int{sizeof(AccumT)},
+      (is_primitive_key<KeyT>() == primitive_key::yes),
+      (is_primitive_accum<AccumT>() == primitive_accum::yes),
+      (is_primitive_op<ReductionOpT>() == primitive_op::yes)}(arch);
+  }
+};
 } // namespace detail::reduce_by_key
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
index 77c393d7643..309a295bf42 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
@@ -18,17 +18,29 @@
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
+#include <cub/detail/delay_constructor.cuh>
 #include <cub/device/dispatch/tuning/common.cuh>
+#include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
 #include <cub/util_device.cuh>
 
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/__device/arch_id.h>
 #include <cuda/std/__algorithm/clamp.h>
 #include <cuda/std/__algorithm/max.h>
 
+#if _CCCL_HAS_CONCEPTS()
+#  include <cuda/std/concepts>
+#endif // _CCCL_HAS_CONCEPTS()
+
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::rle::encode
 {
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -89,6 +101,7 @@ struct sm80_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::n
 {};
 #endif
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -218,7 +231,8 @@ struct sm100_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, le
 // {};
 #endif
 
-// this policy is passed to DispatchReduceByKey
+// TODO(bgruber): remove in CCCL 4.0 when we drop all CUB dispatchers
+// this policy is passed to DispatchStreamingReduceByKey and then to DeviceReduceByKeyKernel
 template <class LengthT, class KeyT>
 struct policy_hub
 {
@@ -294,6 +308,196 @@ struct policy_hub
 
   using MaxPolicy = Policy1000;
 };
+
+// DeviceRunLengthEncode::Encode delegates to reduce by key
+using rle_encode_policy = reduce_by_key::reduce_by_key_policy;
+
+#if _CCCL_HAS_CONCEPTS()
+template <typename T>
+concept rle_encode_policy_selector = reduce_by_key::reduce_by_key_policy_selector<T>;
+#endif // _CCCL_HAS_CONCEPTS()
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the RLE dispatchers
+using reduce_by_key::policy_selector_from_hub;
+
+struct policy_selector
+{
+  length_size length_sz;
+  key_size key_sz;
+  primitive_length prim_len;
+  primitive_key prim_key;
+  int max_input_bytes;
+  int combined_input_bytes;
+
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy
+  {
+    const bool tuned_prim = (prim_len == primitive_length::yes && prim_key == primitive_key::yes);
+    const bool length_4   = (length_sz == length_size::_4);
+
+    if (arch >= ::cuda::arch_id::sm_100 && tuned_prim && length_4)
+    {
+      if (key_sz == key_size::_1)
+      {
+        return rle_encode_policy{
+          256,
+          14,
+          BLOCK_LOAD_DIRECT,
+          LOAD_CA,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::exponential_backon, 468, 300}};
+      }
+      if (key_sz == key_size::_2)
+      {
+        return rle_encode_policy{
+          224,
+          14,
+          BLOCK_LOAD_DIRECT,
+          LOAD_DEFAULT,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::exponential_backon, 376, 420}};
+      }
+      if (key_sz == key_size::_4)
+      {
+        return rle_encode_policy{
+          256,
+          14,
+          BLOCK_LOAD_DIRECT,
+          LOAD_CA,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::exponential_backon, 956, 70}};
+      }
+      if (key_sz == key_size::_8)
+      {
+        return rle_encode_policy{
+          224,
+          9,
+          BLOCK_LOAD_WARP_TRANSPOSE,
+          LOAD_DEFAULT,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::exponential_backoff, 188, 765}};
+      }
+    }
+    if (arch >= ::cuda::arch_id::sm_90 && tuned_prim && length_4)
+    {
+      if (key_sz == key_size::_1)
+      {
+        return rle_encode_policy{
+          256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}};
+      }
+      if (key_sz == key_size::_2)
+      {
+        return rle_encode_policy{
+          128, 22, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 775}};
+      }
+      if (key_sz == key_size::_4)
+      {
+        return rle_encode_policy{
+          192,
+          14,
+          BLOCK_LOAD_WARP_TRANSPOSE,
+          LOAD_DEFAULT,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::fixed_delay, 284, 480}};
+      }
+      if (key_sz == key_size::_8)
+      {
+        return rle_encode_policy{
+          128,
+          19,
+          BLOCK_LOAD_WARP_TRANSPOSE,
+          LOAD_DEFAULT,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::no_delay, 0, 515}};
+      }
+#if _CCCL_HAS_INT128()
+      if (key_sz == key_size::_16)
+      {
+        return rle_encode_policy{
+          128,
+          11,
+          BLOCK_LOAD_WARP_TRANSPOSE,
+          LOAD_DEFAULT,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::fixed_delay, 428, 930}};
+      }
+#endif
+    }
+    if (arch >= ::cuda::arch_id::sm_80 && tuned_prim && length_4)
+    {
+      if (key_sz == key_size::_1)
+      {
+        return rle_encode_policy{
+          256, 14, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 640}};
+      }
+      if (key_sz == key_size::_2)
+      {
+        return rle_encode_policy{
+          256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 900}};
+      }
+      if (key_sz == key_size::_4)
+      {
+        return rle_encode_policy{
+          256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1080}};
+      }
+      if (key_sz == key_size::_8)
+      {
+        return rle_encode_policy{
+          224,
+          9,
+          BLOCK_LOAD_WARP_TRANSPOSE,
+          LOAD_DEFAULT,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::no_delay, 0, 1075}};
+      }
+#if _CCCL_HAS_INT128()
+      if (key_sz == key_size::_16)
+      {
+        return rle_encode_policy{
+          128,
+          7,
+          BLOCK_LOAD_WARP_TRANSPOSE,
+          LOAD_DEFAULT,
+          BLOCK_SCAN_WARP_SCANS,
+          {delay_constructor_kind::no_delay, 0, 630}};
+      }
+#endif
+    }
+    constexpr int nominal_4B_items_per_thread = 6;
+    const int items =
+      (max_input_bytes <= 8)
+        ? 6
+        : ::cuda::std::clamp(
+            ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
+    return rle_encode_policy{
+      128,
+      items,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      BLOCK_SCAN_WARP_SCANS,
+      {delay_constructor_kind::fixed_delay, 350, 450},
+    };
+  }
+};
+
+template <class LengthT, class KeyT>
+struct policy_selector_from_types
+{
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy
+  {
+    constexpr policy_selector selector{
+      classify_length_size<LengthT>(),
+      classify_key_size<KeyT>(),
+      is_primitive_length<LengthT>(),
+      is_primitive_key<KeyT>(),
+      static_cast<int>((::cuda::std::max) (sizeof(LengthT), sizeof(KeyT))),
+      static_cast<int>(sizeof(LengthT) + sizeof(KeyT))};
+    return selector(arch);
+  }
+};
+
+#if _CCCL_HAS_CONCEPTS()
+static_assert(rle_encode_policy_selector<policy_selector>);
+#endif // _CCCL_HAS_CONCEPTS()
 } // namespace detail::rle::encode
 
 CUB_NAMESPACE_END
diff --git a/nvbench_helper/nvbench_helper/look_back_helper.cuh b/nvbench_helper/nvbench_helper/look_back_helper.cuh
index f7a97937dcb..5d31dde94dd 100644
--- a/nvbench_helper/nvbench_helper/look_back_helper.cuh
+++ b/nvbench_helper/nvbench_helper/look_back_helper.cuh
@@ -16,4 +16,8 @@ using delay_constructor_t =
   cub::detail::delay_constructor_t<static_cast<cub::detail::delay_constructor_kind>(TUNE_DELAY_CONSTRUCTOR_ID),
                                    TUNE_MAGIC_NS,
                                    TUNE_L2_WRITE_LATENCY_NS>;
+
+inline constexpr auto delay_constructor_policy = cub::detail::delay_constructor_policy{
+  static_cast<cub::detail::delay_constructor_kind>(TUNE_DELAY_CONSTRUCTOR_ID), TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS};
+
 #endif // !TUNE_BASE

From f045884883ebe6047f48b82803c84b95d5e272c3 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Tue, 24 Feb 2026 23:45:05 +0100
Subject: [PATCH 02/23] annot

---
 cub/benchmarks/bench/reduce/by_key.cu            | 2 +-
 cub/benchmarks/bench/run_length_encode/encode.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu
index 41b486aa752..f3603525bf7 100644
--- a/cub/benchmarks/bench/reduce/by_key.cu
+++ b/cub/benchmarks/bench/reduce/by_key.cu
@@ -17,7 +17,7 @@
 #if !TUNE_BASE
 struct bench_reduce_by_key_policy_selector
 {
-  [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const
     -> cub::detail::reduce_by_key::reduce_by_key_policy
   {
     return {
diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu
index da115685082..d00e16acf6e 100644
--- a/cub/benchmarks/bench/run_length_encode/encode.cu
+++ b/cub/benchmarks/bench/run_length_encode/encode.cu
@@ -19,7 +19,7 @@
 #if !TUNE_BASE
 struct bench_encode_policy_selector
 {
-  [[nodiscard]] constexpr auto operator()(::cuda::arch_id /*arch*/) const
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const
     -> cub::detail::reduce_by_key::reduce_by_key_policy
   {
     return {

From 2add7e60dbbf155911c5b7cfd25db955b34e8825 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 13:51:54 +0100
Subject: [PATCH 03/23] Comments

---
 .../dispatch/tuning/tuning_reduce_by_key.cuh  | 48 +++++++++----------
 .../dispatch/tuning/tuning_rle_encode.cuh     | 13 ++---
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index 1dd6e433206..357cba58181 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -38,21 +38,28 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail::reduce_by_key
 {
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 enum class primitive_key
 {
   no,
   yes
 };
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 enum class primitive_accum
 {
   no,
   yes
 };
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 enum class primitive_op
 {
   no,
   yes
 };
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 enum class key_size
 {
   _1,
@@ -62,6 +69,8 @@ enum class key_size
   _16,
   unknown
 };
+
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 enum class accum_size
 {
   _1,
@@ -72,24 +81,28 @@ enum class accum_size
   unknown
 };
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class T>
 _CCCL_API constexpr primitive_key is_primitive_key()
 {
   return detail::is_primitive<T>::value ? primitive_key::yes : primitive_key::no;
 }
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class T>
 _CCCL_API constexpr primitive_accum is_primitive_accum()
 {
   return detail::is_primitive<T>::value ? primitive_accum::yes : primitive_accum::no;
 }
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class ReductionOpT>
 _CCCL_API constexpr primitive_op is_primitive_op()
 {
   return basic_binary_op_t<ReductionOpT>::value ? primitive_op::yes : primitive_op::no;
 }
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class KeyT>
 _CCCL_API constexpr key_size classify_key_size()
 {
@@ -102,6 +115,7 @@ _CCCL_API constexpr key_size classify_key_size()
          : key_size::unknown;
 }
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class AccumT>
 _CCCL_API constexpr accum_size classify_accum_size()
 {
@@ -114,28 +128,7 @@ _CCCL_API constexpr accum_size classify_accum_size()
          : accum_size::unknown;
 }
 
-_CCCL_API constexpr int size_of(key_size sz)
-{
-  return sz == key_size::_1 ? 1
-       : sz == key_size::_2 ? 2
-       : sz == key_size::_4 ? 4
-       : sz == key_size::_8 ? 8
-       : sz == key_size::_16
-         ? 16
-         : 4;
-}
-
-_CCCL_API constexpr int size_of(accum_size sz)
-{
-  return sz == accum_size::_1 ? 1
-       : sz == accum_size::_2 ? 2
-       : sz == accum_size::_4 ? 4
-       : sz == accum_size::_8 ? 8
-       : sz == accum_size::_16
-         ? 16
-         : 4;
-}
-
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class KeyT,
           class AccumT,
           primitive_op PrimitiveOp,
@@ -375,6 +368,7 @@ struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive
   using delay_constructor                            = detail::no_delay_constructor_t<1090>;
 };
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class KeyT,
           class AccumT,
           primitive_op PrimitiveOp,
@@ -614,6 +608,7 @@ struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive
   using delay_constructor                            = detail::no_delay_constructor_t<1150>;
 };
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class KeyT,
           class AccumT,
           primitive_op PrimitiveOp,
@@ -865,6 +860,7 @@ struct sm100_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primiti
 //   using delay_constructor                            = detail::no_delay_constructor_t<1125>;
 // };
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
 template <class ReductionOpT, class AccumT, class KeyT>
 struct policy_hub
 {
@@ -980,6 +976,11 @@ struct reduce_by_key_policy
 #endif // !_CCCL_COMPILER(NVRTC)
 };
 
+#if _CCCL_HAS_CONCEPTS()
+template <typename T>
+concept reduce_by_key_policy_selector = detail::policy_selector<T, reduce_by_key_policy>;
+#endif // _CCCL_HAS_CONCEPTS()
+
 _CCCL_HOST_DEVICE constexpr reduce_by_key_policy
 make_default_reduce_by_key_policy(int combined_input_bytes, int max_input_bytes, CacheLoadModifier load_mod)
 {
@@ -1587,8 +1588,7 @@ struct policy_selector
 };
 
 #if _CCCL_HAS_CONCEPTS()
-template <typename T>
-concept reduce_by_key_policy_selector = detail::policy_selector<T, reduce_by_key_policy>;
+static_assert(reduce_by_key_policy_selector<policy_selector>);
 #endif // _CCCL_HAS_CONCEPTS()
 
 // TODO(bgruber): remove in CCCL 4.0 when we drop the reduce-by-key dispatchers
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
index 309a295bf42..2fdd3bbbe90 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
@@ -40,7 +40,7 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail::rle::encode
 {
-// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce dispatchers
+// TODO(bgruber): remove in CCCL 4.0 when we drop the CUB dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -101,7 +101,7 @@ struct sm80_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::n
 {};
 #endif
 
-// TODO(bgruber): remove in CCCL 4.0 when we drop the reduce dispatchers
+// TODO(bgruber): remove in CCCL 4.0 when we drop the CUB dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -162,6 +162,7 @@ struct sm90_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::n
 {};
 #endif
 
+// TODO(bgruber): remove in CCCL 4.0 when we drop the CUB dispatchers
 template <class LengthT,
           class KeyT,
           primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
@@ -479,6 +480,10 @@ struct policy_selector
   }
 };
 
+#if _CCCL_HAS_CONCEPTS()
+static_assert(rle_encode_policy_selector<policy_selector>);
+#endif // _CCCL_HAS_CONCEPTS()
+
 template <class LengthT, class KeyT>
 struct policy_selector_from_types
 {
@@ -494,10 +499,6 @@ struct policy_selector_from_types
     return selector(arch);
   }
 };
-
-#if _CCCL_HAS_CONCEPTS()
-static_assert(rle_encode_policy_selector<policy_selector>);
-#endif // _CCCL_HAS_CONCEPTS()
 } // namespace detail::rle::encode
 
 CUB_NAMESPACE_END

From 2f3240b7ebe3e3bf785b9d3de79bde773798f035 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 13:55:45 +0100
Subject: [PATCH 04/23] Fix

---
 cub/benchmarks/bench/reduce/by_key.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu
index f3603525bf7..dfe1d9549c4 100644
--- a/cub/benchmarks/bench/reduce/by_key.cu
+++ b/cub/benchmarks/bench/reduce/by_key.cu
@@ -24,7 +24,6 @@ struct bench_reduce_by_key_policy_selector
       TUNE_THREADS,
       TUNE_ITEMS,
       TUNE_TRANSPOSE == 0 ? cub::BLOCK_LOAD_DIRECT : cub::BLOCK_LOAD_WARP_TRANSPOSE,
-      ,
       TUNE_LOAD == 0 ? cub::LOAD_DEFAULT : cub::LOAD_CA,
       cub::BLOCK_SCAN_WARP_SCANS,
       delay_constructor_policy,

From d7b0edc85f4f822d27d1ad072587dcd54506605b Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 14:52:39 +0100
Subject: [PATCH 05/23] MSVC?

---
 cub/cub/device/dispatch/dispatch_reduce_by_key.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 72564332c0e..820d03a6d7b 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -685,7 +685,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
   }
 
   return detail::dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) {
-    constexpr reduce_by_key_policy policy = policy_getter(); // need the constexpr of vsmem_helper
+    static constexpr reduce_by_key_policy policy = policy_getter(); // need the constexpr of vsmem_helper
 
 #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
     NV_IF_TARGET(

From 132e464d8a02ffbc8feb224c4c24c3d9594b882d Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 15:14:11 +0100
Subject: [PATCH 06/23] SASS fixes

---
 .../dispatch/tuning/tuning_reduce_by_key.cuh  | 61 +++++++++----------
 1 file changed, 28 insertions(+), 33 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index 357cba58181..2b3a95b49ff 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -981,49 +981,44 @@ template <typename T>
 concept reduce_by_key_policy_selector = detail::policy_selector<T, reduce_by_key_policy>;
 #endif // _CCCL_HAS_CONCEPTS()
 
-_CCCL_HOST_DEVICE constexpr reduce_by_key_policy
-make_default_reduce_by_key_policy(int combined_input_bytes, int max_input_bytes, CacheLoadModifier load_mod)
-{
-  constexpr int nominal_4B_items_per_thread = 6;
-  const int items_per_thread =
-    (max_input_bytes <= 8)
-      ? 6
-      : ::cuda::std::clamp(static_cast<int>(::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes)),
-                           1,
-                           nominal_4B_items_per_thread);
-  return reduce_by_key_policy{
-    128,
-    items_per_thread,
-    BLOCK_LOAD_DIRECT,
-    load_mod,
-    BLOCK_SCAN_WARP_SCANS,
-    {delay_constructor_kind::fixed_delay, 350, 450}};
-}
-
 struct policy_selector
 {
   int key_size;
   int accum_size;
+
+  // TODO(bgruber): we want to get rid of the following three and just assume by default that types behave "primitive".
+  // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because
+  // then users can opt-out of tunings again
   bool is_primitive_key_t;
   bool is_primitive_accum_t;
   bool is_primitive_op;
 
-  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy
+  _CCCL_API constexpr auto __make_default_reduce_by_key_policy(CacheLoadModifier load_mod) const -> reduce_by_key_policy
   {
-    const int combined_input_bytes = key_size + accum_size;
-    const int max_input_bytes      = (::cuda::std::max) (key_size, accum_size);
-    const auto default_ldg         = [&] {
-      return make_default_reduce_by_key_policy(combined_input_bytes, max_input_bytes, LOAD_LDG);
-    };
-    const auto default_load_default = [&] {
-      return make_default_reduce_by_key_policy(combined_input_bytes, max_input_bytes, LOAD_DEFAULT);
-    };
+    constexpr int nominal_4B_items_per_thread = 6;
+    const int combined_input_bytes            = key_size + accum_size;
+    const int max_input_bytes                 = (::cuda::std::max) (key_size, accum_size);
+    const int items_per_thread =
+      (max_input_bytes <= 8)
+        ? 6
+        : ::cuda::std::clamp(
+            ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
+    return reduce_by_key_policy{
+      128,
+      items_per_thread,
+      BLOCK_LOAD_DIRECT,
+      load_mod,
+      BLOCK_SCAN_WARP_SCANS,
+      default_reduce_by_key_delay_constructor_policy(accum_size, sizeof(int), true)};
+  }
 
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy
+  {
     const bool tuned_prim = (is_primitive_key_t && is_primitive_accum_t);
 
     if (!is_primitive_op)
     {
-      return default_ldg();
+      return __make_default_reduce_by_key_policy(LOAD_LDG);
     }
 
     if (arch >= ::cuda::arch_id::sm_100 && tuned_prim)
@@ -1385,12 +1380,12 @@ struct policy_selector
                 BLOCK_SCAN_WARP_SCANS,
                 {delay_constructor_kind::no_delay, 0, 1150}};
       }
-      return default_load_default();
+      return __make_default_reduce_by_key_policy(LOAD_DEFAULT);
     }
 
     if (arch >= ::cuda::arch_id::sm_86)
     {
-      return default_ldg();
+      return __make_default_reduce_by_key_policy(LOAD_LDG);
     }
 
     if (arch >= ::cuda::arch_id::sm_80 && tuned_prim)
@@ -1580,10 +1575,10 @@ struct policy_selector
                 BLOCK_SCAN_WARP_SCANS,
                 {delay_constructor_kind::no_delay, 0, 1090}};
       }
-      return default_load_default();
+      return __make_default_reduce_by_key_policy(LOAD_DEFAULT);
     }
 
-    return default_ldg();
+    return __make_default_reduce_by_key_policy(LOAD_LDG);
   }
 };
 

From 641f3563a0f65f3963855a5f87da6e1bd7513215 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 15:20:11 +0100
Subject: [PATCH 07/23] const fix

---
 cub/benchmarks/bench/reduce/by_key.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu
index dfe1d9549c4..0c92a820788 100644
--- a/cub/benchmarks/bench/reduce/by_key.cu
+++ b/cub/benchmarks/bench/reduce/by_key.cu
@@ -55,9 +55,9 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
   thrust::device_vector<KeyT> out_keys(elements);
   thrust::device_vector<KeyT> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
 
-  KeyT* d_in_keys         = thrust::raw_pointer_cast(in_keys.data());
+  const KeyT* d_in_keys   = thrust::raw_pointer_cast(in_keys.data());
   KeyT* d_out_keys        = thrust::raw_pointer_cast(out_keys.data());
-  ValueT* d_in_vals       = thrust::raw_pointer_cast(in_vals.data());
+  const ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data());
   ValueT* d_out_vals      = thrust::raw_pointer_cast(out_vals.data());
   OffsetT* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());
 

From e213ef2a48f29936f255c442df80999ea9abd17d Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 15:47:12 +0100
Subject: [PATCH 08/23] Fix is primitive conditions

---
 cub/benchmarks/bench/reduce/by_key.cu              | 14 ++++----------
 cub/cub/device/dispatch/dispatch_reduce_by_key.cuh |  8 ++++++--
 .../dispatch/tuning/tuning_reduce_by_key.cuh       | 11 ++++++-----
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu
index 0c92a820788..e02945740cc 100644
--- a/cub/benchmarks/bench/reduce/by_key.cu
+++ b/cub/benchmarks/bench/reduce/by_key.cu
@@ -35,15 +35,9 @@ struct bench_reduce_by_key_policy_selector
 template <class KeyT, class ValueT, class OffsetT>
 static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
 {
-  using keys_input_it_t            = const KeyT*;
-  using unique_output_it_t         = KeyT*;
-  using vals_input_it_t            = const ValueT*;
-  using aggregate_output_it_t      = ValueT*;
-  using num_runs_output_iterator_t = OffsetT*;
-  using equality_op_t              = ::cuda::std::equal_to<>;
-  using reduction_op_t             = ::cuda::std::plus<>;
-  using accum_t                    = ValueT;
-  using offset_t                   = OffsetT;
+  using equality_op_t  = ::cuda::std::equal_to<>;
+  using reduction_op_t = ::cuda::std::plus<>;
+  using offset_t       = OffsetT;
 
   const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   constexpr std::size_t min_segment_size = 1;
@@ -66,7 +60,7 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
   const offset_t num_items = static_cast<offset_t>(elements);
 
   auto dispatch_on_stream = [&](cudaStream_t stream) {
-    return cub::detail::reduce_by_key::dispatch(
+    return cub::detail::reduce_by_key::dispatch</* OverrideAccumT */ ValueT>(
       d_temp_storage,
       temp_storage_bytes,
       d_in_keys,
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 820d03a6d7b..e10d7d24eec 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -21,6 +21,7 @@
 
 #include <cub/agent/agent_reduce_by_key.cuh>
 #include <cub/detail/arch_dispatch.cuh>
+#include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -645,6 +646,7 @@ struct DispatchReduceByKey
 namespace detail::reduce_by_key
 {
 template <
+  typename OverrideAccumT = use_default,
   typename KeysInputIteratorT,
   typename UniqueOutputIteratorT,
   typename ValuesInputIteratorT,
@@ -653,8 +655,10 @@ template <
   typename EqualityOpT,
   typename ReductionOpT,
   typename OffsetT,
-  typename AccumT =
-    ::cuda::std::__accumulator_t<ReductionOpT, it_value_t<ValuesInputIteratorT>, it_value_t<ValuesInputIteratorT>>,
+  typename AccumT = ::cuda::std::conditional_t<
+    !::cuda::std::is_same_v<OverrideAccumT, use_default>,
+    OverrideAccumT,
+    ::cuda::std::__accumulator_t<ReductionOpT, it_value_t<ValuesInputIteratorT>, it_value_t<ValuesInputIteratorT>>>,
   typename KeyT           = non_void_value_t<UniqueOutputIteratorT, it_value_t<KeysInputIteratorT>>,
   typename PolicySelector = policy_selector_from_types<ReductionOpT, AccumT, KeyT>>
 #if _CCCL_HAS_CONCEPTS()
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index 2b3a95b49ff..d31810c9158 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -1014,14 +1014,15 @@ struct policy_selector
 
   [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy
   {
-    const bool tuned_prim = (is_primitive_key_t && is_primitive_accum_t);
-
+    // bail out if we don't know the operation. TODO(bgruber): drop this check when we make the tuning API public
     if (!is_primitive_op)
     {
       return __make_default_reduce_by_key_policy(LOAD_LDG);
     }
 
-    if (arch >= ::cuda::arch_id::sm_100 && tuned_prim)
+    const bool use_tuning = (is_primitive_key_t || key_size == 16) && (is_primitive_accum_t || accum_size == 16);
+
+    if (arch >= ::cuda::arch_id::sm_100 && use_tuning)
     {
       if (key_size == 1 && accum_size == 1)
       {
@@ -1169,7 +1170,7 @@ struct policy_selector
       }
     }
 
-    if (arch >= ::cuda::arch_id::sm_90 && tuned_prim)
+    if (arch >= ::cuda::arch_id::sm_90 && use_tuning)
     {
       if (key_size == 1 && accum_size == 1)
       {
@@ -1388,7 +1389,7 @@ struct policy_selector
       return __make_default_reduce_by_key_policy(LOAD_LDG);
     }
 
-    if (arch >= ::cuda::arch_id::sm_80 && tuned_prim)
+    if (arch >= ::cuda::arch_id::sm_80 && use_tuning)
     {
       if (key_size == 1 && accum_size == 1)
       {

From 8bd1b347efee4326f1576d6c3f557fbb194e34c3 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 16:24:44 +0100
Subject: [PATCH 09/23] Refactor

---
 cub/cub/device/dispatch/dispatch_reduce_by_key.cuh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index e10d7d24eec..321cd2536f5 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -680,7 +680,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
 {
   using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet
   using ScanTileStateT      = ReduceByKeyScanTileState<AccumT, OffsetT>;
-  [[maybe_unused]] static constexpr int INIT_KERNEL_THREADS = 128;
+  [[maybe_unused]] static constexpr int init_kernel_threads = 128;
 
   ::cuda::arch_id arch_id{};
   if (const auto error = CubDebug(ptx_arch_id(arch_id)))
@@ -758,12 +758,12 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
       return error;
     }
 
-    const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
+    const int init_grid_size = ::cuda::std::max(1, ::cuda::ceil_div(num_tiles, init_kernel_threads));
 #ifdef CUB_DEBUG_LOG
-    _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+    _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
 #endif
     if (const auto error = CubDebug(
-          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+          THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_kernel_threads, 0, stream)
             .doit(detail::scan::DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
                   tile_state,
                   num_tiles,
@@ -793,6 +793,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
       OffsetT,
       AccumT,
       streaming_context_t>;
+
     int reduce_by_key_sm_occupancy{};
     if (const auto error = CubDebug(MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, block_threads)))
     {

From ef27cd4d32f97bffcf3a17fc862131d2e9d7add1 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 18:33:51 +0100
Subject: [PATCH 10/23] More SASS fixes

---
 cub/cub/detail/delay_constructor.cuh          |  20 +-
 .../dispatch/tuning/tuning_reduce_by_key.cuh  | 839 +++++++++---------
 .../tuning/tuning_rle_non_trivial_runs.cuh    |  11 +-
 3 files changed, 457 insertions(+), 413 deletions(-)

diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh
index 23d0c6d571e..9d939d4563f 100644
--- a/cub/cub/detail/delay_constructor.cuh
+++ b/cub/cub/detail/delay_constructor.cuh
@@ -193,15 +193,27 @@ struct delay_constructor_for<delay_constructor_kind::reduce_by_key, Delay, L2Wri
 template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
 using delay_constructor_t = typename delay_constructor_for<Kind, Delay, L2WriteLatency>::type;
 
-_CCCL_API constexpr auto
-default_reduce_by_key_delay_constructor_policy(int key_size, int value_size, bool value_is_primitive)
+_CCCL_API constexpr auto default_delay_constructor_policy(bool is_primitive_or_trivially_copyable)
 {
-  if (value_is_primitive && (value_size + key_size < 16))
+  if (is_primitive_or_trivially_copyable)
   {
-    return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450};
+    return delay_constructor_policy{delay_constructor_kind::fixed_delay, 350, 450};
   }
   return delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450};
 }
+
+_CCCL_API constexpr auto default_reduce_by_key_delay_constructor_policy(
+  int key_size,
+  int value_size,
+  bool key_is_primitive_or_trivially_copyable,
+  bool value_is_primitive_or_trivially_copyable)
+{
+  if (value_is_primitive_or_trivially_copyable && (value_size + key_size < 16))
+  {
+    return delay_constructor_policy{delay_constructor_kind::reduce_by_key, 350, 450};
+  }
+  return default_delay_constructor_policy(key_is_primitive_or_trivially_copyable);
+}
 } // namespace detail
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index d31810c9158..9ec45f56906 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -986,12 +986,13 @@ struct policy_selector
   int key_size;
   int accum_size;
 
-  // TODO(bgruber): we want to get rid of the following three and just assume by default that types behave "primitive".
+  // TODO(bgruber): we want to get rid of the following four and just assume by default that types behave "primitive".
   // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because
   // then users can opt-out of tunings again
-  bool is_primitive_key_t;
-  bool is_primitive_accum_t;
-  bool is_primitive_op;
+  bool key_is_primitive;
+  bool key_is_trivially_copyable;
+  bool accum_is_primitive;
+  bool op_is_primitive;
 
   _CCCL_API constexpr auto __make_default_reduce_by_key_policy(CacheLoadModifier load_mod) const -> reduce_by_key_policy
   {
@@ -1009,20 +1010,21 @@ struct policy_selector
       BLOCK_LOAD_DIRECT,
       load_mod,
       BLOCK_SCAN_WARP_SCANS,
-      default_reduce_by_key_delay_constructor_policy(accum_size, sizeof(int), true)};
+      default_reduce_by_key_delay_constructor_policy(
+        accum_size, sizeof(int), key_is_primitive || key_is_trivially_copyable, true)};
   }
 
   [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> reduce_by_key_policy
   {
     // bail out if we don't know the operation. TODO(bgruber): drop this check when we make the tuning API public
-    if (!is_primitive_op)
+    if (!op_is_primitive)
     {
       return __make_default_reduce_by_key_policy(LOAD_LDG);
     }
 
-    const bool use_tuning = (is_primitive_key_t || key_size == 16) && (is_primitive_accum_t || accum_size == 16);
+    const bool use_tuning = (key_is_primitive || key_size == 16) && (accum_is_primitive || accum_size == 16);
 
-    if (arch >= ::cuda::arch_id::sm_100 && use_tuning)
+    if (arch >= ::cuda::arch_id::sm_100 && use_tuning) // if we don't have a tuning, fall back to SM90
     {
       if (key_size == 1 && accum_size == 1)
       {
@@ -1170,217 +1172,226 @@ struct policy_selector
       }
     }
 
-    if (arch >= ::cuda::arch_id::sm_90 && use_tuning)
+    if (arch >= ::cuda::arch_id::sm_90)
     {
-      if (key_size == 1 && accum_size == 1)
-      {
-        return {
-          256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 720}};
-      }
-      if (key_size == 1 && accum_size == 2)
-      {
-        return {
-          320, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 865}};
-      }
-      if (key_size == 1 && accum_size == 4)
-      {
-        return {192,
-                14,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 735}};
-      }
-      if (key_size == 1 && accum_size == 8)
-      {
-        return {128,
-                13,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 580}};
-      }
-      if (key_size == 1 && accum_size == 16)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1100}};
-      }
-      if (key_size == 2 && accum_size == 1)
-      {
-        return {
-          128, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 985}};
-      }
-      if (key_size == 2 && accum_size == 2)
-      {
-        return {256,
-                11,
-                BLOCK_LOAD_DIRECT,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::fixed_delay, 276, 650}};
-      }
-      if (key_size == 2 && accum_size == 4)
-      {
-        return {256,
-                14,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::fixed_delay, 240, 765}};
-      }
-      if (key_size == 2 && accum_size == 8)
-      {
-        return {128,
-                19,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1190}};
-      }
-      if (key_size == 2 && accum_size == 16)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1175}};
-      }
-      if (key_size == 4 && accum_size == 1)
-      {
-        return {256,
-                13,
-                BLOCK_LOAD_DIRECT,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::fixed_delay, 404, 645}};
-      }
-      if (key_size == 4 && accum_size == 2)
-      {
-        return {256,
-                18,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1160}};
-      }
-      if (key_size == 4 && accum_size == 4)
-      {
-        return {256,
-                18,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1170}};
-      }
-      if (key_size == 4 && accum_size == 8)
-      {
-        return {128,
-                13,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1055}};
-      }
-      if (key_size == 4 && accum_size == 16)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1195}};
-      }
-      if (key_size == 8 && accum_size == 1)
-      {
-        return {
-          256, 10, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1170}};
-      }
-      if (key_size == 8 && accum_size == 2)
-      {
-        return {256,
-                9,
-                BLOCK_LOAD_DIRECT,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::fixed_delay, 236, 1030}};
-      }
-      if (key_size == 8 && accum_size == 4)
-      {
-        return {128,
-                13,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::fixed_delay, 152, 560}};
-      }
-      if (key_size == 8 && accum_size == 8)
-      {
-        return {128,
-                23,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1030}};
-      }
-      if (key_size == 8 && accum_size == 16)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1125}};
-      }
-      if (key_size == 16 && !is_primitive_key_t && accum_size == 1)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1080}};
-      }
-      if (key_size == 16 && !is_primitive_key_t && accum_size == 2)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::fixed_delay, 320, 1005}};
-      }
-      if (key_size == 16 && !is_primitive_key_t && accum_size == 4)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::fixed_delay, 232, 1100}};
-      }
-      if (key_size == 16 && !is_primitive_key_t && accum_size == 8)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1195}};
-      }
-      if (key_size == 16 && !is_primitive_key_t && accum_size == 16)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1150}};
-      }
+      if (use_tuning)
+      {
+        if (key_size == 1 && accum_size == 1)
+        {
+          return {
+            256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 720}};
+        }
+        if (key_size == 1 && accum_size == 2)
+        {
+          return {
+            320, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 865}};
+        }
+        if (key_size == 1 && accum_size == 4)
+        {
+          return {192,
+                  14,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 735}};
+        }
+        if (key_size == 1 && accum_size == 8)
+        {
+          return {128,
+                  13,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 580}};
+        }
+        if (key_size == 1 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1100}};
+        }
+        if (key_size == 2 && accum_size == 1)
+        {
+          return {
+            128, 23, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 985}};
+        }
+        if (key_size == 2 && accum_size == 2)
+        {
+          return {256,
+                  11,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 276, 650}};
+        }
+        if (key_size == 2 && accum_size == 4)
+        {
+          return {256,
+                  14,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 240, 765}};
+        }
+        if (key_size == 2 && accum_size == 8)
+        {
+          return {128,
+                  19,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1190}};
+        }
+        if (key_size == 2 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1175}};
+        }
+        if (key_size == 4 && accum_size == 1)
+        {
+          return {256,
+                  13,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 404, 645}};
+        }
+        if (key_size == 4 && accum_size == 2)
+        {
+          return {256,
+                  18,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1160}};
+        }
+        if (key_size == 4 && accum_size == 4)
+        {
+          return {256,
+                  18,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1170}};
+        }
+        if (key_size == 4 && accum_size == 8)
+        {
+          return {128,
+                  13,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1055}};
+        }
+        if (key_size == 4 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1195}};
+        }
+        if (key_size == 8 && accum_size == 1)
+        {
+          return {256,
+                  10,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1170}};
+        }
+        if (key_size == 8 && accum_size == 2)
+        {
+          return {256,
+                  9,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 236, 1030}};
+        }
+        if (key_size == 8 && accum_size == 4)
+        {
+          return {128,
+                  13,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 152, 560}};
+        }
+        if (key_size == 8 && accum_size == 8)
+        {
+          return {128,
+                  23,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1030}};
+        }
+        if (key_size == 8 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1125}};
+        }
+        if (key_size == 16 && accum_size == 1)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1080}};
+        }
+        if (key_size == 16 && accum_size == 2)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 320, 1005}};
+        }
+        if (key_size == 16 && accum_size == 4)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::fixed_delay, 232, 1100}};
+        }
+        if (key_size == 16 && accum_size == 8)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1195}};
+        }
+        if (key_size == 16 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1150}};
+        }
+      }
+
+      // no tuning, use a default one
       return __make_default_reduce_by_key_policy(LOAD_DEFAULT);
     }
 
@@ -1389,193 +1400,206 @@ struct policy_selector
       return __make_default_reduce_by_key_policy(LOAD_LDG);
     }
 
-    if (arch >= ::cuda::arch_id::sm_80 && use_tuning)
+    if (arch >= ::cuda::arch_id::sm_80)
     {
-      if (key_size == 1 && accum_size == 1)
-      {
-        return {
-          256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 975}};
-      }
-      if (key_size == 1 && accum_size == 2)
-      {
-        return {
-          224, 12, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 840}};
-      }
-      if (key_size == 1 && accum_size == 4)
-      {
-        return {256,
-                15,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 760}};
-      }
-      if (key_size == 1 && accum_size == 8)
-      {
-        return {
-          224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1070}};
-      }
-      if (key_size == 1 && accum_size == 16)
-      {
-        return {128,
-                9,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1175}};
-      }
-      if (key_size == 2 && accum_size == 1)
-      {
-        return {
-          256, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}};
-      }
-      if (key_size == 2 && accum_size == 2)
-      {
-        return {224,
-                14,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 640}};
-      }
-      if (key_size == 2 && accum_size == 4)
-      {
-        return {256,
-                14,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 905}};
-      }
-      if (key_size == 2 && accum_size == 8)
-      {
-        return {224,
-                9,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 810}};
-      }
-      if (key_size == 2 && accum_size == 16)
-      {
-        return {160,
-                9,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1115}};
-      }
-      if (key_size == 4 && accum_size == 1)
-      {
-        return {
-          288, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1110}};
-      }
-      if (key_size == 4 && accum_size == 2)
-      {
-        return {192,
-                15,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1200}};
-      }
-      if (key_size == 4 && accum_size == 4)
-      {
-        return {
-          256, 15, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1110}};
-      }
-      if (key_size == 4 && accum_size == 8)
-      {
-        return {224,
-                9,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1165}};
-      }
-      if (key_size == 4 && accum_size == 16)
-      {
-        return {160,
-                9,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1100}};
-      }
-      if (key_size == 8 && accum_size == 1)
-      {
-        return {192,
-                10,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1175}};
-      }
-      if (key_size == 8 && accum_size == 2)
-      {
-        return {
-          224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1075}};
-      }
-      if (key_size == 8 && accum_size == 4)
-      {
-        return {
-          384, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1040}};
-      }
-      if (key_size == 8 && accum_size == 8)
-      {
-        return {128,
-                14,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1080}};
-      }
-      if (key_size == 8 && accum_size == 16)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 430}};
-      }
-      if (key_size == 16 && !is_primitive_key_t && accum_size == 1)
-      {
-        return {
-          192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1105}};
-      }
-      if (key_size == 16 && !is_primitive_key_t && accum_size == 2)
-      {
-        return {192,
-                7,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 755}};
-      }
-      if (key_size == 16 && !is_primitive_key_t && accum_size == 4)
-      {
-        return {192,
-                7,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 535}};
-      }
-      if (key_size == 16 && !is_primitive_key_t && accum_size == 8)
-      {
-        return {
-          192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1035}};
-      }
-      if (key_size == 16 && !is_primitive_key_t && accum_size == 16)
-      {
-        return {128,
-                11,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS,
-                {delay_constructor_kind::no_delay, 0, 1090}};
-      }
+      if (use_tuning)
+      {
+        if (key_size == 1 && accum_size == 1)
+        {
+          return {
+            256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 975}};
+        }
+        if (key_size == 1 && accum_size == 2)
+        {
+          return {
+            224, 12, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 840}};
+        }
+        if (key_size == 1 && accum_size == 4)
+        {
+          return {256,
+                  15,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 760}};
+        }
+        if (key_size == 1 && accum_size == 8)
+        {
+          return {
+            224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1070}};
+        }
+        if (key_size == 1 && accum_size == 16)
+        {
+          return {128,
+                  9,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1175}};
+        }
+        if (key_size == 2 && accum_size == 1)
+        {
+          return {
+            256, 11, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}};
+        }
+        if (key_size == 2 && accum_size == 2)
+        {
+          return {224,
+                  14,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 640}};
+        }
+        if (key_size == 2 && accum_size == 4)
+        {
+          return {256,
+                  14,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 905}};
+        }
+        if (key_size == 2 && accum_size == 8)
+        {
+          return {224,
+                  9,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 810}};
+        }
+        if (key_size == 2 && accum_size == 16)
+        {
+          return {160,
+                  9,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1115}};
+        }
+        if (key_size == 4 && accum_size == 1)
+        {
+          return {288,
+                  11,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1110}};
+        }
+        if (key_size == 4 && accum_size == 2)
+        {
+          return {192,
+                  15,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1200}};
+        }
+        if (key_size == 4 && accum_size == 4)
+        {
+          return {256,
+                  15,
+                  BLOCK_LOAD_DIRECT,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1110}};
+        }
+        if (key_size == 4 && accum_size == 8)
+        {
+          return {224,
+                  9,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1165}};
+        }
+        if (key_size == 4 && accum_size == 16)
+        {
+          return {160,
+                  9,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1100}};
+        }
+        if (key_size == 8 && accum_size == 1)
+        {
+          return {192,
+                  10,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1175}};
+        }
+        if (key_size == 8 && accum_size == 2)
+        {
+          return {
+            224, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1075}};
+        }
+        if (key_size == 8 && accum_size == 4)
+        {
+          return {
+            384, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1040}};
+        }
+        if (key_size == 8 && accum_size == 8)
+        {
+          return {128,
+                  14,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1080}};
+        }
+        if (key_size == 8 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 430}};
+        }
+        if (key_size == 16 && accum_size == 1)
+        {
+          return {
+            192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1105}};
+        }
+        if (key_size == 16 && accum_size == 2)
+        {
+          return {192,
+                  7,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 755}};
+        }
+        if (key_size == 16 && accum_size == 4)
+        {
+          return {192,
+                  7,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 535}};
+        }
+        if (key_size == 16 && accum_size == 8)
+        {
+          return {
+            192, 7, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1035}};
+        }
+        if (key_size == 16 && accum_size == 16)
+        {
+          return {128,
+                  11,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_SCAN_WARP_SCANS,
+                  {delay_constructor_kind::no_delay, 0, 1090}};
+        }
+      }
+
+      // no tuning, use a default one
       return __make_default_reduce_by_key_policy(LOAD_DEFAULT);
     }
 
@@ -1613,9 +1637,10 @@ struct policy_selector_from_types
     return policy_selector{
       int{sizeof(KeyT)},
       int{sizeof(AccumT)},
-      (is_primitive_key<KeyT>() == primitive_key::yes),
-      (is_primitive_accum<AccumT>() == primitive_accum::yes),
-      (is_primitive_op<ReductionOpT>() == primitive_op::yes)}(arch);
+      is_primitive_v<KeyT>,
+      ::cuda::std::is_trivially_copyable_v<KeyT>,
+      is_primitive_v<AccumT>,
+      basic_binary_op_t<ReductionOpT>::value}(arch);
   }
 };
 } // namespace detail::reduce_by_key
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
index 3ff351a2bd4..97f1d9a03e6 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
@@ -374,6 +374,7 @@ struct policy_selector
   type_t key_type;
   bool length_is_primitive;
   bool key_is_primitive; // TODO(bgruber): can probably be derived from key_type
+  bool key_is_trivially_copyable;
 
   _CCCL_API constexpr auto
   make_default_policy(BlockLoadAlgorithm block_load_alg, int delay_ctor_key_size, CacheLoadModifier load_mod) const
@@ -388,7 +389,8 @@ struct policy_selector
       load_mod,
       true,
       BLOCK_SCAN_WARP_SCANS,
-      default_reduce_by_key_delay_constructor_policy(delay_ctor_key_size, sizeof(int), true)};
+      default_reduce_by_key_delay_constructor_policy(
+        delay_ctor_key_size, sizeof(int), key_is_primitive || key_is_trivially_copyable, true)};
   }
 
   [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy
@@ -600,7 +602,12 @@ struct policy_selector_from_types
   [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_non_trivial_runs_policy
   {
     constexpr policy_selector selector{
-      sizeof(LengthT), int{sizeof(KeyT)}, classify_type<KeyT>, is_primitive_v<LengthT>, is_primitive_v<KeyT>};
+      sizeof(LengthT),
+      int{sizeof(KeyT)},
+      classify_type<KeyT>,
+      is_primitive_v<LengthT>,
+      is_primitive_v<KeyT>,
+      ::cuda::std::is_trivially_copyable_v<KeyT>};
     return selector(arch);
   }
 };

From 74f0d052887956f3e36278d5708074fea6e008c1 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 21:57:16 +0100
Subject: [PATCH 11/23] comments and fix

---
 .../dispatch/tuning/tuning_reduce_by_key.cuh  | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index 9ec45f56906..c1baddd6f7f 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -17,8 +17,8 @@
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
-#include <cub/block/block_store.cuh>
 #include <cub/detail/delay_constructor.cuh>
+#include <cub/device/dispatch/tuning/common.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
@@ -762,7 +762,6 @@ template <class KeyT, class AccumT>
 struct sm100_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_4, accum_size::_4>
 {
   // ipt_14.tpb_224.trp_1.ld_0.ns_476.dcid_5.l2w_1005 1.187378  1.119705  1.185397  1.258420
-
   static constexpr int items                         = 14;
   static constexpr int threads                       = 224;
   static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
@@ -923,7 +922,7 @@ struct policy_hub
 
   struct Policy1000 : ChainedPolicy<1000, Policy1000, Policy900>
   {
-    // Use values from tuning if a specialization exists, otherwise pick the default
+    // Use values from tuning if a specialization exists, otherwise fall back to SM90
     template <typename Tuning>
     static auto select_agent_policy(int)
       -> AgentReduceByKeyPolicy<Tuning::threads,
@@ -985,6 +984,7 @@ struct policy_selector
 {
   int key_size;
   int accum_size;
+  type_t accum_t;
 
   // TODO(bgruber): we want to get rid of the following four and just assume by default that types behave "primitive".
   // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because
@@ -1028,6 +1028,7 @@ struct policy_selector
     {
       if (key_size == 1 && accum_size == 1)
       {
+        // ipt_13.tpb_576.trp_0.ld_1.ns_2044.dcid_5.l2w_240 1.161888  0.848558  1.134941  1.299109
         return {576,
                 13,
                 BLOCK_LOAD_DIRECT,
@@ -1037,6 +1038,7 @@ struct policy_selector
       }
       if (key_size == 1 && accum_size == 2)
       {
+        // ipt_10.tpb_224.trp_0.ld_0.ns_244.dcid_4.l2w_390 1.313932  1.260540  1.319588  1.427374
         return {224,
                 10,
                 BLOCK_LOAD_DIRECT,
@@ -1046,6 +1048,7 @@ struct policy_selector
       }
       if (key_size == 1 && accum_size == 4)
       {
+        // ipt_14.tpb_128.trp_0.ld_0.ns_248.dcid_2.l2w_285  1.118109  1.051534  1.134336  1.326788
         return {128,
                 14,
                 BLOCK_LOAD_DIRECT,
@@ -1055,6 +1058,7 @@ struct policy_selector
       }
       if (key_size == 1 && accum_size == 8)
       {
+        // ipt_19.tpb_128.trp_1.ld_0.ns_132.dcid_1.l2w_540 1.113820  1.002404  1.105014  1.202296
         return {128,
                 19,
                 BLOCK_LOAD_WARP_TRANSPOSE,
@@ -1064,6 +1068,7 @@ struct policy_selector
       }
       if (key_size == 2 && accum_size == 1)
       {
+        // ipt_14.tpb_128.trp_1.ld_0.ns_164.dcid_2.l2w_290 1.239579  1.119705  1.239111  1.313112
         return {128,
                 14,
                 BLOCK_LOAD_WARP_TRANSPOSE,
@@ -1073,6 +1078,7 @@ struct policy_selector
       }
       if (key_size == 2 && accum_size == 2)
       {
+        // ipt_14.tpb_256.trp_1.ld_0.ns_180.dcid_2.l2w_975 1.145635  1.012658  1.139956  1.251546
         return {256,
                 14,
                 BLOCK_LOAD_WARP_TRANSPOSE,
@@ -1080,8 +1086,9 @@ struct policy_selector
                 BLOCK_SCAN_WARP_SCANS,
                 {delay_constructor_kind::exponential_backoff, 180, 975}};
       }
-      if (key_size == 2 && accum_size == 4)
+      if (key_size == 2 && accum_size == 4 && accum_t != type_t::float32) // I16, F32, I32 regressed, fall back to SM90
       {
+        // ipt_11.tpb_256.trp_0.ld_0.ns_224.dcid_2.l2w_550 1.066293  1.000109  1.073092  1.181818
         return {256,
                 11,
                 BLOCK_LOAD_DIRECT,
@@ -1091,6 +1098,7 @@ struct policy_selector
       }
       if (key_size == 2 && accum_size == 8)
       {
+        // ipt_10.tpb_160.trp_1.ld_0.ns_156.dcid_1.l2w_725 1.045007  1.002105  1.049690  1.141827
         return {160,
                 10,
                 BLOCK_LOAD_WARP_TRANSPOSE,
@@ -1100,6 +1108,7 @@ struct policy_selector
       }
       if (key_size == 4 && accum_size == 1)
       {
+        // ipt_10.tpb_224.trp_0.ld_0.ns_324.dcid_2.l2w_285 1.157217  1.073724  1.166510  1.356940
         return {224,
                 10,
                 BLOCK_LOAD_DIRECT,
@@ -1109,6 +1118,7 @@ struct policy_selector
       }
       if (key_size == 4 && accum_size == 2)
       {
+        // ipt_11.tpb_256.trp_0.ld_0.ns_1984.dcid_5.l2w_115 1.214155  1.128842  1.214093  1.364476
         return {256,
                 11,
                 BLOCK_LOAD_DIRECT,
@@ -1118,6 +1128,7 @@ struct policy_selector
       }
       if (key_size == 4 && accum_size == 4)
       {
+        // ipt_14.tpb_224.trp_1.ld_0.ns_476.dcid_5.l2w_1005 1.187378  1.119705  1.185397  1.258420
         return {224,
                 14,
                 BLOCK_LOAD_WARP_TRANSPOSE,
@@ -1127,6 +1138,7 @@ struct policy_selector
       }
       if (key_size == 4 && accum_size == 8)
       {
+        // ipt_10.tpb_256.trp_1.ld_0.ns_1868.dcid_7.l2w_145 1.142915  1.020581  1.137459  1.237913
         return {256,
                 10,
                 BLOCK_LOAD_WARP_TRANSPOSE,
@@ -1136,6 +1148,7 @@ struct policy_selector
       }
       if (key_size == 8 && accum_size == 1)
       {
+        // ipt_9.tpb_224.trp_1.ld_0.ns_1940.dcid_5.l2w_460 1.157294  1.075650  1.153566  1.250729
         return {224,
                 9,
                 BLOCK_LOAD_WARP_TRANSPOSE,
@@ -1145,6 +1158,7 @@ struct policy_selector
       }
       if (key_size == 8 && accum_size == 2)
       {
+        // ipt_11.tpb_224.trp_1.ld_1.ns_392.dcid_2.l2w_550 1.104034  1.007212  1.099543  1.220401
         return {224,
                 11,
                 BLOCK_LOAD_WARP_TRANSPOSE,
@@ -1154,6 +1168,7 @@ struct policy_selector
       }
       if (key_size == 8 && accum_size == 4)
       {
+        // ipt_9.tpb_224.trp_1.ld_0.ns_244.dcid_2.l2w_475 1.130098  1.000000  1.130661  1.215722
         return {224,
                 9,
                 BLOCK_LOAD_WARP_TRANSPOSE,
@@ -1163,6 +1178,7 @@ struct policy_selector
       }
       if (key_size == 8 && accum_size == 8)
       {
+        // ipt_9.tpb_224.trp_1.ld_0.ns_196.dcid_2.l2w_340 1.272056  1.142857  1.262499  1.352941
         return {224,
                 9,
                 BLOCK_LOAD_WARP_TRANSPOSE,
@@ -1637,6 +1653,7 @@ struct policy_selector_from_types
     return policy_selector{
       int{sizeof(KeyT)},
       int{sizeof(AccumT)},
+      classify_type<AccumT>,
       is_primitive_v<KeyT>,
       ::cuda::std::is_trivially_copyable_v<KeyT>,
       is_primitive_v<AccumT>,

From bfbdc85e9eadf9917afde789259ac606d2d9d2f9 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 23:32:48 +0100
Subject: [PATCH 12/23] RLE fixdes

---
 .../dispatch/tuning/tuning_reduce_by_key.cuh  |  13 +-
 .../dispatch/tuning/tuning_rle_encode.cuh     | 241 ++++++++++--------
 2 files changed, 141 insertions(+), 113 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index c1baddd6f7f..c2230e9ef9b 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -994,7 +994,7 @@ struct policy_selector
   bool accum_is_primitive;
   bool op_is_primitive;
 
-  _CCCL_API constexpr auto __make_default_reduce_by_key_policy(CacheLoadModifier load_mod) const -> reduce_by_key_policy
+  _CCCL_API constexpr auto __make_default_policy(CacheLoadModifier load_mod) const -> reduce_by_key_policy
   {
     constexpr int nominal_4B_items_per_thread = 6;
     const int combined_input_bytes            = key_size + accum_size;
@@ -1019,7 +1019,7 @@ struct policy_selector
     // bail out if we don't know the operation. TODO(bgruber): drop this check when we make the tuning API public
     if (!op_is_primitive)
     {
-      return __make_default_reduce_by_key_policy(LOAD_LDG);
+      return __make_default_policy(LOAD_LDG);
     }
 
     const bool use_tuning = (key_is_primitive || key_size == 16) && (accum_is_primitive || accum_size == 16);
@@ -1408,12 +1408,12 @@ struct policy_selector
       }
 
       // no tuning, use a default one
-      return __make_default_reduce_by_key_policy(LOAD_DEFAULT);
+      return __make_default_policy(LOAD_DEFAULT);
     }
 
     if (arch >= ::cuda::arch_id::sm_86)
     {
-      return __make_default_reduce_by_key_policy(LOAD_LDG);
+      return __make_default_policy(LOAD_LDG);
     }
 
     if (arch >= ::cuda::arch_id::sm_80)
@@ -1616,10 +1616,11 @@ struct policy_selector
       }
 
       // no tuning, use a default one
-      return __make_default_reduce_by_key_policy(LOAD_DEFAULT);
+      return __make_default_policy(LOAD_DEFAULT);
     }
 
-    return __make_default_reduce_by_key_policy(LOAD_LDG);
+    // for SM50
+    return __make_default_policy(LOAD_LDG);
   }
 };
 
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
index 2fdd3bbbe90..14f9b58dea9 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
@@ -323,21 +323,38 @@ using reduce_by_key::policy_selector_from_hub;
 
 struct policy_selector
 {
-  length_size length_sz;
-  key_size key_sz;
-  primitive_length prim_len;
-  primitive_key prim_key;
+  int length_size;
+  int key_size;
+  type_t key_t;
+  bool length_is_primitive;
+  bool key_is_primitive;
   int max_input_bytes;
   int combined_input_bytes;
 
-  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy
+  _CCCL_API constexpr auto __make_default_policy(CacheLoadModifier load_mod) const -> rle_encode_policy
   {
-    const bool tuned_prim = (prim_len == primitive_length::yes && prim_key == primitive_key::yes);
-    const bool length_4   = (length_sz == length_size::_4);
+    constexpr int nominal_4B_items_per_thread = 6;
+    const int items_per_thread =
+      (max_input_bytes <= 8)
+        ? 6
+        : ::cuda::std::clamp(
+            ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
+    return rle_encode_policy{
+      128,
+      items_per_thread,
+      BLOCK_LOAD_DIRECT,
+      load_mod,
+      BLOCK_SCAN_WARP_SCANS,
+      {delay_constructor_kind::fixed_delay, 350, 450},
+    };
+  }
 
-    if (arch >= ::cuda::arch_id::sm_100 && tuned_prim && length_4)
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy
+  {
+    // if we don't have a tuning for SM100, fall back to SM90
+    if (arch >= ::cuda::arch_id::sm_100 && length_is_primitive && length_size == 4 && key_is_primitive)
     {
-      if (key_sz == key_size::_1)
+      if (key_size == 1)
       {
         return rle_encode_policy{
           256,
@@ -347,7 +364,7 @@ struct policy_selector
           BLOCK_SCAN_WARP_SCANS,
           {delay_constructor_kind::exponential_backon, 468, 300}};
       }
-      if (key_sz == key_size::_2)
+      if (key_size == 2)
       {
         return rle_encode_policy{
           224,
@@ -357,7 +374,7 @@ struct policy_selector
           BLOCK_SCAN_WARP_SCANS,
           {delay_constructor_kind::exponential_backon, 376, 420}};
       }
-      if (key_sz == key_size::_4)
+      if (key_size == 4)
       {
         return rle_encode_policy{
           256,
@@ -367,7 +384,7 @@ struct policy_selector
           BLOCK_SCAN_WARP_SCANS,
           {delay_constructor_kind::exponential_backon, 956, 70}};
       }
-      if (key_sz == key_size::_8)
+      if (key_size == 8)
       {
         return rle_encode_policy{
           224,
@@ -378,105 +395,114 @@ struct policy_selector
           {delay_constructor_kind::exponential_backoff, 188, 765}};
       }
     }
-    if (arch >= ::cuda::arch_id::sm_90 && tuned_prim && length_4)
+
+    if (arch >= ::cuda::arch_id::sm_90)
     {
-      if (key_sz == key_size::_1)
+      if (length_is_primitive && length_size == 4)
       {
-        return rle_encode_policy{
-          256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}};
+        if (key_is_primitive && key_size == 1)
+        {
+          return rle_encode_policy{
+            256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 620}};
+        }
+        if (key_is_primitive && key_size == 2)
+        {
+          return rle_encode_policy{
+            128, 22, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 775}};
+        }
+        if (key_is_primitive && key_size == 4)
+        {
+          return rle_encode_policy{
+            192,
+            14,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::fixed_delay, 284, 480}};
+        }
+        if (key_is_primitive && key_size == 8)
+        {
+          return rle_encode_policy{
+            128,
+            19,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 515}};
+        }
+        if (key_t == type_t::int128 || key_t == type_t::uint128)
+        {
+          return rle_encode_policy{
+            128,
+            11,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::fixed_delay, 428, 930}};
+        }
       }
-      if (key_sz == key_size::_2)
-      {
-        return rle_encode_policy{
-          128, 22, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 775}};
-      }
-      if (key_sz == key_size::_4)
-      {
-        return rle_encode_policy{
-          192,
-          14,
-          BLOCK_LOAD_WARP_TRANSPOSE,
-          LOAD_DEFAULT,
-          BLOCK_SCAN_WARP_SCANS,
-          {delay_constructor_kind::fixed_delay, 284, 480}};
-      }
-      if (key_sz == key_size::_8)
-      {
-        return rle_encode_policy{
-          128,
-          19,
-          BLOCK_LOAD_WARP_TRANSPOSE,
-          LOAD_DEFAULT,
-          BLOCK_SCAN_WARP_SCANS,
-          {delay_constructor_kind::no_delay, 0, 515}};
-      }
-#if _CCCL_HAS_INT128()
-      if (key_sz == key_size::_16)
-      {
-        return rle_encode_policy{
-          128,
-          11,
-          BLOCK_LOAD_WARP_TRANSPOSE,
-          LOAD_DEFAULT,
-          BLOCK_SCAN_WARP_SCANS,
-          {delay_constructor_kind::fixed_delay, 428, 930}};
-      }
-#endif
+
+      // no tuning, use a default one
+      return __make_default_policy(LOAD_DEFAULT);
     }
-    if (arch >= ::cuda::arch_id::sm_80 && tuned_prim && length_4)
+
+    if (arch >= ::cuda::arch_id::sm_86)
     {
-      if (key_sz == key_size::_1)
-      {
-        return rle_encode_policy{
-          256, 14, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 640}};
-      }
-      if (key_sz == key_size::_2)
-      {
-        return rle_encode_policy{
-          256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 900}};
-      }
-      if (key_sz == key_size::_4)
-      {
-        return rle_encode_policy{
-          256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 1080}};
-      }
-      if (key_sz == key_size::_8)
-      {
-        return rle_encode_policy{
-          224,
-          9,
-          BLOCK_LOAD_WARP_TRANSPOSE,
-          LOAD_DEFAULT,
-          BLOCK_SCAN_WARP_SCANS,
-          {delay_constructor_kind::no_delay, 0, 1075}};
-      }
-#if _CCCL_HAS_INT128()
-      if (key_sz == key_size::_16)
+      return __make_default_policy(LOAD_LDG);
+    }
+
+    if (arch >= ::cuda::arch_id::sm_80)
+    {
+      if (length_is_primitive && length_size == 4)
       {
-        return rle_encode_policy{
-          128,
-          7,
-          BLOCK_LOAD_WARP_TRANSPOSE,
-          LOAD_DEFAULT,
-          BLOCK_SCAN_WARP_SCANS,
-          {delay_constructor_kind::no_delay, 0, 630}};
+        if (key_is_primitive && key_size == 1)
+        {
+          return rle_encode_policy{
+            256, 14, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 640}};
+        }
+        if (key_is_primitive && key_size == 2)
+        {
+          return rle_encode_policy{
+            256, 13, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, BLOCK_SCAN_WARP_SCANS, {delay_constructor_kind::no_delay, 0, 900}};
+        }
+        if (key_is_primitive && key_size == 4)
+        {
+          return rle_encode_policy{
+            256,
+            13,
+            BLOCK_LOAD_DIRECT,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 1080}};
+        }
+        if (key_is_primitive && key_size == 8)
+        {
+          return rle_encode_policy{
+            224,
+            9,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 1075}};
+        }
+        if (key_t == type_t::int128 || key_t == type_t::uint128)
+        {
+          return rle_encode_policy{
+            128,
+            7,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_SCAN_WARP_SCANS,
+            {delay_constructor_kind::no_delay, 0, 630}};
+        }
       }
-#endif
+
+      // no tuning, use a default one
+      return __make_default_policy(LOAD_DEFAULT);
     }
-    constexpr int nominal_4B_items_per_thread = 6;
-    const int items =
-      (max_input_bytes <= 8)
-        ? 6
-        : ::cuda::std::clamp(
-            ::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
-    return rle_encode_policy{
-      128,
-      items,
-      BLOCK_LOAD_DIRECT,
-      LOAD_LDG,
-      BLOCK_SCAN_WARP_SCANS,
-      {delay_constructor_kind::fixed_delay, 350, 450},
-    };
+
+    // for SM50
+    return __make_default_policy(LOAD_LDG);
   }
 };
 
@@ -490,10 +516,11 @@ struct policy_selector_from_types
   [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy
   {
     constexpr policy_selector selector{
-      classify_length_size<LengthT>(),
-      classify_key_size<KeyT>(),
-      is_primitive_length<LengthT>(),
-      is_primitive_key<KeyT>(),
+      int{sizeof(LengthT)},
+      int{sizeof(KeyT)},
+      classify_type<KeyT>,
+      is_primitive_v<LengthT>,
+      is_primitive_v<KeyT>,
       static_cast<int>((::cuda::std::max) (sizeof(LengthT), sizeof(KeyT))),
       static_cast<int>(sizeof(LengthT) + sizeof(KeyT))};
     return selector(arch);

From eff865fab9cd4db8f9abe697ff5a22fd89b0eee0 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 23:48:15 +0100
Subject: [PATCH 13/23] const

---
 cub/benchmarks/bench/run_length_encode/encode.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu
index d00e16acf6e..650da0437a2 100644
--- a/cub/benchmarks/bench/run_length_encode/encode.cu
+++ b/cub/benchmarks/bench/run_length_encode/encode.cu
@@ -62,7 +62,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   thrust::device_vector<T> out_keys(elements);
   thrust::device_vector<T> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
 
-  T* d_in_keys        = thrust::raw_pointer_cast(in_keys.data());
+  const T* d_in_keys  = thrust::raw_pointer_cast(in_keys.data());
   T* d_out_keys       = thrust::raw_pointer_cast(out_keys.data());
   auto d_out_vals     = thrust::raw_pointer_cast(out_vals.data());
   auto d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());

From 475ba6fa540038d3f7486603b024b9b8a50d279c Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 26 Feb 2026 00:06:21 +0100
Subject: [PATCH 14/23] override accum_t

---
 cub/benchmarks/bench/run_length_encode/encode.cu    | 13 ++++---------
 .../dispatch/dispatch_streaming_reduce_by_key.cuh   |  9 ++++++---
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu
index 650da0437a2..4addc163fbb 100644
--- a/cub/benchmarks/bench/run_length_encode/encode.cu
+++ b/cub/benchmarks/bench/run_length_encode/encode.cu
@@ -44,14 +44,9 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   // Offset type large enough to represent the total number of runs in the sequence
   using num_runs_t = offset_t;
 
-  using keys_input_it_t            = const T*;
-  using unique_output_it_t         = T*;
-  using run_length_input_it_t      = thrust::constant_iterator<run_length_t, offset_t>;
-  using run_length_output_it_t     = run_length_t*;
-  using num_runs_output_iterator_t = num_runs_t*;
-  using equality_op_t              = ::cuda::std::equal_to<>;
-  using reduction_op_t             = ::cuda::std::plus<>;
-  using accum_t                    = run_length_t;
+  using run_length_input_it_t = thrust::constant_iterator<run_length_t, offset_t>;
+  using equality_op_t         = ::cuda::std::equal_to<>;
+  using reduction_op_t        = ::cuda::std::plus<>;
 
   const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   constexpr std::size_t min_segment_size = 1;
@@ -73,7 +68,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   const offset_t num_items = static_cast<offset_t>(elements);
 
   auto dispatch_on_stream = [&](cudaStream_t stream) {
-    return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key(
+    return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key</* OverrideAccumT */ run_length_t>(
       d_temp_storage,
       temp_storage_bytes,
       d_in_keys,
diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
index 4fec39263a6..2adc6651354 100644
--- a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
@@ -14,7 +14,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_reduce_by_key.cuh>
-#include <cub/detail/arch_dispatch.cuh>
+#include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -35,6 +35,7 @@ CUB_NAMESPACE_BEGIN
 namespace detail::reduce_by_key
 {
 template <
+  typename OverrideAccumT = use_default,
   typename KeysInputIteratorT,
   typename UniqueOutputIteratorT,
   typename ValuesInputIteratorT,
@@ -43,8 +44,10 @@ template <
   typename EqualityOpT,
   typename ReductionOpT,
   typename OffsetT,
-  typename AccumT =
-    ::cuda::std::__accumulator_t<ReductionOpT, it_value_t<ValuesInputIteratorT>, it_value_t<ValuesInputIteratorT>>,
+  typename AccumT = ::cuda::std::conditional_t<
+    !::cuda::std::is_same_v<OverrideAccumT, use_default>,
+    OverrideAccumT,
+    ::cuda::std::__accumulator_t<ReductionOpT, it_value_t<ValuesInputIteratorT>, it_value_t<ValuesInputIteratorT>>>,
   typename PolicySelector =
     policy_selector_from_types<ReductionOpT,
                                AccumT,

From 3fb2652c1d176e7819eac9176dbfb05f6a9d3762 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 26 Feb 2026 00:25:36 +0100
Subject: [PATCH 15/23] rle fixes

---
 cub/benchmarks/bench/run_length_encode/encode.cu   | 12 +++++++-----
 .../device/dispatch/tuning/tuning_rle_encode.cuh   | 14 +++++++-------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu
index 4addc163fbb..37227aa7649 100644
--- a/cub/benchmarks/bench/run_length_encode/encode.cu
+++ b/cub/benchmarks/bench/run_length_encode/encode.cu
@@ -47,6 +47,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   using run_length_input_it_t = thrust::constant_iterator<run_length_t, offset_t>;
   using equality_op_t         = ::cuda::std::equal_to<>;
   using reduction_op_t        = ::cuda::std::plus<>;
+  using accum_t               = run_length_t;
 
   const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   constexpr std::size_t min_segment_size = 1;
@@ -68,7 +69,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
   const offset_t num_items = static_cast<offset_t>(elements);
 
   auto dispatch_on_stream = [&](cudaStream_t stream) {
-    return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key</* OverrideAccumT */ run_length_t>(
+    return cub::detail::reduce_by_key::dispatch_streaming_reduce_by_key</* OverrideAccumT */ accum_t>(
       d_temp_storage,
       temp_storage_bytes,
       d_in_keys,
@@ -79,11 +80,12 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT, RunLengthT
       equality_op_t{},
       reduction_op_t{},
       num_items,
-      stream
-#if !TUNE_BASE
-      ,
+      stream,
+#if TUNE_BASE
+      cub::detail::rle::encode::policy_selector_from_types<accum_t, T> {}
+#else // TUNE_BASE
       bench_encode_policy_selector{}
-#endif
+#endif // TUNE_BASE
     );
   };
 
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
index 14f9b58dea9..fe01147de4c 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
@@ -327,13 +327,14 @@ struct policy_selector
   int key_size;
   type_t key_t;
   bool length_is_primitive;
+  bool length_is_trivially_copyable;
   bool key_is_primitive;
-  int max_input_bytes;
-  int combined_input_bytes;
 
   _CCCL_API constexpr auto __make_default_policy(CacheLoadModifier load_mod) const -> rle_encode_policy
   {
     constexpr int nominal_4B_items_per_thread = 6;
+    const int combined_input_bytes            = length_size + key_size;
+    const int max_input_bytes                 = (::cuda::std::max) (length_size, key_size);
     const int items_per_thread =
       (max_input_bytes <= 8)
         ? 6
@@ -345,8 +346,8 @@ struct policy_selector
       BLOCK_LOAD_DIRECT,
       load_mod,
       BLOCK_SCAN_WARP_SCANS,
-      {delay_constructor_kind::fixed_delay, 350, 450},
-    };
+      default_reduce_by_key_delay_constructor_policy(
+        length_size, int{sizeof(int)}, length_is_primitive || length_is_trivially_copyable, true)};
   }
 
   [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rle_encode_policy
@@ -520,9 +521,8 @@ struct policy_selector_from_types
       int{sizeof(KeyT)},
       classify_type<KeyT>,
       is_primitive_v<LengthT>,
-      is_primitive_v<KeyT>,
-      static_cast<int>((::cuda::std::max) (sizeof(LengthT), sizeof(KeyT))),
-      static_cast<int>(sizeof(LengthT) + sizeof(KeyT))};
+      ::cuda::std::is_trivially_copyable_v<LengthT>,
+      is_primitive_v<KeyT>};
     return selector(arch);
   }
 };

From dd5f4bc30fd176fec383bd8c3c03197d62c04ccc Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 26 Feb 2026 00:36:06 +0100
Subject: [PATCH 16/23] Comment

---
 cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
index fe01147de4c..20c52d20f84 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
@@ -326,6 +326,10 @@ struct policy_selector
   int length_size;
   int key_size;
   type_t key_t;
+
+  // TODO(bgruber): we want to get rid of the following three and just assume by default that types behave "primitive".
+  // This opts a lot more types into the tunings we have. We can do this when we publish the public tuning API, because
+  // then users can opt-out of tunings again
   bool length_is_primitive;
   bool length_is_trivially_copyable;
   bool key_is_primitive;

From c9585d49456c29ab0dd8a9efd109abeee1b72144 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 26 Feb 2026 09:27:47 +0100
Subject: [PATCH 17/23] Fix

---
 cub/cub/device/device_run_length_encode.cuh | 23 +++++++--------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh
index d0c5d05e0fa..8b1373afdc5 100644
--- a/cub/cub/device/device_run_length_encode.cuh
+++ b/cub/cub/device/device_run_length_encode.cuh
@@ -188,28 +188,19 @@ struct DeviceRunLengthEncode
 
     using policy_selector_t = detail::rle::encode::policy_selector_from_types<accum_t, key_t>;
 
-    return detail::reduce_by_key::dispatch_streaming_reduce_by_key<
-      InputIteratorT,
-      UniqueOutputIteratorT,
-      lengths_input_iterator_t,
-      LengthsOutputIteratorT,
-      NumRunsOutputIteratorT,
-      equality_op,
-      reduction_op,
-      offset_t,
-      accum_t,
-      policy_selector_t>(
+    return detail::reduce_by_key::dispatch_streaming_reduce_by_key</* OverrideAccumT */ accum_t>(
       d_temp_storage,
       temp_storage_bytes,
       d_in,
       d_unique_out,
-      lengths_input_iterator_t((length_t) 1),
+      lengths_input_iterator_t(length_t{1}),
       d_counts_out,
       d_num_runs_out,
-      equality_op(),
-      reduction_op(),
-      num_items,
-      stream);
+      equality_op{},
+      reduction_op{},
+      static_cast<offset_t>(num_items),
+      stream,
+      policy_selector_t{});
   }
 
   //! @rst

From e2ea871d27124b08e97ff66ad2dd30412ac6d118 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 26 Feb 2026 09:32:54 +0100
Subject: [PATCH 18/23] Review

---
 cub/cub/device/dispatch/dispatch_reduce_by_key.cuh         | 2 +-
 .../device/dispatch/dispatch_streaming_reduce_by_key.cuh   | 7 ++++---
 cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh    | 5 +----
 cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh       | 5 +----
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 321cd2536f5..e1bb6357833 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -33,7 +33,7 @@
 
 #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
 #  include <sstream>
-#endif
+#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
 
 CUB_NAMESPACE_BEGIN
 
diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
index 2adc6651354..5d959df2c4c 100644
--- a/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
@@ -17,15 +17,16 @@
 #include <cub/device/dispatch/dispatch_common.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
-#include <cub/thread/thread_operators.cuh>
 #include <cub/util_device.cuh>
-#include <cub/util_math.cuh>
 #include <cub/util_type.cuh>
 
 #include <thrust/iterator/offset_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/__functional/invoke.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_same.h>
+
 #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
 #  include <sstream>
 #endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index c2230e9ef9b..64c96bc77d6 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -25,10 +25,7 @@
 #include <cuda/__device/arch_id.h>
 #include <cuda/cmath>
 #include <cuda/std/__algorithm/clamp.h>
-
-#if _CCCL_HAS_CONCEPTS()
-#  include <cuda/std/concepts>
-#endif // _CCCL_HAS_CONCEPTS()
+#include <cuda/std/concepts>
 
 #if !_CCCL_COMPILER(NVRTC)
 #  include <ostream>
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
index 20c52d20f84..5083ed52b85 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
@@ -27,10 +27,7 @@
 #include <cuda/__device/arch_id.h>
 #include <cuda/std/__algorithm/clamp.h>
 #include <cuda/std/__algorithm/max.h>
-
-#if _CCCL_HAS_CONCEPTS()
-#  include <cuda/std/concepts>
-#endif // _CCCL_HAS_CONCEPTS()
+#include <cuda/std/concepts>
 
 #if !_CCCL_COMPILER(NVRTC)
 #  include <ostream>

From 668966eba5e023aaa259e50be5059dd44eccfb0c Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 26 Feb 2026 18:07:41 +0100
Subject: [PATCH 19/23] MSVC

---
 .../dispatch/dispatch_reduce_by_key.cuh       | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index e1bb6357833..fada316a417 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -698,19 +698,16 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
          "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast<int>(arch_id), ss.str().c_str());))
 #endif
 
-    // convert the policy into a legacy agent policy for vsmem_helper. TODO(bgruber): refactor this in the future
-    using AgentReduceByKeyPolicy = AgentReduceByKeyPolicy<
-      policy.block_threads,
-      policy.items_per_thread,
-      policy.load_algorithm,
-      policy.load_modifier,
-      policy.scan_algorithm,
-      delay_constructor_t<policy.delay_constructor.kind,
-                          policy.delay_constructor.delay,
-                          policy.delay_constructor.l2_write_latency>>;
-
     using vsmem_helper_t = vsmem_helper_default_fallback_policy_t<
-      AgentReduceByKeyPolicy,
+      // convert the policy into a legacy agent policy for vsmem_helper. TODO(bgruber): refactor this in the future
+      AgentReduceByKeyPolicy<policy.block_threads,
+                             policy.items_per_thread,
+                             policy.load_algorithm,
+                             policy.load_modifier,
+                             policy.scan_algorithm,
+                             delay_constructor_t<policy.delay_constructor.kind,
+                                                 policy.delay_constructor.delay,
+                                                 policy.delay_constructor.l2_write_latency>>,
       AgentReduceByKey,
       KeysInputIteratorT,
       UniqueOutputIteratorT,

From 224597a65ec23e6f95073c276b7e2df69a2ae3dd Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 26 Feb 2026 19:26:03 +0100
Subject: [PATCH 20/23] M S V C

---
 .../dispatch/dispatch_reduce_by_key.cuh       | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index fada316a417..638e3056dc2 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -698,16 +698,20 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
          "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast<int>(arch_id), ss.str().c_str());))
 #endif
 
+    // Convert the policy into a legacy agent policy for vsmem_helper. Use a value and decltype it again, to workaround
+    // MSVC bug that just turns the type to `int`. TODO(bgruber): refactor this in the future
+    [[maybe_unused]] AgentReduceByKeyPolicy<
+      policy.block_threads,
+      policy.items_per_thread,
+      policy.load_algorithm,
+      policy.load_modifier,
+      policy.scan_algorithm,
+      delay_constructor_t<policy.delay_constructor.kind,
+                          policy.delay_constructor.delay,
+                          policy.delay_constructor.l2_write_latency>> agent_policy;
+
     using vsmem_helper_t = vsmem_helper_default_fallback_policy_t<
-      // convert the policy into a legacy agent policy for vsmem_helper. TODO(bgruber): refactor this in the future
-      AgentReduceByKeyPolicy<policy.block_threads,
-                             policy.items_per_thread,
-                             policy.load_algorithm,
-                             policy.load_modifier,
-                             policy.scan_algorithm,
-                             delay_constructor_t<policy.delay_constructor.kind,
-                                                 policy.delay_constructor.delay,
-                                                 policy.delay_constructor.l2_write_latency>>,
+      decltype(agent_policy),
       AgentReduceByKey,
       KeysInputIteratorT,
       UniqueOutputIteratorT,

From ab404c69f957ebe2eddce36e4fa68725129c1887 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 27 Feb 2026 13:02:11 +0100
Subject: [PATCH 21/23] M  S  V  C

---
 .../dispatch/dispatch_reduce_by_key.cuh       | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 638e3056dc2..f88c853628b 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -645,6 +645,22 @@ struct DispatchReduceByKey
 
 namespace detail::reduce_by_key
 {
+// we move the conversion out of the lambda below, so MSVC can compile the code
+template <typename PolicyGetter>
+_CCCL_API constexpr auto convert_to_agent_policy(PolicyGetter policy_getter)
+{
+  constexpr reduce_by_key_policy policy = policy_getter();
+  return AgentReduceByKeyPolicy<
+    policy.block_threads,
+    policy.items_per_thread,
+    policy.load_algorithm,
+    policy.load_modifier,
+    policy.scan_algorithm,
+    delay_constructor_t<policy.delay_constructor.kind,
+                        policy.delay_constructor.delay,
+                        policy.delay_constructor.l2_write_latency>>{};
+}
+
 template <
   typename OverrideAccumT = use_default,
   typename KeysInputIteratorT,
@@ -698,20 +714,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
          "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast<int>(arch_id), ss.str().c_str());))
 #endif
 
-    // Convert the policy into a legacy agent policy for vsmem_helper. Use a value and decltype it again, to workaround
-    // MSVC bug that just turns the type to `int`. TODO(bgruber): refactor this in the future
-    [[maybe_unused]] AgentReduceByKeyPolicy<
-      policy.block_threads,
-      policy.items_per_thread,
-      policy.load_algorithm,
-      policy.load_modifier,
-      policy.scan_algorithm,
-      delay_constructor_t<policy.delay_constructor.kind,
-                          policy.delay_constructor.delay,
-                          policy.delay_constructor.l2_write_latency>> agent_policy;
-
+    // TODO(bgruber): refactor this in the future
     using vsmem_helper_t = vsmem_helper_default_fallback_policy_t<
-      decltype(agent_policy),
+      decltype(convert_to_agent_policy(policy_getter)),
       AgentReduceByKey,
       KeysInputIteratorT,
       UniqueOutputIteratorT,

From 186ac061c38cda8c20e5ddef3362508d2debc856 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 27 Feb 2026 14:00:47 +0100
Subject: [PATCH 22/23] Silence warning

---
 cub/cub/device/dispatch/dispatch_reduce_by_key.cuh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index f88c853628b..84bd03e465c 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -705,12 +705,10 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
   }
 
   return detail::dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) {
-    static constexpr reduce_by_key_policy policy = policy_getter(); // need the constexpr of vsmem_helper
-
 #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
     NV_IF_TARGET(
       NV_IS_HOST,
-      (::std::stringstream ss; ss << policy; _CubLog(
+      (::std::stringstream ss; ss << policy_getter(); _CubLog(
          "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast<int>(arch_id), ss.str().c_str());))
 #endif
 

From 87f00f775b6fa3dc594a12f5ee0484642d6ecaa1 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 27 Feb 2026 16:02:55 +0100
Subject: [PATCH 23/23] M S V C

---
 .../dispatch/dispatch_reduce_by_key.cuh       | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 84bd03e465c..14fbf3ae15c 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -645,20 +645,25 @@ struct DispatchReduceByKey
 
 namespace detail::reduce_by_key
 {
-// we move the conversion out of the lambda below, so MSVC can compile the code
-template <typename PolicyGetter>
-_CCCL_API constexpr auto convert_to_agent_policy(PolicyGetter policy_getter)
+// we move the conversion of the policy to the agent policy and its use out of the lambda below, so MSVC does not ICE
+template <typename PolicyGetter, typename... Args>
+_CCCL_API auto determine_threads_items_vsmem(PolicyGetter policy_getter)
 {
+  // TODO(bgruber): refactor this in the future
   constexpr reduce_by_key_policy policy = policy_getter();
-  return AgentReduceByKeyPolicy<
-    policy.block_threads,
-    policy.items_per_thread,
-    policy.load_algorithm,
-    policy.load_modifier,
-    policy.scan_algorithm,
-    delay_constructor_t<policy.delay_constructor.kind,
-                        policy.delay_constructor.delay,
-                        policy.delay_constructor.l2_write_latency>>{};
+  using Policy                          = AgentReduceByKeyPolicy<
+                             policy.block_threads,
+                             policy.items_per_thread,
+                             policy.load_algorithm,
+                             policy.load_modifier,
+                             policy.scan_algorithm,
+                             delay_constructor_t<policy.delay_constructor.kind,
+                                                 policy.delay_constructor.delay,
+                                                 policy.delay_constructor.l2_write_latency>>;
+  using vsmem_helper_t = vsmem_helper_default_fallback_policy_t<Policy, AgentReduceByKey, Args...>;
+  return ::cuda::std::tuple{vsmem_helper_t::agent_policy_t::BLOCK_THREADS,
+                            vsmem_helper_t::agent_policy_t::ITEMS_PER_THREAD,
+                            vsmem_helper_t::vsmem_per_block};
 }
 
 template <
@@ -712,10 +717,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
          "Dispatching DeviceReduceByKey to arch %d with tuning: %s\n", static_cast<int>(arch_id), ss.str().c_str());))
 #endif
 
-    // TODO(bgruber): refactor this in the future
-    using vsmem_helper_t = vsmem_helper_default_fallback_policy_t<
-      decltype(convert_to_agent_policy(policy_getter)),
-      AgentReduceByKey,
+    const auto [block_threads, items_per_thread, vsmem_per_block] = determine_threads_items_vsmem<
+      decltype(policy_getter),
       KeysInputIteratorT,
       UniqueOutputIteratorT,
       ValuesInputIteratorT,
@@ -725,17 +728,14 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
       ReductionOpT,
       OffsetT,
       AccumT,
-      streaming_context_t>;
-
-    constexpr int block_threads    = vsmem_helper_t::agent_policy_t::BLOCK_THREADS;
-    constexpr int items_per_thread = vsmem_helper_t::agent_policy_t::ITEMS_PER_THREAD;
+      streaming_context_t>(policy_getter);
 
     // Number of input tiles
     const int tile_size = block_threads * items_per_thread;
     const int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
 
     // The amount of virtual shared memory to allocate
-    const auto vsmem_size = num_tiles * vsmem_helper_t::vsmem_per_block;
+    const auto vsmem_size = num_tiles * vsmem_per_block;
 
     size_t tile_descriptor_memory{};
     if (const auto error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, tile_descriptor_memory)))