From ad1c1df7c77a15427392603d6c69d23e2774dd10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Sat, 7 Feb 2026 20:58:47 -0800
Subject: [PATCH 01/31] Base changes in scan and tests.

---
 c/parallel/src/scan.cu                        | 239 +++++----
 cub/cub/detail/delay_constructor.cuh          | 183 +++++++
 cub/cub/device/dispatch/dispatch_scan.cuh     | 239 ++++++++-
 .../device/dispatch/kernels/kernel_scan.cuh   |  29 +-
 .../dispatch/tuning/tuning_radix_sort.cuh     | 191 +------
 .../device/dispatch/tuning/tuning_scan.cuh    | 479 +++++++++++++++++-
 .../nvbench_helper/look_back_helper.cuh       |  19 +-
 7 files changed, 1086 insertions(+), 293 deletions(-)
 create mode 100644 cub/cub/detail/delay_constructor.cuh
diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 230707e7d64..215b232a270 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -4,13 +4,12 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
 #include <cub/detail/choose_offset.cuh>
 #include <cub/detail/launcher/cuda_driver.cuh>
-#include <cub/detail/ptx-json-parser.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/thread/thread_load.cuh>
 #include <cub/util_arch.cuh>
@@ -21,6 +20,7 @@
 #include <format>
 #include <iostream>
 #include <optional>
+#include <sstream>
 #include <string>
 #include <type_traits>
 #include <vector>
@@ -56,33 +56,6 @@ enum class InitKind
   NoInit,
 };
 
-struct scan_runtime_tuning_policy
-{
-  cub::detail::RuntimeScanAgentPolicy scan;
-
-  auto Scan() const
-  {
-    return scan;
-  }
-
-  void CheckLoadModifier() const
-  {
-    if (scan.LoadModifier() == cub::CacheLoadModifier::LOAD_LDG)
-    {
-      throw std::runtime_error("The memory consistency model does not apply to texture "
-                               "accesses");
-    }
-  }
-
-  using MaxPolicy = scan_runtime_tuning_policy;
-
-  template <typename F>
-  cudaError_t Invoke(int, F& op)
-  {
-    return op.template Invoke<scan_runtime_tuning_policy>(*this);
-  }
-};
-
 static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_type_info init)
 {
   // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
@@ -120,8 +93,8 @@ std::string get_scan_kernel_name(
   bool force_inclusive,
   cccl_init_kind_t init_kind)
 {
-  std::string chained_policy_t;
-  check(cccl_type_name_from_nvrtc<device_scan_policy>(&chained_policy_t));
+  std::string policy_selector_t;
+  check(cccl_type_name_from_nvrtc<device_scan_policy>(&policy_selector_t));
 
   const cccl_type_info accum_t  = scan::get_accumulator_type(op, input_it, init);
   const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type);
@@ -162,7 +135,7 @@ std::string get_scan_kernel_name(
   auto tile_state_t = std::format("cub::ScanTileState<{0}>", accum_cpp_t);
   return std::format(
     "cub::detail::scan::DeviceScanKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}>",
-    chained_policy_t, // 0
+    policy_selector_t, // 0
     input_iterator_t, // 1
     output_iterator_t, // 2
     tile_state_t, // 3
@@ -174,20 +147,6 @@ std::string get_scan_kernel_name(
     init_t); // 9
 }
 
-template <auto* GetPolicy>
-struct dynamic_scan_policy_t
-{
-  using MaxPolicy = dynamic_scan_policy_t;
-
-  template <typename F>
-  cudaError_t Invoke(int device_ptx_version, F& op)
-  {
-    return op.template Invoke<scan_runtime_tuning_policy>(GetPolicy(device_ptx_version, accumulator_type));
-  }
-
-  cccl_type_info accumulator_type;
-};
-
 struct scan_kernel_source
 {
   cccl_device_scan_build_result_t& build;
@@ -245,8 +204,115 @@ try
 
   const auto output_it_value_t = cccl_type_enum_to_name(output_it.value_type.type);
 
-  std::string policy_hub_expr = std::format(
-    "cub::detail::scan::policy_hub<{}, {}, {}, {}, {}>",
+  const auto policy_sel = [&] {
+    using cub::detail::op_kind_t;
+    using cub::detail::type_t;
+    using cub::detail::scan::policy_selector;
+    using cub::detail::scan::primitive_accum;
+    using cub::detail::scan::primitive_op;
+
+    auto accum_type = type_t::other;
+    switch (accum_t.type)
+    {
+      case CCCL_INT8:
+        accum_type = type_t::int8;
+        break;
+      case CCCL_INT16:
+        accum_type = type_t::int16;
+        break;
+      case CCCL_INT32:
+        accum_type = type_t::int32;
+        break;
+      case CCCL_INT64:
+        accum_type = type_t::int64;
+        break;
+      case CCCL_UINT8:
+        accum_type = type_t::uint8;
+        break;
+      case CCCL_UINT16:
+        accum_type = type_t::uint16;
+        break;
+      case CCCL_UINT32:
+        accum_type = type_t::uint32;
+        break;
+      case CCCL_UINT64:
+        accum_type = type_t::uint64;
+        break;
+      case CCCL_FLOAT32:
+        accum_type = type_t::float32;
+        break;
+      case CCCL_FLOAT64:
+        accum_type = type_t::float64;
+        break;
+      default:
+        break;
+    }
+
+    auto operation_t = op_kind_t::other;
+    switch (op.type)
+    {
+      case CCCL_PLUS:
+        operation_t = op_kind_t::plus;
+        break;
+      case CCCL_MINIMUM:
+        operation_t = op_kind_t::min;
+        break;
+      case CCCL_MAXIMUM:
+        operation_t = op_kind_t::max;
+        break;
+      default:
+        break;
+    }
+
+    auto primitive_accum_t = primitive_accum::no;
+    switch (accum_t.type)
+    {
+      case CCCL_INT8:
+      case CCCL_INT16:
+      case CCCL_INT32:
+      case CCCL_INT64:
+      case CCCL_UINT8:
+      case CCCL_UINT16:
+      case CCCL_UINT32:
+      case CCCL_UINT64:
+      case CCCL_FLOAT16:
+      case CCCL_FLOAT32:
+      case CCCL_FLOAT64:
+      case CCCL_BOOLEAN:
+        primitive_accum_t = primitive_accum::yes;
+        break;
+      default:
+        break;
+    }
+
+    const auto primitive_op_t =
+      (op.type == CCCL_PLUS || op.type == CCCL_MINIMUM || op.type == CCCL_MAXIMUM)
+        ? primitive_op::yes
+        : primitive_op::no;
+
+    const auto input_type      = input_it.value_type.type;
+    const auto output_type     = output_it.value_type.type;
+    const bool types_match     = input_type == output_type && input_type == accum_t.type;
+    const bool benchmark_match = primitive_op_t == primitive_op::yes && types_match && input_type != CCCL_STORAGE;
+
+    return policy_selector{
+      static_cast<int>(input_it.value_type.size),
+      static_cast<int>(output_it.value_type.size),
+      static_cast<int>(accum_t.size),
+      int{sizeof(OffsetT)},
+      accum_type,
+      operation_t,
+      primitive_accum_t,
+      primitive_op_t,
+      benchmark_match};
+  }();
+
+  // TODO(bgruber): drop this if tuning policies become formattable
+  std::stringstream policy_sel_str;
+  policy_sel_str << policy_sel(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));
+
+  std::string policy_selector_expr = std::format(
+    "cub::detail::scan::policy_selector_from_types<{}, {}, {}, {}, {}>",
     input_it_value_t,
     output_it_value_t,
     accum_cpp,
@@ -265,20 +331,20 @@ struct __align__({1}) storage_t {{
 {2}
 {3}
 {4}
-using device_scan_policy = {5}::MaxPolicy;
-
-#include <cub/detail/ptx-json/json.cuh>
-__device__ consteval auto& policy_generator() {{
-  return ptx_json::id<ptx_json::string("device_scan_policy")>()
-    = cub::detail::scan::ScanPolicyWrapper<device_scan_policy::ActivePolicy>::EncodedPolicy();
-}}
+using device_scan_policy = {5};
+using namespace cub;
+using namespace cub::detail::scan;
+using cub::detail::delay_constructor_policy;
+using cub::detail::delay_constructor_kind;
+static_assert(device_scan_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6}, "Host generated and JIT compiled policy mismatch");
 )XXX",
     input_it.value_type.size, // 0
     input_it.value_type.alignment, // 1
     input_iterator_src, // 2
     output_iterator_src, // 3
     op_src, // 4
-    policy_hub_expr); // 5
+    policy_selector_expr, // 5
+    policy_sel_str.view()); // 6
 
 #if false // CCCL_DEBUGGING_SWITCH
     fflush(stderr);
@@ -302,7 +368,6 @@ __device__ consteval auto& policy_generator() {{
     "-rdc=true",
     "-dlto",
     "-DCUB_DISABLE_CDP",
-    "-DCUB_ENABLE_POLICY_PTX_JSON",
     "-std=c++20"};
 
   cccl::detail::extend_args_with_build_config(args, config);
@@ -337,11 +402,6 @@ __device__ consteval auto& policy_generator() {{
   auto [description_bytes_per_tile,
         payload_bytes_per_tile] = get_tile_state_bytes_per_tile(accum_t, accum_cpp, args.data(), args.size(), arch);
 
-  nlohmann::json runtime_policy = cub::detail::ptx_json::parse("device_scan_policy", {result.data.get(), result.size});
-
-  using cub::detail::RuntimeScanAgentPolicy;
-  auto scan_policy = RuntimeScanAgentPolicy::from_json(runtime_policy, "ScanPolicyT");
-
   build_ptr->cc                         = cc;
   build_ptr->cubin                      = (void*) result.data.release();
   build_ptr->cubin_size                 = result.size;
@@ -350,7 +410,7 @@ __device__ consteval auto& policy_generator() {{
   build_ptr->init_kind                  = init_kind;
   build_ptr->description_bytes_per_tile = description_bytes_per_tile;
   build_ptr->payload_bytes_per_tile     = payload_bytes_per_tile;
-  build_ptr->runtime_policy             = new scan::scan_runtime_tuning_policy{scan_policy};
+  build_ptr->runtime_policy             = new cub::detail::scan::policy_selector{policy_sel};
 
   return CUDA_SUCCESS;
 }
@@ -384,31 +444,39 @@ CUresult cccl_device_scan(
     CUdevice cu_device;
     check(cuCtxGetDevice(&cu_device));
 
-    auto exec_status = cub::DispatchScan<
-      indirect_arg_t,
-      indirect_arg_t,
-      indirect_arg_t,
-      std::conditional_t<std::is_same_v<InitValueT, cub::NullType>, cub::NullType, indirect_arg_t>,
-      cuda::std::size_t,
-      void,
-      EnforceInclusive,
-      scan::scan_runtime_tuning_policy,
-      scan::scan_kernel_source,
-      cub::detail::CudaDriverLauncherFactory>::
-      Dispatch(
+    if constexpr (std::is_same_v<InitValueT, cub::NullType>)
+    {
+      auto exec_status = cub::detail::scan::dispatch_with_accum<void>(
         d_temp_storage,
         *temp_storage_bytes,
-        d_in,
-        d_out,
-        op,
+        indirect_arg_t{d_in},
+        indirect_arg_t{d_out},
+        indirect_arg_t{op},
         init,
-        num_items,
+        static_cast<OffsetT>(num_items),
         stream,
-        {build},
-        cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
-        *reinterpret_cast<scan::scan_runtime_tuning_policy*>(build.runtime_policy));
-
-    error = static_cast<CUresult>(exec_status);
+        *static_cast<cub::detail::scan::policy_selector*>(build.runtime_policy),
+        scan::scan_kernel_source{build},
+        cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
+      error = static_cast<CUresult>(exec_status);
+    }
+    else
+    {
+      indirect_arg_t init_arg{init};
+      auto exec_status = cub::detail::scan::dispatch_with_accum<void>(
+        d_temp_storage,
+        *temp_storage_bytes,
+        indirect_arg_t{d_in},
+        indirect_arg_t{d_out},
+        indirect_arg_t{op},
+        init_arg,
+        static_cast<OffsetT>(num_items),
+        stream,
+        *static_cast<cub::detail::scan::policy_selector*>(build.runtime_policy),
+        scan::scan_kernel_source{build},
+        cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
+      error = static_cast<CUresult>(exec_status);
+    }
   }
   catch (const std::exception& exc)
   {
@@ -549,7 +617,8 @@ try
     return CUDA_ERROR_INVALID_VALUE;
   }
   std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(build_ptr->cubin));
-  std::unique_ptr<char[]> policy(reinterpret_cast<char*>(build_ptr->runtime_policy));
+  std::unique_ptr<cub::detail::scan::policy_selector> policy(
+    static_cast<cub::detail::scan::policy_selector*>(build_ptr->runtime_policy));
   check(cuLibraryUnload(build_ptr->library));
 
   return CUDA_SUCCESS;
diff --git a/cub/cub/detail/delay_constructor.cuh b/cub/cub/detail/delay_constructor.cuh
new file mode 100644
index 00000000000..44a409f497a
--- /dev/null
+++ b/cub/cub/detail/delay_constructor.cuh
@@ -0,0 +1,183 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+enum class delay_constructor_kind
+{
+  no_delay,
+  fixed_delay,
+  exponential_backoff,
+  exponential_backoff_jitter,
+  exponential_backoff_jitter_window,
+  exponential_backon_jitter_window,
+  exponential_backon_jitter,
+  exponential_backon
+};
+
+#if !_CCCL_COMPILER(NVRTC)
+inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kind)
+{
+  switch (kind)
+  {
+    case delay_constructor_kind::no_delay:
+      return os << "delay_constructor_kind::no_delay";
+    case delay_constructor_kind::fixed_delay:
+      return os << "delay_constructor_kind::fixed_delay";
+    case delay_constructor_kind::exponential_backoff:
+      return os << "delay_constructor_kind::exponential_backoff";
+    case delay_constructor_kind::exponential_backoff_jitter:
+      return os << "delay_constructor_kind::exponential_backoff_jitter";
+    case delay_constructor_kind::exponential_backoff_jitter_window:
+      return os << "delay_constructor_kind::exponential_backoff_jitter_window";
+    case delay_constructor_kind::exponential_backon_jitter_window:
+      return os << "delay_constructor_kind::exponential_backon_jitter_window";
+    case delay_constructor_kind::exponential_backon_jitter:
+      return os << "delay_constructor_kind::exponential_backon_jitter";
+    case delay_constructor_kind::exponential_backon:
+      return os << "delay_constructor_kind::exponential_backon";
+    default:
+      return os << "<unknown delay_constructor_kind: " << static_cast<int>(kind) << ">";
+  }
+}
+#endif // !_CCCL_COMPILER(NVRTC)
+
+struct delay_constructor_policy
+{
+  delay_constructor_kind kind;
+  unsigned int delay;
+  unsigned int l2_write_latency;
+
+  _CCCL_API constexpr friend bool operator==(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs)
+  {
+    return lhs.kind == rhs.kind && lhs.delay == rhs.delay && lhs.l2_write_latency == rhs.l2_write_latency;
+  }
+
+  _CCCL_API constexpr friend bool operator!=(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const delay_constructor_policy& p)
+  {
+    return os << "delay_constructor_policy { .kind = " << p.kind << ", .delay = " << p.delay
+              << ", .l2_write_latency = " << p.l2_write_latency << " }";
+  }
+#endif // !_CCCL_COMPILER(NVRTC)
+};
+
+template <typename DelayConstructor>
+inline constexpr auto delay_constructor_policy_from_type = 0;
+
+template <unsigned int L2WriteLatency>
+inline constexpr auto delay_constructor_policy_from_type<no_delay_constructor_t<L2WriteLatency>> =
+  delay_constructor_policy{delay_constructor_kind::no_delay, 0, L2WriteLatency};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+inline constexpr auto delay_constructor_policy_from_type<fixed_delay_constructor_t<Delay, L2WriteLatency>> =
+  delay_constructor_policy{delay_constructor_kind::fixed_delay, Delay, L2WriteLatency};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+inline constexpr auto delay_constructor_policy_from_type<exponential_backoff_constructor_t<Delay, L2WriteLatency>> =
+  delay_constructor_policy{delay_constructor_kind::exponential_backoff, Delay, L2WriteLatency};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+inline constexpr auto
+  delay_constructor_policy_from_type<exponential_backoff_jitter_constructor_t<Delay, L2WriteLatency>> =
+    delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter, Delay, L2WriteLatency};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+inline constexpr auto
+  delay_constructor_policy_from_type<exponential_backoff_jitter_window_constructor_t<Delay, L2WriteLatency>> =
+    delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter_window, Delay, L2WriteLatency};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+inline constexpr auto
+  delay_constructor_policy_from_type<exponential_backon_jitter_window_constructor_t<Delay, L2WriteLatency>> =
+    delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, Delay, L2WriteLatency};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+inline constexpr auto delay_constructor_policy_from_type<exponential_backon_jitter_constructor_t<Delay, L2WriteLatency>> =
+  delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, Delay, L2WriteLatency};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+inline constexpr auto delay_constructor_policy_from_type<exponential_backon_constructor_t<Delay, L2WriteLatency>> =
+  delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency};
+
+template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
+struct delay_constructor_for;
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+struct delay_constructor_for<delay_constructor_kind::no_delay, Delay, L2WriteLatency>
+{
+  using type = no_delay_constructor_t<L2WriteLatency>;
+};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+struct delay_constructor_for<delay_constructor_kind::fixed_delay, Delay, L2WriteLatency>
+{
+  using type = fixed_delay_constructor_t<Delay, L2WriteLatency>;
+};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+struct delay_constructor_for<delay_constructor_kind::exponential_backoff, Delay, L2WriteLatency>
+{
+  using type = exponential_backoff_constructor_t<Delay, L2WriteLatency>;
+};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+struct delay_constructor_for<delay_constructor_kind::exponential_backoff_jitter, Delay, L2WriteLatency>
+{
+  using type = exponential_backoff_jitter_constructor_t<Delay, L2WriteLatency>;
+};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+struct delay_constructor_for<delay_constructor_kind::exponential_backoff_jitter_window, Delay, L2WriteLatency>
+{
+  using type = exponential_backoff_jitter_window_constructor_t<Delay, L2WriteLatency>;
+};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+struct delay_constructor_for<delay_constructor_kind::exponential_backon_jitter_window, Delay, L2WriteLatency>
+{
+  using type = exponential_backon_jitter_window_constructor_t<Delay, L2WriteLatency>;
+};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+struct delay_constructor_for<delay_constructor_kind::exponential_backon_jitter, Delay, L2WriteLatency>
+{
+  using type = exponential_backon_jitter_constructor_t<Delay, L2WriteLatency>;
+};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+struct delay_constructor_for<delay_constructor_kind::exponential_backon, Delay, L2WriteLatency>
+{
+  using type = exponential_backon_constructor_t<Delay, L2WriteLatency>;
+};
+
+template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
+using delay_constructor_t = typename delay_constructor_for<Kind, Delay, L2WriteLatency>::type;
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 65a962d5901..fe3134e60e9 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -1,5 +1,5 @@
 // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
-// SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 /**
@@ -45,7 +45,7 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail::scan
 {
-template <typename MaxPolicyT,
+template <typename PolicySelector,
           typename InputIteratorT,
           typename OutputIteratorT,
           typename ScanOpT,
@@ -61,7 +61,7 @@ struct DeviceScanKernelSource
 
   CUB_DEFINE_KERNEL_GETTER(
     ScanKernel,
-    DeviceScanKernel<MaxPolicyT,
+    DeviceScanKernel<PolicySelector,
                      InputIteratorT,
                      OutputIteratorT,
                      ScanTileStateT,
@@ -81,6 +81,32 @@ struct DeviceScanKernelSource
     return ScanTileStateT();
   }
 };
+
+// TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
+template <typename LegacyActivePolicy>
+_CCCL_API constexpr auto convert_policy() -> scan_policy
+{
+  using scan_policy_t = typename LegacyActivePolicy::ScanPolicyT;
+  return scan_policy{
+    scan_policy_t::BLOCK_THREADS,
+    scan_policy_t::ITEMS_PER_THREAD,
+    scan_policy_t::LOAD_ALGORITHM,
+    scan_policy_t::LOAD_MODIFIER,
+    scan_policy_t::STORE_ALGORITHM,
+    scan_policy_t::SCAN_ALGORITHM,
+    detail::delay_constructor_policy_from_type<typename scan_policy_t::detail::delay_constructor_t>};
+}
+
+// TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
+template <typename PolicyHub>
+struct policy_selector_from_hub
+{
+  // this is only called in device code
+  _CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> scan_policy
+  {
+    return convert_policy<typename PolicyHub::MaxPolicy::ActivePolicy>();
+  }
+};
 } // namespace detail::scan
 
 /******************************************************************************
@@ -111,6 +137,7 @@ struct DeviceScanKernelSource
  *   Enum flag to specify whether to enforce inclusive scan.
  *
  */
+// TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0
 template <
   typename InputIteratorT,
   typename OutputIteratorT,
@@ -126,7 +153,7 @@ template <
   typename PolicyHub              = detail::scan::
     policy_hub<detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, AccumT, OffsetT, ScanOpT>,
   typename KernelSource = detail::scan::DeviceScanKernelSource<
-    typename PolicyHub::MaxPolicy,
+    detail::scan::policy_selector_from_hub<PolicyHub>,
     InputIteratorT,
     OutputIteratorT,
     ScanOpT,
@@ -216,6 +243,7 @@ struct DispatchScan
    * @param[in] max_policy
    *   Struct encoding chain of algorithm tuning policies
    */
+  // TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -404,6 +432,7 @@ struct DispatchScan
    * @param[in] max_policy
    *   Struct encoding chain of algorithm tuning policies
    */
+  // TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0
   template <typename MaxPolicyT = typename PolicyHub::MaxPolicy>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -444,4 +473,206 @@ struct DispatchScan
   }
 };
 
+namespace detail::scan
+{
+template <
+  typename InputIteratorT,
+  typename OutputIteratorT,
+  typename ScanOpT,
+  typename InitValueT,
+  typename OffsetT,
+  typename AccumT                 = ::cuda::std::__accumulator_t<ScanOpT,
+                                                                 cub::detail::it_value_t<InputIteratorT>,
+                                                                 ::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
+                                                                                  cub::detail::it_value_t<InputIteratorT>,
+                                                                                  typename InitValueT::value_type>>,
+  ForceInclusive EnforceInclusive = ForceInclusive::No,
+  typename PolicySelector         = policy_selector_from_types<detail::it_value_t<InputIteratorT>,
+                                                               detail::it_value_t<OutputIteratorT>,
+                                                               AccumT,
+                                                               OffsetT,
+                                                               ScanOpT>,
+  typename KernelSource =
+    DeviceScanKernelSource<PolicySelector, InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT, AccumT, EnforceInclusive>,
+  typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
+#if _CCCL_HAS_CONCEPTS()
+  requires scan_policy_selector<PolicySelector>
+#endif // _CCCL_HAS_CONCEPTS()
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  InputIteratorT d_in,
+  OutputIteratorT d_out,
+  ScanOpT scan_op,
+  InitValueT init_value,
+  OffsetT num_items,
+  cudaStream_t stream,
+  PolicySelector policy_selector         = {},
+  KernelSource kernel_source             = {},
+  KernelLauncherFactory launcher_factory = {}) -> cudaError_t
+{
+  static_assert(::cuda::std::is_unsigned_v<OffsetT> && sizeof(OffsetT) >= 4,
+                "DispatchScan only supports unsigned offset types of at least 4-bytes");
+
+  ::cuda::arch_id arch_id{};
+  if (const auto error = CubDebug(launcher_factory.PtxArchId(arch_id)))
+  {
+    return error;
+  }
+
+  const scan_policy active_policy = policy_selector(arch_id);
+
+  // Number of input tiles
+  const int tile_size = active_policy.block_threads * active_policy.items_per_thread;
+  const int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+  auto tile_state = kernel_source.TileState();
+
+  // Specify temporary storage allocation requirements
+  size_t allocation_sizes[1];
+  if (const auto error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0])))
+  {
+    return error; // bytes needed for tile status descriptors
+  }
+
+  // Compute allocation pointers into the single storage blob (or compute
+  // the necessary size of the blob)
+  void* allocations[1] = {};
+  if (const auto error =
+        CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+  {
+    return error;
+  }
+
+  // Return if the caller is simply requesting the size of the storage allocation, or the problem is empty
+  if (d_temp_storage == nullptr || num_items == 0)
+  {
+    return cudaSuccess;
+  }
+
+  // Construct the tile status interface
+  if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])))
+  {
+    return error;
+  }
+
+  // Log init_kernel configuration
+  constexpr int init_kernel_threads = 128;
+  const int init_grid_size          = ::cuda::ceil_div(num_tiles, init_kernel_threads);
+
+#ifdef CUB_DEBUG_LOG
+  _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
+#endif // CUB_DEBUG_LOG
+
+  // Invoke init_kernel to initialize tile descriptors
+  launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true)
+    .doit(kernel_source.InitKernel(), tile_state, num_tiles);
+
+  // Check for failure to launch
+  if (const auto error = CubDebug(cudaPeekAtLastError()))
+  {
+    return error;
+  }
+
+  // Sync the stream if specified to flush runtime errors
+  if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+  {
+    return error;
+  }
+
+  // Get SM occupancy for scan_kernel
+  int scan_sm_occupancy;
+  if (const auto error = CubDebug(
+        launcher_factory.MaxSmOccupancy(scan_sm_occupancy, kernel_source.ScanKernel(), active_policy.block_threads)))
+  {
+    return error;
+  }
+
+  // Get max x-dimension of grid
+  int max_dim_x;
+  if (const auto error = CubDebug(launcher_factory.MaxGridDimX(max_dim_x)))
+  {
+    return error;
+  }
+
+  // Run grids in epochs (in case number of tiles exceeds max x-dimension
+  const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x);
+  for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+  {
+// Log scan_kernel configuration
+#ifdef CUB_DEBUG_LOG
+    _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
+            "per thread, %d SM occupancy\n",
+            start_tile,
+            scan_grid_size,
+            active_policy.block_threads,
+            (long long) stream,
+            active_policy.items_per_thread,
+            scan_sm_occupancy);
+#endif // CUB_DEBUG_LOG
+
+    // Invoke scan_kernel
+    launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true)
+      .doit(kernel_source.ScanKernel(), d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items);
+
+    // Check for failure to launch
+    if (const auto error = CubDebug(cudaPeekAtLastError()))
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+    {
+      return error;
+    }
+  }
+
+  return cudaSuccess;
+}
+
+template <
+  typename AccumT,
+  typename InputIteratorT,
+  typename OutputIteratorT,
+  typename ScanOpT,
+  typename InitValueT,
+  typename OffsetT,
+  ForceInclusive EnforceInclusive = ForceInclusive::No,
+  typename PolicySelector         = policy_selector_from_types<detail::it_value_t<InputIteratorT>,
+                                                               detail::it_value_t<OutputIteratorT>,
+                                                               AccumT,
+                                                               OffsetT,
+                                                               ScanOpT>,
+  typename KernelSource =
+    DeviceScanKernelSource<PolicySelector, InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT, AccumT, EnforceInclusive>,
+  typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_with_accum(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  InputIteratorT d_in,
+  OutputIteratorT d_out,
+  ScanOpT scan_op,
+  InitValueT init_value,
+  OffsetT num_items,
+  cudaStream_t stream,
+  PolicySelector policy_selector         = {},
+  KernelSource kernel_source             = {},
+  KernelLauncherFactory launcher_factory = {}) -> cudaError_t
+{
+  return dispatch<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT, AccumT>(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in,
+    d_out,
+    scan_op,
+    init_value,
+    num_items,
+    stream,
+    policy_selector,
+    kernel_source,
+    launcher_factory);
+}
+} // namespace detail::scan
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index eb4cba0ada8..f0ee2a0a236 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #pragma once
@@ -14,6 +14,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_scan.cuh>
+#include <cub/device/dispatch/tuning/tuning_scan.cuh>
 #include <cub/util_macro.cuh>
 
 CUB_NAMESPACE_BEGIN
@@ -82,8 +83,8 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte
  * @brief Scan kernel entry point (multi-block)
  *
  *
- * @tparam ChainedPolicyT
- *   Chained tuning policy
+ * @tparam PolicySelector
+ *   Policy selector for tuning
  *
  * @tparam InputIteratorT
  *   Random-access input iterator type for reading scan inputs @iterator
@@ -126,7 +127,7 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte
  * @paramTotal num_items
  *   number of scan items for the entire problem
  */
-template <typename ChainedPolicyT,
+template <typename PolicySelector,
           typename InputIteratorT,
           typename OutputIteratorT,
           typename ScanTileStateT,
@@ -136,7 +137,7 @@ template <typename ChainedPolicyT,
           typename AccumT,
           bool ForceInclusive,
           typename RealInitValueT = typename InitValueT::value_type>
-__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
+__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
     InputIteratorT d_in,
     OutputIteratorT d_out,
@@ -146,7 +147,23 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
     InitValueT init_value,
     OffsetT num_items)
 {
-  using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT;
+  static constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
+  static_assert(policy.load_modifier != CacheLoadModifier::LOAD_LDG,
+                "The memory consistency model does not apply to texture "
+                "accesses");
+
+  using ScanPolicyT = AgentScanPolicy<
+    policy.block_threads,
+    policy.items_per_thread,
+    AccumT,
+    policy.load_algorithm,
+    policy.load_modifier,
+    policy.store_algorithm,
+    policy.scan_algorithm,
+    NoScaling<policy.block_threads, policy.items_per_thread>,
+    delay_constructor_t<policy.delay_constructor.kind,
+                        policy.delay_constructor.delay,
+                        policy.delay_constructor.l2_write_latency>>;
 
   // Thread block type for scanning input tiles
   using AgentScanT = detail::scan::AgentScan<
diff --git a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh
index 7ae1e0bde7c..a68a2cb746b 100644
--- a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #pragma once
@@ -19,6 +19,7 @@
 #include <cub/agent/agent_radix_sort_upsweep.cuh>
 #include <cub/agent/agent_scan.cuh>
 #include <cub/device/dispatch/tuning/common.cuh>
+#include <cub/device/dispatch/tuning/tuning_scan.cuh>
 #include <cub/util_device.cuh>
 
 #include <cuda/__device/arch_id.h>
@@ -31,129 +32,8 @@
 CUB_NAMESPACE_BEGIN
 namespace detail::radix_sort
 {
-enum class delay_constructor_kind
-{
-  no_delay,
-  fixed_delay,
-  exponential_backoff,
-  exponential_backoff_jitter,
-  exponential_backoff_jitter_window,
-  exponential_backon_jitter_window,
-  exponential_backon_jitter,
-  exponential_backon
-};
-
-#if !_CCCL_COMPILER(NVRTC)
-inline ::std::ostream& operator<<(::std::ostream& os, delay_constructor_kind kind)
-{
-  switch (kind)
-  {
-    case delay_constructor_kind::no_delay:
-      return os << "delay_constructor_kind::no_delay";
-    case delay_constructor_kind::fixed_delay:
-      return os << "delay_constructor_kind::fixed_delay";
-    case delay_constructor_kind::exponential_backoff:
-      return os << "delay_constructor_kind::exponential_backoff";
-    case delay_constructor_kind::exponential_backoff_jitter:
-      return os << "delay_constructor_kind::exponential_backoff_jitter";
-    case delay_constructor_kind::exponential_backoff_jitter_window:
-      return os << "delay_constructor_kind::exponential_backoff_jitter_window";
-    case delay_constructor_kind::exponential_backon_jitter_window:
-      return os << "delay_constructor_kind::exponential_backon_jitter_window";
-    case delay_constructor_kind::exponential_backon_jitter:
-      return os << "delay_constructor_kind::exponential_backon_jitter";
-    case delay_constructor_kind::exponential_backon:
-      return os << "delay_constructor_kind::exponential_backon";
-    default:
-      return os << "<unknown delay_constructor_kind: " << static_cast<int>(kind) << ">";
-  }
-}
-#endif // !_CCCL_COMPILER(NVRTC)
-
-struct delay_constructor_policy
-{
-  delay_constructor_kind kind;
-  unsigned int delay;
-  unsigned int l2_write_latency;
-
-  _CCCL_API constexpr friend bool operator==(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs)
-  {
-    return lhs.kind == rhs.kind && lhs.delay == rhs.delay && lhs.l2_write_latency == rhs.l2_write_latency;
-  }
-
-  _CCCL_API constexpr friend bool operator!=(const delay_constructor_policy& lhs, const delay_constructor_policy& rhs)
-  {
-    return !(lhs == rhs);
-  }
-
-#if !_CCCL_COMPILER(NVRTC)
-  friend ::std::ostream& operator<<(::std::ostream& os, const delay_constructor_policy& p)
-  {
-    return os << "delay_constructor_policy { .kind = " << p.kind << ", .delay = " << p.delay
-              << ", .l2_write_latency = " << p.l2_write_latency << " }";
-  }
-#endif // !_CCCL_COMPILER(NVRTC)
-};
-
-template <typename DelayConstructor>
-inline constexpr auto delay_constructor_policy_from_type = 0;
-
-template <unsigned int L2WriteLatency>
-inline constexpr auto delay_constructor_policy_from_type<no_delay_constructor_t<L2WriteLatency>> =
-  delay_constructor_policy{delay_constructor_kind::no_delay, 0, L2WriteLatency};
-
-template <unsigned int Delay, unsigned int L2WriteLatency>
-inline constexpr auto delay_constructor_policy_from_type<fixed_delay_constructor_t<Delay, L2WriteLatency>> =
-  delay_constructor_policy{delay_constructor_kind::fixed_delay, Delay, L2WriteLatency};
-
-template <unsigned int Delay, unsigned int L2WriteLatency>
-inline constexpr auto delay_constructor_policy_from_type<exponential_backoff_constructor_t<Delay, L2WriteLatency>> =
-  delay_constructor_policy{delay_constructor_kind::exponential_backoff, Delay, L2WriteLatency};
-
-template <unsigned int Delay, unsigned int L2WriteLatency>
-inline constexpr auto
-  delay_constructor_policy_from_type<exponential_backoff_jitter_constructor_t<Delay, L2WriteLatency>> =
-    delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter, Delay, L2WriteLatency};
-
-template <unsigned int Delay, unsigned int L2WriteLatency>
-inline constexpr auto
-  delay_constructor_policy_from_type<exponential_backoff_jitter_window_constructor_t<Delay, L2WriteLatency>> =
-    delay_constructor_policy{delay_constructor_kind::exponential_backoff_jitter_window, Delay, L2WriteLatency};
-
-template <unsigned int Delay, unsigned int L2WriteLatency>
-inline constexpr auto
-  delay_constructor_policy_from_type<exponential_backon_jitter_window_constructor_t<Delay, L2WriteLatency>> =
-    delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, Delay, L2WriteLatency};
-
-template <unsigned int Delay, unsigned int L2WriteLatency>
-inline constexpr auto delay_constructor_policy_from_type<exponential_backon_jitter_constructor_t<Delay, L2WriteLatency>> =
-  delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, Delay, L2WriteLatency};
-
-template <unsigned int Delay, unsigned int L2WriteLatency>
-inline constexpr auto delay_constructor_policy_from_type<exponential_backon_constructor_t<Delay, L2WriteLatency>> =
-  delay_constructor_policy{delay_constructor_kind::exponential_backon, Delay, L2WriteLatency};
-
-// TODO(bgruber): this is modeled after <look_back_helper.cuh>, unify this
-template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
-struct __delay_constructor_t_helper
-{
-private:
-  using delay_constructors = ::cuda::std::__type_list<
-    detail::no_delay_constructor_t<L2WriteLatency>,
-    detail::fixed_delay_constructor_t<Delay, L2WriteLatency>,
-    detail::exponential_backoff_constructor_t<Delay, L2WriteLatency>,
-    detail::exponential_backoff_jitter_constructor_t<Delay, L2WriteLatency>,
-    detail::exponential_backoff_jitter_window_constructor_t<Delay, L2WriteLatency>,
-    detail::exponential_backon_jitter_window_constructor_t<Delay, L2WriteLatency>,
-    detail::exponential_backon_jitter_constructor_t<Delay, L2WriteLatency>,
-    detail::exponential_backon_constructor_t<Delay, L2WriteLatency>>;
-
-public:
-  using type = ::cuda::std::__type_at_c<static_cast<int>(Kind), delay_constructors>;
-};
-
-template <delay_constructor_kind Kind, unsigned int Delay, unsigned int L2WriteLatency>
-using delay_constructor_t = typename __delay_constructor_t_helper<Kind, Delay, L2WriteLatency>::type;
+using detail::scan::make_mem_scaled_scan_policy;
+using detail::scan::scan_policy;
 
 struct radix_sort_histogram_policy
 {
@@ -268,63 +148,6 @@ _CCCL_API constexpr auto make_reg_scaled_radix_sort_onesweep_policy(
     store_algorithm};
 }
 
-// TODO(bgruber): move this into the scan tuning header
-struct scan_policy
-{
-  int block_threads;
-  int items_per_thread;
-  BlockLoadAlgorithm load_algorithm;
-  CacheLoadModifier load_modifier;
-  BlockStoreAlgorithm store_algorithm;
-  BlockScanAlgorithm scan_algorithm;
-  delay_constructor_policy delay_constructor;
-
-  _CCCL_API constexpr friend bool operator==(const scan_policy& lhs, const scan_policy& rhs)
-  {
-    return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread
-        && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier
-        && lhs.store_algorithm == rhs.store_algorithm && lhs.scan_algorithm == rhs.scan_algorithm
-        && lhs.delay_constructor == rhs.delay_constructor;
-  }
-
-  _CCCL_API constexpr friend bool operator!=(const scan_policy& lhs, const scan_policy& rhs)
-  {
-    return !(lhs == rhs);
-  }
-
-#if !_CCCL_COMPILER(NVRTC)
-  friend ::std::ostream& operator<<(::std::ostream& os, const scan_policy& p)
-  {
-    return os
-        << "scan_policy { .block_threads = " << p.block_threads << ", .items_per_thread = " << p.items_per_thread
-        << ", .load_algorithm = " << p.load_algorithm << ", .load_modifier = " << p.load_modifier
-        << ", .store_algorithm = " << p.store_algorithm << ", .scan_algorithm = " << p.scan_algorithm
-        << ", .delay_constructor = " << p.delay_constructor << " }";
-  }
-#endif // !_CCCL_COMPILER(NVRTC)
-};
-
-_CCCL_API constexpr auto make_mem_scaled_scan_policy(
-  int nominal_4b_block_threads,
-  int nominal_4b_items_per_thread,
-  int compute_t_size,
-  BlockLoadAlgorithm load_algorithm,
-  CacheLoadModifier load_modifier,
-  BlockStoreAlgorithm store_algorithm,
-  BlockScanAlgorithm scan_algorithm,
-  delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}) -> scan_policy
-{
-  const auto scaled = scale_mem_bound(nominal_4b_block_threads, nominal_4b_items_per_thread, compute_t_size);
-  return scan_policy{
-    scaled.block_threads,
-    scaled.items_per_thread,
-    load_algorithm,
-    load_modifier,
-    store_algorithm,
-    scan_algorithm,
-    delay_constructor};
-}
-
 struct radix_sort_downsweep_policy
 {
   int block_threads;
@@ -1778,7 +1601,7 @@ struct policy_selector
     return ::cuda::std::max(value_size, key_size);
   }
 
-  [[nodiscard]] _CCCL_API constexpr auto make_onsweep_small_key_policy(const small_key_tuning_values& tuning) const
+  [[nodiscard]] _CCCL_API constexpr auto make_onesweep_small_key_policy(const small_key_tuning_values& tuning) const
     -> radix_sort_policy
   {
     const int primary_radix_bits     = (key_size > 1) ? 7 : 5;
@@ -1921,12 +1744,12 @@ struct policy_selector
 
     if (arch >= ::cuda::arch_id::sm_100)
     {
-      return make_onsweep_small_key_policy(get_sm100_tuning(key_size, value_size, offset_size, key_type));
+      return make_onesweep_small_key_policy(get_sm100_tuning(key_size, value_size, offset_size, key_type));
     }
 
     if (arch >= ::cuda::arch_id::sm_90)
     {
-      return make_onsweep_small_key_policy(get_sm90_tuning(key_size, value_size, offset_size));
+      return make_onesweep_small_key_policy(get_sm90_tuning(key_size, value_size, offset_size));
     }
 
     if (arch >= ::cuda::arch_id::sm_80)
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 986cc800872..7db16f81d65 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #pragma once
@@ -18,16 +18,22 @@
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
+#include <cub/detail/delay_constructor.cuh>
 #include <cub/device/dispatch/tuning/common.cuh>
 #include <cub/thread/thread_load.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/__device/arch_id.h>
 #include <cuda/std/__functional/invoke.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/void_t.h>
 
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::scan
@@ -114,6 +120,62 @@ constexpr _CCCL_HOST_DEVICE offset_size classify_offset_size()
   return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown;
 }
 
+struct scan_policy
+{
+  int block_threads;
+  int items_per_thread;
+  BlockLoadAlgorithm load_algorithm;
+  CacheLoadModifier load_modifier;
+  BlockStoreAlgorithm store_algorithm;
+  BlockScanAlgorithm scan_algorithm;
+  delay_constructor_policy delay_constructor;
+
+  _CCCL_API constexpr friend bool operator==(const scan_policy& lhs, const scan_policy& rhs)
+  {
+    return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread
+        && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier
+        && lhs.store_algorithm == rhs.store_algorithm && lhs.scan_algorithm == rhs.scan_algorithm
+        && lhs.delay_constructor == rhs.delay_constructor;
+  }
+
+  _CCCL_API constexpr friend bool operator!=(const scan_policy& lhs, const scan_policy& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const scan_policy& p)
+  {
+    return os
+        << "scan_policy { .block_threads = " << p.block_threads << ", .items_per_thread = " << p.items_per_thread
+        << ", .load_algorithm = " << p.load_algorithm << ", .load_modifier = " << p.load_modifier
+        << ", .store_algorithm = " << p.store_algorithm << ", .scan_algorithm = " << p.scan_algorithm
+        << ", .delay_constructor = " << p.delay_constructor << " }";
+  }
+#endif // !_CCCL_COMPILER(NVRTC)
+};
+
+_CCCL_API constexpr auto make_mem_scaled_scan_policy(
+  int nominal_4b_block_threads,
+  int nominal_4b_items_per_thread,
+  int compute_t_size,
+  BlockLoadAlgorithm load_algorithm,
+  CacheLoadModifier load_modifier,
+  BlockStoreAlgorithm store_algorithm,
+  BlockScanAlgorithm scan_algorithm,
+  delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}) -> scan_policy
+{
+  const auto scaled = scale_mem_bound(nominal_4b_block_threads, nominal_4b_items_per_thread, compute_t_size);
+  return scan_policy{
+    scaled.block_threads,
+    scaled.items_per_thread,
+    load_algorithm,
+    load_modifier,
+    store_algorithm,
+    scan_algorithm,
+    delay_constructor};
+}
+
 template <class ValueT,
           class AccumT,
           class OffsetT,
@@ -217,6 +279,7 @@ struct sm80_tuning<type_t::uint128, primitive_op::yes, primitive_accum::no, accu
 {};
 #endif
 
+// TODO(griwes): remove for CCCL 4.0 when we drop the public scan dispatcher
 template <class AccumT,
           primitive_op PrimitiveOp,
           primitive_accum PrimitiveAccumulator = is_primitive_accum<AccumT>(),
@@ -255,6 +318,7 @@ struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_si
 #endif
 // clang-format on
 
+// TODO(griwes): remove for CCCL 4.0 when we drop the public scan dispatcher
 template <class ValueT,
           class AccumT,
           class OffsetT,
@@ -436,6 +500,7 @@ _CCCL_HOST_DEVICE ScanPolicyWrapper<PolicyT> MakeScanPolicyWrapper(PolicyT polic
   return ScanPolicyWrapper<PolicyT>{policy};
 }
 
+// TODO(griwes): remove this in CCCL 4.0 when we remove the public scan dispatcher
 template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
 struct policy_hub
 {
@@ -566,6 +631,418 @@ struct policy_hub
 
   using MaxPolicy = Policy1000;
 };
+
+#if _CCCL_HAS_CONCEPTS()
+template <typename T>
+concept scan_policy_selector = policy_selector<T, scan_policy>;
+#endif // _CCCL_HAS_CONCEPTS()
+
+constexpr _CCCL_HOST_DEVICE delay_constructor_policy default_delay_constructor_policy(primitive_accum primitive)
+{
+  return primitive == primitive_accum::yes
+         ? delay_constructor_policy{delay_constructor_kind::fixed_delay, 350, 450}
+         : delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450};
+}
+
+struct policy_selector
+{
+  int input_value_size;
+  int output_value_size;
+  int accum_size;
+  int offset_size;
+  type_t accum_type;
+  op_kind_t operation_t;
+  primitive_accum primitive_accum_t;
+  primitive_op primitive_op_t;
+  bool benchmark_match;
+
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy
+  {
+    const bool large_values = accum_size > 128;
+    const BlockLoadAlgorithm scan_transposed_load =
+      large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE;
+    const BlockStoreAlgorithm scan_transposed_store =
+      large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE;
+    const auto default_delay = default_delay_constructor_policy(primitive_accum_t);
+
+    if (arch >= ::cuda::arch_id::sm_100)
+    {
+      if (benchmark_match && operation_t == op_kind_t::plus && primitive_accum_t == primitive_accum::yes)
+      {
+        if (offset_size == 4)
+        {
+          switch (input_value_size)
+          {
+            case 1:
+              // ipt_18.tpb_512.ns_768.dcid_7.l2w_820.trp_1.ld_0 1.188818  1.005682  1.173041  1.305288
+              return make_mem_scaled_scan_policy(
+                512,
+                18,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon, 768, 820});
+            case 2:
+              // ipt_13.tpb_512.ns_1384.dcid_7.l2w_720.trp_1.ld_0 1.128443  1.002841  1.119688  1.307692
+              return make_mem_scaled_scan_policy(
+                512,
+                13,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon, 1384, 720});
+            case 4:
+              // ipt_22.tpb_384.ns_1904.dcid_6.l2w_830.trp_1.ld_0 1.148442  0.997167  1.139902  1.462651
+              return make_mem_scaled_scan_policy(
+                384,
+                22,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, 1904, 830});
+            case 8:
+              // ipt_23.tpb_416.ns_772.dcid_5.l2w_710.trp_1.ld_0 1.089468  1.015581  1.085630  1.264583
+              return make_mem_scaled_scan_policy(
+                416,
+                23,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, 772, 710});
+            default:
+              break;
+          }
+        }
+        else if (offset_size == 8)
+        {
+          switch (input_value_size)
+          {
+            case 1:
+              // ipt_14.tpb_384.ns_228.dcid_7.l2w_775.trp_1.ld_1 1.107210  1.000000  1.100637  1.307692
+              return make_mem_scaled_scan_policy(
+                384,
+                14,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_CA,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon, 228, 775});
+            case 2:
+              // todo(gonidelis): Regresses for large inputs. Find better tuning.
+              // ipt_13.tpb_288.ns_1520.dcid_5.l2w_895.trp_1.ld_1 1.080934  0.983509  1.077724  1.305288
+              break;
+            case 4:
+              // ipt_19.tpb_416.ns_956.dcid_7.l2w_550.trp_1.ld_1 1.146142  0.994350  1.137459  1.455636
+              return make_mem_scaled_scan_policy(
+                416,
+                19,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_CA,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon, 956, 550});
+            case 8:
+              if (accum_type == type_t::float64)
+              {
+                break;
+              }
+              // ipt_22.tpb_320.ns_328.dcid_2.l2w_965.trp_1.ld_0 1.080133  1.000000  1.075577  1.248963
+              return make_mem_scaled_scan_policy(
+                320,
+                22,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backoff, 328, 965});
+            default:
+              break;
+          }
+        }
+      }
+    }
+
+    if (arch >= ::cuda::arch_id::sm_90)
+    {
+      if (primitive_op_t == primitive_op::yes)
+      {
+        if (primitive_accum_t == primitive_accum::yes)
+        {
+          switch (accum_size)
+          {
+            case 1:
+              return make_mem_scaled_scan_policy(
+                192,
+                22,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 168, 1140});
+            case 2:
+              return make_mem_scaled_scan_policy(
+                512,
+                12,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 376, 1125});
+            case 4:
+              if (accum_type == type_t::float32)
+              {
+                return make_mem_scaled_scan_policy(
+                  128,
+                  24,
+                  accum_size,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_STORE_WARP_TRANSPOSE,
+                  BLOCK_SCAN_WARP_SCANS,
+                  delay_constructor_policy{delay_constructor_kind::fixed_delay, 688, 1140});
+              }
+              return make_mem_scaled_scan_policy(
+                128,
+                24,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 648, 1245});
+            case 8:
+              if (accum_type == type_t::float64)
+              {
+                return make_mem_scaled_scan_policy(
+                  224,
+                  24,
+                  accum_size,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_STORE_WARP_TRANSPOSE,
+                  BLOCK_SCAN_WARP_SCANS,
+                  delay_constructor_policy{delay_constructor_kind::fixed_delay, 576, 1215});
+              }
+              return make_mem_scaled_scan_policy(
+                224,
+                24,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 632, 1290});
+            default:
+              break;
+          }
+        }
+
+#if _CCCL_HAS_INT128()
+        if (primitive_accum_t == primitive_accum::no && accum_size == 16
+            && (accum_type == type_t::int128 || accum_type == type_t::uint128))
+        {
+          return make_mem_scaled_scan_policy(
+            576,
+            21,
+            accum_size,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_STORE_WARP_TRANSPOSE,
+            BLOCK_SCAN_WARP_SCANS,
+            delay_constructor_policy{delay_constructor_kind::fixed_delay, 860, 630});
+        }
+#endif
+      }
+    }
+
+    if (arch >= ::cuda::arch_id::sm_80)
+    {
+      if (primitive_op_t == primitive_op::yes)
+      {
+        if (primitive_accum_t == primitive_accum::yes)
+        {
+          switch (accum_size)
+          {
+            case 1:
+              return make_mem_scaled_scan_policy(
+                320,
+                14,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 368, 725});
+            case 2:
+              return make_mem_scaled_scan_policy(
+                352,
+                16,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 488, 1040});
+            case 4:
+              if (accum_type == type_t::float32)
+              {
+                return make_mem_scaled_scan_policy(
+                  288,
+                  8,
+                  accum_size,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_STORE_WARP_TRANSPOSE,
+                  BLOCK_SCAN_WARP_SCANS,
+                  delay_constructor_policy{delay_constructor_kind::fixed_delay, 724, 1050});
+              }
+              return make_mem_scaled_scan_policy(
+                320,
+                12,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 268, 1180});
+            case 8:
+              if (accum_type == type_t::float64)
+              {
+                return make_mem_scaled_scan_policy(
+                  384,
+                  12,
+                  accum_size,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_STORE_WARP_TRANSPOSE,
+                  BLOCK_SCAN_WARP_SCANS,
+                  delay_constructor_policy{delay_constructor_kind::fixed_delay, 388, 1100});
+              }
+              return make_mem_scaled_scan_policy(
+                288,
+                22,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 716, 785});
+            default:
+              break;
+          }
+        }
+
+#if _CCCL_HAS_INT128()
+        if (primitive_accum_t == primitive_accum::no && accum_size == 16
+            && (accum_type == type_t::int128 || accum_type == type_t::uint128))
+        {
+          return make_mem_scaled_scan_policy(
+            640,
+            24,
+            accum_size,
+            BLOCK_LOAD_DIRECT,
+            LOAD_DEFAULT,
+            BLOCK_STORE_DIRECT,
+            BLOCK_SCAN_WARP_SCANS,
+            delay_constructor_policy{delay_constructor_kind::no_delay, 0, 1200});
+        }
+#endif
+      }
+    }
+
+    if (arch >= ::cuda::arch_id::sm_75)
+    {
+      if (benchmark_match && operation_t == op_kind_t::plus && primitive_accum_t == primitive_accum::yes
+          && offset_size == 8 && input_value_size == 4)
+      {
+        // ipt_7.tpb_128.ns_628.dcid_1.l2w_520.trp_1.ld_0
+        return make_mem_scaled_scan_policy(
+          128,
+          7,
+          accum_size,
+          BLOCK_LOAD_WARP_TRANSPOSE,
+          LOAD_DEFAULT,
+          BLOCK_STORE_WARP_TRANSPOSE,
+          BLOCK_SCAN_WARP_SCANS,
+          delay_constructor_policy{delay_constructor_kind::fixed_delay, 628, 520});
+      }
+
+      return make_mem_scaled_scan_policy(
+        128,
+        15,
+        accum_size,
+        scan_transposed_load,
+        LOAD_DEFAULT,
+        scan_transposed_store,
+        BLOCK_SCAN_WARP_SCANS,
+        default_delay);
+    }
+
+    if (arch >= ::cuda::arch_id::sm_60)
+    {
+      return make_mem_scaled_scan_policy(
+        128,
+        15,
+        accum_size,
+        scan_transposed_load,
+        LOAD_DEFAULT,
+        scan_transposed_store,
+        BLOCK_SCAN_WARP_SCANS,
+        default_delay);
+    }
+
+    return make_mem_scaled_scan_policy(
+      128,
+      12,
+      accum_size,
+      BLOCK_LOAD_DIRECT,
+      LOAD_CA,
+      BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+      BLOCK_SCAN_RAKING,
+      default_delay);
+  }
+};
+
+#if _CCCL_HAS_CONCEPTS()
+static_assert(scan_policy_selector<policy_selector>);
+#endif // _CCCL_HAS_CONCEPTS()
+
+// stateless version which can be passed to kernels
+template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
+struct policy_selector_from_types
+{
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy
+  {
+    constexpr bool benchmark_match =
+      sizeof(AccumT) == sizeof(::cuda::std::__accumulator_t<ScanOpT, InputValueT, InputValueT>)
+      && sizeof(InputValueT) == sizeof(OutputValueT);
+
+    constexpr auto policies = policy_selector{
+      int{sizeof(InputValueT)},
+      int{sizeof(OutputValueT)},
+      int{sizeof(AccumT)},
+      int{sizeof(OffsetT)},
+      classify_type<AccumT>,
+      classify_op<ScanOpT>,
+      is_primitive_accum<AccumT>(),
+      is_primitive_op<ScanOpT>(),
+      benchmark_match};
+    return policies(arch);
+  }
+};
 } // namespace detail::scan
 
 CUB_NAMESPACE_END
diff --git a/nvbench_helper/nvbench_helper/look_back_helper.cuh b/nvbench_helper/nvbench_helper/look_back_helper.cuh
index bee6708a781..f7a97937dcb 100644
--- a/nvbench_helper/nvbench_helper/look_back_helper.cuh
+++ b/nvbench_helper/nvbench_helper/look_back_helper.cuh
@@ -1,10 +1,10 @@
-// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #pragma once
 
 #if !TUNE_BASE
-#  include <cub/agent/single_pass_scan_operators.cuh>
+#  include <cub/detail/delay_constructor.cuh>
 
 #  include <nvbench_helper.cuh>
 
@@ -12,15 +12,8 @@
 #    error "TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS, and TUNE_DELAY_CONSTRUCTOR_ID must be defined"
 #  endif
 
-using delay_constructors = nvbench::type_list<
-  cub::detail::no_delay_constructor_t<TUNE_L2_WRITE_LATENCY_NS>,
-  cub::detail::fixed_delay_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
-  cub::detail::exponential_backoff_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
-  cub::detail::exponential_backoff_jitter_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
-  cub::detail::exponential_backoff_jitter_window_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
-  cub::detail::exponential_backon_jitter_window_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
-  cub::detail::exponential_backon_jitter_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
-  cub::detail::exponential_backon_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>>;
-
-using delay_constructor_t = nvbench::tl::get<TUNE_DELAY_CONSTRUCTOR_ID, delay_constructors>;
+using delay_constructor_t =
+  cub::detail::delay_constructor_t<static_cast<cub::detail::delay_constructor_kind>(TUNE_DELAY_CONSTRUCTOR_ID),
+                                   TUNE_MAGIC_NS,
+                                   TUNE_L2_WRITE_LATENCY_NS>;
 #endif // !TUNE_BASE

From 637133997524a88797a9413c6518ab53ca5af8ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Sat, 7 Feb 2026 21:19:20 -0800
Subject: [PATCH 02/31] Update benchmarks.

---
 .../applications/P1/log-cdf-from-log-pdfs.cu  | 60 ++++++++---------
 .../P1/non-commutative-bicyclic-monoid.cu     | 60 ++++++++---------
 .../P1/rabin-karp-second-fingerprinting.cu    | 53 +++++++--------
 .../scan/applications/P1/running-min-max.cu   | 59 ++++++++--------
 .../P1/scan-over-unitriangular-group.cu       | 60 ++++++++---------
 cub/benchmarks/bench/scan/exclusive/base.cuh  | 67 +++++--------------
 cub/benchmarks/bench/scan/exclusive/by_key.cu |  2 +-
 cub/benchmarks/bench/scan/policy_selector.h   | 31 +++++++++
 8 files changed, 192 insertions(+), 200 deletions(-)
 create mode 100644 cub/benchmarks/bench/scan/policy_selector.h

diff --git a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu
index f6acf8ae6c0..92f38e31892 100644
--- a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu
@@ -34,26 +34,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-struct policy_hub_t
-{
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
-      TUNE_THREADS,
-      TUNE_ITEMS,
-      // TODO Tune
-      TUNE_LOAD_ALGORITHM,
-      TUNE_LOAD_MODIFIER,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      TUNE_STORE_ALGORITHM,
-      delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // !TUNE_BASE
 
+#include "../../policy_selector.h"
+
 namespace impl
 {
 /*
@@ -125,15 +109,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<FloatingPoi
   using op_t           = impl::log_add_plus;
   using accum_t        = value_t;
 
-#if !TUNE_BASE
-  using policy_t = policy_hub_t<accum_t>;
-  using dispatch_t =
-    cub::DispatchScan<input_t, output_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_t, output_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   auto mu             = static_cast<value_t>(state.get_float64("Mu{io}"));
 
@@ -161,14 +136,39 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<FloatingPoi
   state.add_global_memory_writes<value_t>(elements);
 
   size_t tmp_size;
-  dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream);
+  cub::detail::scan::dispatch_with_accum<accum_t>(
+    nullptr,
+    tmp_size,
+    d_input,
+    d_output,
+    op_t{},
+    wrapped_init_t{},
+    input.size(),
+    bench_stream
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream());
+    cub::detail::scan::dispatch_with_accum<accum_t>(
+      d_tmp,
+      tmp_size,
+      d_input,
+      d_output,
+      op_t{},
+      wrapped_init_t{},
+      input.size(),
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 
   // for validation, use
diff --git a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu
index 07eb27be154..8ab3018ddc0 100644
--- a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu
@@ -30,26 +30,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-struct policy_hub_t
-{
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
-      TUNE_THREADS,
-      TUNE_ITEMS,
-      // TODO Tune
-      TUNE_LOAD_ALGORITHM,
-      TUNE_LOAD_MODIFIER,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      TUNE_STORE_ALGORITHM,
-      delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // !TUNE_BASE
 
+#include "../../policy_selector.h"
+
 namespace impl
 {
 /* Consider free monoid with two generators, ``q`` and ``p``, modulo defining relationship (``p * q == 1``).
@@ -112,15 +96,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<T, OffsetT>
   using output_it_t    = pair_t*;
   using offset_t       = cub::detail::choose_offset_t<OffsetT>;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<accum_t>;
-  using dispatch_t = cub::
-    DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
 
   thrust::device_vector<pair_t> output(elements);
@@ -148,14 +123,39 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<T, OffsetT>
   cudaStream_t bench_stream = state.get_cuda_stream();
 
   size_t tmp_size;
-  dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream);
+  cub::detail::scan::dispatch_with_accum<accum_t>(
+    nullptr,
+    tmp_size,
+    d_input,
+    d_output,
+    op_t{},
+    wrapped_init_t{},
+    input.size(),
+    bench_stream
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream());
+    cub::detail::scan::dispatch_with_accum<accum_t>(
+      d_tmp,
+      tmp_size,
+      d_input,
+      d_output,
+      op_t{},
+      wrapped_init_t{},
+      input.size(),
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 }
 
diff --git a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu
index 579ef22d211..89421692c6b 100644
--- a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu
@@ -35,26 +35,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-struct policy_hub_t
-{
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
-      TUNE_THREADS,
-      TUNE_ITEMS,
-      // TODO Tune
-      TUNE_LOAD_ALGORITHM,
-      TUNE_LOAD_MODIFIER,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      TUNE_STORE_ALGORITHM,
-      delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // !TUNE_BASE
 
+#include "../../policy_selector.h"
+
 namespace impl
 {
 /* Denote epsilon, the identity element, be an empty sequence, and consider
@@ -307,15 +291,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<BitsetT, Of
 
   using ZpT = impl::ZpT;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<accum_t>;
-  using dispatch_t = cub::
-    DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
 
   thrust::device_vector<input_t> input = generate(elements);
@@ -337,13 +312,26 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<BitsetT, Of
   cudaStream_t bench_stream = state.get_cuda_stream().get_stream();
 
   size_t tmp_size;
-  dispatch_t::Dispatch(nullptr, tmp_size, inp_it, out_it, op_t{p}, wrapped_init_t{}, input.size(), bench_stream);
+  cub::detail::scan::dispatch_with_accum<accum_t>(
+    nullptr,
+    tmp_size,
+    inp_it,
+    out_it,
+    op_t{p},
+    wrapped_init_t{},
+    input.size(),
+    bench_stream
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
+    cub::detail::scan::dispatch_with_accum<accum_t>(
       d_tmp,
       tmp_size,
       inp_it,
@@ -351,7 +339,12 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<BitsetT, Of
       op_t{p},
       wrapped_init_t{},
       input.size(),
-      launch.get_stream());
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 
   // for validation uncomment these two lines
diff --git a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu
index cf37c7f6312..1ae88d78742 100644
--- a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu
@@ -37,26 +37,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-struct policy_hub_t
-{
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
-      TUNE_THREADS,
-      TUNE_ITEMS,
-      // TODO Tune
-      TUNE_LOAD_ALGORITHM,
-      TUNE_LOAD_MODIFIER,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      TUNE_STORE_ALGORITHM,
-      delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // !TUNE_BASE
 
+#include "../../policy_selector.h"
+
 namespace impl
 {
 /* Given input sequence of values, compute sequence of
@@ -215,15 +199,6 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   using output_it_t    = pair_t*;
   using offset_t       = cub::detail::choose_offset_t<OffsetT>;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<accum_t>;
-  using dispatch_t = cub::
-    DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
 
   thrust::device_vector<pair_t> output(elements);
@@ -241,13 +216,39 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   cudaStream_t bench_stream = state.get_cuda_stream().get_stream();
 
   size_t tmp_size;
-  dispatch_t::Dispatch(nullptr, tmp_size, inp_it, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream);
+  cub::detail::scan::dispatch_with_accum<accum_t>(
+    nullptr,
+    tmp_size,
+    inp_it,
+    d_output,
+    op_t{},
+    wrapped_init_t{},
+    input.size(),
+    bench_stream
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(d_tmp, tmp_size, inp_it, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream());
+    cub::detail::scan::dispatch_with_accum<accum_t>(
+      d_tmp,
+      tmp_size,
+      inp_it,
+      d_output,
+      op_t{},
+      wrapped_init_t{},
+      input.size(),
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 
   // for verification use
diff --git a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu
index e2afb9ea752..6ebc38a49fc 100644
--- a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu
@@ -34,26 +34,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-struct policy_hub_t
-{
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
-      TUNE_THREADS,
-      TUNE_ITEMS,
-      // TODO Tune
-      TUNE_LOAD_ALGORITHM,
-      TUNE_LOAD_MODIFIER,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      TUNE_STORE_ALGORITHM,
-      delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // !TUNE_BASE
 
+#include "../../policy_selector.h"
+
 namespace impl
 {
 template <typename T>
@@ -125,15 +109,6 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   using output_it_t    = tuple_t*;
   using offset_t       = cub::detail::choose_offset_t<OffsetT>;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<accum_t>;
-  using dispatch_t = cub::
-    DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   cudaStream_t bench_stream = state.get_cuda_stream().get_stream();
 
@@ -158,14 +133,39 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   auto d_output = thrust::raw_pointer_cast(output.data());
 
   size_t tmp_size;
-  dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream);
+  cub::detail::scan::dispatch_with_accum<accum_t>(
+    nullptr,
+    tmp_size,
+    d_input,
+    d_output,
+    op_t{},
+    wrapped_init_t{},
+    input.size(),
+    bench_stream
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream());
+    cub::detail::scan::dispatch_with_accum<accum_t>(
+      d_tmp,
+      tmp_size,
+      d_input,
+      d_output,
+      op_t{},
+      wrapped_init_t{},
+      input.size(),
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 
   // for validation use (recommended for integral types and smallish input sizes)
diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh
index 709f5592678..c2f572c2244 100644
--- a/cub/benchmarks/bench/scan/exclusive/base.cuh
+++ b/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -24,44 +24,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-template <typename AccumT>
-struct policy_hub_t
-{
-  template <int NOMINAL_BLOCK_THREADS_4B,
-            int NOMINAL_ITEMS_PER_THREAD_4B,
-            typename ComputeT,
-            cub::BlockLoadAlgorithm LOAD_ALGORITHM,
-            cub::CacheLoadModifier LOAD_MODIFIER,
-            cub::BlockStoreAlgorithm STORE_ALGORITHM,
-            cub::BlockScanAlgorithm SCAN_ALGORITHM>
-  using agent_policy_t = cub::AgentScanPolicy<
-    NOMINAL_BLOCK_THREADS_4B,
-    NOMINAL_ITEMS_PER_THREAD_4B,
-    ComputeT,
-    LOAD_ALGORITHM,
-    LOAD_MODIFIER,
-    STORE_ALGORITHM,
-    SCAN_ALGORITHM,
-    cub::detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
-    delay_constructor_t>;
-
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanPolicyT =
-      agent_policy_t<TUNE_THREADS,
-                     TUNE_ITEMS,
-                     AccumT,
-                     TUNE_LOAD_ALGORITHM,
-                     TUNE_LOAD_MODIFIER,
-                     TUNE_STORE_ALGORITHM,
-                     cub::BLOCK_SCAN_WARP_SCANS>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // TUNE_BASE
 
+#include "../policy_selector.h"
+
 template <typename T, typename OffsetT>
 static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 try
@@ -73,15 +39,6 @@ try
   using output_it_t    = T*;
   using offset_t       = cub::detail::choose_offset_t<OffsetT>;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<accum_t>;
-  using dispatch_t = cub::
-    DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   if (sizeof(offset_t) == 4 && elements > std::numeric_limits<offset_t>::max())
   {
@@ -100,7 +57,7 @@ try
   state.add_global_memory_writes<T>(elements);
 
   size_t tmp_size;
-  dispatch_t::Dispatch(
+  cub::detail::scan::dispatch_with_accum<accum_t>(
     nullptr,
     tmp_size,
     d_input,
@@ -108,11 +65,16 @@ try
     op_t{},
     wrapped_init_t{T{}},
     static_cast<offset_t>(input.size()),
-    0 /* stream */);
+    0 /* stream */
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
-  thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
+  thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
+    cub::detail::scan::dispatch_with_accum<accum_t>(
       thrust::raw_pointer_cast(tmp.data()),
       tmp_size,
       d_input,
@@ -120,7 +82,12 @@ try
       op_t{},
       wrapped_init_t{T{}},
       static_cast<offset_t>(input.size()),
-      launch.get_stream());
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 }
 catch (const std::bad_alloc&)
diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu
index 09928667a52..e01363cc712 100644
--- a/cub/benchmarks/bench/scan/exclusive/by_key.cu
+++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu
@@ -106,7 +106,7 @@ static void scan(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT
     static_cast<int>(elements),
     0 /* stream */);
 
-  thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
+  thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h
new file mode 100644
index 00000000000..2d474315534
--- /dev/null
+++ b/cub/benchmarks/bench/scan/policy_selector.h
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3
+
+#include <cub/device/device_scan.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+
+#if !TUNE_BASE
+template <typename AccumT>
+struct policy_selector
+{
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::scan::scan_policy
+  {
+    return cub::detail::scan::make_mem_scaled_scan_policy(
+      TUNE_THREADS,
+      TUNE_ITEMS,
+      int{sizeof(AccumT)},
+      TUNE_LOAD_ALGORITHM,
+      TUNE_LOAD_MODIFIER,
+      TUNE_STORE_ALGORITHM,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      cub::detail::delay_constructor_policy_from_type<delay_constructor_t>);
+  }
+};
+#endif // !TUNE_BASE

From 9e346b5e44de1d77caa4a78cfc1efafeb6176d38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Sat, 7 Feb 2026 21:22:38 -0800
Subject: [PATCH 03/31] Update copyright years.

---
 .../bench/scan/applications/P1/log-cdf-from-log-pdfs.cu         | 2 +-
 .../scan/applications/P1/non-commutative-bicyclic-monoid.cu     | 2 +-
 .../scan/applications/P1/rabin-karp-second-fingerprinting.cu    | 2 +-
 cub/benchmarks/bench/scan/applications/P1/running-min-max.cu    | 2 +-
 .../bench/scan/applications/P1/scan-over-unitriangular-group.cu | 2 +-
 cub/benchmarks/bench/scan/exclusive/base.cuh                    | 2 +-
 cub/benchmarks/bench/scan/exclusive/by_key.cu                   | 2 +-
 cub/benchmarks/bench/scan/policy_selector.h                     | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu
index 92f38e31892..c3f5ebcb299 100644
--- a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/detail/choose_offset.cuh>
diff --git a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu
index 8ab3018ddc0..41213cf1b0d 100644
--- a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/detail/choose_offset.cuh>
diff --git a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu
index 89421692c6b..d52ca9a450f 100644
--- a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/detail/choose_offset.cuh>
diff --git a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu
index 1ae88d78742..82a158c0940 100644
--- a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/detail/choose_offset.cuh>
diff --git a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu
index 6ebc38a49fc..02ee4feba28 100644
--- a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/detail/choose_offset.cuh>
diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh
index c2f572c2244..a425fe54d95 100644
--- a/cub/benchmarks/bench/scan/exclusive/base.cuh
+++ b/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #pragma once
diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu
index e01363cc712..d1a50605a23 100644
--- a/cub/benchmarks/bench/scan/exclusive/by_key.cu
+++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #include <cub/detail/choose_offset.cuh>
diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h
index 2d474315534..92ee907447e 100644
--- a/cub/benchmarks/bench/scan/policy_selector.h
+++ b/cub/benchmarks/bench/scan/policy_selector.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #include <cub/device/device_scan.cuh>

From e9467aff52ae243622c9c0e59ade359c34213b12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Sat, 7 Feb 2026 21:38:34 -0800
Subject: [PATCH 04/31] c.parallel: centralize the handling of common cub
 types.

---
 c/parallel/src/radix_sort.cu       | 15 +-------
 c/parallel/src/reduce.cu           | 29 ++------------
 c/parallel/src/scan.cu             | 61 ++----------------------------
 c/parallel/src/segmented_reduce.cu | 30 ++-------------
 c/parallel/src/util/types.h        | 51 ++++++++++++++++++++++++-
 5 files changed, 61 insertions(+), 125 deletions(-)

diff --git a/c/parallel/src/radix_sort.cu b/c/parallel/src/radix_sort.cu
index 84db1db7d93..eef85b4fa0c 100644
--- a/c/parallel/src/radix_sort.cu
+++ b/c/parallel/src/radix_sort.cu
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -213,18 +213,7 @@ try
   std::string offset_t;
   check(cccl_type_name_from_nvrtc<OffsetT>(&offset_t));
 
-  // TODO(bgruber): generalize this somewhere
-  const auto key_type = [&] {
-    switch (input_keys_it.value_type.type)
-    {
-      case CCCL_FLOAT32:
-        return cub::detail::type_t::float32;
-      case CCCL_FLOAT64:
-        return cub::detail::type_t::float64;
-      default:
-        return cub::detail::type_t::other;
-    }
-  }();
+  const auto key_type = cccl_type_enum_to_cub_type(input_keys_it.value_type.type);
 
   const auto policy_sel = cub::detail::radix_sort::policy_selector{
     static_cast<int>(input_keys_it.value_type.size),
diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu
index 8df0aeb5d8a..c673d3453c3 100644
--- a/c/parallel/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -223,31 +223,8 @@ try
   const auto policy_sel = [&] {
     using namespace cub::detail;
 
-    auto accum_type = type_t::other;
-    if (accum_t.type == CCCL_FLOAT32)
-    {
-      accum_type = type_t::float32;
-    }
-    else if (accum_t.type == CCCL_FLOAT64)
-    {
-      accum_type = type_t::float64;
-    }
-
-    auto operation_t = op_kind_t::other;
-    switch (op.type)
-    {
-      case CCCL_PLUS:
-        operation_t = op_kind_t::plus;
-        break;
-      case CCCL_MINIMUM:
-        operation_t = op_kind_t::min;
-        break;
-      case CCCL_MAXIMUM:
-        operation_t = op_kind_t::max;
-        break;
-      default:
-        break;
-    }
+    const auto accum_type  = cccl_type_enum_to_cub_type(accum_t.type);
+    const auto operation_t = cccl_op_kind_to_cub_op(op.type);
 
     const int offset_size = int{sizeof(OffsetT)};
     return cub::detail::reduce::policy_selector{accum_type, operation_t, offset_size, static_cast<int>(accum_t.size)};
diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 215b232a270..6a4cb529f7c 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -205,64 +205,12 @@ try
   const auto output_it_value_t = cccl_type_enum_to_name(output_it.value_type.type);
 
   const auto policy_sel = [&] {
-    using cub::detail::op_kind_t;
-    using cub::detail::type_t;
     using cub::detail::scan::policy_selector;
     using cub::detail::scan::primitive_accum;
     using cub::detail::scan::primitive_op;
 
-    auto accum_type = type_t::other;
-    switch (accum_t.type)
-    {
-      case CCCL_INT8:
-        accum_type = type_t::int8;
-        break;
-      case CCCL_INT16:
-        accum_type = type_t::int16;
-        break;
-      case CCCL_INT32:
-        accum_type = type_t::int32;
-        break;
-      case CCCL_INT64:
-        accum_type = type_t::int64;
-        break;
-      case CCCL_UINT8:
-        accum_type = type_t::uint8;
-        break;
-      case CCCL_UINT16:
-        accum_type = type_t::uint16;
-        break;
-      case CCCL_UINT32:
-        accum_type = type_t::uint32;
-        break;
-      case CCCL_UINT64:
-        accum_type = type_t::uint64;
-        break;
-      case CCCL_FLOAT32:
-        accum_type = type_t::float32;
-        break;
-      case CCCL_FLOAT64:
-        accum_type = type_t::float64;
-        break;
-      default:
-        break;
-    }
-
-    auto operation_t = op_kind_t::other;
-    switch (op.type)
-    {
-      case CCCL_PLUS:
-        operation_t = op_kind_t::plus;
-        break;
-      case CCCL_MINIMUM:
-        operation_t = op_kind_t::min;
-        break;
-      case CCCL_MAXIMUM:
-        operation_t = op_kind_t::max;
-        break;
-      default:
-        break;
-    }
+    const auto accum_type  = cccl_type_enum_to_cub_type(accum_t.type);
+    const auto operation_t = cccl_op_kind_to_cub_op(op.type);
 
     auto primitive_accum_t = primitive_accum::no;
     switch (accum_t.type)
@@ -285,10 +233,7 @@ try
         break;
     }
 
-    const auto primitive_op_t =
-      (op.type == CCCL_PLUS || op.type == CCCL_MINIMUM || op.type == CCCL_MAXIMUM)
-        ? primitive_op::yes
-        : primitive_op::no;
+    const auto primitive_op_t = (operation_t == cub::detail::op_kind_t::other) ? primitive_op::no : primitive_op::yes;
 
     const auto input_type      = input_it.value_type.type;
     const auto output_type     = output_it.value_type.type;
diff --git a/c/parallel/src/segmented_reduce.cu b/c/parallel/src/segmented_reduce.cu
index 93ca88066c6..a07c696ddfb 100644
--- a/c/parallel/src/segmented_reduce.cu
+++ b/c/parallel/src/segmented_reduce.cu
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -154,35 +154,11 @@ try
   // OffsetT is checked to match have 64-bit size
   const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64);
 
-  // TODO(bgruber): share this with reduce.cu
   const auto policy_sel = [&] {
     using namespace cub::detail;
 
-    auto accum_type = type_t::other;
-    if (accum_t.type == CCCL_FLOAT32)
-    {
-      accum_type = type_t::float32;
-    }
-    else if (accum_t.type == CCCL_FLOAT64)
-    {
-      accum_type = type_t::float64;
-    }
-
-    auto operation_t = op_kind_t::other;
-    switch (op.type)
-    {
-      case CCCL_PLUS:
-        operation_t = op_kind_t::plus;
-        break;
-      case CCCL_MINIMUM:
-        operation_t = op_kind_t::min;
-        break;
-      case CCCL_MAXIMUM:
-        operation_t = op_kind_t::max;
-        break;
-      default:
-        break;
-    }
+    const auto accum_type  = cccl_type_enum_to_cub_type(accum_t.type);
+    const auto operation_t = cccl_op_kind_to_cub_op(op.type);
 
     const int offset_size = int{sizeof(OffsetT)};
     return cub::detail::segmented_reduce::policy_selector{
diff --git a/c/parallel/src/util/types.h b/c/parallel/src/util/types.h
index 10408939f80..7ff767b4a94 100644
--- a/c/parallel/src/util/types.h
+++ b/c/parallel/src/util/types.h
@@ -4,12 +4,14 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
 #pragma once
 
+#include <cub/device/dispatch/tuning/common.cuh>
+
 #include <cuda/std/cstdint>
 
 #include <string>
@@ -107,3 +109,50 @@ std::string cccl_type_enum_to_name(cccl_type_enum type, bool is_pointer = false)
 
   return result;
 }
+
+inline constexpr cub::detail::type_t cccl_type_enum_to_cub_type(cccl_type_enum type)
+{
+  switch (type)
+  {
+    case CCCL_INT8:
+      return cub::detail::type_t::int8;
+    case CCCL_INT16:
+      return cub::detail::type_t::int16;
+    case CCCL_INT32:
+      return cub::detail::type_t::int32;
+    case CCCL_INT64:
+      return cub::detail::type_t::int64;
+    case CCCL_UINT8:
+      return cub::detail::type_t::uint8;
+    case CCCL_UINT16:
+      return cub::detail::type_t::uint16;
+    case CCCL_UINT32:
+      return cub::detail::type_t::uint32;
+    case CCCL_UINT64:
+      return cub::detail::type_t::uint64;
+    case CCCL_FLOAT32:
+      return cub::detail::type_t::float32;
+    case CCCL_FLOAT64:
+      return cub::detail::type_t::float64;
+    case CCCL_FLOAT16:
+    case CCCL_STORAGE:
+    case CCCL_BOOLEAN:
+    default:
+      return cub::detail::type_t::other;
+  }
+}
+
+inline constexpr cub::detail::op_kind_t cccl_op_kind_to_cub_op(cccl_op_kind_t type)
+{
+  switch (type)
+  {
+    case CCCL_PLUS:
+      return cub::detail::op_kind_t::plus;
+    case CCCL_MINIMUM:
+      return cub::detail::op_kind_t::min;
+    case CCCL_MAXIMUM:
+      return cub::detail::op_kind_t::max;
+    default:
+      return cub::detail::op_kind_t::other;
+  }
+}

From c4c0c0982f69df07e9cf6951f369110be498ebfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 12 Feb 2026 18:09:39 -0800
Subject: [PATCH 05/31] Resolve review comments.

---
 c/parallel/src/scan.cu                        | 46 ++++++-------------
 cub/benchmarks/bench/scan/policy_selector.h   |  4 +-
 cub/cub/device/dispatch/dispatch_scan.cuh     | 35 ++++++++++----
 .../device/dispatch/tuning/tuning_scan.cuh    |  8 ++--
 4 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 6a4cb529f7c..57edaf9ab6e 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -389,39 +389,19 @@ CUresult cccl_device_scan(
     CUdevice cu_device;
     check(cuCtxGetDevice(&cu_device));
 
-    if constexpr (std::is_same_v<InitValueT, cub::NullType>)
-    {
-      auto exec_status = cub::detail::scan::dispatch_with_accum<void>(
-        d_temp_storage,
-        *temp_storage_bytes,
-        indirect_arg_t{d_in},
-        indirect_arg_t{d_out},
-        indirect_arg_t{op},
-        init,
-        static_cast<OffsetT>(num_items),
-        stream,
-        *static_cast<cub::detail::scan::policy_selector*>(build.runtime_policy),
-        scan::scan_kernel_source{build},
-        cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
-      error = static_cast<CUresult>(exec_status);
-    }
-    else
-    {
-      indirect_arg_t init_arg{init};
-      auto exec_status = cub::detail::scan::dispatch_with_accum<void>(
-        d_temp_storage,
-        *temp_storage_bytes,
-        indirect_arg_t{d_in},
-        indirect_arg_t{d_out},
-        indirect_arg_t{op},
-        init_arg,
-        static_cast<OffsetT>(num_items),
-        stream,
-        *static_cast<cub::detail::scan::policy_selector*>(build.runtime_policy),
-        scan::scan_kernel_source{build},
-        cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
-      error = static_cast<CUresult>(exec_status);
-    }
+    auto exec_status = cub::detail::scan::dispatch_with_accum<void>(
+      d_temp_storage,
+      *temp_storage_bytes,
+      indirect_arg_t{d_in},
+      indirect_arg_t{d_out},
+      indirect_arg_t{op},
+      std::conditional_t<std::is_same_v<InitValueT, cub::NullType>, cub::NullType, indirect_arg_t>{init},
+      static_cast<OffsetT>(num_items),
+      stream,
+      *static_cast<cub::detail::scan::policy_selector*>(build.runtime_policy),
+      scan::scan_kernel_source{build},
+      cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
+    error = static_cast<CUresult>(exec_status);
   }
   catch (const std::exception& exc)
   {
diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h
index 92ee907447e..386e4b41278 100644
--- a/cub/benchmarks/bench/scan/policy_selector.h
+++ b/cub/benchmarks/bench/scan/policy_selector.h
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
-// SPDX-License-Identifier: BSD-3
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/device/device_scan.cuh>
 
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index fe3134e60e9..560ad485d28 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -319,8 +319,11 @@ struct DispatchScan
 #endif // CUB_DEBUG_LOG
 
     // Invoke init_kernel to initialize tile descriptors
-    launcher_factory(init_grid_size, INIT_KERNEL_THREADS, 0, stream, /* use_pdl */ true)
-      .doit(init_kernel, tile_state, num_tiles);
+    if (const auto error = CubDebug(launcher_factory(init_grid_size, INIT_KERNEL_THREADS, 0, stream, /* use_pdl */ true)
+                                      .doit(init_kernel, tile_state, num_tiles)))
+    {
+      return error;
+    }
 
     // Check for failure to launch
     if (const auto error = CubDebug(cudaPeekAtLastError()))
@@ -366,8 +369,12 @@ struct DispatchScan
 #endif // CUB_DEBUG_LOG
 
       // Invoke scan_kernel
-      launcher_factory(scan_grid_size, policy.Scan().BlockThreads(), 0, stream, /* use_pdl */ true)
-        .doit(scan_kernel, d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items);
+      if (const auto error = CubDebug(
+            launcher_factory(scan_grid_size, policy.Scan().BlockThreads(), 0, stream, /* use_pdl */ true)
+              .doit(scan_kernel, d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items)))
+      {
+        return error;
+      }
 
       // Check for failure to launch
       if (const auto error = CubDebug(cudaPeekAtLastError()))
@@ -521,6 +528,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   }
 
   const scan_policy active_policy = policy_selector(arch_id);
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+  NV_IF_TARGET(NV_IS_HOST,
+               (std::stringstream ss; ss << active_policy;
+                _CubLog("Dispatching DeviceScan to arch %d with tuning: %s\n", (int) arch_id, ss.str().c_str());))
+#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
 
   // Number of input tiles
   const int tile_size = active_policy.block_threads * active_policy.items_per_thread;
@@ -565,8 +577,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
 #endif // CUB_DEBUG_LOG
 
   // Invoke init_kernel to initialize tile descriptors
-  launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true)
-    .doit(kernel_source.InitKernel(), tile_state, num_tiles);
+  if (const auto error = CubDebug(launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true)
+                                    .doit(kernel_source.InitKernel(), tile_state, num_tiles)))
+  {
+    return error;
+  }
 
   // Check for failure to launch
   if (const auto error = CubDebug(cudaPeekAtLastError()))
@@ -612,8 +627,12 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
 #endif // CUB_DEBUG_LOG
 
     // Invoke scan_kernel
-    launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true)
-      .doit(kernel_source.ScanKernel(), d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items);
+    if (const auto error = CubDebug(
+          launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true)
+            .doit(kernel_source.ScanKernel(), d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items)))
+    {
+      return error;
+    }
 
     // Check for failure to launch
     if (const auto error = CubDebug(cudaPeekAtLastError()))
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 7db16f81d65..559111249dd 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -652,12 +652,14 @@ struct policy_selector
   int offset_size;
   type_t accum_type;
   op_kind_t operation_t;
-  primitive_accum primitive_accum_t;
-  primitive_op primitive_op_t;
+  // TODO(griwes): remove this field before policy_selector is publicly exposed
   bool benchmark_match;
 
   [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy
   {
+    primitive_accum primitive_accum_t = accum_type != type_t::other && accum_type != type_t::int128;
+    primitive_op primitive_op_t       = operation_t != op_kind_t::other;
+
     const bool large_values = accum_size > 128;
     const BlockLoadAlgorithm scan_transposed_load =
       large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE;
@@ -1037,8 +1039,6 @@ struct policy_selector_from_types
       int{sizeof(OffsetT)},
       classify_type<AccumT>,
       classify_op<ScanOpT>,
-      is_primitive_accum<AccumT>(),
-      is_primitive_op<ScanOpT>(),
       benchmark_match};
     return policies(arch);
   }

From 2c2db7cccc947fc1313dce798f32f8d2d52c5bd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 12 Feb 2026 18:11:11 -0800
Subject: [PATCH 06/31] Fix c.parallel radix_sort breakage.

---
 c/parallel/src/radix_sort.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/c/parallel/src/radix_sort.cu b/c/parallel/src/radix_sort.cu
index eef85b4fa0c..c4b9717d7b5 100644
--- a/c/parallel/src/radix_sort.cu
+++ b/c/parallel/src/radix_sort.cu
@@ -245,6 +245,8 @@ struct __align__({3}) values_storage_t {{
 using device_radix_sort_policy = {5};
 using namespace cub;
 using namespace cub::detail::radix_sort;
+using cub::detail::delay_constructor_policy;
+using cub::detail::delay_constructor_kind;
 static_assert(device_radix_sort_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6}, "Host generated and JIT compiled policy mismatch");
 )XXX",
     input_keys_it.value_type.size, // 0

From 2a0ddf4171762df497d1e788ca0fc4af8be66c2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 19 Feb 2026 01:15:37 +0100
Subject: [PATCH 07/31] Compilation fixes.

---
 cub/cub/device/dispatch/dispatch_scan.cuh                 | 2 +-
 cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 4652cecd3e7..7617eee9345 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -510,7 +510,7 @@ struct DispatchScan
     // TODO(bgruber): we probably need to ensure alignment of d_temp_storage
     _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), "");
 
-    constexpr scan_warpspeed_policy warpspeed_policy = detail::scan::make_scan_warpspeed_policy<WarpspeedPolicy>();
+    constexpr auto warpspeed_policy = detail::scan::make_scan_warpspeed_policy<WarpspeedPolicy>();
     constexpr int smem_size_1_stage = detail::scan::smem_for_stages<InputT, OutputT, AccumT>(warpspeed_policy, 1);
     static_assert(smem_size_1_stage <= detail::max_smem_per_block); // this is ensured by scan_use_warpspeed
 
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
index 5b40fcc4d81..15d68d6ac90 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
@@ -82,6 +82,12 @@ struct warpspeedKernelPolicy
   static constexpr int num_total_threads    = NumTotalThreads;
   static constexpr int items_per_thread     = ItemsPerThread;
   static constexpr int tile_size            = TileSize;
+
+  static constexpr int num_reduce_warps     = NumReduceWarps;
+  static constexpr int num_scan_stor_warps  = NumScanStorWarps;
+  static constexpr int num_load_warps       = NumLoadWarps;
+  static constexpr int num_sched_warps      = NumSchedWarps;
+  static constexpr int num_look_ahead_warps = NumLookAheadWarps;
 };
 
 template <typename InputT, typename OutputT, typename AccumT>
@@ -854,7 +860,7 @@ _CCCL_API constexpr auto smem_for_stages(
   (void) output_size;
   const auto counts = make_scan_stage_counts(num_stages);
 
-  const int align_inout     = ::cuda::std::max(16, input_align, output_align);
+  const int align_inout     = ::cuda::std::max({16, input_align, output_align});
   const int inout_bytes     = policy.tile_size * input_size + 16;
   const auto reduce_squad   = policy.squadReduce();
   const int sum_thread_warp = (reduce_squad.threadCount() + reduce_squad.warpCount()) * accum_size;

From 22ece56ea5f43da424d7f5d9bc966ffb90cb49c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 19 Feb 2026 02:20:14 +0100
Subject: [PATCH 08/31] Go through dispatch_arch, unify dispatch paths for
 scan.

---
 cub/cub/device/dispatch/dispatch_scan.cuh     | 522 ++++++++----------
 .../device/dispatch/tuning/tuning_scan.cuh    |  14 +-
 cub/test/catch2_test_env_launch_helper.h      |  47 ++
 3 files changed, 275 insertions(+), 308 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 7617eee9345..672263e477e 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -23,6 +23,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_scan.cuh>
+#include <cub/detail/arch_dispatch.cuh>
 #include <cub/detail/launcher/cuda_runtime.cuh>
 #include <cub/detail/warpspeed/warpspeed.cuh>
 #include <cub/device/dispatch/dispatch_common.cuh>
@@ -471,20 +472,22 @@ struct DispatchScan
     return cudaSuccess;
   }
 
-#if __cccl_ptx_isa >= 860
-  template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm(ActivePolicyT)
+#if _CCCL_CUDACC_AT_LEAST(12, 8)
+  template <typename PolicySelectorT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm(
+    const detail::scan::scan_warpspeed_policy& warpspeed_policy, const PolicySelectorT& policy_selector)
   {
-    using InputT          = ::cuda::std::iter_value_t<InputIteratorT>;
-    using OutputT         = ::cuda::std::iter_value_t<OutputIteratorT>;
-    using WarpspeedPolicy = typename ActivePolicyT::WarpspeedPolicy;
-
     const int grid_dim =
-      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(WarpspeedPolicy::tile_size)));
+      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(warpspeed_policy.tile_size)));
 
     if (d_temp_storage == nullptr)
     {
-      temp_storage_bytes = grid_dim * kernel_source.look_ahead_tile_state_size();
+      temp_storage_bytes = static_cast<size_t>(grid_dim) * kernel_source.look_ahead_tile_state_size();
+      return cudaSuccess;
+    }
+
+    if (num_items == 0)
+    {
       return cudaSuccess;
     }
 
@@ -496,8 +499,8 @@ struct DispatchScan
     // number of stages to have an even workload across all SMs (improves small problem sizes), assuming 1 CTA per SM
     // +1 since it tends to improve performance
     // TODO(bgruber): make the +1 a tuning parameter
-    [[maybe_unused]] const int max_stages_for_even_workload =
-      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * WarpspeedPolicy::tile_size)) + 1);
+    const int max_stages_for_even_workload =
+      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * warpspeed_policy.tile_size)) + 1);
 
     // Maximum dynamic shared memory size that we can use for temporary storage.
     int max_dynamic_smem_size{};
@@ -510,20 +513,31 @@ struct DispatchScan
     // TODO(bgruber): we probably need to ensure alignment of d_temp_storage
     _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), "");
 
-    constexpr auto warpspeed_policy = detail::scan::make_scan_warpspeed_policy<WarpspeedPolicy>();
-    constexpr int smem_size_1_stage = detail::scan::smem_for_stages<InputT, OutputT, AccumT>(warpspeed_policy, 1);
-    static_assert(smem_size_1_stage <= detail::max_smem_per_block); // this is ensured by scan_use_warpspeed
-
     auto scan_kernel = kernel_source.ScanKernel();
     int num_stages   = 1;
-    int smem_size    = smem_size_1_stage;
+    int smem_size    = detail::scan::smem_for_stages(
+      warpspeed_policy,
+      num_stages,
+      policy_selector.input_value_size,
+      policy_selector.input_value_alignment,
+      policy_selector.output_value_size,
+      policy_selector.output_value_alignment,
+      policy_selector.accum_size,
+      policy_selector.accum_alignment);
 
     // When launched from the host, maximize the number of stages that we can fit inside the shared memory.
     NV_IF_TARGET(NV_IS_HOST, ({
                    while (num_stages <= max_stages_for_even_workload)
                    {
-                     const auto next_smem_size =
-                       detail::scan::smem_for_stages<InputT, OutputT, AccumT>(warpspeed_policy, num_stages + 1);
+                     const auto next_smem_size = detail::scan::smem_for_stages(
+                       warpspeed_policy,
+                       num_stages + 1,
+                       policy_selector.input_value_size,
+                       policy_selector.input_value_alignment,
+                       policy_selector.output_value_size,
+                       policy_selector.output_value_alignment,
+                       policy_selector.accum_size,
+                       policy_selector.accum_alignment);
                      if (next_smem_size > max_dynamic_smem_size)
                      {
                        // This number of stages failed, so stay at the current settings
@@ -576,7 +590,7 @@ struct DispatchScan
 
     // Invoke scan kernel
     {
-      constexpr int block_dim = WarpspeedPolicy::num_total_threads;
+      const int block_dim = warpspeed_policy.num_total_threads;
 
 #  ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", grid_dim, block_dim, smem_size, (long long) stream);
@@ -612,26 +626,171 @@ struct DispatchScan
 
     return cudaSuccess;
   }
-#endif // __cccl_ptx_isa >= 860
+#endif // _CCCL_CUDACC_AT_LEAST(12, 8)
 
-  template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {})
+  template <typename PolicyGetter, typename PolicySelectorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  __invoke(PolicyGetter policy_getter, const PolicySelectorT& policy_selector)
   {
-#if __cccl_ptx_isa >= 860
-    if constexpr (detail::scan::scan_use_warpspeed<
-                    ActivePolicyT,
-                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<InputIteratorT>,
-                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<OutputIteratorT>,
-                    AccumT>)
+    CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter();
+
+    CUB_DETAIL_STATIC_ISH_ASSERT(active_policy.load_modifier != CacheLoadModifier::LOAD_LDG,
+                                 "The memory consistency model does not apply to texture accesses");
+
+#if !_CCCL_CUDACC_AT_LEAST(12, 8)
+    (void) policy_selector;
+#endif // !_CCCL_CUDACC_AT_LEAST(12, 8)
+
+#if _CCCL_CUDACC_AT_LEAST(12, 8)
+    if (kernel_source.use_warpspeed(active_policy))
+    {
+      return __invoke_lookahead_algorithm(*active_policy.warpspeed, policy_selector);
+    }
+#endif // _CCCL_CUDACC_AT_LEAST(12, 8)
+
+    // Number of input tiles
+    const int tile_size = active_policy.block_threads * active_policy.items_per_thread;
+    const int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+    auto tile_state = kernel_source.TileState();
+
+    // Specify temporary storage allocation requirements
+    size_t allocation_sizes[1];
+    if (const auto error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0])))
+    {
+      return error; // bytes needed for tile status descriptors
+    }
+
+    // Compute allocation pointers into the single storage blob (or compute
+    // the necessary size of the blob)
+    void* allocations[1] = {};
+    if (const auto error =
+          CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
     {
-      return __invoke_lookahead_algorithm(active_policy);
+      return error;
     }
-    else
-#endif // __cccl_ptx_isa >= 860
+
+    // Return if the caller is simply requesting the size of the storage allocation, or the problem is empty
+    if (d_temp_storage == nullptr || num_items == 0)
     {
-      return Invoke(
-        kernel_source.InitKernel(), kernel_source.ScanKernel(), detail::scan::MakeScanPolicyWrapper(active_policy));
+      return cudaSuccess;
     }
+
+    // Construct the tile status interface
+    if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])))
+    {
+      return error;
+    }
+
+    // Log init_kernel configuration
+    constexpr int init_kernel_threads = 128;
+    const int init_grid_size          = ::cuda::ceil_div(num_tiles, init_kernel_threads);
+
+#ifdef CUB_DEBUG_LOG
+    _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
+#endif // CUB_DEBUG_LOG
+
+    // Invoke init_kernel to initialize tile descriptors
+    if (const auto error = CubDebug(
+          launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true)
+            .doit(kernel_source.InitKernel(), kernel_source.make_tile_state_kernel_arg(tile_state), num_tiles)))
+    {
+      return error;
+    }
+
+    // Check for failure to launch
+    if (const auto error = CubDebug(cudaPeekAtLastError()))
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+    {
+      return error;
+    }
+
+    // Get SM occupancy for scan_kernel
+    int scan_sm_occupancy;
+    if (const auto error = CubDebug(
+          launcher_factory.MaxSmOccupancy(scan_sm_occupancy, kernel_source.ScanKernel(), active_policy.block_threads)))
+    {
+      return error;
+    }
+
+    // Get max x-dimension of grid
+    int max_dim_x;
+    if (const auto error = CubDebug(launcher_factory.MaxGridDimX(max_dim_x)))
+    {
+      return error;
+    }
+
+    // Run grids in epochs (in case number of tiles exceeds max x-dimension
+    const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x);
+    for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+    {
+// Log scan_kernel configuration
+#ifdef CUB_DEBUG_LOG
+      _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
+              "per thread, %d SM occupancy\n",
+              start_tile,
+              scan_grid_size,
+              active_policy.block_threads,
+              (long long) stream,
+              active_policy.items_per_thread,
+              scan_sm_occupancy);
+#endif // CUB_DEBUG_LOG
+
+      // Invoke scan_kernel
+      if (const auto error = CubDebug(
+            launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true)
+              .doit(kernel_source.ScanKernel(),
+                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_in),
+                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_out),
+                    kernel_source.make_tile_state_kernel_arg(tile_state),
+                    start_tile,
+                    scan_op,
+                    init_value,
+                    num_items,
+                    /* num_stages, unused */ 1)))
+      {
+        return error;
+      }
+
+      // Check for failure to launch
+      if (const auto error = CubDebug(cudaPeekAtLastError()))
+      {
+        return error;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+      {
+        return error;
+      }
+    }
+
+    return cudaSuccess;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT = {})
+  {
+    struct policy_getter
+    {
+      _CCCL_API _CCCL_FORCEINLINE constexpr auto operator()() const
+      {
+        return detail::scan::convert_policy<ActivePolicyT>();
+      }
+    };
+
+    using policy_selector_t = detail::scan::policy_selector_from_types<
+      detail::it_value_t<InputIteratorT>,
+      detail::it_value_t<OutputIteratorT>,
+      AccumT,
+      OffsetT,
+      ScanOpT>;
+    return __invoke(policy_getter{}, policy_selector_t{});
   }
 
   /**
@@ -761,280 +920,41 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
     return error;
   }
 
-  const scan_policy active_policy = policy_selector(arch_id);
 #if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
   NV_IF_TARGET(NV_IS_HOST,
-               (std::stringstream ss; ss << active_policy;
+               (std::stringstream ss; ss << policy_selector(arch_id);
                 _CubLog("Dispatching DeviceScan to arch %d with tuning: %s\n", (int) arch_id, ss.str().c_str());))
 #endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
 
-#if _CCCL_CUDACC_AT_LEAST(12, 8)
-  if (kernel_source.use_warpspeed(active_policy))
-  {
-    const int grid_dim =
-      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(active_policy.warpspeed->tile_size)));
-
-    if (d_temp_storage == nullptr)
-    {
-      temp_storage_bytes = static_cast<size_t>(grid_dim) * kernel_source.look_ahead_tile_state_size();
-      return cudaSuccess;
-    }
-
-    if (num_items == 0)
-    {
-      return cudaSuccess;
-    }
-
-    int sm_count = 0;
-    if (const auto error = CubDebug(launcher_factory.MultiProcessorCount(sm_count)))
-    {
-      return error;
-    }
-
-    const int max_stages_for_even_workload = static_cast<int>(
-      ::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * active_policy.warpspeed->tile_size)) + 1);
-
-    int max_dynamic_smem_size{};
-    if (const auto error =
-          CubDebug(launcher_factory.max_dynamic_smem_size_for(max_dynamic_smem_size, kernel_source.ScanKernel())))
-    {
-      return error;
-    }
-
-    _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), "");
-
-    auto scan_kernel = kernel_source.ScanKernel();
-    int num_stages   = 1;
-    int smem_size    = detail::scan::smem_for_stages(
-      *active_policy.warpspeed,
-      num_stages,
-      policy_selector.input_value_size,
-      policy_selector.input_value_alignment,
-      policy_selector.output_value_size,
-      policy_selector.output_value_alignment,
-      policy_selector.accum_size,
-      policy_selector.accum_alignment);
-
-    NV_IF_TARGET(NV_IS_HOST, ({
-                   while (num_stages <= max_stages_for_even_workload)
-                   {
-                     const auto next_smem_size = detail::scan::smem_for_stages(
-                       *active_policy.warpspeed,
-                       num_stages + 1,
-                       policy_selector.input_value_size,
-                       policy_selector.input_value_alignment,
-                       policy_selector.output_value_size,
-                       policy_selector.output_value_alignment,
-                       policy_selector.accum_size,
-                       policy_selector.accum_alignment);
-                     if (next_smem_size > max_dynamic_smem_size)
-                     {
-                       break;
-                     }
-
-                     smem_size = next_smem_size;
-                     ++num_stages;
-                   }
-
-                   if (const auto error = launcher_factory.set_max_dynamic_smem_size_for(scan_kernel, smem_size))
-                   {
-                     return error;
-                   }
-                 }))
-
-    // Invoke init kernel
-    {
-      constexpr auto init_kernel_threads = 128;
-      const auto init_grid_size          = ::cuda::ceil_div(grid_dim, init_kernel_threads);
-
-#  ifdef CUB_DEBUG_LOG
-      _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, , %lld>>>()\n",
-              init_grid_size,
-              init_kernel_threads,
-              (long long) stream);
-#  endif // CUB_DEBUG_LOG
-
-      if (const auto error = CubDebug(
-            launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true)
-              .doit(kernel_source.InitKernel(),
-                    kernel_source.look_ahead_make_tile_state_kernel_arg(d_temp_storage),
-                    grid_dim)))
-      {
-        return error;
-      }
-
-      if (const auto error = CubDebug(cudaPeekAtLastError()))
-      {
-        return error;
-      }
-
-      if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
-      {
-        return error;
-      }
-    }
-
-    // Invoke scan kernel
-    {
-      const int block_dim = active_policy.warpspeed->num_total_threads;
-
-#  ifdef CUB_DEBUG_LOG
-      _CubLog("Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", grid_dim, block_dim, smem_size, (long long) stream);
-#  endif // CUB_DEBUG_LOG
-
-      if (const auto error = CubDebug(
-            launcher_factory(grid_dim, block_dim, smem_size, stream, /* use_pdl */ true)
-              .doit(scan_kernel,
-                    d_in,
-                    d_out,
-                    kernel_source.look_ahead_make_tile_state_kernel_arg(d_temp_storage),
-                    /* start_tile, unused */ 0,
-                    ::cuda::std::move(scan_op),
-                    init_value,
-                    num_items,
-                    num_stages)))
-      {
-        return error;
-      }
-
-      if (const auto error = CubDebug(cudaPeekAtLastError()))
-      {
-        return error;
-      }
-
-      if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
-      {
-        return error;
-      }
-    }
-
-    return cudaSuccess;
-  }
-#endif // _CCCL_CUDACC_AT_LEAST(12, 8)
-
-  // Number of input tiles
-  const int tile_size = active_policy.block_threads * active_policy.items_per_thread;
-  const int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
-
-  auto tile_state = kernel_source.TileState();
-
-  // Specify temporary storage allocation requirements
-  size_t allocation_sizes[1];
-  if (const auto error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0])))
+  struct fake_policy
   {
-    return error; // bytes needed for tile status descriptors
-  }
-
-  // Compute allocation pointers into the single storage blob (or compute
-  // the necessary size of the blob)
-  void* allocations[1] = {};
-  if (const auto error =
-        CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
-  {
-    return error;
-  }
-
-  // Return if the caller is simply requesting the size of the storage allocation, or the problem is empty
-  if (d_temp_storage == nullptr || num_items == 0)
-  {
-    return cudaSuccess;
-  }
-
-  // Construct the tile status interface
-  if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])))
-  {
-    return error;
-  }
-
-  // Log init_kernel configuration
-  constexpr int init_kernel_threads = 128;
-  const int init_grid_size          = ::cuda::ceil_div(num_tiles, init_kernel_threads);
-
-#ifdef CUB_DEBUG_LOG
-  _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
-#endif // CUB_DEBUG_LOG
-
-  // Invoke init_kernel to initialize tile descriptors
-  if (const auto error = CubDebug(
-        launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true)
-          .doit(kernel_source.InitKernel(), kernel_source.make_tile_state_kernel_arg(tile_state), num_tiles)))
-  {
-    return error;
-  }
-
-  // Check for failure to launch
-  if (const auto error = CubDebug(cudaPeekAtLastError()))
-  {
-    return error;
-  }
-
-  // Sync the stream if specified to flush runtime errors
-  if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
-  {
-    return error;
-  }
-
-  // Get SM occupancy for scan_kernel
-  int scan_sm_occupancy;
-  if (const auto error = CubDebug(
-        launcher_factory.MaxSmOccupancy(scan_sm_occupancy, kernel_source.ScanKernel(), active_policy.block_threads)))
-  {
-    return error;
-  }
-
-  // Get max x-dimension of grid
-  int max_dim_x;
-  if (const auto error = CubDebug(launcher_factory.MaxGridDimX(max_dim_x)))
-  {
-    return error;
-  }
-
-  // Run grids in epochs (in case number of tiles exceeds max x-dimension
-  const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x);
-  for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
-  {
-// Log scan_kernel configuration
-#ifdef CUB_DEBUG_LOG
-    _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
-            "per thread, %d SM occupancy\n",
-            start_tile,
-            scan_grid_size,
-            active_policy.block_threads,
-            (long long) stream,
-            active_policy.items_per_thread,
-            scan_sm_occupancy);
-#endif // CUB_DEBUG_LOG
-
-    // Invoke scan_kernel
-    if (const auto error = CubDebug(
-          launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true)
-            .doit(kernel_source.ScanKernel(),
-                  d_in,
-                  d_out,
-                  kernel_source.make_tile_state_kernel_arg(tile_state),
-                  start_tile,
-                  scan_op,
-                  init_value,
-                  num_items,
-                  /* num_stages, unused */ 1)))
-    {
-      return error;
-    }
-
-    // Check for failure to launch
-    if (const auto error = CubDebug(cudaPeekAtLastError()))
-    {
-      return error;
-    }
-
-    // Sync the stream if specified to flush runtime errors
-    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
-    {
-      return error;
-    }
-  }
-
-  return cudaSuccess;
+    using MaxPolicy = void;
+  };
+
+  return dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) {
+    return DispatchScan<InputIteratorT,
+                        OutputIteratorT,
+                        ScanOpT,
+                        InitValueT,
+                        OffsetT,
+                        AccumT,
+                        EnforceInclusive,
+                        fake_policy,
+                        KernelSource,
+                        KernelLauncherFactory>{
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_items,
+      scan_op,
+      init_value,
+      stream,
+      -1 /* ptx_version, not used actually */,
+      kernel_source,
+      launcher_factory}
+      .__invoke(policy_getter, policy_selector);
+  });
 }
 
 template <
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index bff5bf3bc85..baa72c09e63 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -202,13 +202,13 @@ struct scan_policy
        << ", .delay_constructor = " << p.delay_constructor;
     if (p.warpspeed)
     {
-      os << ", scan_warpspeed_policy { .num_squads = " << scan_warpspeed_policy::num_squads << ", .num_reduce_warps = "
-         << p.warpspeed->num_reduce_warps << ", .num_scan_stor_warps = " << p.warpspeed->num_scan_stor_warps
-         << ", .num_load_warps = " << p.warpspeed->num_load_warps << ", .num_sched_warps = "
-         << p.warpspeed->num_sched_warps << ", .num_look_ahead_warps = " << p.warpspeed->num_look_ahead_warps
-         << ", .num_look_ahead_items = " << p.warpspeed->num_look_ahead_items << ", .num_total_threads = "
-         << p.warpspeed->num_total_threads << ", .items_per_thread = " << p.warpspeed->items_per_thread
-         << ", .tile_size = " << p.warpspeed->tile_size << "}";
+      os << ", .warpspeed = scan_warpspeed_policy { .num_reduce_warps = " << p.warpspeed->num_reduce_warps
+         << ", .num_scan_stor_warps = " << p.warpspeed->num_scan_stor_warps << ", .num_load_warps = "
+         << p.warpspeed->num_load_warps << ", .num_sched_warps = " << p.warpspeed->num_sched_warps
+         << ", .num_look_ahead_warps = " << p.warpspeed->num_look_ahead_warps << ", .num_look_ahead_items = "
+         << p.warpspeed->num_look_ahead_items << ", .num_total_threads = " << p.warpspeed->num_total_threads
+         << ", .items_per_thread = " << p.warpspeed->items_per_thread << ", .tile_size = " << p.warpspeed->tile_size
+         << "}";
     }
     return os << " }";
   }
diff --git a/cub/test/catch2_test_env_launch_helper.h b/cub/test/catch2_test_env_launch_helper.h
index 886e53f35ed..454a29b28e1 100644
--- a/cub/test/catch2_test_env_launch_helper.h
+++ b/cub/test/catch2_test_env_launch_helper.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <cub/util_device.cuh>
+
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
 #include <cuda/std/optional>
@@ -149,6 +151,51 @@ struct stream_registry_factory_t
     // Get max grid dimension
     return cudaDeviceGetAttribute(&max_grid_dim_x, cudaDevAttrMaxGridDimX, device_ordinal);
   }
+
+  CUB_RUNTIME_FUNCTION cudaError_t MemsetAsync(void* dst, unsigned char value, size_t num_bytes, cudaStream_t stream)
+  {
+    return cudaMemsetAsync(dst, value, num_bytes, stream);
+  }
+
+  CUB_RUNTIME_FUNCTION cudaError_t
+  MemcpyAsync(void* dst, const void* src, size_t num_bytes, cudaMemcpyKind kind, cudaStream_t stream)
+  {
+    return cudaMemcpyAsync(dst, src, num_bytes, kind, stream);
+  }
+
+  CUB_RUNTIME_FUNCTION cudaError_t MaxSharedMemory(int& max_shared_memory) const
+  {
+    int device = 0;
+    auto error = cudaGetDevice(&device);
+    if (error != cudaSuccess)
+    {
+      return error;
+    }
+
+    return cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, device);
+  }
+
+  template <typename Kernel>
+  CUB_RUNTIME_FUNCTION cudaError_t max_dynamic_smem_size_for(int& max_dynamic_smem_size, Kernel kernel_ptr)
+  {
+    NV_IF_ELSE_TARGET(NV_IS_HOST, //
+                      ({ return cub::MaxPotentialDynamicSmemBytes(max_dynamic_smem_size, kernel_ptr); }),
+                      ({
+                        cudaFuncAttributes func_attrs{};
+                        if (const auto error = cudaFuncGetAttributes(&func_attrs, kernel_ptr))
+                        {
+                          return error;
+                        }
+                        max_dynamic_smem_size = func_attrs.maxDynamicSharedSizeBytes;
+                        return cudaSuccess;
+                      }))
+  }
+
+  template <typename Kernel>
+  CUB_RUNTIME_FUNCTION cudaError_t set_max_dynamic_smem_size_for(Kernel kernel_ptr, int smem_size)
+  {
+    return cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+  }
 };
 
 struct stream_scope

From d7f53336a4329d959d382e673f4f2616b1ad6d31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 19 Feb 2026 02:34:06 +0100
Subject: [PATCH 09/31] Remove cuda::std::optional from policies.

---
 c/parallel/src/scan.cu                        |  2 +-
 cub/cub/device/dispatch/dispatch_scan.cuh     |  8 ++--
 .../device/dispatch/kernels/kernel_scan.cuh   | 24 +++++------
 .../kernels/kernel_scan_warpspeed.cuh         |  1 +
 .../device/dispatch/tuning/tuning_scan.cuh    | 42 +++++++++++--------
 5 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index c324ccaca99..fdb2f74e4dd 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -312,7 +312,7 @@ try
   const bool use_warpspeed =
     active_policy.warpspeed
     && cub::detail::scan::use_warpspeed(
-      *active_policy.warpspeed,
+      active_policy.warpspeed,
       static_cast<int>(input_it.value_type.size),
       static_cast<int>(input_it.value_type.alignment),
       static_cast<int>(output_it.value_type.size),
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 672263e477e..309b69a1348 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -122,7 +122,7 @@ struct DeviceScanKernelSource
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
     if (policy.warpspeed)
     {
-      return detail::scan::use_warpspeed<UnwrappedInputIteratorT, UnwrappedOutputIteratorT, AccumT>(*policy.warpspeed);
+      return detail::scan::use_warpspeed<UnwrappedInputIteratorT, UnwrappedOutputIteratorT, AccumT>(policy.warpspeed);
     }
 #else
     (void) policy;
@@ -141,7 +141,7 @@ struct has_warpspeed_policy<T, ::cuda::std::void_t<typename T::WarpspeedPolicy>>
 {};
 
 template <typename LegacyActivePolicy>
-_CCCL_API constexpr auto convert_warpspeed_policy() -> ::cuda::std::optional<scan_warpspeed_policy>
+_CCCL_API constexpr auto convert_warpspeed_policy() -> scan_warpspeed_policy
 {
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
   if constexpr (has_warpspeed_policy<LegacyActivePolicy>::value)
@@ -149,7 +149,7 @@ _CCCL_API constexpr auto convert_warpspeed_policy() -> ::cuda::std::optional<sca
     return make_scan_warpspeed_policy<typename LegacyActivePolicy::WarpspeedPolicy>();
   }
 #endif // _CCCL_CUDACC_AT_LEAST(12, 8)
-  return ::cuda::std::nullopt;
+  return {};
 }
 
 // TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
@@ -644,7 +644,7 @@ struct DispatchScan
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
     if (kernel_source.use_warpspeed(active_policy))
     {
-      return __invoke_lookahead_algorithm(*active_policy.warpspeed, policy_selector);
+      return __invoke_lookahead_algorithm(active_policy.warpspeed, policy_selector);
     }
 #endif // _CCCL_CUDACC_AT_LEAST(12, 8)
 
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index d9fb0b2a0f1..35db22ee147 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -67,7 +67,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel(
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
   constexpr scan_policy policy = PolicySelectorT{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
   if constexpr (policy.warpspeed
-                && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(*policy.warpspeed))
+                && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(policy.warpspeed))
   {
     device_scan_init_lookahead_body(tile_state.lookahead, num_tiles);
   }
@@ -204,19 +204,19 @@ __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block
 
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
   if constexpr (policy.warpspeed
-                && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(*policy.warpspeed))
+                && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(policy.warpspeed))
   {
     using WarpspeedPolicyT = warpspeedKernelPolicy<
-      policy.warpspeed->num_squads,
-      policy.warpspeed->num_reduce_warps,
-      policy.warpspeed->num_scan_stor_warps,
-      policy.warpspeed->num_load_warps,
-      policy.warpspeed->num_sched_warps,
-      policy.warpspeed->num_look_ahead_warps,
-      policy.warpspeed->num_look_ahead_items,
-      policy.warpspeed->num_total_threads,
-      policy.warpspeed->items_per_thread,
-      policy.warpspeed->tile_size>;
+      scan_warpspeed_policy::num_squads,
+      policy.warpspeed.num_reduce_warps,
+      policy.warpspeed.num_scan_stor_warps,
+      policy.warpspeed.num_load_warps,
+      policy.warpspeed.num_sched_warps,
+      policy.warpspeed.num_look_ahead_warps,
+      policy.warpspeed.num_look_ahead_items,
+      policy.warpspeed.num_total_threads,
+      policy.warpspeed.items_per_thread,
+      policy.warpspeed.tile_size>;
     NV_IF_TARGET(
       NV_PROVIDES_SM_100, ({
         auto scan_params = scanKernelParams<it_value_t<InputIteratorT>, it_value_t<OutputIteratorT>, AccumT>{
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
index 15d68d6ac90..e6fd0c5f21f 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
@@ -149,6 +149,7 @@ template <typename WarpspeedPolicy>
 _CCCL_API constexpr scan_warpspeed_policy make_scan_warpspeed_policy()
 {
   return scan_warpspeed_policy{
+    true,
     WarpspeedPolicy::num_reduce_warps,
     WarpspeedPolicy::num_scan_stor_warps,
     WarpspeedPolicy::num_load_warps,
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index baa72c09e63..a379c7e667e 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -31,7 +31,6 @@
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/void_t.h>
-#include <cuda/std/optional>
 
 #if !_CCCL_COMPILER(NVRTC)
 #  include <ostream>
@@ -127,6 +126,7 @@ struct scan_warpspeed_policy
 {
   static constexpr int num_squads = 5;
 
+  bool valid = false;
   int num_reduce_warps;
   int num_scan_stor_warps;
   int num_load_warps;
@@ -138,6 +138,11 @@ struct scan_warpspeed_policy
   int items_per_thread;
   int tile_size;
 
+  _CCCL_API constexpr explicit operator bool() const noexcept
+  {
+    return valid;
+  }
+
   _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const
   {
     return warpspeed::SquadDesc{0, num_reduce_warps};
@@ -161,11 +166,11 @@ struct scan_warpspeed_policy
 
   _CCCL_API constexpr friend bool operator==(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs)
   {
-    return lhs.num_reduce_warps == rhs.num_reduce_warps && lhs.num_scan_stor_warps == rhs.num_scan_stor_warps
-        && lhs.num_load_warps == rhs.num_load_warps && lhs.num_sched_warps == rhs.num_sched_warps
-        && lhs.num_look_ahead_warps == rhs.num_look_ahead_warps && lhs.num_look_ahead_items == rhs.num_look_ahead_items
-        && lhs.num_total_threads == rhs.num_total_threads && lhs.items_per_thread == rhs.items_per_thread
-        && lhs.tile_size == rhs.tile_size;
+    return lhs.valid == rhs.valid && lhs.num_reduce_warps == rhs.num_reduce_warps
+        && lhs.num_scan_stor_warps == rhs.num_scan_stor_warps && lhs.num_load_warps == rhs.num_load_warps
+        && lhs.num_sched_warps == rhs.num_sched_warps && lhs.num_look_ahead_warps == rhs.num_look_ahead_warps
+        && lhs.num_look_ahead_items == rhs.num_look_ahead_items && lhs.num_total_threads == rhs.num_total_threads
+        && lhs.items_per_thread == rhs.items_per_thread && lhs.tile_size == rhs.tile_size;
   }
 };
 
@@ -178,7 +183,7 @@ struct scan_policy
   BlockStoreAlgorithm store_algorithm;
   BlockScanAlgorithm scan_algorithm;
   delay_constructor_policy delay_constructor;
-  ::cuda::std::optional<scan_warpspeed_policy> warpspeed = ::cuda::std::nullopt;
+  scan_warpspeed_policy warpspeed = {};
 
   _CCCL_API constexpr friend bool operator==(const scan_policy& lhs, const scan_policy& rhs)
   {
@@ -202,13 +207,13 @@ struct scan_policy
        << ", .delay_constructor = " << p.delay_constructor;
     if (p.warpspeed)
     {
-      os << ", .warpspeed = scan_warpspeed_policy { .num_reduce_warps = " << p.warpspeed->num_reduce_warps
-         << ", .num_scan_stor_warps = " << p.warpspeed->num_scan_stor_warps << ", .num_load_warps = "
-         << p.warpspeed->num_load_warps << ", .num_sched_warps = " << p.warpspeed->num_sched_warps
-         << ", .num_look_ahead_warps = " << p.warpspeed->num_look_ahead_warps << ", .num_look_ahead_items = "
-         << p.warpspeed->num_look_ahead_items << ", .num_total_threads = " << p.warpspeed->num_total_threads
-         << ", .items_per_thread = " << p.warpspeed->items_per_thread << ", .tile_size = " << p.warpspeed->tile_size
-         << "}";
+      os << ", .warpspeed = scan_warpspeed_policy { .valid = " << p.warpspeed.valid << ", .num_reduce_warps = "
+         << p.warpspeed.num_reduce_warps << ", .num_scan_stor_warps = " << p.warpspeed.num_scan_stor_warps
+         << ", .num_load_warps = " << p.warpspeed.num_load_warps << ", .num_sched_warps = "
+         << p.warpspeed.num_sched_warps << ", .num_look_ahead_warps = " << p.warpspeed.num_look_ahead_warps
+         << ", .num_look_ahead_items = " << p.warpspeed.num_look_ahead_items << ", .num_total_threads = "
+         << p.warpspeed.num_total_threads << ", .items_per_thread = " << p.warpspeed.items_per_thread
+         << ", .tile_size = " << p.warpspeed.tile_size << "}";
     }
     return os << " }";
   }
@@ -223,8 +228,8 @@ _CCCL_API constexpr auto make_mem_scaled_scan_policy(
   CacheLoadModifier load_modifier,
   BlockStoreAlgorithm store_algorithm,
   BlockScanAlgorithm scan_algorithm,
-  delay_constructor_policy delay_constructor             = {delay_constructor_kind::fixed_delay, 350, 450},
-  ::cuda::std::optional<scan_warpspeed_policy> warpspeed = ::cuda::std::nullopt) -> scan_policy
+  delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450},
+  scan_warpspeed_policy warpspeed            = {}) -> scan_policy
 {
   const auto scaled = scale_mem_bound(nominal_4b_block_threads, nominal_4b_items_per_thread, compute_t_size);
   return scan_policy{
@@ -777,12 +782,13 @@ constexpr _CCCL_HOST_DEVICE delay_constructor_policy default_delay_constructor_p
          : delay_constructor_policy{delay_constructor_kind::no_delay, 0, 450};
 }
 
-constexpr _CCCL_HOST_DEVICE ::cuda::std::optional<scan_warpspeed_policy>
+constexpr _CCCL_HOST_DEVICE scan_warpspeed_policy
 get_warpspeed_policy(::cuda::arch_id arch, int input_value_size, int accum_size)
 {
   if (arch >= ::cuda::arch_id::sm_100)
   {
     scan_warpspeed_policy warpspeed_policy{};
+    warpspeed_policy.valid = true;
 
     warpspeed_policy.num_reduce_warps     = 4;
     warpspeed_policy.num_scan_stor_warps  = 4;
@@ -806,7 +812,7 @@ get_warpspeed_policy(::cuda::arch_id arch, int input_value_size, int accum_size)
     return warpspeed_policy;
   }
 
-  return ::cuda::std::nullopt;
+  return {};
 }
 
 struct policy_selector

From 497638c0023db26546a9aa3f4d8f2ca9e991d58d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 19 Feb 2026 02:58:23 +0100
Subject: [PATCH 10/31] Pull scan_warpspeed_policy out into its own file.

---
 .../kernels/kernel_scan_warpspeed.cuh         |  1 +
 .../kernels/scan_warpspeed_policy.cuh         | 67 +++++++++++++++++++
 .../device/dispatch/tuning/tuning_scan.cuh    | 54 +--------------
 3 files changed, 69 insertions(+), 53 deletions(-)
 create mode 100644 cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh

diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
index e6fd0c5f21f..c8b1756653b 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
@@ -21,6 +21,7 @@
 #include <cub/detail/warpspeed/squad/load_store.cuh>
 #include <cub/detail/warpspeed/squad/squad.cuh>
 #include <cub/detail/warpspeed/values.cuh>
+#include <cub/device/dispatch/kernels/scan_warpspeed_policy.cuh>
 #include <cub/thread/thread_reduce.cuh>
 #include <cub/thread/thread_scan.cuh>
 #include <cub/warp/warp_reduce.cuh>
diff --git a/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh
new file mode 100644
index 00000000000..3409fc53866
--- /dev/null
+++ b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cub/detail/warpspeed/squad/squad_desc.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail::scan
+{
+struct scan_warpspeed_policy
+{
+  static constexpr int num_squads = 5;
+
+  bool valid = false;
+  int num_reduce_warps;
+  int num_scan_stor_warps;
+  int num_load_warps;
+  int num_sched_warps;
+  int num_look_ahead_warps;
+
+  int num_look_ahead_items;
+  int num_total_threads;
+  int items_per_thread;
+  int tile_size;
+
+  _CCCL_API constexpr explicit operator bool() const noexcept
+  {
+    return valid;
+  }
+
+  _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const
+  {
+    return warpspeed::SquadDesc{0, num_reduce_warps};
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const
+  {
+    return warpspeed::SquadDesc{1, num_scan_stor_warps};
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const
+  {
+    return warpspeed::SquadDesc{2, num_load_warps};
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadSched() const
+  {
+    return warpspeed::SquadDesc{3, num_sched_warps};
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const
+  {
+    return warpspeed::SquadDesc{4, num_look_ahead_warps};
+  }
+
+  _CCCL_API constexpr friend bool operator==(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs)
+  {
+    return lhs.valid == rhs.valid && lhs.num_reduce_warps == rhs.num_reduce_warps
+        && lhs.num_scan_stor_warps == rhs.num_scan_stor_warps && lhs.num_load_warps == rhs.num_load_warps
+        && lhs.num_sched_warps == rhs.num_sched_warps && lhs.num_look_ahead_warps == rhs.num_look_ahead_warps
+        && lhs.num_look_ahead_items == rhs.num_look_ahead_items && lhs.num_total_threads == rhs.num_total_threads
+        && lhs.items_per_thread == rhs.items_per_thread && lhs.tile_size == rhs.tile_size;
+  }
+};
+} // namespace detail::scan
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index a379c7e667e..f7ea31fdc31 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -19,7 +19,7 @@
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
 #include <cub/detail/delay_constructor.cuh>
-#include <cub/detail/warpspeed/squad/squad_desc.cuh>
+#include <cub/device/dispatch/kernels/scan_warpspeed_policy.cuh>
 #include <cub/device/dispatch/tuning/common.cuh>
 #include <cub/thread/thread_load.cuh>
 #include <cub/util_device.cuh>
@@ -122,58 +122,6 @@ constexpr _CCCL_HOST_DEVICE offset_size classify_offset_size()
   return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown;
 }
 
-struct scan_warpspeed_policy
-{
-  static constexpr int num_squads = 5;
-
-  bool valid = false;
-  int num_reduce_warps;
-  int num_scan_stor_warps;
-  int num_load_warps;
-  int num_sched_warps;
-  int num_look_ahead_warps;
-
-  int num_look_ahead_items;
-  int num_total_threads;
-  int items_per_thread;
-  int tile_size;
-
-  _CCCL_API constexpr explicit operator bool() const noexcept
-  {
-    return valid;
-  }
-
-  _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const
-  {
-    return warpspeed::SquadDesc{0, num_reduce_warps};
-  }
-  _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const
-  {
-    return warpspeed::SquadDesc{1, num_scan_stor_warps};
-  }
-  _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const
-  {
-    return warpspeed::SquadDesc{2, num_load_warps};
-  }
-  _CCCL_API constexpr warpspeed::SquadDesc squadSched() const
-  {
-    return warpspeed::SquadDesc{3, num_sched_warps};
-  }
-  _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const
-  {
-    return warpspeed::SquadDesc{4, num_look_ahead_warps};
-  }
-
-  _CCCL_API constexpr friend bool operator==(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs)
-  {
-    return lhs.valid == rhs.valid && lhs.num_reduce_warps == rhs.num_reduce_warps
-        && lhs.num_scan_stor_warps == rhs.num_scan_stor_warps && lhs.num_load_warps == rhs.num_load_warps
-        && lhs.num_sched_warps == rhs.num_sched_warps && lhs.num_look_ahead_warps == rhs.num_look_ahead_warps
-        && lhs.num_look_ahead_items == rhs.num_look_ahead_items && lhs.num_total_threads == rhs.num_total_threads
-        && lhs.items_per_thread == rhs.items_per_thread && lhs.tile_size == rhs.tile_size;
-  }
-};
-
 struct scan_policy
 {
   int block_threads;

From 61db5eb8becb81a55058b17f99e085b22faefdd8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 19 Feb 2026 03:09:48 +0100
Subject: [PATCH 11/31] Check for is_constant_evaluated in new dispatch.

---
 cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
index c8b1756653b..c78dad4b27e 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
@@ -928,7 +928,7 @@ _CCCL_API constexpr bool use_warpspeed(
   bool output_trivially_copyable,
   bool output_default_constructible)
 {
-#if defined(__CUDA_ARCH__) && __cccl_ptx_isa < 860
+#if defined(__CUDA_ARCH__) && __cccl_ptx_isa < 860 || !defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED)
   (void) policy;
   (void) input_size;
   (void) input_align;

From 5f3aedaf5bf5aa7241872b4b7de04f9992034b6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Fri, 20 Feb 2026 15:35:09 +0100
Subject: [PATCH 12/31] Fix some thinkos.

---
 cub/cub/device/dispatch/dispatch_scan.cuh     | 18 ++++++++------
 .../device/dispatch/kernels/kernel_scan.cuh   | 24 +++++++++++--------
 .../kernels/kernel_scan_warpspeed.cuh         | 10 ++++----
 3 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 309b69a1348..13e9d428708 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -51,6 +51,10 @@
 #include <cuda_runtime_api.h>
 #include <cudaTypedefs.h>
 
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+#  include <sstream>
+#endif
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::scan
@@ -496,12 +500,6 @@ struct DispatchScan
     {
       return error;
     }
-    // number of stages to have an even workload across all SMs (improves small problem sizes), assuming 1 CTA per SM
-    // +1 since it tends to improve performance
-    // TODO(bgruber): make the +1 a tuning parameter
-    const int max_stages_for_even_workload =
-      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * warpspeed_policy.tile_size)) + 1);
-
     // Maximum dynamic shared memory size that we can use for temporary storage.
     int max_dynamic_smem_size{};
     if (const auto error =
@@ -527,6 +525,12 @@ struct DispatchScan
 
     // When launched from the host, maximize the number of stages that we can fit inside the shared memory.
     NV_IF_TARGET(NV_IS_HOST, ({
+                   // number of stages to have an even workload across all SMs (improves small problem sizes), assuming
+                   // 1 CTA per SM +1 since it tends to improve performance
+                   // TODO(bgruber): make the +1 a tuning parameter
+                   const int max_stages_for_even_workload = static_cast<int>(
+                     ::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * warpspeed_policy.tile_size)) + 1);
+
                    while (num_stages <= max_stages_for_even_workload)
                    {
                      const auto next_smem_size = detail::scan::smem_for_stages(
@@ -560,7 +564,7 @@ struct DispatchScan
       const auto init_grid_size          = ::cuda::ceil_div(grid_dim, init_kernel_threads);
 
 #  ifdef CUB_DEBUG_LOG
-      _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, , %lld>>>()\n",
+      _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, %lld>>>()\n",
               init_grid_size,
               init_kernel_threads,
               (long long) stream);
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index 35db22ee147..b2679711d2c 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -111,22 +111,26 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte
     *d_num_selected_out = 0;
   }
 }
-template <typename ChainedPolicyT, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
-[[nodiscard]] _CCCL_DEVICE_API _CCCL_CONSTEVAL int get_device_scan_launch_bounds() noexcept
+template <typename PolicySelector, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
+_CCCL_API constexpr int get_device_scan_launch_bounds_helper() noexcept
 {
+  constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
-  if constexpr (detail::scan::
-                  scan_use_warpspeed<typename ChainedPolicyT::ActivePolicy, InputIteratorT, OutputIteratorT, AccumT>)
+  if constexpr (policy.warpspeed
+                && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(policy.warpspeed))
   {
-    return get_scan_block_threads<typename ChainedPolicyT::ActivePolicy>;
+    return policy.warpspeed.num_total_threads;
   }
-  else
 #endif // _CCCL_CUDACC_AT_LEAST(12, 8)
-  {
-    return static_cast<int>(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS);
-  }
+  return policy.block_threads;
 }
 
+// need a variable template to force constant evaluation, otherwise nvcc may emit
+// "bad attribute argument substitution" errors for __launch_bounds__
+template <typename PolicySelector, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
+inline constexpr int get_device_scan_launch_bounds =
+  get_device_scan_launch_bounds_helper<PolicySelector, InputIteratorT, OutputIteratorT, AccumT>();
+
 /**
  * @brief Scan kernel entry point (multi-block)
  *
@@ -182,7 +186,7 @@ template <typename PolicySelector,
           typename AccumT,
           bool ForceInclusive,
           typename RealInitValueT = typename InitValueT::value_type>
-__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
+__launch_bounds__(get_device_scan_launch_bounds<PolicySelector, InputIteratorT, OutputIteratorT, AccumT>)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
     _CCCL_GRID_CONSTANT const InputIteratorT d_in,
     _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
index c78dad4b27e..049a3440358 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
@@ -862,12 +862,14 @@ _CCCL_API constexpr auto smem_for_stages(
   (void) output_size;
   const auto counts = make_scan_stage_counts(num_stages);
 
-  const int align_inout     = ::cuda::std::max({16, input_align, output_align});
-  const int inout_bytes     = policy.tile_size * input_size + 16;
+  const int align_inout = ::cuda::std::max({16, input_align, output_align});
+  const int inout_bytes = policy.tile_size * input_size + 16;
+  // Match sizeof(InOutT): round up to the alignment so each stage matches SmemResource<InOutT>.
+  const int inout_stride    = (inout_bytes + align_inout - 1) & ~(align_inout - 1);
   const auto reduce_squad   = policy.squadReduce();
   const int sum_thread_warp = (reduce_squad.threadCount() + reduce_squad.warpCount()) * accum_size;
 
-  void* inout_base = smemAllocator.alloc(static_cast<::cuda::std::uint32_t>(inout_bytes * num_stages), align_inout);
+  void* inout_base = smemAllocator.alloc(static_cast<::cuda::std::uint32_t>(inout_stride * num_stages), align_inout);
   void* next_block_idx_base = smemAllocator.alloc(
     static_cast<::cuda::std::uint32_t>(sizeof(uint4) * counts.num_block_idx_stages), alignof(uint4));
   void* sum_exclusive_base = smemAllocator.alloc(
@@ -876,7 +878,7 @@ _CCCL_API constexpr auto smem_for_stages(
     smemAllocator.alloc(static_cast<::cuda::std::uint32_t>(sum_thread_warp * num_stages), accum_align);
 
   ScanResourcesRaw res = {
-    warpspeed::SmemResourceRaw{syncHandler, inout_base, inout_bytes, inout_bytes, num_stages},
+    warpspeed::SmemResourceRaw{syncHandler, inout_base, inout_stride, inout_stride, num_stages},
     warpspeed::SmemResourceRaw{
       syncHandler,
       next_block_idx_base,

From 72239e23d8864447e880e1f3104cf5ee2e67e44a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Fri, 20 Feb 2026 16:06:47 +0100
Subject: [PATCH 13/31] Compilation fixes.

---
 cub/cub/device/dispatch/kernels/kernel_scan.cuh | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index b2679711d2c..cb45ef9543b 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -112,7 +112,7 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte
   }
 }
 template <typename PolicySelector, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
-_CCCL_API constexpr int get_device_scan_launch_bounds_helper() noexcept
+_CCCL_DEVICE constexpr int get_device_scan_launch_bounds() noexcept
 {
   constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
@@ -125,12 +125,6 @@ _CCCL_API constexpr int get_device_scan_launch_bounds_helper() noexcept
   return policy.block_threads;
 }
 
-// need a variable template to force constant evaluation, otherwise nvcc may emit
-// "bad attribute argument substitution" errors for __launch_bounds__
-template <typename PolicySelector, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
-inline constexpr int get_device_scan_launch_bounds =
-  get_device_scan_launch_bounds_helper<PolicySelector, InputIteratorT, OutputIteratorT, AccumT>();
-
 /**
  * @brief Scan kernel entry point (multi-block)
  *
@@ -186,7 +180,7 @@ template <typename PolicySelector,
           typename AccumT,
           bool ForceInclusive,
           typename RealInitValueT = typename InitValueT::value_type>
-__launch_bounds__(get_device_scan_launch_bounds<PolicySelector, InputIteratorT, OutputIteratorT, AccumT>)
+__launch_bounds__(get_device_scan_launch_bounds<PolicySelector, InputIteratorT, OutputIteratorT, AccumT>())
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
     _CCCL_GRID_CONSTANT const InputIteratorT d_in,
     _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
@@ -210,7 +204,7 @@ __launch_bounds__(get_device_scan_launch_bounds<PolicySelector, InputIteratorT,
   if constexpr (policy.warpspeed
                 && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(policy.warpspeed))
   {
-    using WarpspeedPolicyT = warpspeedKernelPolicy<
+    using WarpspeedPolicyT [[maybe_unused]] = warpspeedKernelPolicy<
       scan_warpspeed_policy::num_squads,
       policy.warpspeed.num_reduce_warps,
       policy.warpspeed.num_scan_stor_warps,

From 861b25cba42bfd89cab35294f4a8e944a16cb9b1 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 13:37:08 +0100
Subject: [PATCH 14/31] Remove %RANGE% declarations from header

---
 cub/benchmarks/bench/scan/policy_selector.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h
index 386e4b41278..b0a18d61c95 100644
--- a/cub/benchmarks/bench/scan/policy_selector.h
+++ b/cub/benchmarks/bench/scan/policy_selector.h
@@ -3,14 +3,6 @@
 
 #include <cub/device/device_scan.cuh>
 
-// %RANGE% TUNE_ITEMS ipt 7:24:1
-// %RANGE% TUNE_THREADS tpb 128:1024:32
-// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
-// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
-// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
-// %RANGE% TUNE_TRANSPOSE trp 0:1:1
-// %RANGE% TUNE_LOAD ld 0:1:1
-
 #if !TUNE_BASE
 template <typename AccumT>
 struct policy_selector

From 0a7d3e7e9f0488ec2706f8c9c27109c336b4d457 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 25 Feb 2026 13:42:10 +0100
Subject: [PATCH 15/31] Add delay_constructor_policy to look_back_helper.cuh

---
 cub/benchmarks/bench/scan/policy_selector.h        | 4 +++-
 nvbench_helper/nvbench_helper/look_back_helper.cuh | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h
index b0a18d61c95..e64813929f6 100644
--- a/cub/benchmarks/bench/scan/policy_selector.h
+++ b/cub/benchmarks/bench/scan/policy_selector.h
@@ -3,6 +3,8 @@
 
 #include <cub/device/device_scan.cuh>
 
+#include "look_back_helper.cuh"
+
 #if !TUNE_BASE
 template <typename AccumT>
 struct policy_selector
@@ -17,7 +19,7 @@ struct policy_selector
       TUNE_LOAD_MODIFIER,
       TUNE_STORE_ALGORITHM,
       cub::BLOCK_SCAN_WARP_SCANS,
-      cub::detail::delay_constructor_policy_from_type<delay_constructor_t>);
+      delay_constructor_policy);
   }
 };
 #endif // !TUNE_BASE
diff --git a/nvbench_helper/nvbench_helper/look_back_helper.cuh b/nvbench_helper/nvbench_helper/look_back_helper.cuh
index f7a97937dcb..81dd1a7a9db 100644
--- a/nvbench_helper/nvbench_helper/look_back_helper.cuh
+++ b/nvbench_helper/nvbench_helper/look_back_helper.cuh
@@ -16,4 +16,7 @@ using delay_constructor_t =
   cub::detail::delay_constructor_t<static_cast<cub::detail::delay_constructor_kind>(TUNE_DELAY_CONSTRUCTOR_ID),
                                    TUNE_MAGIC_NS,
                                    TUNE_L2_WRITE_LATENCY_NS>;
+
+inline constexpr auto delay_constructor_policy = cub::detail::delay_constructor_policy{
+  static_cast<cub::detail::delay_constructor_kind>(TUNE_DELAY_CONSTRUCTOR_ID), TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS};
 #endif // !TUNE_BASE

From 2a7e044035a7666cc1a99be3882e985f44f55466 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 26 Feb 2026 13:43:56 +0100
Subject: [PATCH 16/31] CI fixes.

---
 c/parallel/src/scan.cu                        | 20 ++-----------------
 ci/matrix.yaml                                |  4 ++++
 .../kernels/kernel_scan_warpspeed.cuh         |  5 ++++-
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index fdb2f74e4dd..5df6035c55a 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -283,24 +283,8 @@ try
 
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
   const auto is_trivial_type = [](cccl_type_enum type) {
-    switch (type)
-    {
-      case CCCL_INT8:
-      case CCCL_INT16:
-      case CCCL_INT32:
-      case CCCL_INT64:
-      case CCCL_UINT8:
-      case CCCL_UINT16:
-      case CCCL_UINT32:
-      case CCCL_UINT64:
-      case CCCL_FLOAT16:
-      case CCCL_FLOAT32:
-      case CCCL_FLOAT64:
-      case CCCL_BOOLEAN:
-        return true;
-      default:
-        return false;
-    }
+    // TODO: implement actual logic here when nontrivial custom types become supported
+    return true;
   };
 
   const bool input_contiguous             = input_it.type == cccl_iterator_kind_t::CCCL_POINTER;
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 6cc5c970233..237f6a2f681 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -184,6 +184,8 @@ workflows:
     # c.parallel -- pinned to gcc13 to match python
     - {jobs: ['test'],  project: ['cccl_c_parallel'], ctk: '12.X', cxx: ['gcc13', 'msvc'], gpu: ['rtx2080']}
     - {jobs: ['test'],  project: ['cccl_c_parallel'], ctk: '13.X', cxx: ['gcc13', 'msvc'], gpu: ['rtx2080', 'l4', 'h100']}
+    # RTX PRO 6000 coverage (limited due to small number of runners):
+    - {jobs: ['test'], project: 'cccl_c_parallel', ctk: '13.X', cxx: ['gcc13'], gpu: ['rtxpro6000']}
     # c.experimental.stf -- pinned to gcc13 to match python
     - {jobs: ['test'],  project: ['cccl_c_stf'], ctk: '12.X', cxx: 'gcc13', gpu: ['rtx2080']}
     - {jobs: ['test'],  project: ['cccl_c_stf'], ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']}
@@ -269,6 +271,8 @@ workflows:
     # c.parallel -- pinned to gcc13 to match python
     - {jobs: ['test'],  project: ['cccl_c_parallel'], ctk: '12.X', cxx: ['gcc13', 'msvc'], gpu: ['rtx2080']}
     - {jobs: ['test'],  project: ['cccl_c_parallel'], ctk: '13.X', cxx: ['gcc13', 'msvc'], gpu: ['rtx2080', 'l4', 'h100']}
+    # RTX PRO 6000 coverage (limited due to small number of runners):
+    - {jobs: ['test'], project: 'cccl_c_parallel', ctk: '13.X', cxx: ['gcc13'], gpu: ['rtxpro6000']}
     # c.experimental.stf -- pinned to gcc13 to match python
     - {jobs: ['test'],  project: ['cccl_c_stf'], ctk: '12.X', cxx: 'gcc13', gpu: ['rtx2080']}
     - {jobs: ['test'],  project: ['cccl_c_stf'], ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']}
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
index 049a3440358..4c1918dc613 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
@@ -930,7 +930,10 @@ _CCCL_API constexpr bool use_warpspeed(
   bool output_trivially_copyable,
   bool output_default_constructible)
 {
-#if defined(__CUDA_ARCH__) && __cccl_ptx_isa < 860 || !defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED)
+// We need `cuda::std::is_constant_evaluated` for the compile-time SMEM computation. And we need PTX ISA 8.6.
+// MSVC + nvcc < 13.1 just fails to compile `cub.test.device.scan.lid_1.types_0` with `Internal error` and nothing else.
+#if (defined(__CUDA_ARCH__) && __cccl_ptx_isa < 860) || !defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) \
+  || ((_CCCL_COMPILER(MSVC) && _CCCL_CUDA_COMPILER(NVCC, <, 13, 1)))
   (void) policy;
   (void) input_size;
   (void) input_align;

From 1353f063bde76485c5dc7db4ad772ae67c125808 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 26 Feb 2026 15:51:15 +0100
Subject: [PATCH 17/31] Refactor DeviceScan, remove warpspeed from policy_hub,
 address review comments.

---
 c/parallel/src/scan.cu                        |   4 +-
 cub/cub/device/device_scan.cuh                |  91 +++++----
 cub/cub/device/dispatch/dispatch_scan.cuh     | 101 +++++----
 .../device/dispatch/kernels/kernel_scan.cuh   |  15 +-
 .../kernels/kernel_scan_warpspeed.cuh         | 191 ++++--------------
 .../device/dispatch/tuning/tuning_scan.cuh    | 117 ++++-------
 cub/test/catch2_test_device_scan_env.cu       |  37 ++--
 7 files changed, 206 insertions(+), 350 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 5df6035c55a..ab7aeb44f04 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -282,7 +282,7 @@ try
   const auto active_policy = policy_sel(arch_id);
 
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
-  const auto is_trivial_type = [](cccl_type_enum type) {
+  const auto is_trivial_type = [](cccl_type_enum /* type */) {
     // TODO: implement actual logic here when nontrivial custom types become supported
     return true;
   };
@@ -450,7 +450,7 @@ CUresult cccl_device_scan(
     CUdevice cu_device;
     check(cuCtxGetDevice(&cu_device));
 
-    auto exec_status = cub::detail::scan::dispatch_with_accum<void>(
+    auto exec_status = cub::detail::scan::dispatch_with_accum<void, EnforceInclusive>(
       d_temp_storage,
       *temp_storage_bytes,
       indirect_arg_t{d_in},
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index 2da3ba19adf..0ef257540e6 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -53,7 +53,7 @@ struct tuning
 struct default_tuning : tuning<default_tuning>
 {
   template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
-  using fn = policy_hub<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>;
+  using selector = policy_selector_from_types<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>;
 };
 } // namespace detail::scan
 
@@ -137,14 +137,23 @@ struct DeviceScan
                                                     cub::detail::it_value_t<InputIteratorT>,
                                                     typename InitValueT::value_type>>;
 
-    using policy_t = typename scan_tuning_t::
-      template fn<detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, accum_t, offset_t, ScanOpT>;
+    using policy_selector_t = typename scan_tuning_t::template selector<
+      detail::it_value_t<InputIteratorT>,
+      detail::it_value_t<OutputIteratorT>,
+      accum_t,
+      offset_t,
+      ScanOpT>;
 
-    using dispatch_t =
-      DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, offset_t, accum_t, EnforceInclusive, policy_t>;
-
-    return dispatch_t::Dispatch(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, static_cast<offset_t>(num_items), stream);
+    return detail::scan::dispatch<EnforceInclusive>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      scan_op,
+      init,
+      static_cast<offset_t>(num_items),
+      stream,
+      policy_selector_t{});
   }
 
   template <typename InputIteratorT,
@@ -295,15 +304,15 @@ struct DeviceScan
     // Initial value
     InitT init_value{};
 
-    return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
-      Dispatch(d_temp_storage,
-               temp_storage_bytes,
-               d_in,
-               d_out,
-               ::cuda::std::plus<>{},
-               detail::InputValue<InitT>(init_value),
-               num_items,
-               stream);
+    return detail::scan::dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      ::cuda::std::plus<>{},
+      detail::InputValue<InitT>(init_value),
+      static_cast<OffsetT>(num_items),
+      stream);
   }
 
   //! @rst
@@ -587,14 +596,14 @@ struct DeviceScan
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
 
-    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
+    return detail::scan::dispatch(
       d_temp_storage,
       temp_storage_bytes,
       d_in,
       d_out,
       scan_op,
       detail::InputValue<InitValueT>(init_value),
-      num_items,
+      static_cast<OffsetT>(num_items),
       stream);
   }
 
@@ -929,14 +938,14 @@ struct DeviceScan
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
 
-    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
+    return detail::scan::dispatch(
       d_temp_storage,
       temp_storage_bytes,
       d_in,
       d_out,
       scan_op,
       detail::InputValue<InitValueT>(init_value),
-      num_items,
+      static_cast<OffsetT>(num_items),
       stream);
   }
 
@@ -1158,8 +1167,15 @@ struct DeviceScan
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
 
-    return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
+    return detail::scan::dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      ::cuda::std::plus<>{},
+      NullType{},
+      static_cast<OffsetT>(num_items),
+      stream);
   }
 
   //! @rst
@@ -1355,8 +1371,8 @@ struct DeviceScan
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
 
-    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
+    return detail::scan::dispatch(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), static_cast<OffsetT>(num_items), stream);
   }
 
   //! @rst
@@ -1446,23 +1462,16 @@ struct DeviceScan
 
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
-    using AccumT  = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
 
-    return DispatchScan<
-      InputIteratorT,
-      OutputIteratorT,
-      ScanOpT,
-      detail::InputValue<InitValueT>,
-      OffsetT,
-      AccumT,
-      ForceInclusive::Yes>::Dispatch(d_temp_storage,
-                                     temp_storage_bytes,
-                                     d_in,
-                                     d_out,
-                                     scan_op,
-                                     detail::InputValue<InitValueT>(init_value),
-                                     num_items,
-                                     stream);
+    return detail::scan::dispatch<ForceInclusive::Yes>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      scan_op,
+      detail::InputValue<InitValueT>(init_value),
+      static_cast<OffsetT>(num_items),
+      stream);
   }
 
   //! @rst
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 13e9d428708..a50a9604ae5 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -135,27 +135,6 @@ struct DeviceScanKernelSource
   }
 };
 
-// TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
-template <typename T, typename = void>
-struct has_warpspeed_policy : ::cuda::std::false_type
-{};
-
-template <typename T>
-struct has_warpspeed_policy<T, ::cuda::std::void_t<typename T::WarpspeedPolicy>> : ::cuda::std::true_type
-{};
-
-template <typename LegacyActivePolicy>
-_CCCL_API constexpr auto convert_warpspeed_policy() -> scan_warpspeed_policy
-{
-#if _CCCL_CUDACC_AT_LEAST(12, 8)
-  if constexpr (has_warpspeed_policy<LegacyActivePolicy>::value)
-  {
-    return make_scan_warpspeed_policy<typename LegacyActivePolicy::WarpspeedPolicy>();
-  }
-#endif // _CCCL_CUDACC_AT_LEAST(12, 8)
-  return {};
-}
-
 // TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
 template <typename LegacyActivePolicy>
 _CCCL_API constexpr auto convert_policy() -> scan_policy
@@ -168,12 +147,11 @@ _CCCL_API constexpr auto convert_policy() -> scan_policy
     scan_policy_t::LOAD_MODIFIER,
     scan_policy_t::STORE_ALGORITHM,
     scan_policy_t::SCAN_ALGORITHM,
-    detail::delay_constructor_policy_from_type<typename scan_policy_t::detail::delay_constructor_t>,
-    convert_warpspeed_policy<LegacyActivePolicy>()};
+    detail::delay_constructor_policy_from_type<typename scan_policy_t::detail::delay_constructor_t>};
 }
 
 // TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
-template <typename PolicyHub>
+template <typename PolicyHub, typename InputValueT, typename OutputValueT, typename AccumT>
 struct policy_selector_from_hub
 {
   // this is only called in device code
@@ -181,6 +159,13 @@ struct policy_selector_from_hub
   {
     return convert_policy<typename PolicyHub::MaxPolicy::ActivePolicy>();
   }
+
+  static constexpr int input_value_size       = int{sizeof(InputValueT)};
+  static constexpr int input_value_alignment  = int{alignof(InputValueT)};
+  static constexpr int output_value_size      = int{sizeof(OutputValueT)};
+  static constexpr int output_value_alignment = int{alignof(OutputValueT)};
+  static constexpr int accum_size             = int{sizeof(AccumT)};
+  static constexpr int accum_alignment        = int{alignof(AccumT)};
 };
 } // namespace detail::scan
 
@@ -228,7 +213,8 @@ template <
   typename PolicyHub              = detail::scan::
     policy_hub<detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, AccumT, OffsetT, ScanOpT>,
   typename KernelSource = detail::scan::DeviceScanKernelSource<
-    detail::scan::policy_selector_from_hub<PolicyHub>,
+    detail::scan::
+      policy_selector_from_hub<PolicyHub, detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, AccumT>,
     THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<InputIteratorT>,
     THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<OutputIteratorT>,
     ScanOpT,
@@ -880,24 +866,31 @@ struct DispatchScan
 namespace detail::scan
 {
 template <
+  ForceInclusive EnforceInclusive = ForceInclusive::No,
   typename InputIteratorT,
   typename OutputIteratorT,
   typename ScanOpT,
   typename InitValueT,
   typename OffsetT,
-  typename AccumT                 = ::cuda::std::__accumulator_t<ScanOpT,
-                                                                 cub::detail::it_value_t<InputIteratorT>,
-                                                                 ::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
-                                                                                  cub::detail::it_value_t<InputIteratorT>,
-                                                                                  typename InitValueT::value_type>>,
-  ForceInclusive EnforceInclusive = ForceInclusive::No,
-  typename PolicySelector         = policy_selector_from_types<detail::it_value_t<InputIteratorT>,
-                                                               detail::it_value_t<OutputIteratorT>,
-                                                               AccumT,
-                                                               OffsetT,
-                                                               ScanOpT>,
-  typename KernelSource =
-    DeviceScanKernelSource<PolicySelector, InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT, AccumT, EnforceInclusive>,
+  typename AccumT         = ::cuda::std::__accumulator_t<ScanOpT,
+                                                         cub::detail::it_value_t<InputIteratorT>,
+                                                         ::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
+                                                                          cub::detail::it_value_t<InputIteratorT>,
+                                                                          typename InitValueT::value_type>>,
+  typename PolicySelector = policy_selector_from_types<detail::it_value_t<InputIteratorT>,
+                                                       detail::it_value_t<OutputIteratorT>,
+                                                       AccumT,
+                                                       OffsetT,
+                                                       ScanOpT>,
+  typename KernelSource   = DeviceScanKernelSource<
+      PolicySelector,
+      THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<InputIteratorT>,
+      THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<OutputIteratorT>,
+      ScanOpT,
+      InitValueT,
+      OffsetT,
+      AccumT,
+      EnforceInclusive>,
   typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
 #if _CCCL_HAS_CONCEPTS()
   requires scan_policy_selector<PolicySelector>
@@ -961,22 +954,28 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   });
 }
 
-template <
-  typename AccumT,
-  typename InputIteratorT,
-  typename OutputIteratorT,
-  typename ScanOpT,
-  typename InitValueT,
-  typename OffsetT,
-  ForceInclusive EnforceInclusive = ForceInclusive::No,
-  typename PolicySelector         = policy_selector_from_types<detail::it_value_t<InputIteratorT>,
+template <typename AccumT,
+          ForceInclusive EnforceInclusive = ForceInclusive::No,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitValueT,
+          typename OffsetT,
+          typename PolicySelector = policy_selector_from_types<detail::it_value_t<InputIteratorT>,
                                                                detail::it_value_t<OutputIteratorT>,
                                                                AccumT,
                                                                OffsetT,
                                                                ScanOpT>,
-  typename KernelSource =
-    DeviceScanKernelSource<PolicySelector, InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT, AccumT, EnforceInclusive>,
-  typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
+          typename KernelSource   = DeviceScanKernelSource<
+              PolicySelector,
+              THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<InputIteratorT>,
+              THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<OutputIteratorT>,
+              ScanOpT,
+              InitValueT,
+              OffsetT,
+              AccumT,
+              EnforceInclusive>,
+          typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
 CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_with_accum(
   void* d_temp_storage,
   size_t& temp_storage_bytes,
@@ -990,7 +989,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_with_accum(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {}) -> cudaError_t
 {
-  return dispatch<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT, AccumT>(
+  return dispatch<EnforceInclusive, InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT, AccumT>(
     d_temp_storage,
     temp_storage_bytes,
     d_in,
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index cb45ef9543b..d703d035d2b 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -112,7 +112,7 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte
   }
 }
 template <typename PolicySelector, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
-_CCCL_DEVICE constexpr int get_device_scan_launch_bounds() noexcept
+[[nodiscard]] _CCCL_DEVICE_API _CCCL_CONSTEVAL int get_device_scan_launch_bounds() noexcept
 {
   constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
@@ -204,22 +204,11 @@ __launch_bounds__(get_device_scan_launch_bounds<PolicySelector, InputIteratorT,
   if constexpr (policy.warpspeed
                 && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(policy.warpspeed))
   {
-    using WarpspeedPolicyT [[maybe_unused]] = warpspeedKernelPolicy<
-      scan_warpspeed_policy::num_squads,
-      policy.warpspeed.num_reduce_warps,
-      policy.warpspeed.num_scan_stor_warps,
-      policy.warpspeed.num_load_warps,
-      policy.warpspeed.num_sched_warps,
-      policy.warpspeed.num_look_ahead_warps,
-      policy.warpspeed.num_look_ahead_items,
-      policy.warpspeed.num_total_threads,
-      policy.warpspeed.items_per_thread,
-      policy.warpspeed.tile_size>;
     NV_IF_TARGET(
       NV_PROVIDES_SM_100, ({
         auto scan_params = scanKernelParams<it_value_t<InputIteratorT>, it_value_t<OutputIteratorT>, AccumT>{
           d_in, d_out, tile_state.lookahead, num_items, num_stages};
-        device_scan_lookahead_body<WarpspeedPolicyT, ForceInclusive, RealInitValueT>(scan_params, scan_op, init_value);
+        device_scan_lookahead_body<PolicySelector, ForceInclusive, RealInitValueT>(scan_params, scan_op, init_value);
       }));
   }
   else
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
index 4c1918dc613..921036b836a 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
@@ -30,6 +30,7 @@
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/__device/arch_id.h>
 #include <cuda/__memory/align_down.h>
 #include <cuda/__memory/align_up.h>
 #include <cuda/__ptx/instructions/clusterlaunchcontrol.h>
@@ -43,53 +44,11 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail::scan
 {
-template <int NumSquads,
-          int NumReduceWarps,
-          int NumScanStorWarps,
-          int NumLoadWarps,
-          int NumSchedWarps,
-          int NumLookAheadWarps,
-          int NumLookAheadItems,
-          int NumTotalThreads,
-          int ItemsPerThread,
-          int TileSize>
-struct warpspeedKernelPolicy
+template <typename PolicySelector>
+_CCCL_API constexpr scan_warpspeed_policy get_warpspeed_policy() noexcept
 {
-  static constexpr int num_squads = NumSquads;
-
-  // The squads cannot be static constexpr variables, as those are not device accessible
-  [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept
-  {
-    return warpspeed::SquadDesc{0, NumReduceWarps};
-  }
-  [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept
-  {
-    return warpspeed::SquadDesc{1, NumScanStorWarps};
-  }
-  [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept
-  {
-    return warpspeed::SquadDesc{2, NumLoadWarps};
-  }
-  [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadSched() noexcept
-  {
-    return warpspeed::SquadDesc{3, NumSchedWarps};
-  }
-  [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLookback() noexcept
-  {
-    return warpspeed::SquadDesc{4, NumLookAheadWarps};
-  }
-
-  static constexpr int num_look_ahead_items = NumLookAheadItems;
-  static constexpr int num_total_threads    = NumTotalThreads;
-  static constexpr int items_per_thread     = ItemsPerThread;
-  static constexpr int tile_size            = TileSize;
-
-  static constexpr int num_reduce_warps     = NumReduceWarps;
-  static constexpr int num_scan_stor_warps  = NumScanStorWarps;
-  static constexpr int num_load_warps       = NumLoadWarps;
-  static constexpr int num_sched_warps      = NumSchedWarps;
-  static constexpr int num_look_ahead_warps = NumLookAheadWarps;
-};
+  return PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).warpspeed;
+}
 
 template <typename InputT, typename OutputT, typename AccumT>
 struct scanKernelParams
@@ -102,22 +61,27 @@ struct scanKernelParams
 };
 
 // Struct holding all scan kernel resources
-template <typename WarpspeedPolicy, typename InputT, typename OutputT, typename AccumT>
+template <typename PolicySelector, typename InputT, typename OutputT, typename AccumT>
 struct ScanResources
 {
+  _CCCL_API static constexpr scan_warpspeed_policy warpspeed_policy() noexcept
+  {
+    return get_warpspeed_policy<PolicySelector>();
+  }
+
   // align to at least 16 bytes (InputT/OutputT may be aligned higher) so each stage starts correctly aligned
   struct alignas(::cuda::std::max({::cuda::std::size_t{16}, alignof(InputT), alignof(OutputT)})) InOutT
   {
     // the tile_size size is a multiple of the warp size, and thus for sure a multiple of 16
-    static_assert(WarpspeedPolicy::tile_size % 16 == 0, "tile_size must be multiple of 16");
+    static_assert(ScanResources::warpspeed_policy().tile_size % 16 == 0, "tile_size must be multiple of 16");
 
     // therefore, unaligned inputs need exactly 16 bytes extra for overcopying (tail padding = 16 - head padding)
-    ::cuda::std::byte inout[WarpspeedPolicy::tile_size * sizeof(InputT) + 16];
+    ::cuda::std::byte inout[ScanResources::warpspeed_policy().tile_size * sizeof(InputT) + 16];
   };
   static_assert(alignof(InOutT) >= alignof(InputT));
   static_assert(alignof(InOutT) >= alignof(OutputT));
-  using SumThreadAndWarpT =
-    AccumT[WarpspeedPolicy::squadReduce().threadCount() + WarpspeedPolicy::squadReduce().warpCount()];
+  using SumThreadAndWarpT = AccumT[ScanResources::warpspeed_policy().squadReduce().threadCount()
+                                   + ScanResources::warpspeed_policy().squadReduce().warpCount()];
 
   warpspeed::SmemResource<InOutT> smemInOut; // will also be used to stage the output (as OutputT) for the bulk copy
   warpspeed::SmemResource<uint4> smemNextBlockIdx;
@@ -146,22 +110,6 @@ _CCCL_API constexpr scan_stage_counts make_scan_stage_counts(int num_stages)
   return {num_block_idx_stages, 2};
 }
 
-template <typename WarpspeedPolicy>
-_CCCL_API constexpr scan_warpspeed_policy make_scan_warpspeed_policy()
-{
-  return scan_warpspeed_policy{
-    true,
-    WarpspeedPolicy::num_reduce_warps,
-    WarpspeedPolicy::num_scan_stor_warps,
-    WarpspeedPolicy::num_load_warps,
-    WarpspeedPolicy::num_sched_warps,
-    WarpspeedPolicy::num_look_ahead_warps,
-    WarpspeedPolicy::num_look_ahead_items,
-    WarpspeedPolicy::num_total_threads,
-    WarpspeedPolicy::items_per_thread,
-    WarpspeedPolicy::tile_size};
-}
-
 template <typename Policy,
           typename SmemInOutT,
           typename SmemNextBlockIdxT,
@@ -176,7 +124,7 @@ _CCCL_API constexpr void setup_scan_resources(
   SmemSumExclusiveCtaT& smemSumExclusiveCta,
   SmemSumThreadAndWarpT& smemSumThreadAndWarp)
 {
-  const warpspeed::SquadDesc scanSquads[Policy::num_squads] = {
+  const warpspeed::SquadDesc scanSquads[scan_warpspeed_policy::num_squads] = {
     policy.squadReduce(),
     policy.squadScanStore(),
     policy.squadLoad(),
@@ -199,11 +147,11 @@ _CCCL_API constexpr void setup_scan_resources(
 
 // Function to allocate resources.
 
-template <typename WarpspeedPolicy, typename InputT, typename OutputT, typename AccumT>
-[[nodiscard]] _CCCL_API constexpr ScanResources<WarpspeedPolicy, InputT, OutputT, AccumT>
+template <typename PolicySelector, typename InputT, typename OutputT, typename AccumT>
+[[nodiscard]] _CCCL_API constexpr ScanResources<PolicySelector, InputT, OutputT, AccumT>
 allocResources(warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& smemAllocator, int numStages)
 {
-  using ScanResourcesT    = ScanResources<WarpspeedPolicy, InputT, OutputT, AccumT>;
+  using ScanResourcesT    = ScanResources<PolicySelector, InputT, OutputT, AccumT>;
   using InOutT            = typename ScanResourcesT::InOutT;
   using SumThreadAndWarpT = typename ScanResourcesT::SumThreadAndWarpT;
 
@@ -217,7 +165,7 @@ allocResources(warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& sm
   };
 
   setup_scan_resources(
-    make_scan_warpspeed_policy<WarpspeedPolicy>(),
+    ScanResourcesT::warpspeed_policy(),
     syncHandler,
     smemAllocator,
     res.smemInOut,
@@ -293,7 +241,7 @@ _CCCL_DEVICE_API Tp warpScanExclusive(const Tp regInput, ScanOpT& scan_op)
 // warp-specialization dispatch is performed once at the start of the kernel and
 // not in any of the hot loops (even if that may seem the case from a first
 // glance at the code).
-template <typename WarpspeedPolicy,
+template <typename PolicySelector,
           typename InputT,
           typename OutputT,
           typename AccumT,
@@ -310,17 +258,18 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody(
   ////////////////////////////////////////////////////////////////////////////////
   // Tuning dependent variables
   ////////////////////////////////////////////////////////////////////////////////
-  static constexpr warpspeed::SquadDesc squadReduce    = WarpspeedPolicy::squadReduce();
-  static constexpr warpspeed::SquadDesc squadScanStore = WarpspeedPolicy::squadScanStore();
-  static constexpr warpspeed::SquadDesc squadLoad      = WarpspeedPolicy::squadLoad();
-  static constexpr warpspeed::SquadDesc squadSched     = WarpspeedPolicy::squadSched();
-  static constexpr warpspeed::SquadDesc squadLookback  = WarpspeedPolicy::squadLookback();
+  static constexpr scan_warpspeed_policy policy        = get_warpspeed_policy<PolicySelector>();
+  static constexpr warpspeed::SquadDesc squadReduce    = policy.squadReduce();
+  static constexpr warpspeed::SquadDesc squadScanStore = policy.squadScanStore();
+  static constexpr warpspeed::SquadDesc squadLoad      = policy.squadLoad();
+  static constexpr warpspeed::SquadDesc squadSched     = policy.squadSched();
+  static constexpr warpspeed::SquadDesc squadLookback  = policy.squadLookback();
 
-  constexpr int tile_size            = WarpspeedPolicy::tile_size;
-  constexpr int num_look_ahead_items = WarpspeedPolicy::num_look_ahead_items;
+  constexpr int tile_size            = policy.tile_size;
+  constexpr int num_look_ahead_items = policy.num_look_ahead_items;
 
   // We might try to instantiate the kernel with hughe types which would lead to a small tile size. Ensure its never 0
-  constexpr int elemPerThread = WarpspeedPolicy::items_per_thread;
+  constexpr int elemPerThread = policy.items_per_thread;
   static_assert(elemPerThread * squadReduce.threadCount() == tile_size, "Invalid tuning policy");
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -329,8 +278,8 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody(
   warpspeed::SyncHandler syncHandler{};
   warpspeed::SmemAllocator smemAllocator{};
 
-  ScanResources<WarpspeedPolicy, InputT, OutputT, AccumT> res =
-    allocResources<WarpspeedPolicy, InputT, OutputT, AccumT>(syncHandler, smemAllocator, params.numStages);
+  ScanResources<PolicySelector, InputT, OutputT, AccumT> res =
+    allocResources<PolicySelector, InputT, OutputT, AccumT>(syncHandler, smemAllocator, params.numStages);
 
   syncHandler.clusterInitSync(specialRegisters);
 
@@ -718,8 +667,7 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody(
       {
         // otherwise, issue multiple bulk copies in chunks of the input tile size
         // TODO(bgruber): I am sure this could be implemented a lot more efficiently
-        static constexpr int elem_per_chunk =
-          static_cast<int>(WarpspeedPolicy::tile_size * sizeof(InputT) / sizeof(OutputT));
+        static constexpr int elem_per_chunk = static_cast<int>(policy.tile_size * sizeof(InputT) / sizeof(OutputT));
         for (int chunk_offset = 0; chunk_offset < valid_items; chunk_offset += elem_per_chunk)
         {
           const int chunk_size = ::cuda::std::min(valid_items - chunk_offset, elem_per_chunk);
@@ -774,14 +722,7 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody(
 
 #endif // __cccl_ptx_isa >= 860
 
-template <typename ActivePolicy, class = void>
-inline constexpr int get_scan_block_threads = 1;
-
-template <typename ActivePolicy>
-inline constexpr int get_scan_block_threads<ActivePolicy, ::cuda::std::void_t<typename ActivePolicy::WarpspeedPolicy>> =
-  ActivePolicy::WarpspeedPolicy::num_total_threads;
-
-template <typename WarpspeedPolicy,
+template <typename PolicySelector,
           bool ForceInclusive,
           typename RealInitValueT,
           typename InputT,
@@ -796,18 +737,20 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void device_scan_lookahead_body(
   // Cache special registers at start of kernel
   warpspeed::SpecialRegisters specialRegisters = warpspeed::getSpecialRegisters();
 
+  static constexpr scan_warpspeed_policy policy = get_warpspeed_policy<PolicySelector>();
+
   // Dispatch for warp-specialization
-  static constexpr warpspeed::SquadDesc scanSquads[WarpspeedPolicy::num_squads] = {
-    WarpspeedPolicy::squadReduce(),
-    WarpspeedPolicy::squadScanStore(),
-    WarpspeedPolicy::squadLoad(),
-    WarpspeedPolicy::squadSched(),
-    WarpspeedPolicy::squadLookback(),
+  static constexpr warpspeed::SquadDesc scanSquads[scan_warpspeed_policy::num_squads] = {
+    policy.squadReduce(),
+    policy.squadScanStore(),
+    policy.squadLoad(),
+    policy.squadSched(),
+    policy.squadLookback(),
   };
 
   // we need to force inline the lambda, but clang in CUDA mode only likes the GNU syntax
   warpspeed::squadDispatch(specialRegisters, scanSquads, [&](warpspeed::Squad squad) _CCCL_FORCEINLINE_LAMBDA {
-    kernelBody<WarpspeedPolicy, InputT, OutputT, AccumT, ScanOpT, RealInitValueT, ForceInclusive>(
+    kernelBody<PolicySelector, InputT, OutputT, AccumT, ScanOpT, RealInitValueT, ForceInclusive>(
       squad, specialRegisters, params, ::cuda::std::move(scan_op), static_cast<RealInitValueT>(init_value));
   });
 #endif // __cccl_ptx_isa >= 860
@@ -978,54 +921,6 @@ _CCCL_API constexpr bool use_warpspeed(const scan_warpspeed_policy& policy)
     ::cuda::std::is_trivially_copyable_v<OutputT>,
     ::cuda::std::is_default_constructible_v<OutputT>);
 }
-
-#if 0
-// we check the required shared memory inside a template, so the error message shows the amount in case of failure
-template <int RequiredSharedMemory>
-_CCCL_API constexpr void verify_smem()
-{
-  static_assert(RequiredSharedMemory <= max_smem_per_block,
-                "Single stage configuration exceeds architecture independent SMEM (48KiB)");
-}
-#endif
-
-template <typename WarpspeedPolicy, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
-_CCCL_API constexpr auto one_stage_fits_48KiB_SMEM() -> bool
-{
-  using InputT                           = it_value_t<InputIteratorT>;
-  using OutputT                          = it_value_t<OutputIteratorT>;
-  constexpr scan_warpspeed_policy policy = make_scan_warpspeed_policy<WarpspeedPolicy>();
-  constexpr int smem_size_1_stage        = smem_for_stages<InputT, OutputT, AccumT>(policy, 1);
-// We can turn this on to report if a single stage of the warpspeed scan would exceed 48KiB if SMEM.
-#if 0
-  verify_smem<smem_size_1_stage>();
-#endif
-  return smem_size_1_stage <= max_smem_per_block;
-}
-
-template <typename Policy, typename InputIterator, typename OutputIterator, typename AccumT, typename = void>
-inline constexpr bool scan_use_warpspeed = false;
-
-// We need `cuda::std::is_constant_evaluated` for the compile-time SMEM computation. And we need PTX ISA 8.6.
-// MSVC + nvcc < 13.1 just fails to compile `cub.test.device.scan.lid_1.types_0` with `Internal error` and nothing else.
-#if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && __cccl_ptx_isa >= 860 \
-  && !(_CCCL_COMPILER(MSVC) && _CCCL_CUDA_COMPILER(NVCC, <, 13, 1))
-template <typename Policy, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
-inline constexpr bool scan_use_warpspeed<Policy,
-                                         InputIteratorT,
-                                         OutputIteratorT,
-                                         AccumT,
-                                         ::cuda::std::void_t<typename Policy::WarpspeedPolicy>> =
-  // for bulk copy: input and output iterators must be contiguous and their value types must be trivially copyable
-  THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InputIteratorT>
-  && ::cuda::std::is_trivially_copyable_v<it_value_t<InputIteratorT>>
-  && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>
-  && ::cuda::std::is_trivially_copyable_v<it_value_t<OutputIteratorT>>
-  // for bulk copy store: we need to prepare a buffer of output types in SMEM
-  && ::cuda::std::is_default_constructible_v<it_value_t<OutputIteratorT>>
-  // need to fit one stage into 48KiB so binaries stay forward compatible with future GPUs
-  && one_stage_fits_48KiB_SMEM<typename Policy::WarpspeedPolicy, InputIteratorT, OutputIteratorT, AccumT>();
-#endif // defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && __cccl_ptx_isa >= 860
 } // namespace detail::scan
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index f7ea31fdc31..89138cd1132 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -642,77 +642,6 @@ struct policy_hub
 
     using ScanPolicyT =
       decltype(select_agent_policy100<sm100_tuning<InputValueT, AccumT, OffsetT, classify_op<ScanOpT>>, InputValueT>(0));
-
-#if __cccl_ptx_isa >= 860
-    struct WarpspeedPolicy
-    {
-      // Squad definitions
-      static constexpr int num_squads           = 5;
-      static constexpr int num_threads_per_warp = 32;
-
-      // TODO(bgruber): tune this
-      static constexpr int num_reduce_warps     = 4;
-      static constexpr int num_scan_stor_warps  = 4;
-      static constexpr int num_load_warps       = 1;
-      static constexpr int num_sched_warps      = 1;
-      static constexpr int num_look_ahead_warps = 1;
-
-      // TODO(bgruber): 5 is a bit better for complex<float>
-      static constexpr int num_look_ahead_items = sizeof(AccumT) == 2 ? 3 : 4;
-
-      // Deduced definitions
-      static constexpr int num_total_warps =
-        num_reduce_warps + num_scan_stor_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
-      static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;
-
-      static constexpr int squad_reduce_thread_count = num_reduce_warps * num_threads_per_warp;
-
-      // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM)
-      // 2-byte types and double needed special handling
-      static constexpr int items_per_thread =
-        ::cuda::std::max(256 / (sizeof(InputValueT) == 2 ? 2 : int{sizeof(AccumT)}) - 1, 1);
-      // TODO(bgruber): the special handling of double below is a LOT faster, but exceeds 48KiB SMEM
-      // clang-format off
-      // |   F64   |      I32      |     72576      |  11.295 us |       2.44% |  11.917 us |       8.02% |     0.622 us |   5.50% |   SLOW   |
-      // |   F64   |      I32      |    1056384     |  16.162 us |       6.24% |  15.847 us |       5.57% |    -0.315 us |  -1.95% |   SAME   |
-      // |   F64   |      I32      |    16781184    |  65.696 us |       1.64% |  60.650 us |       3.37% |    -5.046 us |  -7.68% |   FAST   |
-      // |   F64   |      I32      |   268442496    | 863.896 us |       0.22% | 679.100 us |       0.93% |  -184.796 us | -21.39% |   FAST   |
-      // |   F64   |      I32      |   1073745792   |   3.418 ms |       0.12% |   2.662 ms |       0.46% |  -755.740 us | -22.11% |   FAST   |
-      // |   F64   |      I64      |     72576      |  12.301 us |       8.18% |  12.987 us |       5.75% |     0.686 us |   5.58% |   SAME   |
-      // |   F64   |      I64      |    1056384     |  16.775 us |       5.70% |  16.091 us |       6.14% |    -0.684 us |  -4.08% |   SAME   |
-      // |   F64   |      I64      |    16781184    |  66.970 us |       1.41% |  58.024 us |       3.17% |    -8.946 us | -13.36% |   FAST   |
-      // |   F64   |      I64      |   268442496    | 863.826 us |       0.23% | 676.465 us |       0.98% |  -187.360 us | -21.69% |   FAST   |
-      // |   F64   |      I64      |   1073745792   |   3.419 ms |       0.11% |   2.664 ms |       0.48% |  -755.409 us | -22.09% |   FAST   |
-      // |   F64   |      I64      |   4294975104   |  13.641 ms |       0.05% |  10.575 ms |       0.24% | -3065.815 us | -22.48% |   FAST   |
-      // clang-format on
-      // (256 / (sizeof(InputValueT) == 2 ? 2 : (::cuda::std::is_same_v<InputValueT, double> ? 4 : sizeof(AccumT))) -
-      // 1);
-
-      static constexpr int tile_size = items_per_thread * squad_reduce_thread_count;
-
-      // The squads cannot be static constexpr variables, as those are not device accessible
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept
-      {
-        return warpspeed::SquadDesc{0, num_reduce_warps};
-      }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept
-      {
-        return warpspeed::SquadDesc{1, num_scan_stor_warps};
-      }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept
-      {
-        return warpspeed::SquadDesc{2, num_load_warps};
-      }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadSched() noexcept
-      {
-        return warpspeed::SquadDesc{3, num_sched_warps};
-      }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLookback() noexcept
-      {
-        return warpspeed::SquadDesc{4, num_look_ahead_warps};
-      }
-    };
-#endif // __cccl_ptx_isa >= 860
   };
 
   using MaxPolicy = Policy1000;
@@ -738,14 +667,17 @@ get_warpspeed_policy(::cuda::arch_id arch, int input_value_size, int accum_size)
     scan_warpspeed_policy warpspeed_policy{};
     warpspeed_policy.valid = true;
 
+    // TODO(bgruber): tune this
     warpspeed_policy.num_reduce_warps     = 4;
     warpspeed_policy.num_scan_stor_warps  = 4;
     warpspeed_policy.num_load_warps       = 1;
     warpspeed_policy.num_sched_warps      = 1;
     warpspeed_policy.num_look_ahead_warps = 1;
 
+    // TODO(bgruber): 5 is a bit better for complex<float>
     warpspeed_policy.num_look_ahead_items = accum_size == 2 ? 3 : 4;
 
+    // Deduced definitions
     const auto num_threads_per_warp = 32;
     const auto num_total_warps =
       warpspeed_policy.num_reduce_warps + warpspeed_policy.num_scan_stor_warps + warpspeed_policy.num_load_warps
@@ -754,8 +686,27 @@ get_warpspeed_policy(::cuda::arch_id arch, int input_value_size, int accum_size)
 
     const auto squad_reduce_thread_count = warpspeed_policy.num_reduce_warps * num_threads_per_warp;
 
+    // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM)
+    // 2-byte types and double needed special handling
     warpspeed_policy.items_per_thread = ::cuda::std::max(256 / (input_value_size == 2 ? 2 : accum_size) - 1, 1);
-    warpspeed_policy.tile_size        = warpspeed_policy.items_per_thread * squad_reduce_thread_count;
+    // TODO(bgruber): the special handling of double below is a LOT faster, but exceeds 48KiB SMEM
+    // clang-format off
+    // |   F64   |      I32      |     72576      |  11.295 us |       2.44% |  11.917 us |       8.02% |     0.622 us |   5.50% |   SLOW   |
+    // |   F64   |      I32      |    1056384     |  16.162 us |       6.24% |  15.847 us |       5.57% |    -0.315 us |  -1.95% |   SAME   |
+    // |   F64   |      I32      |    16781184    |  65.696 us |       1.64% |  60.650 us |       3.37% |    -5.046 us |  -7.68% |   FAST   |
+    // |   F64   |      I32      |   268442496    | 863.896 us |       0.22% | 679.100 us |       0.93% |  -184.796 us | -21.39% |   FAST   |
+    // |   F64   |      I32      |   1073745792   |   3.418 ms |       0.12% |   2.662 ms |       0.46% |  -755.740 us | -22.11% |   FAST   |
+    // |   F64   |      I64      |     72576      |  12.301 us |       8.18% |  12.987 us |       5.75% |     0.686 us |   5.58% |   SAME   |
+    // |   F64   |      I64      |    1056384     |  16.775 us |       5.70% |  16.091 us |       6.14% |    -0.684 us |  -4.08% |   SAME   |
+    // |   F64   |      I64      |    16781184    |  66.970 us |       1.41% |  58.024 us |       3.17% |    -8.946 us | -13.36% |   FAST   |
+    // |   F64   |      I64      |   268442496    | 863.826 us |       0.23% | 676.465 us |       0.98% |  -187.360 us | -21.69% |   FAST   |
+    // |   F64   |      I64      |   1073745792   |   3.419 ms |       0.11% |   2.664 ms |       0.48% |  -755.409 us | -22.09% |   FAST   |
+    // |   F64   |      I64      |   4294975104   |  13.641 ms |       0.05% |  10.575 ms |       0.24% | -3065.815 us | -22.48% |   FAST   |
+    // clang-format on
+    // (256 / (sizeof(InputValueT) == 2 ? 2 : (::cuda::std::is_same_v<InputValueT, double> ? 4 : sizeof(AccumT))) -
+    // 1);
+
+    warpspeed_policy.tile_size = warpspeed_policy.items_per_thread * squad_reduce_thread_count;
 
     return warpspeed_policy;
   }
@@ -1172,6 +1123,25 @@ struct policy_selector
 static_assert(scan_policy_selector<policy_selector>);
 #endif // _CCCL_HAS_CONCEPTS()
 
+template <typename ScanOpT, typename InputValueT, typename OutputValueT, typename AccumT, typename = void>
+struct benchmark_match_for_policy_selector
+{
+  static constexpr bool value = false;
+};
+
+template <typename ScanOpT, typename InputValueT, typename OutputValueT, typename AccumT>
+struct benchmark_match_for_policy_selector<
+  ScanOpT,
+  InputValueT,
+  OutputValueT,
+  AccumT,
+  ::cuda::std::void_t<::cuda::std::__accumulator_t<ScanOpT, InputValueT, InputValueT>>>
+{
+  static constexpr bool value =
+    sizeof(AccumT) == sizeof(::cuda::std::__accumulator_t<ScanOpT, InputValueT, InputValueT>)
+    && sizeof(InputValueT) == sizeof(OutputValueT);
+};
+
 // stateless version which can be passed to kernels
 template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
 struct policy_selector_from_types
@@ -1185,8 +1155,7 @@ struct policy_selector_from_types
   [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy
   {
     constexpr bool benchmark_match =
-      sizeof(AccumT) == sizeof(::cuda::std::__accumulator_t<ScanOpT, InputValueT, InputValueT>)
-      && sizeof(InputValueT) == sizeof(OutputValueT);
+      benchmark_match_for_policy_selector<ScanOpT, InputValueT, OutputValueT, AccumT>::value;
 
     constexpr auto policies = policy_selector{
       input_value_size,
diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu
index ce295020b54..5fc417274ed 100644
--- a/cub/test/catch2_test_device_scan_env.cu
+++ b/cub/test/catch2_test_device_scan_env.cu
@@ -14,6 +14,7 @@ struct stream_registry_factory_t;
 
 #include <thrust/device_vector.h>
 
+#include <cuda/__device/arch_id.h>
 #include <cuda/iterator>
 
 #include "catch2_test_env_launch_helper.h"
@@ -45,36 +46,23 @@ struct block_size_check_t
   }
 };
 
-struct block_size_retreiver_t
-{
-  int* ptr;
-
-  template <class ActivePolicyT>
-  cudaError_t Invoke()
-  {
-    *ptr = ActivePolicyT::ScanPolicyT::BLOCK_THREADS;
-    return cudaSuccess;
-  }
-};
-
 TEST_CASE("Device scan exclusive scan works with default environment", "[scan][device]")
 {
   using num_items_t = int;
   using value_t     = int;
   using offset_t    = cub::detail::choose_offset_t<num_items_t>;
 
-  using policy_t =
-    cub::detail::scan::default_tuning::fn<value_t, value_t, value_t, offset_t, block_size_check_t>::MaxPolicy;
+  using selector_t =
+    cub::detail::scan::default_tuning::selector<value_t, value_t, value_t, offset_t, block_size_check_t>;
 
   int current_device{};
   REQUIRE(cudaSuccess == cudaGetDevice(&current_device));
 
-  int ptx_version{};
-  REQUIRE(cudaSuccess == cub::PtxVersion(ptx_version, current_device));
+  cudaDeviceProp device_props{};
+  REQUIRE(cudaSuccess == cudaGetDeviceProperties(&device_props, current_device));
 
-  int target_block_size{};
-  block_size_retreiver_t block_size_retreiver{&target_block_size};
-  REQUIRE(cudaSuccess == policy_t::Invoke(ptx_version, block_size_retreiver));
+  const auto target_block_size =
+    selector_t{}(cuda::to_arch_id(cuda::compute_capability{device_props.major, device_props.minor})).block_threads;
 
   num_items_t num_items = 1;
   c2h::device_vector<int> d_block_size(1);
@@ -114,8 +102,8 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev
 template <int BlockThreads>
 struct scan_tuning : cub::detail::scan::tuning<scan_tuning<BlockThreads>>
 {
-  template <class /* InputValueT */, class /* OutputValueT */, class AccumT, class /* Offset */, class /* ScanOpT */>
-  struct fn
+  template <class InputValueT, class OutputValueT, class AccumT, class OffsetT, class ScanOpT>
+  struct policy_hub
   {
     struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
     {
@@ -139,6 +127,13 @@ struct scan_tuning : cub::detail::scan::tuning<scan_tuning<BlockThreads>>
 
     using MaxPolicy = Policy500;
   };
+
+  template <class InputValueT, class OutputValueT, class AccumT, class OffsetT, class ScanOpT>
+  using selector =
+    cub::detail::scan::policy_selector_from_hub<policy_hub<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>,
+                                                InputValueT,
+                                                OutputValueT,
+                                                AccumT>;
 };
 
 struct get_reduce_tuning_query_t

From 863c8743ebaed4addc47757e836e4fe1381da3ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 26 Feb 2026 16:36:01 +0100
Subject: [PATCH 18/31] Operators for scan_warpspeed_policy.

---
 .../kernels/scan_warpspeed_policy.cuh         | 21 +++++++++++++++++++
 .../device/dispatch/tuning/tuning_scan.cuh    |  8 +------
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh
index 3409fc53866..e4f254dcd6b 100644
--- a/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh
+++ b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh
@@ -7,6 +7,10 @@
 
 #include <cub/detail/warpspeed/squad/squad_desc.cuh>
 
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::scan
@@ -61,6 +65,23 @@ struct scan_warpspeed_policy
         && lhs.num_look_ahead_items == rhs.num_look_ahead_items && lhs.num_total_threads == rhs.num_total_threads
         && lhs.items_per_thread == rhs.items_per_thread && lhs.tile_size == rhs.tile_size;
   }
+
+  _CCCL_API constexpr friend bool operator!=(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const scan_warpspeed_policy& p)
+  {
+    return os
+        << "scan_warpspeed_policy { .valid = " << p.valid << ", .num_reduce_warps = " << p.num_reduce_warps
+        << ", .num_scan_stor_warps = " << p.num_scan_stor_warps << ", .num_load_warps = " << p.num_load_warps
+        << ", .num_sched_warps = " << p.num_sched_warps << ", .num_look_ahead_warps = " << p.num_look_ahead_warps
+        << ", .num_look_ahead_items = " << p.num_look_ahead_items << ", .num_total_threads = " << p.num_total_threads
+        << ", .items_per_thread = " << p.items_per_thread << ", .tile_size = " << p.tile_size << " }";
+  }
+#endif // !_CCCL_COMPILER(NVRTC)
 };
 } // namespace detail::scan
 
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 89138cd1132..9683694e085 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -155,13 +155,7 @@ struct scan_policy
        << ", .delay_constructor = " << p.delay_constructor;
     if (p.warpspeed)
     {
-      os << ", .warpspeed = scan_warpspeed_policy { .valid = " << p.warpspeed.valid << ", .num_reduce_warps = "
-         << p.warpspeed.num_reduce_warps << ", .num_scan_stor_warps = " << p.warpspeed.num_scan_stor_warps
-         << ", .num_load_warps = " << p.warpspeed.num_load_warps << ", .num_sched_warps = "
-         << p.warpspeed.num_sched_warps << ", .num_look_ahead_warps = " << p.warpspeed.num_look_ahead_warps
-         << ", .num_look_ahead_items = " << p.warpspeed.num_look_ahead_items << ", .num_total_threads = "
-         << p.warpspeed.num_total_threads << ", .items_per_thread = " << p.warpspeed.items_per_thread
-         << ", .tile_size = " << p.warpspeed.tile_size << "}";
+      os << ", .warpspeed = " << p.warpspeed;
     }
     return os << " }";
   }

From 699d4bfd209127e4941b1bcad2b0aaef78ed42c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 26 Feb 2026 16:49:56 +0100
Subject: [PATCH 19/31] Fix clang build.

---
 cub/cub/device/dispatch/dispatch_scan.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index a50a9604ae5..f19e4f0fb73 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -768,7 +768,8 @@ struct DispatchScan
   {
     struct policy_getter
     {
-      _CCCL_API _CCCL_FORCEINLINE constexpr auto operator()() const
+      // host-device not api, because clang warns about exclude_from_explicit_instantiation in local types
+      _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto operator()() const
       {
         return detail::scan::convert_policy<ActivePolicyT>();
       }

From ab85bfee3b3266553fd35b9dbfb892f8a99df4ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 26 Feb 2026 17:25:14 +0100
Subject: [PATCH 20/31] Fix a missed test.

---
 cub/test/catch2_test_device_scan_env.cu | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu
index 29e47f0fd48..d489e320942 100644
--- a/cub/test/catch2_test_device_scan_env.cu
+++ b/cub/test/catch2_test_device_scan_env.cu
@@ -207,18 +207,17 @@ TEST_CASE("Device scan inclusive-scan works with default environment", "[scan][d
   using value_t     = int;
   using offset_t    = cub::detail::choose_offset_t<num_items_t>;
 
-  using policy_t =
-    cub::detail::scan::default_tuning::fn<value_t, value_t, value_t, offset_t, block_size_check_t>::MaxPolicy;
+  using selector_t =
+    cub::detail::scan::default_tuning::selector<value_t, value_t, value_t, offset_t, block_size_check_t>;
 
   int current_device{};
   REQUIRE(cudaSuccess == cudaGetDevice(&current_device));
 
-  int ptx_version{};
-  REQUIRE(cudaSuccess == cub::PtxVersion(ptx_version, current_device));
+  cudaDeviceProp device_props{};
+  REQUIRE(cudaSuccess == cudaGetDeviceProperties(&device_props, current_device));
 
-  int target_block_size{};
-  block_size_retreiver_t block_size_retreiver{&target_block_size};
-  REQUIRE(cudaSuccess == policy_t::Invoke(ptx_version, block_size_retreiver));
+  const auto target_block_size =
+    selector_t{}(cuda::to_arch_id(cuda::compute_capability{device_props.major, device_props.minor})).block_threads;
 
   num_items_t num_items = 1;
   c2h::device_vector<int> d_block_size(1);

From 372bcb90239db59440aeb4de01ffc2826585ca09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 26 Feb 2026 17:57:58 +0100
Subject: [PATCH 21/31] Fix clang-cuda concept checks.

---
 cub/cub/device/dispatch/dispatch_scan.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index f19e4f0fb73..54490c9fde8 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -154,8 +154,9 @@ _CCCL_API constexpr auto convert_policy() -> scan_policy
 template <typename PolicyHub, typename InputValueT, typename OutputValueT, typename AccumT>
 struct policy_selector_from_hub
 {
-  // this is only called in device code
-  _CCCL_DEVICE_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> scan_policy
+  // Called from device code during dispatch, and from host code when clang-cuda evaluates
+  // scan_policy_selector concept checks.
+  _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> scan_policy
   {
     return convert_policy<typename PolicyHub::MaxPolicy::ActivePolicy>();
   }

From 27ab1c4a27d8fec43becbf457e6545977a86a7fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 26 Feb 2026 18:39:04 +0100
Subject: [PATCH 22/31] Fix classifications of bool and min/max.

---
 c/parallel/src/util/types.h               | 3 ++-
 cub/cub/device/dispatch/tuning/common.cuh | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/c/parallel/src/util/types.h b/c/parallel/src/util/types.h
index 7ff767b4a94..837025a69c2 100644
--- a/c/parallel/src/util/types.h
+++ b/c/parallel/src/util/types.h
@@ -114,6 +114,8 @@ inline constexpr cub::detail::type_t cccl_type_enum_to_cub_type(cccl_type_enum t
 {
   switch (type)
   {
+    case CCCL_BOOLEAN:
+      return cub::detail::type_t::boolean;
     case CCCL_INT8:
       return cub::detail::type_t::int8;
     case CCCL_INT16:
@@ -136,7 +138,6 @@ inline constexpr cub::detail::type_t cccl_type_enum_to_cub_type(cccl_type_enum t
       return cub::detail::type_t::float64;
     case CCCL_FLOAT16:
     case CCCL_STORAGE:
-    case CCCL_BOOLEAN:
     default:
       return cub::detail::type_t::other;
   }
diff --git a/cub/cub/device/dispatch/tuning/common.cuh b/cub/cub/device/dispatch/tuning/common.cuh
index 273eedfbd82..c41686b7971 100644
--- a/cub/cub/device/dispatch/tuning/common.cuh
+++ b/cub/cub/device/dispatch/tuning/common.cuh
@@ -29,6 +29,7 @@ namespace detail
 // libcu++
 enum class type_t
 {
+  boolean,
   int8,
   int16,
   int32,
@@ -50,6 +51,8 @@ inline constexpr auto classify_type = type_t::other;
 template <>
 inline constexpr auto classify_type<char> = ::cuda::std::is_signed_v<char> ? type_t::int8 : type_t::uint8;
 template <>
+inline constexpr auto classify_type<bool> = type_t::boolean;
+template <>
 inline constexpr auto classify_type<signed char> = type_t::int8;
 template <>
 inline constexpr auto classify_type<unsigned char> = type_t::uint8;
@@ -101,6 +104,12 @@ inline constexpr auto classify_op = op_kind_t::other;
 template <typename T>
 inline constexpr auto classify_op<::cuda::std::plus<T>> = op_kind_t::plus;
 
+template <typename T>
+inline constexpr auto classify_op<::cuda::minimum<T>> = op_kind_t::min;
+
+template <typename T>
+inline constexpr auto classify_op<::cuda::maximum<T>> = op_kind_t::max;
+
 struct iterator_info
 {
   int value_type_size;

From 16bc4522733f69ad504a3933cd946706d4d15d3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Fri, 27 Feb 2026 05:33:11 +0100
Subject: [PATCH 23/31] Add missing includes.

---
 cub/cub/device/dispatch/tuning/common.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cub/cub/device/dispatch/tuning/common.cuh b/cub/cub/device/dispatch/tuning/common.cuh
index c41686b7971..8811afa012c 100644
--- a/cub/cub/device/dispatch/tuning/common.cuh
+++ b/cub/cub/device/dispatch/tuning/common.cuh
@@ -18,6 +18,8 @@
 #include <thrust/type_traits/is_contiguous_iterator.h>
 #include <thrust/type_traits/is_trivially_relocatable.h>
 
+#include <cuda/__functional/maximum.h>
+#include <cuda/__functional/minimum.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/is_signed.h>
 

From 565a017f9565620ece4594cdbd99cea50d04d52a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 12 Mar 2026 20:56:12 -0500
Subject: [PATCH 24/31] Codegen fixes.

---
 c/parallel/src/scan.cu                        |  2 +
 cub/benchmarks/bench/scan/exclusive/base.cuh  |  4 +-
 .../kernels/kernel_scan_warpspeed.cuh         | 64 +++++++++++++++++--
 .../device/dispatch/tuning/tuning_scan.cuh    | 23 ++++++-
 4 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index ab7aeb44f04..b4fac8e294b 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -264,6 +264,7 @@ try
     const bool types_match = input_type == output_type && input_type == accum_t.type;
     const bool benchmark_match =
       operation_t != cub::detail::op_kind_t::other && types_match && input_type != CCCL_STORAGE;
+    const bool accum_is_primitive_or_trivially_copy_constructible = true;
 
     return policy_selector{
       static_cast<int>(input_it.value_type.size),
@@ -275,6 +276,7 @@ try
       int{sizeof(OffsetT)},
       accum_type,
       operation_t,
+      accum_is_primitive_or_trivially_copy_constructible,
       benchmark_match};
   }();
 
diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh
index a425fe54d95..65f4b35a6fd 100644
--- a/cub/benchmarks/bench/scan/exclusive/base.cuh
+++ b/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -49,8 +49,8 @@ try
   thrust::device_vector<T> input = generate(elements);
   thrust::device_vector<T> output(elements);
 
-  T* d_input  = thrust::raw_pointer_cast(input.data());
-  T* d_output = thrust::raw_pointer_cast(output.data());
+  const T* d_input = thrust::raw_pointer_cast(input.data());
+  T* d_output      = thrust::raw_pointer_cast(output.data());
 
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements, "Size");
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
index 921036b836a..1869b2a4e1d 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
@@ -50,6 +50,62 @@ _CCCL_API constexpr scan_warpspeed_policy get_warpspeed_policy() noexcept
   return PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).warpspeed;
 }
 
+template <typename PolicySelector>
+struct static_warpspeed_policy_adapter
+{
+  _CCCL_API static constexpr scan_warpspeed_policy policy() noexcept
+  {
+    return get_warpspeed_policy<PolicySelector>();
+  }
+
+  _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const
+  {
+    return policy().squadReduce();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const
+  {
+    return policy().squadScanStore();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const
+  {
+    return policy().squadLoad();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadSched() const
+  {
+    return policy().squadSched();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const
+  {
+    return policy().squadLookback();
+  }
+};
+
+struct runtime_warpspeed_policy_adapter
+{
+  const scan_warpspeed_policy& policy;
+
+  _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const
+  {
+    return policy.squadReduce();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const
+  {
+    return policy.squadScanStore();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const
+  {
+    return policy.squadLoad();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadSched() const
+  {
+    return policy.squadSched();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const
+  {
+    return policy.squadLookback();
+  }
+};
+
 template <typename InputT, typename OutputT, typename AccumT>
 struct scanKernelParams
 {
@@ -110,13 +166,13 @@ _CCCL_API constexpr scan_stage_counts make_scan_stage_counts(int num_stages)
   return {num_block_idx_stages, 2};
 }
 
-template <typename Policy,
+template <typename PolicyAdapter,
           typename SmemInOutT,
           typename SmemNextBlockIdxT,
           typename SmemSumExclusiveCtaT,
           typename SmemSumThreadAndWarpT>
 _CCCL_API constexpr void setup_scan_resources(
-  const Policy& policy,
+  const PolicyAdapter& policy,
   warpspeed::SyncHandler& syncHandler,
   warpspeed::SmemAllocator& smemAllocator,
   SmemInOutT& smemInOut,
@@ -165,7 +221,7 @@ allocResources(warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& sm
   };
 
   setup_scan_resources(
-    ScanResourcesT::warpspeed_policy(),
+    static_warpspeed_policy_adapter<PolicySelector>{},
     syncHandler,
     smemAllocator,
     res.smemInOut,
@@ -834,7 +890,7 @@ _CCCL_API constexpr auto smem_for_stages(
   };
 
   setup_scan_resources(
-    policy,
+    runtime_warpspeed_policy_adapter{policy},
     syncHandler,
     smemAllocator,
     res.smemInOut,
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 9accc75e657..f5d45576fbb 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -30,6 +30,7 @@
 #include <cuda/std/__functional/invoke.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_trivially_copy_constructible.h>
 #include <cuda/std/__type_traits/void_t.h>
 
 #if !_CCCL_COMPILER(NVRTC)
@@ -712,6 +713,7 @@ struct policy_selector
   int offset_size;
   type_t accum_type;
   op_kind_t operation_t;
+  bool accum_is_primitive_or_trivially_copy_constructible;
   // TODO(griwes): remove this field before policy_selector is publicly exposed
   bool benchmark_match;
 
@@ -726,7 +728,7 @@ struct policy_selector
       large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE;
     const BlockStoreAlgorithm scan_transposed_store =
       large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE;
-    const auto default_delay = default_delay_constructor_policy(primitive_accum_t == primitive_accum::yes);
+    const auto default_delay = default_delay_constructor_policy(accum_is_primitive_or_trivially_copy_constructible);
 
     const auto warpspeed_policy = get_warpspeed_policy(arch, input_value_size, accum_size);
 
@@ -947,6 +949,21 @@ struct policy_selector
       }
     }
 
+    // Keep sm_86 aligned with legacy policy_hub behavior: policy_hub resets to default policy for 86.
+    if (arch >= ::cuda::arch_id::sm_86)
+    {
+      return make_mem_scaled_scan_policy(
+        128,
+        15,
+        accum_size,
+        scan_transposed_load,
+        LOAD_DEFAULT,
+        scan_transposed_store,
+        BLOCK_SCAN_WARP_SCANS,
+        default_delay,
+        warpspeed_policy);
+    }
+
     if (arch >= ::cuda::arch_id::sm_80)
     {
       if (primitive_op_t == primitive_op::yes)
@@ -1144,6 +1161,9 @@ struct policy_selector_from_types
     constexpr bool benchmark_match =
       benchmark_match_for_policy_selector<ScanOpT, InputValueT, OutputValueT, AccumT>::value;
 
+    constexpr bool accum_is_primitive_or_trivially_copy_constructible =
+      is_primitive<AccumT>::value || ::cuda::std::is_trivially_copy_constructible_v<AccumT>;
+
     constexpr auto policies = policy_selector{
       input_value_size,
       input_value_alignment,
@@ -1154,6 +1174,7 @@ struct policy_selector_from_types
       int{sizeof(OffsetT)},
       classify_type<AccumT>,
       classify_op<ScanOpT>,
+      accum_is_primitive_or_trivially_copy_constructible,
       benchmark_match};
     return policies(arch);
   }

From 9cd3ca020ecab2de835132a7fd0e292e9067846f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Thu, 12 Mar 2026 21:49:16 -0500
Subject: [PATCH 25/31] Review comments.

---
 cub/cub/device/device_scan.cuh                | 14 ++----
 cub/cub/device/dispatch/dispatch_scan.cuh     | 44 +++++++++----------
 .../kernels/kernel_scan_warpspeed.cuh         |  6 +--
 cub/test/catch2_test_device_scan_env.cu       | 44 +++++--------------
 4 files changed, 38 insertions(+), 70 deletions(-)

diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index f6a22e99828..f584d58d158 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -49,12 +49,6 @@ struct tuning
     return static_cast<const Derived&>(*this);
   }
 };
-
-struct default_tuning : tuning<default_tuning>
-{
-  template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
-  using selector = policy_selector_from_types<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>;
-};
 } // namespace detail::scan
 
 //! @rst
@@ -124,9 +118,6 @@ struct DeviceScan
     ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
     cudaStream_t stream)
   {
-    using scan_tuning_t = ::cuda::std::execution::
-      __query_result_or_t<TuningEnvT, detail::scan::get_tuning_query_t, detail::scan::default_tuning>;
-
     // Unsigned integer type for global offsets
     using offset_t = detail::choose_offset_t<NumItemsT>;
 
@@ -137,13 +128,16 @@ struct DeviceScan
                                                     cub::detail::it_value_t<InputIteratorT>,
                                                     typename InitValueT::value_type>>;
 
-    using policy_selector_t = typename scan_tuning_t::template selector<
+    using default_policy_selector_t = detail::scan::policy_selector_from_types<
       detail::it_value_t<InputIteratorT>,
       detail::it_value_t<OutputIteratorT>,
       accum_t,
       offset_t,
       ScanOpT>;
 
+    using policy_selector_t = ::cuda::std::execution::
+      __query_result_or_t<TuningEnvT, detail::scan::get_tuning_query_t, default_policy_selector_t>;
+
     return detail::scan::dispatch<EnforceInclusive>(
       d_temp_storage,
       temp_storage_bytes,
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 54490c9fde8..6cc8c2bafbc 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -160,13 +160,6 @@ struct policy_selector_from_hub
   {
     return convert_policy<typename PolicyHub::MaxPolicy::ActivePolicy>();
   }
-
-  static constexpr int input_value_size       = int{sizeof(InputValueT)};
-  static constexpr int input_value_alignment  = int{alignof(InputValueT)};
-  static constexpr int output_value_size      = int{sizeof(OutputValueT)};
-  static constexpr int output_value_alignment = int{alignof(OutputValueT)};
-  static constexpr int accum_size             = int{sizeof(AccumT)};
-  static constexpr int accum_alignment        = int{alignof(AccumT)};
 };
 } // namespace detail::scan
 
@@ -463,11 +456,14 @@ struct DispatchScan
     return cudaSuccess;
   }
 
-#if _CCCL_CUDACC_AT_LEAST(12, 8)
-  template <typename PolicySelectorT>
-  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm(
-    const detail::scan::scan_warpspeed_policy& warpspeed_policy, const PolicySelectorT& policy_selector)
+#if __cccl_ptx_isa >= 860
+  template <typename PolicyGetter, typename PolicySelectorT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t
+  __invoke_lookahead_algorithm(PolicyGetter policy_getter, const PolicySelectorT& policy_selector)
   {
+    CUB_DETAIL_CONSTEXPR_ISH auto active_policy          = policy_getter();
+    CUB_DETAIL_CONSTEXPR_ISH const auto warpspeed_policy = active_policy.warpspeed;
+
     const int grid_dim =
       static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(warpspeed_policy.tile_size)));
 
@@ -498,17 +494,21 @@ struct DispatchScan
     // TODO(bgruber): we probably need to ensure alignment of d_temp_storage
     _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), "");
 
-    auto scan_kernel = kernel_source.ScanKernel();
-    int num_stages   = 1;
-    int smem_size    = detail::scan::smem_for_stages(
+    auto scan_kernel                               = kernel_source.ScanKernel();
+    CUB_DETAIL_CONSTEXPR_ISH int smem_size_1_stage = detail::scan::smem_for_stages(
       warpspeed_policy,
-      num_stages,
+      1,
       policy_selector.input_value_size,
       policy_selector.input_value_alignment,
       policy_selector.output_value_size,
       policy_selector.output_value_alignment,
       policy_selector.accum_size,
       policy_selector.accum_alignment);
+    CUB_DETAIL_STATIC_ISH_ASSERT(smem_size_1_stage <= detail::max_smem_per_block,
+                                 "Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)");
+
+    int num_stages = 1;
+    int smem_size  = smem_size_1_stage;
 
     // When launched from the host, maximize the number of stages that we can fit inside the shared memory.
     NV_IF_TARGET(NV_IS_HOST, ({
@@ -617,7 +617,7 @@ struct DispatchScan
 
     return cudaSuccess;
   }
-#endif // _CCCL_CUDACC_AT_LEAST(12, 8)
+#endif // __cccl_ptx_isa >= 860
 
   template <typename PolicyGetter, typename PolicySelectorT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
@@ -628,16 +628,12 @@ struct DispatchScan
     CUB_DETAIL_STATIC_ISH_ASSERT(active_policy.load_modifier != CacheLoadModifier::LOAD_LDG,
                                  "The memory consistency model does not apply to texture accesses");
 
-#if !_CCCL_CUDACC_AT_LEAST(12, 8)
-    (void) policy_selector;
-#endif // !_CCCL_CUDACC_AT_LEAST(12, 8)
-
-#if _CCCL_CUDACC_AT_LEAST(12, 8)
-    if (kernel_source.use_warpspeed(active_policy))
+#if __cccl_ptx_isa >= 860
+    if CUB_DETAIL_CONSTEXPR_ISH (kernel_source.use_warpspeed(active_policy))
     {
-      return __invoke_lookahead_algorithm(active_policy.warpspeed, policy_selector);
+      return __invoke_lookahead_algorithm(policy_getter, policy_selector);
     }
-#endif // _CCCL_CUDACC_AT_LEAST(12, 8)
+#endif // __cccl_ptx_isa >= 860
 
     // Number of input tiles
     const int tile_size = active_policy.block_threads * active_policy.items_per_thread;
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
index 1869b2a4e1d..92b84623932 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
@@ -211,12 +211,12 @@ allocResources(warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& sm
   using InOutT            = typename ScanResourcesT::InOutT;
   using SumThreadAndWarpT = typename ScanResourcesT::SumThreadAndWarpT;
 
-  const auto counts = make_scan_stage_counts(numStages);
+  const auto [num_block_idx_stages, num_sum_exclusive_cta_stages] = make_scan_stage_counts(numStages);
 
   ScanResourcesT res = {
     warpspeed::SmemResource<InOutT>(syncHandler, smemAllocator, warpspeed::Stages{numStages}),
-    warpspeed::SmemResource<uint4>(syncHandler, smemAllocator, warpspeed::Stages{counts.num_block_idx_stages}),
-    warpspeed::SmemResource<AccumT>(syncHandler, smemAllocator, warpspeed::Stages{counts.num_sum_exclusive_cta_stages}),
+    warpspeed::SmemResource<uint4>(syncHandler, smemAllocator, warpspeed::Stages{num_block_idx_stages}),
+    warpspeed::SmemResource<AccumT>(syncHandler, smemAllocator, warpspeed::Stages{num_sum_exclusive_cta_stages}),
     warpspeed::SmemResource<SumThreadAndWarpT>(syncHandler, smemAllocator, warpspeed::Stages{numStages}),
   };
 
diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu
index d489e320942..2ccdb97587b 100644
--- a/cub/test/catch2_test_device_scan_env.cu
+++ b/cub/test/catch2_test_device_scan_env.cu
@@ -55,7 +55,7 @@ TEST_CASE("Device scan exclusive scan works with default environment", "[scan][d
   using offset_t    = cub::detail::choose_offset_t<num_items_t>;
 
   using selector_t =
-    cub::detail::scan::default_tuning::selector<value_t, value_t, value_t, offset_t, block_size_check_t>;
+    cub::detail::scan::policy_selector_from_types<value_t, value_t, value_t, offset_t, block_size_check_t>;
 
   int current_device{};
   REQUIRE(cudaSuccess == cudaGetDevice(&current_device));
@@ -104,38 +104,16 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev
 template <int BlockThreads>
 struct scan_tuning : cub::detail::scan::tuning<scan_tuning<BlockThreads>>
 {
-  template <class InputValueT, class OutputValueT, class AccumT, class OffsetT, class ScanOpT>
-  struct policy_hub
+  _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::scan::scan_policy
   {
-    struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
-    {
-      struct ScanPolicyT
-      {
-        static constexpr int BLOCK_THREADS                      = BlockThreads;
-        static constexpr int ITEMS_PER_THREAD                   = 1;
-        static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE;
-
-        static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::CacheLoadModifier::LOAD_DEFAULT;
-        static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM =
-          cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE;
-        static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING;
-
-        struct detail
-        {
-          using delay_constructor_t = cub::detail::default_delay_constructor_t<AccumT>;
-        };
-      };
-    };
-
-    using MaxPolicy = Policy500;
-  };
-
-  template <class InputValueT, class OutputValueT, class AccumT, class OffsetT, class ScanOpT>
-  using selector =
-    cub::detail::scan::policy_selector_from_hub<policy_hub<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>,
-                                                InputValueT,
-                                                OutputValueT,
-                                                AccumT>;
+    return {BlockThreads,
+            1,
+            cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE,
+            cub::CacheLoadModifier::LOAD_DEFAULT,
+            cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE,
+            cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING,
+            cub::detail::delay_constructor_policy{cub::detail::delay_constructor_kind::fixed_delay, 350, 450}};
+  }
 };
 
 struct get_reduce_tuning_query_t
@@ -208,7 +186,7 @@ TEST_CASE("Device scan inclusive-scan works with default environment", "[scan][d
   using offset_t    = cub::detail::choose_offset_t<num_items_t>;
 
   using selector_t =
-    cub::detail::scan::default_tuning::selector<value_t, value_t, value_t, offset_t, block_size_check_t>;
+    cub::detail::scan::policy_selector_from_types<value_t, value_t, value_t, offset_t, block_size_check_t>;
 
   int current_device{};
   REQUIRE(cudaSuccess == cudaGetDevice(&current_device));

From 81b7a7f7efc2dacb4ef1d97d42c98475c9dd2084 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Fri, 13 Mar 2026 01:16:25 -0500
Subject: [PATCH 26/31] More abstraction layers to restore constexprness.

---
 cub/cub/device/dispatch/dispatch_scan.cuh     | 78 ++++++++++++++-----
 .../device/dispatch/tuning/tuning_scan.cuh    | 24 ++++++
 2 files changed, 81 insertions(+), 21 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 3a8ab97b60e..f5057b3c62a 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -500,18 +500,37 @@ struct DispatchScan
     // TODO(bgruber): we probably need to ensure alignment of d_temp_storage
     _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), "");
 
-    auto scan_kernel                               = kernel_source.ScanKernel();
-    CUB_DETAIL_CONSTEXPR_ISH int smem_size_1_stage = detail::scan::smem_for_stages(
-      warpspeed_policy,
-      1,
-      policy_selector.input_value_size,
-      policy_selector.input_value_alignment,
-      policy_selector.output_value_size,
-      policy_selector.output_value_alignment,
-      policy_selector.accum_size,
-      policy_selector.accum_alignment);
-    CUB_DETAIL_STATIC_ISH_ASSERT(smem_size_1_stage <= detail::max_smem_per_block,
-                                 "Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)");
+    using selector_smem_info_t = detail::scan::selector_smem_info<PolicySelectorT>;
+
+    auto scan_kernel      = kernel_source.ScanKernel();
+    int smem_size_1_stage = 0;
+    if constexpr (selector_smem_info_t::has_static_layout)
+    {
+      CUB_DETAIL_CONSTEXPR_ISH int static_smem_size_1_stage = detail::scan::smem_for_stages(
+        warpspeed_policy,
+        1,
+        selector_smem_info_t::input_value_size,
+        selector_smem_info_t::input_value_alignment,
+        selector_smem_info_t::output_value_size,
+        selector_smem_info_t::output_value_alignment,
+        selector_smem_info_t::accum_size,
+        selector_smem_info_t::accum_alignment);
+      CUB_DETAIL_STATIC_ISH_ASSERT(static_smem_size_1_stage <= detail::max_smem_per_block,
+                                   "Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)");
+      smem_size_1_stage = static_smem_size_1_stage;
+    }
+    else
+    {
+      smem_size_1_stage = detail::scan::smem_for_stages(
+        warpspeed_policy,
+        1,
+        policy_selector.input_value_size,
+        policy_selector.input_value_alignment,
+        policy_selector.output_value_size,
+        policy_selector.output_value_alignment,
+        policy_selector.accum_size,
+        policy_selector.accum_alignment);
+    }
 
     int num_stages = 1;
     int smem_size  = smem_size_1_stage;
@@ -526,15 +545,32 @@ struct DispatchScan
 
                    while (num_stages <= max_stages_for_even_workload)
                    {
-                     const auto next_smem_size = detail::scan::smem_for_stages(
-                       warpspeed_policy,
-                       num_stages + 1,
-                       policy_selector.input_value_size,
-                       policy_selector.input_value_alignment,
-                       policy_selector.output_value_size,
-                       policy_selector.output_value_alignment,
-                       policy_selector.accum_size,
-                       policy_selector.accum_alignment);
+                     const auto next_smem_size = [&] {
+                       if constexpr (selector_smem_info_t::has_static_layout)
+                       {
+                         return detail::scan::smem_for_stages(
+                           warpspeed_policy,
+                           num_stages + 1,
+                           selector_smem_info_t::input_value_size,
+                           selector_smem_info_t::input_value_alignment,
+                           selector_smem_info_t::output_value_size,
+                           selector_smem_info_t::output_value_alignment,
+                           selector_smem_info_t::accum_size,
+                           selector_smem_info_t::accum_alignment);
+                       }
+                       else
+                       {
+                         return detail::scan::smem_for_stages(
+                           warpspeed_policy,
+                           num_stages + 1,
+                           policy_selector.input_value_size,
+                           policy_selector.input_value_alignment,
+                           policy_selector.output_value_size,
+                           policy_selector.output_value_alignment,
+                           policy_selector.accum_size,
+                           policy_selector.accum_alignment);
+                       }
+                     }();
                      if (next_smem_size > max_dynamic_smem_size)
                      {
                        // This number of stages failed, so stay at the current settings
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 4e75aee1fa7..c8042855284 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -1339,6 +1339,30 @@ struct policy_selector_from_types
     return policies(arch);
   }
 };
+
+template <typename PolicySelectorT, typename = void>
+struct selector_smem_info
+{
+  static constexpr bool has_static_layout = false;
+};
+
+template <typename PolicySelectorT>
+struct selector_smem_info<PolicySelectorT,
+                          ::cuda::std::void_t<decltype(PolicySelectorT::input_value_size),
+                                              decltype(PolicySelectorT::input_value_alignment),
+                                              decltype(PolicySelectorT::output_value_size),
+                                              decltype(PolicySelectorT::output_value_alignment),
+                                              decltype(PolicySelectorT::accum_size),
+                                              decltype(PolicySelectorT::accum_alignment)>>
+{
+  static constexpr bool has_static_layout     = true;
+  static constexpr int input_value_size       = PolicySelectorT::input_value_size;
+  static constexpr int input_value_alignment  = PolicySelectorT::input_value_alignment;
+  static constexpr int output_value_size      = PolicySelectorT::output_value_size;
+  static constexpr int output_value_alignment = PolicySelectorT::output_value_alignment;
+  static constexpr int accum_size             = PolicySelectorT::accum_size;
+  static constexpr int accum_alignment        = PolicySelectorT::accum_alignment;
+};
 } // namespace detail::scan
 
 CUB_NAMESPACE_END

From 029b195cbf3b1501e86f8ae2136dac9acda03f82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Fri, 13 Mar 2026 03:22:49 -0500
Subject: [PATCH 27/31] Correctly check for the constants.

---
 cub/cub/device/dispatch/tuning/tuning_scan.cuh | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index c8042855284..25568afc489 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -30,6 +30,7 @@
 #include <cuda/std/__functional/invoke.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_trivially_copy_constructible.h>
 #include <cuda/std/__type_traits/void_t.h>
 
@@ -1347,13 +1348,14 @@ struct selector_smem_info
 };
 
 template <typename PolicySelectorT>
-struct selector_smem_info<PolicySelectorT,
-                          ::cuda::std::void_t<decltype(PolicySelectorT::input_value_size),
-                                              decltype(PolicySelectorT::input_value_alignment),
-                                              decltype(PolicySelectorT::output_value_size),
-                                              decltype(PolicySelectorT::output_value_alignment),
-                                              decltype(PolicySelectorT::accum_size),
-                                              decltype(PolicySelectorT::accum_alignment)>>
+struct selector_smem_info<
+  PolicySelectorT,
+  ::cuda::std::void_t<decltype(::cuda::std::integral_constant<int, PolicySelectorT::input_value_size>{}),
+                      decltype(::cuda::std::integral_constant<int, PolicySelectorT::input_value_alignment>{}),
+                      decltype(::cuda::std::integral_constant<int, PolicySelectorT::output_value_size>{}),
+                      decltype(::cuda::std::integral_constant<int, PolicySelectorT::output_value_alignment>{}),
+                      decltype(::cuda::std::integral_constant<int, PolicySelectorT::accum_size>{}),
+                      decltype(::cuda::std::integral_constant<int, PolicySelectorT::accum_alignment>{})>>
 {
   static constexpr bool has_static_layout     = true;
   static constexpr int input_value_size       = PolicySelectorT::input_value_size;

From ac03691eda336c53ee23da4a3400af7c95ea3475 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Fri, 13 Mar 2026 11:41:36 -0500
Subject: [PATCH 28/31] Another abstraction layer, to remove a constexpr
 reference to `this`.

---
 cub/cub/device/dispatch/dispatch_scan.cuh | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index f5057b3c62a..3f6fe35a082 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -135,6 +135,18 @@ struct DeviceScanKernelSource
   }
 };
 
+template <typename KernelSourceT>
+CUB_RUNTIME_FUNCTION constexpr bool
+use_warpspeed_constexpr_ish(const KernelSourceT& kernel_source, const scan_policy& policy)
+{
+#if defined(CUB_DEFINE_RUNTIME_POLICIES)
+  return kernel_source.use_warpspeed(policy);
+#else
+  (void) kernel_source;
+  return KernelSourceT::use_warpspeed(policy);
+#endif
+}
+
 // TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
 template <typename LegacyActivePolicy>
 _CCCL_API constexpr auto convert_policy() -> scan_policy
@@ -671,7 +683,7 @@ struct DispatchScan
                                  "The memory consistency model does not apply to texture accesses");
 
 #if __cccl_ptx_isa >= 860
-    if CUB_DETAIL_CONSTEXPR_ISH (kernel_source.use_warpspeed(active_policy))
+    if CUB_DETAIL_CONSTEXPR_ISH (detail::scan::use_warpspeed_constexpr_ish(kernel_source, active_policy))
     {
       return __invoke_lookahead_algorithm(policy_getter, policy_selector);
     }

From a6ed3cd5de353ae4c93e053792bf7f7259f05aba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Fri, 13 Mar 2026 12:01:39 -0500
Subject: [PATCH 29/31] I kinda hate this but I think it has to be like this.

---
 cub/cub/device/dispatch/dispatch_scan.cuh | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 3f6fe35a082..8ef0ee3c71b 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -135,18 +135,6 @@ struct DeviceScanKernelSource
   }
 };
 
-template <typename KernelSourceT>
-CUB_RUNTIME_FUNCTION constexpr bool
-use_warpspeed_constexpr_ish(const KernelSourceT& kernel_source, const scan_policy& policy)
-{
-#if defined(CUB_DEFINE_RUNTIME_POLICIES)
-  return kernel_source.use_warpspeed(policy);
-#else
-  (void) kernel_source;
-  return KernelSourceT::use_warpspeed(policy);
-#endif
-}
-
 // TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
 template <typename LegacyActivePolicy>
 _CCCL_API constexpr auto convert_policy() -> scan_policy
@@ -683,10 +671,17 @@ struct DispatchScan
                                  "The memory consistency model does not apply to texture accesses");
 
 #if __cccl_ptx_isa >= 860
-    if CUB_DETAIL_CONSTEXPR_ISH (detail::scan::use_warpspeed_constexpr_ish(kernel_source, active_policy))
+#  if defined(CUB_DEFINE_RUNTIME_POLICIES)
+    if (kernel_source.use_warpspeed(active_policy))
+    {
+      return __invoke_lookahead_algorithm(policy_getter, policy_selector);
+    }
+#  else
+    if CUB_DETAIL_CONSTEXPR_ISH (KernelSource::use_warpspeed(active_policy))
     {
       return __invoke_lookahead_algorithm(policy_getter, policy_selector);
     }
+#  endif
 #endif // __cccl_ptx_isa >= 860
 
     // Number of input tiles

From 3bb81697cc2fd75b736da38ab50e07c64bdb821f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Fri, 13 Mar 2026 12:32:28 -0500
Subject: [PATCH 30/31] Silence a warning.

---
 cub/cub/device/dispatch/dispatch_scan.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 8ef0ee3c71b..bf46fd0f9ba 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -663,7 +663,7 @@ struct DispatchScan
 
   template <typename PolicyGetter, typename PolicySelectorT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
-  __invoke(PolicyGetter policy_getter, const PolicySelectorT& policy_selector)
+  __invoke(PolicyGetter policy_getter, [[maybe_unused]] const PolicySelectorT& policy_selector)
   {
     CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter();
 

From 5dbcfd6182a8a8d23865f5d3daef1e23af4754eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20=27Griwes=27=20Dominiak?= <griwes@griwes.info>
Date: Fri, 13 Mar 2026 13:51:27 -0500
Subject: [PATCH 31/31] Silence MSVC unreachable code warning.

---
 cub/cub/device/dispatch/dispatch_scan.cuh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index bf46fd0f9ba..0bcb0439d03 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -661,6 +661,10 @@ struct DispatchScan
   }
 #endif // __cccl_ptx_isa >= 860
 
+  // On Windows, the `if CUB_DETAIL_CONSTEXPR_ISH` results in `warning C4702: unreachable code`.
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_MSVC(4702)
+
   template <typename PolicyGetter, typename PolicySelectorT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
   __invoke(PolicyGetter policy_getter, [[maybe_unused]] const PolicySelectorT& policy_selector)
@@ -809,6 +813,8 @@ struct DispatchScan
     return cudaSuccess;
   }
 
+  _CCCL_DIAG_POP
+
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT = {})
   {