diff --git a/c/parallel/include/cccl/c/scan.h b/c/parallel/include/cccl/c/scan.h
index a9db31c764a..5b80668e123 100644
--- a/c/parallel/include/cccl/c/scan.h
+++ b/c/parallel/include/cccl/c/scan.h
@@ -34,6 +34,7 @@ typedef struct cccl_device_scan_build_result_t
   CUkernel scan_kernel;
   bool force_inclusive;
   cccl_init_kind_t init_kind;
+  bool use_warpspeed;
   size_t description_bytes_per_tile;
   size_t payload_bytes_per_tile;
   void* runtime_policy;
diff --git a/c/parallel/src/radix_sort.cu b/c/parallel/src/radix_sort.cu
index 975281478d2..2033c2d937e 100644
--- a/c/parallel/src/radix_sort.cu
+++ b/c/parallel/src/radix_sort.cu
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -223,18 +223,7 @@ try
   std::string offset_t;
   check(cccl_type_name_from_nvrtc<OffsetT>(&offset_t));
 
-  // TODO(bgruber): generalize this somewhere
-  const auto key_type = [&] {
-    switch (input_keys_it.value_type.type)
-    {
-      case CCCL_FLOAT32:
-        return cub::detail::type_t::float32;
-      case CCCL_FLOAT64:
-        return cub::detail::type_t::float64;
-      default:
-        return cub::detail::type_t::other;
-    }
-  }();
+  const auto key_type = cccl_type_enum_to_cub_type(input_keys_it.value_type.type);
 
   const auto policy_sel = cub::detail::radix_sort::policy_selector{
     static_cast<int>(input_keys_it.value_type.size),
@@ -268,6 +257,8 @@ using device_radix_sort_policy = {5};
 using namespace cub;
 using namespace cub::detail;
 using namespace cub::detail::radix_sort;
+using cub::detail::delay_constructor_policy;
+using cub::detail::delay_constructor_kind;
 static_assert(device_radix_sort_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6}, "Host generated and JIT compiled policy mismatch");
 )XXX",
     input_keys_it.value_type.size, // 0
diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu
index 8df0aeb5d8a..c673d3453c3 100644
--- a/c/parallel/src/reduce.cu
+++ b/c/parallel/src/reduce.cu
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -223,31 +223,8 @@ try
   const auto policy_sel = [&] {
     using namespace cub::detail;
 
-    auto accum_type = type_t::other;
-    if (accum_t.type == CCCL_FLOAT32)
-    {
-      accum_type = type_t::float32;
-    }
-    else if (accum_t.type == CCCL_FLOAT64)
-    {
-      accum_type = type_t::float64;
-    }
-
-    auto operation_t = op_kind_t::other;
-    switch (op.type)
-    {
-      case CCCL_PLUS:
-        operation_t = op_kind_t::plus;
-        break;
-      case CCCL_MINIMUM:
-        operation_t = op_kind_t::min;
-        break;
-      case CCCL_MAXIMUM:
-        operation_t = op_kind_t::max;
-        break;
-      default:
-        break;
-    }
+    const auto accum_type  = cccl_type_enum_to_cub_type(accum_t.type);
+    const auto operation_t = cccl_op_kind_to_cub_op(op.type);
 
     const int offset_size = int{sizeof(OffsetT)};
     return cub::detail::reduce::policy_selector{accum_type, operation_t, offset_size, static_cast<int>(accum_t.size)};
diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu
index 4c039525cb5..cba53e453e0 100644
--- a/c/parallel/src/scan.cu
+++ b/c/parallel/src/scan.cu
@@ -4,13 +4,12 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
 #include <cub/detail/choose_offset.cuh>
 #include <cub/detail/launcher/cuda_driver.cuh>
-#include <cub/detail/ptx-json-parser.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/thread/thread_load.cuh>
 #include <cub/util_arch.cuh>
@@ -21,6 +20,7 @@
 #include <format>
 #include <iostream>
 #include <optional>
+#include <sstream>
 #include <string>
 #include <type_traits>
 #include <vector>
@@ -56,33 +56,6 @@ enum class InitKind
   NoInit,
 };
 
-struct scan_runtime_tuning_policy
-{
-  cub::detail::RuntimeScanAgentPolicy scan;
-
-  auto Scan() const
-  {
-    return scan;
-  }
-
-  void CheckLoadModifier() const
-  {
-    if (scan.LoadModifier() == cub::CacheLoadModifier::LOAD_LDG)
-    {
-      throw std::runtime_error("The memory consistency model does not apply to texture "
-                               "accesses");
-    }
-  }
-
-  using MaxPolicy = scan_runtime_tuning_policy;
-
-  template <typename F>
-  cudaError_t Invoke(int, F& op)
-  {
-    return op.template Invoke<scan_runtime_tuning_policy>(*this);
-  }
-};
-
 static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_type_info init)
 {
   // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
@@ -135,8 +108,8 @@ std::string get_scan_kernel_name(
   bool force_inclusive,
   cccl_init_kind_t init_kind)
 {
-  std::string chained_policy_t;
-  check(cccl_type_name_from_nvrtc<device_scan_policy>(&chained_policy_t));
+  std::string policy_selector_t;
+  check(cccl_type_name_from_nvrtc<device_scan_policy>(&policy_selector_t));
 
   const cccl_type_info accum_t  = scan::get_accumulator_type(op, input_it, init);
   const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type);
@@ -177,7 +150,7 @@ std::string get_scan_kernel_name(
   auto tile_state_t = std::format("cub::ScanTileState<{0}>", accum_cpp_t);
   return std::format(
     "cub::detail::scan::DeviceScanKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}>",
-    chained_policy_t, // 0
+    policy_selector_t, // 0
     input_iterator_t, // 1
     output_iterator_t, // 2
     tile_state_t, // 3
@@ -189,20 +162,6 @@ std::string get_scan_kernel_name(
     init_t); // 9
 }
 
-template <auto* GetPolicy>
-struct dynamic_scan_policy_t
-{
-  using MaxPolicy = dynamic_scan_policy_t;
-
-  template <typename F>
-  cudaError_t Invoke(int device_ptx_version, F& op)
-  {
-    return op.template Invoke<scan_runtime_tuning_policy>(GetPolicy(device_ptx_version, accumulator_type));
-  }
-
-  cccl_type_info accumulator_type;
-};
-
 struct scan_kernel_source
 {
   cccl_device_scan_build_result_t& build;
@@ -219,11 +178,16 @@ struct scan_kernel_source
   {
     return build.scan_kernel;
   }
-  scan_tile_state TileState()
+  scan_tile_state TileState() const
   {
     return {build.description_bytes_per_tile, build.payload_bytes_per_tile};
   }
 
+  bool use_warpspeed(const cub::detail::scan::scan_policy& /*policy*/) const
+  {
+    return build.use_warpspeed;
+  }
+
   std::size_t look_ahead_tile_state_size() const
   {
     return look_ahead_tile_state_alignment();
@@ -287,8 +251,77 @@ try
 
   const auto output_it_value_t = cccl_type_enum_to_name(output_it.value_type.type);
 
-  std::string policy_hub_expr = std::format(
-    "cub::detail::scan::policy_hub<{}, {}, {}, {}, {}>",
+  const auto policy_sel = [&] {
+    using cub::detail::scan::policy_selector;
+    using cub::detail::scan::primitive_accum;
+    using cub::detail::scan::primitive_op;
+
+    const auto accum_type   = cccl_type_enum_to_cub_type(accum_t.type);
+    const auto operation_t  = cccl_op_kind_to_cub_op(op.type);
+    const auto input_type   = input_it.value_type.type;
+    const auto input_type_t = cccl_type_enum_to_cub_type(input_type);
+
+    const auto output_type = output_it.value_type.type;
+    const bool types_match = input_type == output_type && input_type == accum_t.type;
+    const bool benchmark_match =
+      operation_t != cub::detail::op_kind_t::other && types_match && input_type != CCCL_STORAGE;
+    const bool accum_is_primitive_or_trivially_copy_constructible = true;
+
+    return policy_selector{
+      static_cast<int>(input_it.value_type.size),
+      static_cast<int>(input_it.value_type.alignment),
+      static_cast<int>(output_it.value_type.size),
+      static_cast<int>(output_it.value_type.alignment),
+      static_cast<int>(accum_t.size),
+      static_cast<int>(accum_t.alignment),
+      int{sizeof(OffsetT)},
+      input_type_t,
+      accum_type,
+      operation_t,
+      accum_is_primitive_or_trivially_copy_constructible,
+      benchmark_match};
+  }();
+
+  const auto arch_id       = cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor});
+  const auto active_policy = policy_sel(arch_id);
+
+#if _CCCL_CUDACC_AT_LEAST(12, 8)
+  const auto is_trivial_type = [](cccl_type_enum /* type */) {
+    // TODO: implement actual logic here when nontrivial custom types become supported
+    return true;
+  };
+
+  const bool input_contiguous             = input_it.type == cccl_iterator_kind_t::CCCL_POINTER;
+  const bool output_contiguous            = output_it.type == cccl_iterator_kind_t::CCCL_POINTER;
+  const bool input_trivially_copyable     = is_trivial_type(input_it.value_type.type);
+  const bool output_trivially_copyable    = is_trivial_type(output_it.value_type.type);
+  const bool output_default_constructible = output_trivially_copyable;
+
+  const bool use_warpspeed =
+    active_policy.warpspeed
+    && cub::detail::scan::use_warpspeed(
+      active_policy.warpspeed,
+      static_cast<int>(input_it.value_type.size),
+      static_cast<int>(input_it.value_type.alignment),
+      static_cast<int>(output_it.value_type.size),
+      static_cast<int>(output_it.value_type.alignment),
+      static_cast<int>(accum_t.size),
+      static_cast<int>(accum_t.alignment),
+      input_contiguous,
+      output_contiguous,
+      input_trivially_copyable,
+      output_trivially_copyable,
+      output_default_constructible);
+#else
+  const bool use_warpspeed = false;
+#endif
+
+  // TODO(bgruber): drop this if tuning policies become formattable
+  std::stringstream policy_sel_str;
+  policy_sel_str << active_policy;
+
+  std::string policy_selector_expr = std::format(
+    "cub::detail::scan::policy_selector_from_types<{}, {}, {}, {}, {}>",
     input_it_value_t,
     output_it_value_t,
     accum_cpp,
@@ -307,20 +340,20 @@ struct __align__({1}) storage_t {{
 {2}
 {3}
 {4}
-using device_scan_policy = {5}::MaxPolicy;
-
-#include <cub/detail/ptx-json/json.cuh>
-__device__ consteval auto& policy_generator() {{
-  return ptx_json::id<ptx_json::string("device_scan_policy")>()
-    = cub::detail::scan::ScanPolicyWrapper<device_scan_policy::ActivePolicy>::EncodedPolicy();
-}}
+using device_scan_policy = {5};
+using namespace cub;
+using namespace cub::detail::scan;
+using cub::detail::delay_constructor_policy;
+using cub::detail::delay_constructor_kind;
+static_assert(device_scan_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6}, "Host generated and JIT compiled policy mismatch");
 )XXX",
     input_it.value_type.size, // 0
     input_it.value_type.alignment, // 1
     input_iterator_src, // 2
     output_iterator_src, // 3
     op_src, // 4
-    policy_hub_expr); // 5
+    policy_selector_expr, // 5
+    policy_sel_str.view()); // 6
 
 #if false // CCCL_DEBUGGING_SWITCH
     fflush(stderr);
@@ -344,7 +377,6 @@ __device__ consteval auto& policy_generator() {{
     "-rdc=true",
     "-dlto",
     "-DCUB_DISABLE_CDP",
-    "-DCUB_ENABLE_POLICY_PTX_JSON",
     "-std=c++20"};
 
   cccl::detail::extend_args_with_build_config(args, config);
@@ -379,11 +411,6 @@ __device__ consteval auto& policy_generator() {{
   auto [description_bytes_per_tile,
         payload_bytes_per_tile] = get_tile_state_bytes_per_tile(accum_t, accum_cpp, args.data(), args.size(), arch);
 
-  nlohmann::json runtime_policy = cub::detail::ptx_json::parse("device_scan_policy", {result.data.get(), result.size});
-
-  using cub::detail::RuntimeScanAgentPolicy;
-  auto scan_policy = RuntimeScanAgentPolicy::from_json(runtime_policy, "ScanPolicyT");
-
   build_ptr->cc                         = cc;
   build_ptr->cubin                      = (void*) result.data.release();
   build_ptr->cubin_size                 = result.size;
@@ -392,7 +419,8 @@ __device__ consteval auto& policy_generator() {{
   build_ptr->init_kind                  = init_kind;
   build_ptr->description_bytes_per_tile = description_bytes_per_tile;
   build_ptr->payload_bytes_per_tile     = payload_bytes_per_tile;
-  build_ptr->runtime_policy             = new scan::scan_runtime_tuning_policy{scan_policy};
+  build_ptr->runtime_policy             = new cub::detail::scan::policy_selector{policy_sel};
+  build_ptr->use_warpspeed              = use_warpspeed;
 
   return CUDA_SUCCESS;
 }
@@ -426,30 +454,18 @@ CUresult cccl_device_scan(
     CUdevice cu_device;
     check(cuCtxGetDevice(&cu_device));
 
-    auto exec_status = cub::DispatchScan<
-      indirect_arg_t,
-      indirect_arg_t,
-      indirect_arg_t,
-      std::conditional_t<std::is_same_v<InitValueT, cub::NullType>, cub::NullType, indirect_arg_t>,
-      cuda::std::size_t,
-      void,
-      EnforceInclusive,
-      scan::scan_runtime_tuning_policy,
-      scan::scan_kernel_source,
-      cub::detail::CudaDriverLauncherFactory>::
-      Dispatch(
-        d_temp_storage,
-        *temp_storage_bytes,
-        d_in,
-        d_out,
-        op,
-        init,
-        num_items,
-        stream,
-        {build},
-        cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
-        *reinterpret_cast<scan::scan_runtime_tuning_policy*>(build.runtime_policy));
-
+    auto exec_status = cub::detail::scan::dispatch_with_accum<void, EnforceInclusive>(
+      d_temp_storage,
+      *temp_storage_bytes,
+      indirect_arg_t{d_in},
+      indirect_arg_t{d_out},
+      indirect_arg_t{op},
+      std::conditional_t<std::is_same_v<InitValueT, cub::NullType>, cub::NullType, indirect_arg_t>{init},
+      static_cast<OffsetT>(num_items),
+      stream,
+      *static_cast<cub::detail::scan::policy_selector*>(build.runtime_policy),
+      scan::scan_kernel_source{build},
+      cub::detail::CudaDriverLauncherFactory{cu_device, build.cc});
     error = static_cast<CUresult>(exec_status);
   }
   catch (const std::exception& exc)
@@ -591,7 +607,8 @@ try
     return CUDA_ERROR_INVALID_VALUE;
   }
   std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(build_ptr->cubin));
-  std::unique_ptr<char[]> policy(reinterpret_cast<char*>(build_ptr->runtime_policy));
+  std::unique_ptr<cub::detail::scan::policy_selector> policy(
+    static_cast<cub::detail::scan::policy_selector*>(build_ptr->runtime_policy));
   check(cuLibraryUnload(build_ptr->library));
 
   return CUDA_SUCCESS;
diff --git a/c/parallel/src/segmented_reduce.cu b/c/parallel/src/segmented_reduce.cu
index 2c61508e606..f81201e1902 100644
--- a/c/parallel/src/segmented_reduce.cu
+++ b/c/parallel/src/segmented_reduce.cu
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -154,35 +154,11 @@ try
   // OffsetT is checked to match have 64-bit size
   const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64);
 
-  // TODO(bgruber): share this with reduce.cu
   const auto policy_sel = [&] {
     using namespace cub::detail;
 
-    auto accum_type = type_t::other;
-    if (accum_t.type == CCCL_FLOAT32)
-    {
-      accum_type = type_t::float32;
-    }
-    else if (accum_t.type == CCCL_FLOAT64)
-    {
-      accum_type = type_t::float64;
-    }
-
-    auto operation_t = op_kind_t::other;
-    switch (op.type)
-    {
-      case CCCL_PLUS:
-        operation_t = op_kind_t::plus;
-        break;
-      case CCCL_MINIMUM:
-        operation_t = op_kind_t::min;
-        break;
-      case CCCL_MAXIMUM:
-        operation_t = op_kind_t::max;
-        break;
-      default:
-        break;
-    }
+    const auto accum_type  = cccl_type_enum_to_cub_type(accum_t.type);
+    const auto operation_t = cccl_op_kind_to_cub_op(op.type);
 
     const int offset_size = int{sizeof(OffsetT)};
     return cub::detail::segmented_reduce::policy_selector{
diff --git a/c/parallel/src/util/types.h b/c/parallel/src/util/types.h
index 9d2acb62659..837025a69c2 100644
--- a/c/parallel/src/util/types.h
+++ b/c/parallel/src/util/types.h
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -114,6 +114,8 @@ inline constexpr cub::detail::type_t cccl_type_enum_to_cub_type(cccl_type_enum t
 {
   switch (type)
   {
+    case CCCL_BOOLEAN:
+      return cub::detail::type_t::boolean;
     case CCCL_INT8:
       return cub::detail::type_t::int8;
     case CCCL_INT16:
@@ -140,3 +142,18 @@ inline constexpr cub::detail::type_t cccl_type_enum_to_cub_type(cccl_type_enum t
       return cub::detail::type_t::other;
   }
 }
+
+inline constexpr cub::detail::op_kind_t cccl_op_kind_to_cub_op(cccl_op_kind_t type)
+{
+  switch (type)
+  {
+    case CCCL_PLUS:
+      return cub::detail::op_kind_t::plus;
+    case CCCL_MINIMUM:
+      return cub::detail::op_kind_t::min;
+    case CCCL_MAXIMUM:
+      return cub::detail::op_kind_t::max;
+    default:
+      return cub::detail::op_kind_t::other;
+  }
+}
diff --git a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu
index f6acf8ae6c0..c3f5ebcb299 100644
--- a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/detail/choose_offset.cuh>
@@ -34,26 +34,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-struct policy_hub_t
-{
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
-      TUNE_THREADS,
-      TUNE_ITEMS,
-      // TODO Tune
-      TUNE_LOAD_ALGORITHM,
-      TUNE_LOAD_MODIFIER,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      TUNE_STORE_ALGORITHM,
-      delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // !TUNE_BASE
 
+#include "../../policy_selector.h"
+
 namespace impl
 {
 /*
@@ -125,15 +109,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<FloatingPoi
   using op_t           = impl::log_add_plus;
   using accum_t        = value_t;
 
-#if !TUNE_BASE
-  using policy_t = policy_hub_t<accum_t>;
-  using dispatch_t =
-    cub::DispatchScan<input_t, output_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_t, output_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   auto mu             = static_cast<value_t>(state.get_float64("Mu{io}"));
 
@@ -161,14 +136,39 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<FloatingPoi
   state.add_global_memory_writes<value_t>(elements);
 
   size_t tmp_size;
-  dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream);
+  cub::detail::scan::dispatch_with_accum<accum_t>(
+    nullptr,
+    tmp_size,
+    d_input,
+    d_output,
+    op_t{},
+    wrapped_init_t{},
+    input.size(),
+    bench_stream
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream());
+    cub::detail::scan::dispatch_with_accum<accum_t>(
+      d_tmp,
+      tmp_size,
+      d_input,
+      d_output,
+      op_t{},
+      wrapped_init_t{},
+      input.size(),
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 
   // for validation, use
diff --git a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu
index 98da18d7106..7aa9d79c2e1 100644
--- a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/detail/choose_offset.cuh>
@@ -30,26 +30,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-struct policy_hub_t
-{
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
-      TUNE_THREADS,
-      TUNE_ITEMS,
-      // TODO Tune
-      TUNE_LOAD_ALGORITHM,
-      TUNE_LOAD_MODIFIER,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      TUNE_STORE_ALGORITHM,
-      delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // !TUNE_BASE
 
+#include "../../policy_selector.h"
+
 namespace impl
 {
 /* Consider free monoid with two generators, ``q`` and ``p``, modulo defining relationship (``p * q == 1``).
@@ -112,15 +96,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<T, OffsetT>
   using output_it_t    = pair_t*;
   using offset_t       = cub::detail::choose_offset_t<OffsetT>;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<accum_t>;
-  using dispatch_t = cub::
-    DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
 
   thrust::device_vector<pair_t> output(elements);
@@ -148,14 +123,39 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<T, OffsetT>
   cudaStream_t bench_stream = state.get_cuda_stream();
 
   size_t tmp_size;
-  dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream);
+  cub::detail::scan::dispatch_with_accum<accum_t>(
+    nullptr,
+    tmp_size,
+    d_input,
+    d_output,
+    op_t{},
+    wrapped_init_t{},
+    input.size(),
+    bench_stream
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream());
+    cub::detail::scan::dispatch_with_accum<accum_t>(
+      d_tmp,
+      tmp_size,
+      d_input,
+      d_output,
+      op_t{},
+      wrapped_init_t{},
+      input.size(),
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 }
 
diff --git a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu
index 182277add7c..acb7da72051 100644
--- a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/detail/choose_offset.cuh>
@@ -36,26 +36,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-struct policy_hub_t
-{
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
-      TUNE_THREADS,
-      TUNE_ITEMS,
-      // TODO Tune
-      TUNE_LOAD_ALGORITHM,
-      TUNE_LOAD_MODIFIER,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      TUNE_STORE_ALGORITHM,
-      delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // !TUNE_BASE
 
+#include "../../policy_selector.h"
+
 namespace impl
 {
 /* Denote epsilon, the identity element, be an empty sequence, and consider
@@ -308,15 +292,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<BitsetT, Of
 
   using ZpT = impl::ZpT;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<accum_t>;
-  using dispatch_t = cub::
-    DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
 
   thrust::device_vector<input_t> input = generate(elements);
@@ -338,13 +313,26 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<BitsetT, Of
   cudaStream_t bench_stream = state.get_cuda_stream().get_stream();
 
   size_t tmp_size;
-  dispatch_t::Dispatch(nullptr, tmp_size, inp_it, out_it, op_t{p}, wrapped_init_t{}, input.size(), bench_stream);
+  cub::detail::scan::dispatch_with_accum<accum_t>(
+    nullptr,
+    tmp_size,
+    inp_it,
+    out_it,
+    op_t{p},
+    wrapped_init_t{},
+    input.size(),
+    bench_stream
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
+    cub::detail::scan::dispatch_with_accum<accum_t>(
       d_tmp,
       tmp_size,
       inp_it,
@@ -352,7 +340,12 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list<BitsetT, Of
       op_t{p},
       wrapped_init_t{},
       input.size(),
-      launch.get_stream());
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 
   // for validation uncomment these two lines
diff --git a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu
index cf37c7f6312..82a158c0940 100644
--- a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/detail/choose_offset.cuh>
@@ -37,26 +37,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-struct policy_hub_t
-{
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
-      TUNE_THREADS,
-      TUNE_ITEMS,
-      // TODO Tune
-      TUNE_LOAD_ALGORITHM,
-      TUNE_LOAD_MODIFIER,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      TUNE_STORE_ALGORITHM,
-      delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // !TUNE_BASE
 
+#include "../../policy_selector.h"
+
 namespace impl
 {
 /* Given input sequence of values, compute sequence of
@@ -215,15 +199,6 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   using output_it_t    = pair_t*;
   using offset_t       = cub::detail::choose_offset_t<OffsetT>;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<accum_t>;
-  using dispatch_t = cub::
-    DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
 
   thrust::device_vector<pair_t> output(elements);
@@ -241,13 +216,39 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   cudaStream_t bench_stream = state.get_cuda_stream().get_stream();
 
   size_t tmp_size;
-  dispatch_t::Dispatch(nullptr, tmp_size, inp_it, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream);
+  cub::detail::scan::dispatch_with_accum<accum_t>(
+    nullptr,
+    tmp_size,
+    inp_it,
+    d_output,
+    op_t{},
+    wrapped_init_t{},
+    input.size(),
+    bench_stream
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(d_tmp, tmp_size, inp_it, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream());
+    cub::detail::scan::dispatch_with_accum<accum_t>(
+      d_tmp,
+      tmp_size,
+      inp_it,
+      d_output,
+      op_t{},
+      wrapped_init_t{},
+      input.size(),
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 
   // for verification use
diff --git a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu
index 7135ed7cd5a..92811c32dc4 100644
--- a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu
+++ b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cub/detail/choose_offset.cuh>
@@ -35,26 +35,10 @@
 #  elif TUNE_LOAD == 1
 #    define TUNE_LOAD_MODIFIER cub::LOAD_CA
 #  endif // TUNE_LOAD
-
-struct policy_hub_t
-{
-  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
-  {
-    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
-      TUNE_THREADS,
-      TUNE_ITEMS,
-      // TODO Tune
-      TUNE_LOAD_ALGORITHM,
-      TUNE_LOAD_MODIFIER,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      TUNE_STORE_ALGORITHM,
-      delay_constructor_t>;
-  };
-
-  using MaxPolicy = policy_t;
-};
 #endif // !TUNE_BASE
 
+#include "../../policy_selector.h"
+
 namespace impl
 {
 template <typename T>
@@ -126,15 +110,6 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   using output_it_t    = tuple_t*;
   using offset_t       = cub::detail::choose_offset_t<OffsetT>;
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<accum_t>;
-  using dispatch_t = cub::
-    DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   cudaStream_t bench_stream = state.get_cuda_stream().get_stream();
 
@@ -159,14 +134,39 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   auto d_output = thrust::raw_pointer_cast(output.data());
 
   size_t tmp_size;
-  dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream);
+  cub::detail::scan::dispatch_with_accum<accum_t>(
+    nullptr,
+    tmp_size,
+    d_input,
+    d_output,
+    op_t{},
+    wrapped_init_t{},
+    input.size(),
+    bench_stream
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
-      d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream());
+    cub::detail::scan::dispatch_with_accum<accum_t>(
+      d_tmp,
+      tmp_size,
+      d_input,
+      d_output,
+      op_t{},
+      wrapped_init_t{},
+      input.size(),
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 
   // for validation use (recommended for integral types and smallish input sizes)
diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh
index e5242d5f2aa..3a3eb99d0a1 100644
--- a/cub/benchmarks/bench/scan/exclusive/base.cuh
+++ b/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -13,105 +13,10 @@
 #  if !USES_WARPSPEED()
 #    include <look_back_helper.cuh>
 #  endif // !USES_WARPSPEED()
-
-template <typename AccumT>
-struct policy_hub_t
-{
-  struct MaxPolicy : cub::ChainedPolicy<300, MaxPolicy, MaxPolicy>
-  {
-#  if USES_WARPSPEED()
-    struct WarpspeedPolicy
-    {
-      static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS;
-      static constexpr int num_look_ahead_items      = TUNE_NUM_LOOKBACK_ITEMS;
-      static constexpr int items_per_thread          = TUNE_ITEMS_PLUS_ONE - 1;
-
-      // the rest are fixed or derived definitions
-
-      static constexpr int num_squads           = 5;
-      static constexpr int num_threads_per_warp = 32;
-      static constexpr int num_load_warps       = 1;
-      static constexpr int num_sched_warps      = 1;
-      static constexpr int num_look_ahead_warps = 1;
-
-      static constexpr int num_total_warps =
-        2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
-      static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;
-
-      static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp;
-
-      static constexpr int tile_size = items_per_thread * squad_reduce_thread_count;
-
-      using SquadDesc = cub::detail::warpspeed::SquadDesc;
-
-      // The squads cannot be static constexpr variables, as those are not device accessible
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadReduce() noexcept
-      {
-        return SquadDesc{0, num_reduce_and_scan_warps};
-      }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadScanStore() noexcept
-      {
-        return SquadDesc{1, num_reduce_and_scan_warps};
-      }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLoad() noexcept
-      {
-        return SquadDesc{2, num_load_warps};
-      }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadSched() noexcept
-      {
-        return SquadDesc{3, num_sched_warps};
-      }
-      [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLookback() noexcept
-      {
-        return SquadDesc{4, num_look_ahead_warps};
-      }
-    };
-#  else // USES_WARPSPEED()
-#    if TUNE_TRANSPOSE == 0
-#      define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_DIRECT
-#      define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
-#    else // TUNE_TRANSPOSE == 1
-#      define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_WARP_TRANSPOSE
-#      define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
-#    endif // TUNE_TRANSPOSE
-
-#    if TUNE_LOAD == 0
-#      define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
-#    elif TUNE_LOAD == 1
-#      define TUNE_LOAD_MODIFIER cub::LOAD_CA
-#    endif // TUNE_LOAD
-
-    template <int NOMINAL_BLOCK_THREADS_4B,
-              int NOMINAL_ITEMS_PER_THREAD_4B,
-              typename ComputeT,
-              cub::BlockLoadAlgorithm LOAD_ALGORITHM,
-              cub::CacheLoadModifier LOAD_MODIFIER,
-              cub::BlockStoreAlgorithm STORE_ALGORITHM,
-              cub::BlockScanAlgorithm SCAN_ALGORITHM>
-    using agent_policy_t = cub::AgentScanPolicy<
-      NOMINAL_BLOCK_THREADS_4B,
-      NOMINAL_ITEMS_PER_THREAD_4B,
-      ComputeT,
-      LOAD_ALGORITHM,
-      LOAD_MODIFIER,
-      STORE_ALGORITHM,
-      SCAN_ALGORITHM,
-      cub::detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
-      delay_constructor_t>;
-
-    using ScanPolicyT =
-      agent_policy_t<TUNE_THREADS,
-                     TUNE_ITEMS,
-                     AccumT,
-                     TUNE_LOAD_ALGORITHM,
-                     TUNE_LOAD_MODIFIER,
-                     TUNE_STORE_ALGORITHM,
-                     cub::BLOCK_SCAN_WARP_SCANS>;
-#  endif // USES_WARPSPEED()
-  };
-};
 #endif // TUNE_BASE
 
+#include "../policy_selector.h"
+
 template <typename T, typename OffsetT>
 static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 try
@@ -126,15 +31,6 @@ try
   static_assert(sizeof(offset_t) == sizeof(size_t)); // warpspeed scan uses size_t internally
 #endif // USES_WARPSPEED()
 
-#if !TUNE_BASE
-  using policy_t   = policy_hub_t<accum_t>;
-  using dispatch_t = cub::
-    DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No, policy_t>;
-#else
-  using dispatch_t =
-    cub::DispatchScan<input_it_t, output_it_t, op_t, wrapped_init_t, offset_t, accum_t, cub::ForceInclusive::No>;
-#endif
-
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   if (sizeof(offset_t) == 4 && elements > std::numeric_limits<offset_t>::max())
   {
@@ -145,15 +41,15 @@ try
   thrust::device_vector<T> input = generate(elements);
   thrust::device_vector<T> output(elements);
 
-  T* d_input  = thrust::raw_pointer_cast(input.data());
-  T* d_output = thrust::raw_pointer_cast(output.data());
+  const T* d_input = thrust::raw_pointer_cast(input.data());
+  T* d_output      = thrust::raw_pointer_cast(output.data());
 
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements, "Size");
   state.add_global_memory_writes<T>(elements);
 
   size_t tmp_size;
-  dispatch_t::Dispatch(
+  cub::detail::scan::dispatch_with_accum<accum_t>(
     nullptr,
     tmp_size,
     d_input,
@@ -161,11 +57,16 @@ try
     op_t{},
     wrapped_init_t{T{}},
     static_cast<offset_t>(input.size()),
-    0 /* stream */);
+    0 /* stream */
+#if !TUNE_BASE
+    ,
+    policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+  );
 
-  thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
+  thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
-    dispatch_t::Dispatch(
+    cub::detail::scan::dispatch_with_accum<accum_t>(
       thrust::raw_pointer_cast(tmp.data()),
       tmp_size,
       d_input,
@@ -173,7 +74,12 @@ try
       op_t{},
       wrapped_init_t{T{}},
       static_cast<offset_t>(input.size()),
-      launch.get_stream());
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      policy_selector<accum_t>{}
+#endif // !TUNE_BASE
+    );
   });
 }
 catch (const std::bad_alloc&)
diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu
index 57d7b873f35..2d5c53d6b99 100644
--- a/cub/benchmarks/bench/scan/exclusive/by_key.cu
+++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #include <cub/detail/choose_offset.cuh>
@@ -106,7 +106,7 @@ static void scan(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT
     static_cast<int>(elements),
     0 /* stream */);
 
-  thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
+  thrust::device_vector<nvbench::uint8_t> tmp(tmp_size, thrust::no_init);
   nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
 
   state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h
new file mode 100644
index 00000000000..f1f30d93a90
--- /dev/null
+++ b/cub/benchmarks/bench/scan/policy_selector.h
@@ -0,0 +1,73 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cub/device/device_scan.cuh>
+
+#if !TUNE_BASE
+#  include "look_back_helper.cuh"
+#endif // !TUNE_BASE
+
+#ifndef USES_WARPSPEED
+#  define USES_WARPSPEED() 0
+#endif
+
+#if !TUNE_BASE
+template <typename AccumT>
+struct policy_selector
+{
+#  if USES_WARPSPEED()
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::scan::scan_policy
+  {
+    static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS;
+    static constexpr int num_look_ahead_items      = TUNE_NUM_LOOKBACK_ITEMS;
+    static constexpr int items_per_thread          = TUNE_ITEMS_PLUS_ONE - 1;
+
+    static constexpr int num_threads_per_warp = 32;
+    static constexpr int num_load_warps       = 1;
+    static constexpr int num_sched_warps      = 1;
+    static constexpr int num_look_ahead_warps = 1;
+
+    static constexpr int num_total_warps =
+      2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
+    static constexpr int num_total_threads    = num_total_warps * num_threads_per_warp;
+    static constexpr int squad_reduce_threads = num_reduce_and_scan_warps * num_threads_per_warp;
+    static constexpr int tile_size            = items_per_thread * squad_reduce_threads;
+
+    auto warpspeed_policy = cub::detail::scan::scan_warpspeed_policy{
+      true,
+      num_reduce_and_scan_warps,
+      num_reduce_and_scan_warps,
+      num_load_warps,
+      num_sched_warps,
+      num_look_ahead_warps,
+      num_look_ahead_items,
+      num_total_threads,
+      items_per_thread,
+      tile_size};
+
+    return cub::detail::scan::scan_policy{
+      num_total_threads,
+      items_per_thread,
+      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+      cub::LOAD_DEFAULT,
+      cub::BLOCK_STORE_WARP_TRANSPOSE,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      cub::detail::delay_constructor_policy{cub::detail::delay_constructor_kind::fixed_delay, 350, 450},
+      warpspeed_policy};
+  }
+#  else
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::scan::scan_policy
+  {
+    return cub::detail::scan::make_mem_scaled_scan_policy(
+      TUNE_THREADS,
+      TUNE_ITEMS,
+      int{sizeof(AccumT)},
+      TUNE_LOAD_ALGORITHM,
+      TUNE_LOAD_MODIFIER,
+      TUNE_STORE_ALGORITHM,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      delay_constructor_policy);
+  }
+#  endif
+};
+#endif // !TUNE_BASE
diff --git a/cub/cub/detail/launcher/cuda_driver.cuh b/cub/cub/detail/launcher/cuda_driver.cuh
index 61e5f3d2e89..a4dc9782458 100644
--- a/cub/cub/detail/launcher/cuda_driver.cuh
+++ b/cub/cub/detail/launcher/cuda_driver.cuh
@@ -141,6 +141,60 @@ struct CudaDriverLauncherFactory
       cuDeviceGetAttribute(&max_shared_memory, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, device));
   }
 
+  _CCCL_HIDE_FROM_ABI CUB_RUNTIME_FUNCTION ::cudaError_t
+  max_dynamic_smem_size_for(int& max_dynamic_smem_size, ::CUkernel kernel_ptr) const
+  {
+    max_dynamic_smem_size = -1;
+
+    ::CUfunction kernel_fn;
+    auto status = static_cast<::cudaError_t>(::cuKernelGetFunction(&kernel_fn, kernel_ptr));
+    if (status != cudaSuccess)
+    {
+      return status;
+    }
+
+    int static_smem_size = 0;
+    status               = static_cast<::cudaError_t>(
+      ::cuFuncGetAttribute(&static_smem_size, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_fn));
+    if (status != cudaSuccess)
+    {
+      return status;
+    }
+
+    int reserved_smem_size = 0;
+    status                 = static_cast<::cudaError_t>(
+      ::cuDeviceGetAttribute(&reserved_smem_size, CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK, device));
+    if (status != cudaSuccess)
+    {
+      return status;
+    }
+
+    int max_smem_size_optin = 0;
+    status                  = static_cast<::cudaError_t>(
+      ::cuDeviceGetAttribute(&max_smem_size_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device));
+    if (status != cudaSuccess)
+    {
+      return status;
+    }
+
+    max_dynamic_smem_size = max_smem_size_optin - reserved_smem_size - static_smem_size;
+    return cudaSuccess;
+  }
+
+  _CCCL_HIDE_FROM_ABI CUB_RUNTIME_FUNCTION ::cudaError_t
+  set_max_dynamic_smem_size_for(::CUkernel kernel_ptr, int smem_size) const
+  {
+    ::CUfunction kernel_fn;
+    auto status = static_cast<::cudaError_t>(::cuKernelGetFunction(&kernel_fn, kernel_ptr));
+    if (status != cudaSuccess)
+    {
+      return status;
+    }
+
+    return static_cast<::cudaError_t>(
+      ::cuFuncSetAttribute(kernel_fn, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem_size));
+  }
+
   CUdevice device;
   int cc;
 };
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index 6a1e6577ade..bfc2fc83413 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -52,12 +52,6 @@ struct tuning
     return static_cast<const Derived&>(*this);
   }
 };
-
-struct default_tuning : tuning<default_tuning>
-{
-  template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
-  using fn = policy_hub<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>;
-};
 } // namespace detail::scan
 
 //! @rst
@@ -127,9 +121,6 @@ struct DeviceScan
     ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
     cudaStream_t stream)
   {
-    using scan_tuning_t = ::cuda::std::execution::
-      __query_result_or_t<TuningEnvT, detail::scan::get_tuning_query_t, detail::scan::default_tuning>;
-
     // Unsigned integer type for global offsets
     using offset_t = detail::choose_offset_t<NumItemsT>;
 
@@ -140,14 +131,26 @@ struct DeviceScan
                                                     cub::detail::it_value_t<InputIteratorT>,
                                                     typename InitValueT::value_type>>;
 
-    using policy_t = typename scan_tuning_t::
-      template fn<detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, accum_t, offset_t, ScanOpT>;
+    using default_policy_selector_t = detail::scan::policy_selector_from_types<
+      detail::it_value_t<InputIteratorT>,
+      detail::it_value_t<OutputIteratorT>,
+      accum_t,
+      offset_t,
+      ScanOpT>;
 
-    using dispatch_t =
-      DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, offset_t, accum_t, EnforceInclusive, policy_t>;
+    using policy_selector_t = ::cuda::std::execution::
+      __query_result_or_t<TuningEnvT, detail::scan::get_tuning_query_t, default_policy_selector_t>;
 
-    return dispatch_t::Dispatch(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, static_cast<offset_t>(num_items), stream);
+    return detail::scan::dispatch<EnforceInclusive>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      scan_op,
+      init,
+      static_cast<offset_t>(num_items),
+      stream,
+      policy_selector_t{});
   }
 
   template <ForceInclusive EnforceInclusive = ForceInclusive::No,
@@ -305,15 +308,15 @@ struct DeviceScan
     // Initial value
     InitT init_value{};
 
-    return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
-      Dispatch(d_temp_storage,
-               temp_storage_bytes,
-               d_in,
-               d_out,
-               ::cuda::std::plus<>{},
-               detail::InputValue<InitT>(init_value),
-               num_items,
-               stream);
+    return detail::scan::dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      ::cuda::std::plus<>{},
+      detail::InputValue<InitT>(init_value),
+      static_cast<OffsetT>(num_items),
+      stream);
   }
 
   //! @rst
@@ -597,14 +600,14 @@ struct DeviceScan
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
 
-    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
+    return detail::scan::dispatch(
       d_temp_storage,
       temp_storage_bytes,
       d_in,
       d_out,
       scan_op,
       detail::InputValue<InitValueT>(init_value),
-      num_items,
+      static_cast<OffsetT>(num_items),
       stream);
   }
 
@@ -939,14 +942,14 @@ struct DeviceScan
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
 
-    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
+    return detail::scan::dispatch(
       d_temp_storage,
       temp_storage_bytes,
       d_in,
       d_out,
       scan_op,
       detail::InputValue<InitValueT>(init_value),
-      num_items,
+      static_cast<OffsetT>(num_items),
       stream);
   }
 
@@ -1168,8 +1171,15 @@ struct DeviceScan
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
 
-    return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
+    return detail::scan::dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      ::cuda::std::plus<>{},
+      NullType{},
+      static_cast<OffsetT>(num_items),
+      stream);
   }
 
   //! @rst
@@ -1365,8 +1375,8 @@ struct DeviceScan
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
 
-    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
+    return detail::scan::dispatch(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), static_cast<OffsetT>(num_items), stream);
   }
 
   //! @rst
@@ -1456,23 +1466,16 @@ struct DeviceScan
 
     // Unsigned integer type for global offsets
     using OffsetT = detail::choose_offset_t<NumItemsT>;
-    using AccumT  = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
 
-    return DispatchScan<
-      InputIteratorT,
-      OutputIteratorT,
-      ScanOpT,
-      detail::InputValue<InitValueT>,
-      OffsetT,
-      AccumT,
-      ForceInclusive::Yes>::Dispatch(d_temp_storage,
-                                     temp_storage_bytes,
-                                     d_in,
-                                     d_out,
-                                     scan_op,
-                                     detail::InputValue<InitValueT>(init_value),
-                                     num_items,
-                                     stream);
+    return detail::scan::dispatch<ForceInclusive::Yes>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      scan_op,
+      detail::InputValue<InitValueT>(init_value),
+      static_cast<OffsetT>(num_items),
+      stream);
   }
 
   //! @rst
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 4662b798153..096242314ec 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -23,6 +23,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_scan.cuh>
+#include <cub/detail/arch_dispatch.cuh>
 #include <cub/detail/launcher/cuda_runtime.cuh>
 #include <cub/detail/warpspeed/warpspeed.cuh>
 #include <cub/device/dispatch/dispatch_common.cuh>
@@ -42,18 +43,24 @@
 #include <cuda/std/__iterator/readable_traits.h>
 #include <cuda/std/__memory/construct_at.h>
 #include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/is_unsigned.h>
+#include <cuda/std/__type_traits/void_t.h>
 #include <cuda/std/__utility/move.h>
 
 #include <cuda_runtime_api.h>
 #include <cudaTypedefs.h>
 
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+#  include <sstream>
+#endif
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::scan
 {
-template <typename MaxPolicyT,
+template <typename PolicySelector,
           typename UnwrappedInputIteratorT,
           typename UnwrappedOutputIteratorT,
           typename ScanOpT,
@@ -67,11 +74,11 @@ struct DeviceScanKernelSource
 
   CUB_DEFINE_KERNEL_GETTER(
     InitKernel,
-    DeviceScanInitKernel<MaxPolicyT, UnwrappedInputIteratorT, UnwrappedOutputIteratorT, ScanTileStateT, AccumT>)
+    DeviceScanInitKernel<PolicySelector, UnwrappedInputIteratorT, UnwrappedOutputIteratorT, ScanTileStateT, AccumT>)
 
   CUB_DEFINE_KERNEL_GETTER(
     ScanKernel,
-    DeviceScanKernel<MaxPolicyT,
+    DeviceScanKernel<PolicySelector,
                      UnwrappedInputIteratorT,
                      UnwrappedOutputIteratorT,
                      ScanTileStateT,
@@ -114,6 +121,46 @@ struct DeviceScanKernelSource
     ::cuda::std::__construct_at(&arg.lookahead, static_cast<warpspeed::tile_state_t<AccumT>*>(ts));
     return arg;
   }
+
+  CUB_RUNTIME_FUNCTION static constexpr bool use_warpspeed(const scan_policy& policy)
+  {
+#if _CCCL_CUDACC_AT_LEAST(12, 8)
+    if (policy.warpspeed)
+    {
+      return detail::scan::use_warpspeed<UnwrappedInputIteratorT, UnwrappedOutputIteratorT, AccumT>(policy.warpspeed);
+    }
+#else
+    (void) policy;
+#endif
+    return false;
+  }
+};
+
+// TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
+template <typename LegacyActivePolicy>
+_CCCL_API constexpr auto convert_policy() -> scan_policy
+{
+  using scan_policy_t = typename LegacyActivePolicy::ScanPolicyT;
+  return scan_policy{
+    scan_policy_t::BLOCK_THREADS,
+    scan_policy_t::ITEMS_PER_THREAD,
+    scan_policy_t::LOAD_ALGORITHM,
+    scan_policy_t::LOAD_MODIFIER,
+    scan_policy_t::STORE_ALGORITHM,
+    scan_policy_t::SCAN_ALGORITHM,
+    detail::delay_constructor_policy_from_type<typename scan_policy_t::detail::delay_constructor_t>};
+}
+
+// TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API
+template <typename PolicyHub, typename InputValueT, typename OutputValueT, typename AccumT>
+struct policy_selector_from_hub
+{
+  // Called from device code during dispatch, and from host code when clang-cuda evaluates
+  // scan_policy_selector concept checks.
+  _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> scan_policy
+  {
+    return convert_policy<typename PolicyHub::MaxPolicy::ActivePolicy>();
+  }
 };
 } // namespace detail::scan
 
@@ -145,6 +192,7 @@ struct DeviceScanKernelSource
  *   Enum flag to specify whether to enforce inclusive scan.
  *
  */
+// TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0
 template <
   typename InputIteratorT,
   typename OutputIteratorT,
@@ -160,7 +208,8 @@ template <
   typename PolicyHub              = detail::scan::
     policy_hub<detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, AccumT, OffsetT, ScanOpT>,
   typename KernelSource = detail::scan::DeviceScanKernelSource<
-    typename PolicyHub::MaxPolicy,
+    detail::scan::
+      policy_selector_from_hub<PolicyHub, detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, AccumT>,
     THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<InputIteratorT>,
     THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<OutputIteratorT>,
     ScanOpT,
@@ -250,6 +299,7 @@ struct DispatchScan
    * @param[in] max_policy
    *   Struct encoding chain of algorithm tuning policies
    */
+  // TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -408,8 +458,9 @@ struct DispatchScan
   }
 
 #if __cccl_ptx_isa >= 860
-  template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm(ActivePolicyT)
+  template <typename PolicyGetter, typename PolicySelectorT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t
+  __invoke_lookahead_algorithm(PolicyGetter policy_getter, const PolicySelectorT& policy_selector)
   {
     if (num_items == 0)
     {
@@ -417,16 +468,20 @@ struct DispatchScan
       return cudaSuccess;
     }
 
-    using InputT          = ::cuda::std::iter_value_t<InputIteratorT>;
-    using OutputT         = ::cuda::std::iter_value_t<OutputIteratorT>;
-    using WarpspeedPolicy = typename ActivePolicyT::WarpspeedPolicy;
+    CUB_DETAIL_CONSTEXPR_ISH auto active_policy          = policy_getter();
+    CUB_DETAIL_CONSTEXPR_ISH const auto warpspeed_policy = active_policy.warpspeed;
 
     const int grid_dim =
-      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(WarpspeedPolicy::tile_size)));
+      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(warpspeed_policy.tile_size)));
 
     if (d_temp_storage == nullptr)
     {
-      temp_storage_bytes = grid_dim * kernel_source.look_ahead_tile_state_size();
+      temp_storage_bytes = static_cast<size_t>(grid_dim) * kernel_source.look_ahead_tile_state_size();
+      return cudaSuccess;
+    }
+
+    if (num_items == 0)
+    {
       return cudaSuccess;
     }
 
@@ -435,12 +490,6 @@ struct DispatchScan
     {
       return error;
     }
-    // number of stages to have an even workload across all SMs (improves small problem sizes), assuming 1 CTA per SM
-    // +1 since it tends to improve performance
-    // TODO(bgruber): make the +1 a tuning parameter
-    [[maybe_unused]] const int max_stages_for_even_workload =
-      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * WarpspeedPolicy::tile_size)) + 1);
-
     // Maximum dynamic shared memory size that we can use for temporary storage.
     int max_dynamic_smem_size{};
     if (const auto error =
@@ -452,19 +501,77 @@ struct DispatchScan
     // TODO(bgruber): we probably need to ensure alignment of d_temp_storage
     _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), "");
 
-    constexpr int smem_size_1_stage = detail::scan::smem_for_stages<WarpspeedPolicy, InputT, OutputT, AccumT>(1);
-    static_assert(smem_size_1_stage <= detail::max_smem_per_block); // this is ensured by scan_use_warpspeed
+    using selector_smem_info_t = detail::scan::selector_smem_info<PolicySelectorT>;
 
-    auto scan_kernel = kernel_source.ScanKernel();
-    int num_stages   = 1;
-    int smem_size    = smem_size_1_stage;
+    auto scan_kernel      = kernel_source.ScanKernel();
+    int smem_size_1_stage = 0;
+    if constexpr (selector_smem_info_t::has_static_layout)
+    {
+      CUB_DETAIL_CONSTEXPR_ISH int static_smem_size_1_stage = detail::scan::smem_for_stages(
+        warpspeed_policy,
+        1,
+        selector_smem_info_t::input_value_size,
+        selector_smem_info_t::input_value_alignment,
+        selector_smem_info_t::output_value_size,
+        selector_smem_info_t::output_value_alignment,
+        selector_smem_info_t::accum_size,
+        selector_smem_info_t::accum_alignment);
+      CUB_DETAIL_STATIC_ISH_ASSERT(static_smem_size_1_stage <= detail::max_smem_per_block,
+                                   "Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)");
+      smem_size_1_stage = static_smem_size_1_stage;
+    }
+    else
+    {
+      smem_size_1_stage = detail::scan::smem_for_stages(
+        warpspeed_policy,
+        1,
+        policy_selector.input_value_size,
+        policy_selector.input_value_alignment,
+        policy_selector.output_value_size,
+        policy_selector.output_value_alignment,
+        policy_selector.accum_size,
+        policy_selector.accum_alignment);
+    }
+
+    int num_stages = 1;
+    int smem_size  = smem_size_1_stage;
 
     // When launched from the host, maximize the number of stages that we can fit inside the shared memory.
     NV_IF_TARGET(NV_IS_HOST, ({
+                   // number of stages to have an even workload across all SMs (improves small problem sizes), assuming
+                   // 1 CTA per SM +1 since it tends to improve performance
+                   // TODO(bgruber): make the +1 a tuning parameter
+                   const int max_stages_for_even_workload = static_cast<int>(
+                     ::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * warpspeed_policy.tile_size)) + 1);
+
                    while (num_stages <= max_stages_for_even_workload)
                    {
-                     const auto next_smem_size =
-                       detail::scan::smem_for_stages<WarpspeedPolicy, InputT, OutputT, AccumT>(num_stages + 1);
+                     const auto next_smem_size = [&] {
+                       if constexpr (selector_smem_info_t::has_static_layout)
+                       {
+                         return detail::scan::smem_for_stages(
+                           warpspeed_policy,
+                           num_stages + 1,
+                           selector_smem_info_t::input_value_size,
+                           selector_smem_info_t::input_value_alignment,
+                           selector_smem_info_t::output_value_size,
+                           selector_smem_info_t::output_value_alignment,
+                           selector_smem_info_t::accum_size,
+                           selector_smem_info_t::accum_alignment);
+                       }
+                       else
+                       {
+                         return detail::scan::smem_for_stages(
+                           warpspeed_policy,
+                           num_stages + 1,
+                           policy_selector.input_value_size,
+                           policy_selector.input_value_alignment,
+                           policy_selector.output_value_size,
+                           policy_selector.output_value_alignment,
+                           policy_selector.accum_size,
+                           policy_selector.accum_alignment);
+                       }
+                     }();
                      if (next_smem_size > max_dynamic_smem_size)
                      {
                        // This number of stages failed, so stay at the current settings
@@ -487,7 +594,7 @@ struct DispatchScan
       const auto init_grid_size          = ::cuda::ceil_div(grid_dim, init_kernel_threads);
 
 #  ifdef CUB_DEBUG_LOG
-      _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, , %lld>>>()\n",
+      _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, %lld>>>()\n",
               init_grid_size,
               init_kernel_threads,
               (long long) stream);
@@ -517,7 +624,7 @@ struct DispatchScan
 
     // Invoke scan kernel
     {
-      constexpr int block_dim = WarpspeedPolicy::num_total_threads;
+      const int block_dim = warpspeed_policy.num_total_threads;
 
 #  ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", grid_dim, block_dim, smem_size, (long long) stream);
@@ -555,24 +662,179 @@ struct DispatchScan
   }
 #endif // __cccl_ptx_isa >= 860
 
-  template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {})
+  // On Windows, the `if CUB_DETAIL_CONSTEXPR_ISH` results in `warning C4702: unreachable code`.
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_MSVC(4702)
+
+  template <typename PolicyGetter, typename PolicySelectorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  __invoke(PolicyGetter policy_getter, [[maybe_unused]] const PolicySelectorT& policy_selector)
   {
+    CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter();
+
+    CUB_DETAIL_STATIC_ISH_ASSERT(active_policy.load_modifier != CacheLoadModifier::LOAD_LDG,
+                                 "The memory consistency model does not apply to texture accesses");
+
 #if __cccl_ptx_isa >= 860
-    if constexpr (detail::scan::scan_use_warpspeed<
-                    ActivePolicyT,
-                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<InputIteratorT>,
-                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<OutputIteratorT>,
-                    AccumT>)
+#  if defined(CUB_DEFINE_RUNTIME_POLICIES)
+    if (kernel_source.use_warpspeed(active_policy))
     {
-      return __invoke_lookahead_algorithm(active_policy);
+      return __invoke_lookahead_algorithm(policy_getter, policy_selector);
     }
-    else
+#  else
+    if CUB_DETAIL_CONSTEXPR_ISH (KernelSource::use_warpspeed(active_policy))
+    {
+      return __invoke_lookahead_algorithm(policy_getter, policy_selector);
+    }
+#  endif
 #endif // __cccl_ptx_isa >= 860
+
+    // Number of input tiles
+    const int tile_size = active_policy.block_threads * active_policy.items_per_thread;
+    const int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+    auto tile_state = kernel_source.TileState();
+
+    // Specify temporary storage allocation requirements
+    size_t allocation_sizes[1];
+    if (const auto error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0])))
+    {
+      return error; // bytes needed for tile status descriptors
+    }
+
+    // Compute allocation pointers into the single storage blob (or compute
+    // the necessary size of the blob)
+    void* allocations[1] = {};
+    if (const auto error =
+          CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+    {
+      return error;
+    }
+
+    // Return if the caller is simply requesting the size of the storage allocation, or the problem is empty
+    if (d_temp_storage == nullptr || num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    // Construct the tile status interface
+    if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])))
     {
-      return Invoke(
-        kernel_source.InitKernel(), kernel_source.ScanKernel(), detail::scan::MakeScanPolicyWrapper(active_policy));
+      return error;
     }
+
+    // Log init_kernel configuration
+    constexpr int init_kernel_threads = 128;
+    const int init_grid_size          = ::cuda::ceil_div(num_tiles, init_kernel_threads);
+
+#ifdef CUB_DEBUG_LOG
+    _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
+#endif // CUB_DEBUG_LOG
+
+    // Invoke init_kernel to initialize tile descriptors
+    if (const auto error = CubDebug(
+          launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true)
+            .doit(kernel_source.InitKernel(), kernel_source.make_tile_state_kernel_arg(tile_state), num_tiles)))
+    {
+      return error;
+    }
+
+    // Check for failure to launch
+    if (const auto error = CubDebug(cudaPeekAtLastError()))
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+    {
+      return error;
+    }
+
+    // Get SM occupancy for scan_kernel
+    int scan_sm_occupancy;
+    if (const auto error = CubDebug(
+          launcher_factory.MaxSmOccupancy(scan_sm_occupancy, kernel_source.ScanKernel(), active_policy.block_threads)))
+    {
+      return error;
+    }
+
+    // Get max x-dimension of grid
+    int max_dim_x;
+    if (const auto error = CubDebug(launcher_factory.MaxGridDimX(max_dim_x)))
+    {
+      return error;
+    }
+
+    // Run grids in epochs (in case number of tiles exceeds max x-dimension
+    const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x);
+    for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+    {
+// Log scan_kernel configuration
+#ifdef CUB_DEBUG_LOG
+      _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
+              "per thread, %d SM occupancy\n",
+              start_tile,
+              scan_grid_size,
+              active_policy.block_threads,
+              (long long) stream,
+              active_policy.items_per_thread,
+              scan_sm_occupancy);
+#endif // CUB_DEBUG_LOG
+
+      // Invoke scan_kernel
+      if (const auto error = CubDebug(
+            launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true)
+              .doit(kernel_source.ScanKernel(),
+                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_in),
+                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_out),
+                    kernel_source.make_tile_state_kernel_arg(tile_state),
+                    start_tile,
+                    scan_op,
+                    init_value,
+                    num_items,
+                    /* num_stages, unused */ 1)))
+      {
+        return error;
+      }
+
+      // Check for failure to launch
+      if (const auto error = CubDebug(cudaPeekAtLastError()))
+      {
+        return error;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      if (const auto error = CubDebug(detail::DebugSyncStream(stream)))
+      {
+        return error;
+      }
+    }
+
+    return cudaSuccess;
+  }
+
+  _CCCL_DIAG_POP
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT = {})
+  {
+    struct policy_getter
+    {
+      // host-device not api, because clang warns about exclude_from_explicit_instantiation in local types
+      _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto operator()() const
+      {
+        return detail::scan::convert_policy<ActivePolicyT>();
+      }
+    };
+
+    using policy_selector_t = detail::scan::policy_selector_from_types<
+      detail::it_value_t<InputIteratorT>,
+      detail::it_value_t<OutputIteratorT>,
+      AccumT,
+      OffsetT,
+      ScanOpT>;
+    return __invoke(policy_getter{}, policy_selector_t{});
   }
 
   /**
@@ -614,6 +876,7 @@ struct DispatchScan
    * @param[in] max_policy
    *   Struct encoding chain of algorithm tuning policies
    */
+  // TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0
   template <typename MaxPolicyT = typename PolicyHub::MaxPolicy>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -654,4 +917,145 @@ struct DispatchScan
   }
 };
 
+namespace detail::scan
+{
+template <
+  ForceInclusive EnforceInclusive = ForceInclusive::No,
+  typename InputIteratorT,
+  typename OutputIteratorT,
+  typename ScanOpT,
+  typename InitValueT,
+  typename OffsetT,
+  typename AccumT         = ::cuda::std::__accumulator_t<ScanOpT,
+                                                         cub::detail::it_value_t<InputIteratorT>,
+                                                         ::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
+                                                                          cub::detail::it_value_t<InputIteratorT>,
+                                                                          typename InitValueT::value_type>>,
+  typename PolicySelector = policy_selector_from_types<detail::it_value_t<InputIteratorT>,
+                                                       detail::it_value_t<OutputIteratorT>,
+                                                       AccumT,
+                                                       OffsetT,
+                                                       ScanOpT>,
+  typename KernelSource   = DeviceScanKernelSource<
+      PolicySelector,
+      THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<InputIteratorT>,
+      THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<OutputIteratorT>,
+      ScanOpT,
+      InitValueT,
+      OffsetT,
+      AccumT,
+      EnforceInclusive>,
+  typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
+#if _CCCL_HAS_CONCEPTS()
+  requires scan_policy_selector<PolicySelector>
+#endif // _CCCL_HAS_CONCEPTS()
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  InputIteratorT d_in,
+  OutputIteratorT d_out,
+  ScanOpT scan_op,
+  InitValueT init_value,
+  OffsetT num_items,
+  cudaStream_t stream,
+  PolicySelector policy_selector         = {},
+  KernelSource kernel_source             = {},
+  KernelLauncherFactory launcher_factory = {}) -> cudaError_t
+{
+  static_assert(::cuda::std::is_unsigned_v<OffsetT> && sizeof(OffsetT) >= 4,
+                "DispatchScan only supports unsigned offset types of at least 4-bytes");
+
+  ::cuda::arch_id arch_id{};
+  if (const auto error = CubDebug(launcher_factory.PtxArchId(arch_id)))
+  {
+    return error;
+  }
+
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+  NV_IF_TARGET(NV_IS_HOST,
+               (std::stringstream ss; ss << policy_selector(arch_id);
+                _CubLog("Dispatching DeviceScan to arch %d with tuning: %s\n", (int) arch_id, ss.str().c_str());))
+#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+
+  struct fake_policy
+  {
+    using MaxPolicy = void;
+  };
+
+  return dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) {
+    return DispatchScan<InputIteratorT,
+                        OutputIteratorT,
+                        ScanOpT,
+                        InitValueT,
+                        OffsetT,
+                        AccumT,
+                        EnforceInclusive,
+                        fake_policy,
+                        KernelSource,
+                        KernelLauncherFactory>{
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_items,
+      scan_op,
+      init_value,
+      stream,
+      -1 /* ptx_version, not used actually */,
+      kernel_source,
+      launcher_factory}
+      .__invoke(policy_getter, policy_selector);
+  });
+}
+
+template <typename AccumT,
+          ForceInclusive EnforceInclusive = ForceInclusive::No,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitValueT,
+          typename OffsetT,
+          typename PolicySelector = policy_selector_from_types<detail::it_value_t<InputIteratorT>,
+                                                               detail::it_value_t<OutputIteratorT>,
+                                                               AccumT,
+                                                               OffsetT,
+                                                               ScanOpT>,
+          typename KernelSource   = DeviceScanKernelSource<
+              PolicySelector,
+              THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<InputIteratorT>,
+              THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<OutputIteratorT>,
+              ScanOpT,
+              InitValueT,
+              OffsetT,
+              AccumT,
+              EnforceInclusive>,
+          typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_with_accum(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  InputIteratorT d_in,
+  OutputIteratorT d_out,
+  ScanOpT scan_op,
+  InitValueT init_value,
+  OffsetT num_items,
+  cudaStream_t stream,
+  PolicySelector policy_selector         = {},
+  KernelSource kernel_source             = {},
+  KernelLauncherFactory launcher_factory = {}) -> cudaError_t
+{
+  return dispatch<EnforceInclusive, InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT, AccumT>(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in,
+    d_out,
+    scan_op,
+    init_value,
+    num_items,
+    stream,
+    policy_selector,
+    kernel_source,
+    launcher_factory);
+}
+} // namespace detail::scan
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index 5346385f0ba..d703d035d2b 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -53,7 +53,11 @@ union tile_state_kernel_arg_t
  * @param[in] num_tiles
  *   Number of tiles
  */
-template <typename ChainedPolicyT, typename InputIteratorT, typename OutputIteratorT, typename ScanTileState, typename AccumT>
+template <typename PolicySelectorT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanTileState,
+          typename AccumT>
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel(
   tile_state_kernel_arg_t<ScanTileState, AccumT> tile_state, int num_tiles)
 {
@@ -61,8 +65,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel(
   _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); // beneficial for all problem sizes in cub.bench.scan.exclusive.sum.base
 
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
-  if constexpr (detail::scan::
-                  scan_use_warpspeed<typename ChainedPolicyT::ActivePolicy, InputIteratorT, OutputIteratorT, AccumT>)
+  constexpr scan_policy policy = PolicySelectorT{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
+  if constexpr (policy.warpspeed
+                && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(policy.warpspeed))
   {
     device_scan_init_lookahead_body(tile_state.lookahead, num_tiles);
   }
@@ -106,28 +111,26 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte
     *d_num_selected_out = 0;
   }
 }
-template <typename ChainedPolicyT, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
+template <typename PolicySelector, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
 [[nodiscard]] _CCCL_DEVICE_API _CCCL_CONSTEVAL int get_device_scan_launch_bounds() noexcept
 {
+  constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
-  if constexpr (detail::scan::
-                  scan_use_warpspeed<typename ChainedPolicyT::ActivePolicy, InputIteratorT, OutputIteratorT, AccumT>)
+  if constexpr (policy.warpspeed
+                && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(policy.warpspeed))
   {
-    return get_scan_block_threads<typename ChainedPolicyT::ActivePolicy>;
+    return policy.warpspeed.num_total_threads;
   }
-  else
 #endif // _CCCL_CUDACC_AT_LEAST(12, 8)
-  {
-    return static_cast<int>(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS);
-  }
+  return policy.block_threads;
 }
 
 /**
  * @brief Scan kernel entry point (multi-block)
  *
  *
- * @tparam ChainedPolicyT
- *   Chained tuning policy
+ * @tparam PolicySelector
+ *   Policy selector for tuning
  *
  * @tparam InputIteratorT
  *   Random-access input iterator type for reading scan inputs @iterator
@@ -167,7 +170,7 @@ template <typename ChainedPolicyT, typename InputIteratorT, typename OutputItera
  * @paramTotal num_items
  *   number of scan items for the entire problem
  */
-template <typename ChainedPolicyT,
+template <typename PolicySelector,
           typename InputIteratorT,
           typename OutputIteratorT,
           typename ScanTileState,
@@ -177,7 +180,7 @@ template <typename ChainedPolicyT,
           typename AccumT,
           bool ForceInclusive,
           typename RealInitValueT = typename InitValueT::value_type>
-__launch_bounds__(get_device_scan_launch_bounds<ChainedPolicyT, InputIteratorT, OutputIteratorT, AccumT>(), 1)
+__launch_bounds__(get_device_scan_launch_bounds<PolicySelector, InputIteratorT, OutputIteratorT, AccumT>())
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
     _CCCL_GRID_CONSTANT const InputIteratorT d_in,
     _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
@@ -192,23 +195,41 @@ __launch_bounds__(get_device_scan_launch_bounds<ChainedPolicyT, InputIteratorT,
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const int num_stages)
 {
-  using ActivePolicy = typename ChainedPolicyT::ActivePolicy;
+  static constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
+  static_assert(policy.load_modifier != CacheLoadModifier::LOAD_LDG,
+                "The memory consistency model does not apply to texture "
+                "accesses");
+
 #if _CCCL_CUDACC_AT_LEAST(12, 8)
-  if constexpr (detail::scan::scan_use_warpspeed<ActivePolicy, InputIteratorT, OutputIteratorT, AccumT>)
+  if constexpr (policy.warpspeed
+                && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(policy.warpspeed))
   {
-    NV_IF_TARGET(NV_PROVIDES_SM_100, ({
-                   auto scan_params = scanKernelParams<it_value_t<InputIteratorT>, it_value_t<OutputIteratorT>, AccumT>{
-                     d_in, d_out, tile_state.lookahead, num_items, num_stages};
-                   device_scan_lookahead_body<typename ActivePolicy::WarpspeedPolicy, ForceInclusive, RealInitValueT>(
-                     scan_params, scan_op, init_value);
-                 }));
+    NV_IF_TARGET(
+      NV_PROVIDES_SM_100, ({
+        auto scan_params = scanKernelParams<it_value_t<InputIteratorT>, it_value_t<OutputIteratorT>, AccumT>{
+          d_in, d_out, tile_state.lookahead, num_items, num_stages};
+        device_scan_lookahead_body<PolicySelector, ForceInclusive, RealInitValueT>(scan_params, scan_op, init_value);
+      }));
   }
   else
 #endif // _CCCL_CUDACC_AT_LEAST(12, 8)
   {
+    using ScanPolicyT = AgentScanPolicy<
+      policy.block_threads,
+      policy.items_per_thread,
+      AccumT,
+      policy.load_algorithm,
+      policy.load_modifier,
+      policy.store_algorithm,
+      policy.scan_algorithm,
+      NoScaling<policy.block_threads, policy.items_per_thread>,
+      delay_constructor_t<policy.delay_constructor.kind,
+                          policy.delay_constructor.delay,
+                          policy.delay_constructor.l2_write_latency>>;
+
     // Thread block type for scanning input tiles
     using AgentScanT = detail::scan::AgentScan<
-      typename ActivePolicy::ScanPolicyT,
+      ScanPolicyT,
       InputIteratorT,
       OutputIteratorT,
       ScanOpT,
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
index 9134b70e4dc..92b84623932 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh
@@ -21,6 +21,7 @@
 #include <cub/detail/warpspeed/squad/load_store.cuh>
 #include <cub/detail/warpspeed/squad/squad.cuh>
 #include <cub/detail/warpspeed/values.cuh>
+#include <cub/device/dispatch/kernels/scan_warpspeed_policy.cuh>
 #include <cub/thread/thread_reduce.cuh>
 #include <cub/thread/thread_scan.cuh>
 #include <cub/warp/warp_reduce.cuh>
@@ -29,6 +30,7 @@
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/__device/arch_id.h>
 #include <cuda/__memory/align_down.h>
 #include <cuda/__memory/align_up.h>
 #include <cuda/__ptx/instructions/clusterlaunchcontrol.h>
@@ -42,6 +44,68 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail::scan
 {
+template <typename PolicySelector>
+_CCCL_API constexpr scan_warpspeed_policy get_warpspeed_policy() noexcept
+{
+  return PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).warpspeed;
+}
+
+template <typename PolicySelector>
+struct static_warpspeed_policy_adapter
+{
+  _CCCL_API static constexpr scan_warpspeed_policy policy() noexcept
+  {
+    return get_warpspeed_policy<PolicySelector>();
+  }
+
+  _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const
+  {
+    return policy().squadReduce();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const
+  {
+    return policy().squadScanStore();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const
+  {
+    return policy().squadLoad();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadSched() const
+  {
+    return policy().squadSched();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const
+  {
+    return policy().squadLookback();
+  }
+};
+
+struct runtime_warpspeed_policy_adapter
+{
+  const scan_warpspeed_policy& policy;
+
+  _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const
+  {
+    return policy.squadReduce();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const
+  {
+    return policy.squadScanStore();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const
+  {
+    return policy.squadLoad();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadSched() const
+  {
+    return policy.squadSched();
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const
+  {
+    return policy.squadLookback();
+  }
+};
+
 template <typename InputT, typename OutputT, typename AccumT>
 struct scanKernelParams
 {
@@ -53,76 +117,117 @@ struct scanKernelParams
 };
 
 // Struct holding all scan kernel resources
-template <typename WarpspeedPolicy, typename InputT, typename OutputT, typename AccumT>
+template <typename PolicySelector, typename InputT, typename OutputT, typename AccumT>
 struct ScanResources
 {
+  _CCCL_API static constexpr scan_warpspeed_policy warpspeed_policy() noexcept
+  {
+    return get_warpspeed_policy<PolicySelector>();
+  }
+
   // align to at least 16 bytes (InputT/OutputT may be aligned higher) so each stage starts correctly aligned
   struct alignas(::cuda::std::max({::cuda::std::size_t{16}, alignof(InputT), alignof(OutputT)})) InOutT
   {
     // the tile_size size is a multiple of the warp size, and thus for sure a multiple of 16
-    static_assert(WarpspeedPolicy::tile_size % 16 == 0, "tile_size must be multiple of 16");
+    static_assert(ScanResources::warpspeed_policy().tile_size % 16 == 0, "tile_size must be multiple of 16");
 
     // therefore, unaligned inputs need exactly 16 bytes extra for overcopying (tail padding = 16 - head padding)
-    ::cuda::std::byte inout[WarpspeedPolicy::tile_size * sizeof(InputT) + 16];
+    ::cuda::std::byte inout[ScanResources::warpspeed_policy().tile_size * sizeof(InputT) + 16];
   };
   static_assert(alignof(InOutT) >= alignof(InputT));
   static_assert(alignof(InOutT) >= alignof(OutputT));
-  using SumThreadAndWarpT =
-    AccumT[WarpspeedPolicy::squadReduce().threadCount() + WarpspeedPolicy::squadReduce().warpCount()];
+  using SumThreadAndWarpT = AccumT[ScanResources::warpspeed_policy().squadReduce().threadCount()
+                                   + ScanResources::warpspeed_policy().squadReduce().warpCount()];
 
   warpspeed::SmemResource<InOutT> smemInOut; // will also be used to stage the output (as OutputT) for the bulk copy
   warpspeed::SmemResource<uint4> smemNextBlockIdx;
   warpspeed::SmemResource<AccumT> smemSumExclusiveCta;
   warpspeed::SmemResource<SumThreadAndWarpT> smemSumThreadAndWarp;
 };
+
+struct ScanResourcesRaw
+{
+  warpspeed::SmemResourceRaw smemInOut;
+  warpspeed::SmemResourceRaw smemNextBlockIdx;
+  warpspeed::SmemResourceRaw smemSumExclusiveCta;
+  warpspeed::SmemResourceRaw smemSumThreadAndWarp;
+};
+
+struct scan_stage_counts
+{
+  int num_block_idx_stages;
+  int num_sum_exclusive_cta_stages;
+};
+
+_CCCL_API constexpr scan_stage_counts make_scan_stage_counts(int num_stages)
+{
+  int num_block_idx_stages = num_stages - 1;
+  num_block_idx_stages     = num_block_idx_stages < 1 ? 1 : num_block_idx_stages;
+  return {num_block_idx_stages, 2};
+}
+
+template <typename PolicyAdapter,
+          typename SmemInOutT,
+          typename SmemNextBlockIdxT,
+          typename SmemSumExclusiveCtaT,
+          typename SmemSumThreadAndWarpT>
+_CCCL_API constexpr void setup_scan_resources(
+  const PolicyAdapter& policy,
+  warpspeed::SyncHandler& syncHandler,
+  warpspeed::SmemAllocator& smemAllocator,
+  SmemInOutT& smemInOut,
+  SmemNextBlockIdxT& smemNextBlockIdx,
+  SmemSumExclusiveCtaT& smemSumExclusiveCta,
+  SmemSumThreadAndWarpT& smemSumThreadAndWarp)
+{
+  const warpspeed::SquadDesc scanSquads[scan_warpspeed_policy::num_squads] = {
+    policy.squadReduce(),
+    policy.squadScanStore(),
+    policy.squadLoad(),
+    policy.squadSched(),
+    policy.squadLookback(),
+  };
+
+  smemInOut.addPhase(syncHandler, smemAllocator, policy.squadLoad());
+  smemInOut.addPhase(syncHandler, smemAllocator, {policy.squadReduce(), policy.squadScanStore()});
+
+  smemNextBlockIdx.addPhase(syncHandler, smemAllocator, policy.squadSched());
+  smemNextBlockIdx.addPhase(syncHandler, smemAllocator, scanSquads);
+
+  smemSumExclusiveCta.addPhase(syncHandler, smemAllocator, policy.squadLookback());
+  smemSumExclusiveCta.addPhase(syncHandler, smemAllocator, policy.squadScanStore());
+
+  smemSumThreadAndWarp.addPhase(syncHandler, smemAllocator, policy.squadReduce());
+  smemSumThreadAndWarp.addPhase(syncHandler, smemAllocator, policy.squadScanStore());
+}
+
 // Function to allocate resources.
 
-template <typename WarpspeedPolicy, typename InputT, typename OutputT, typename AccumT>
-[[nodiscard]] _CCCL_API constexpr ScanResources<WarpspeedPolicy, InputT, OutputT, AccumT>
+template <typename PolicySelector, typename InputT, typename OutputT, typename AccumT>
+[[nodiscard]] _CCCL_API constexpr ScanResources<PolicySelector, InputT, OutputT, AccumT>
 allocResources(warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& smemAllocator, int numStages)
 {
-  using ScanResourcesT    = ScanResources<WarpspeedPolicy, InputT, OutputT, AccumT>;
+  using ScanResourcesT    = ScanResources<PolicySelector, InputT, OutputT, AccumT>;
   using InOutT            = typename ScanResourcesT::InOutT;
   using SumThreadAndWarpT = typename ScanResourcesT::SumThreadAndWarpT;
 
-  // If numBlockIdxStages is one less than the number of stages, we find a small
-  // speedup compared to setting it equal to num_stages. Not sure why.
-  int numBlockIdxStages = numStages - 1;
-  // Ensure we have at least 1 stage
-  numBlockIdxStages = numBlockIdxStages < 1 ? 1 : numBlockIdxStages;
-
-  // We do not need too many sumExclusiveCta stages. The lookback warp is the
-  // bottleneck. As soon as it produces a new value, it will be consumed by the
-  // scanStore squad, releasing the stage.
-  int numSumExclusiveCtaStages = 2;
+  const auto [num_block_idx_stages, num_sum_exclusive_cta_stages] = make_scan_stage_counts(numStages);
 
   ScanResourcesT res = {
     warpspeed::SmemResource<InOutT>(syncHandler, smemAllocator, warpspeed::Stages{numStages}),
-    warpspeed::SmemResource<uint4>(syncHandler, smemAllocator, warpspeed::Stages{numBlockIdxStages}),
-    warpspeed::SmemResource<AccumT>(syncHandler, smemAllocator, warpspeed::Stages{numSumExclusiveCtaStages}),
+    warpspeed::SmemResource<uint4>(syncHandler, smemAllocator, warpspeed::Stages{num_block_idx_stages}),
+    warpspeed::SmemResource<AccumT>(syncHandler, smemAllocator, warpspeed::Stages{num_sum_exclusive_cta_stages}),
     warpspeed::SmemResource<SumThreadAndWarpT>(syncHandler, smemAllocator, warpspeed::Stages{numStages}),
   };
-  // asdfasdf
-  constexpr warpspeed::SquadDesc scanSquads[WarpspeedPolicy::num_squads] = {
-    WarpspeedPolicy::squadReduce(),
-    WarpspeedPolicy::squadScanStore(),
-    WarpspeedPolicy::squadLoad(),
-    WarpspeedPolicy::squadSched(),
-    WarpspeedPolicy::squadLookback(),
-  };
 
-  res.smemInOut.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadLoad());
-  res.smemInOut.addPhase(
-    syncHandler, smemAllocator, {WarpspeedPolicy::squadReduce(), WarpspeedPolicy::squadScanStore()});
-
-  res.smemNextBlockIdx.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadSched());
-  res.smemNextBlockIdx.addPhase(syncHandler, smemAllocator, scanSquads);
-
-  res.smemSumExclusiveCta.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadLookback());
-  res.smemSumExclusiveCta.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadScanStore());
-
-  res.smemSumThreadAndWarp.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadReduce());
-  res.smemSumThreadAndWarp.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadScanStore());
+  setup_scan_resources(
+    static_warpspeed_policy_adapter<PolicySelector>{},
+    syncHandler,
+    smemAllocator,
+    res.smemInOut,
+    res.smemNextBlockIdx,
+    res.smemSumExclusiveCta,
+    res.smemSumThreadAndWarp);
 
   return res;
 }
@@ -192,7 +297,7 @@ _CCCL_DEVICE_API Tp warpScanExclusive(const Tp regInput, ScanOpT& scan_op)
 // warp-specialization dispatch is performed once at the start of the kernel and
 // not in any of the hot loops (even if that may seem the case from a first
 // glance at the code).
-template <typename WarpspeedPolicy,
+template <typename PolicySelector,
           typename InputT,
           typename OutputT,
           typename AccumT,
@@ -209,17 +314,18 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody(
   ////////////////////////////////////////////////////////////////////////////////
   // Tuning dependent variables
   ////////////////////////////////////////////////////////////////////////////////
-  static constexpr warpspeed::SquadDesc squadReduce    = WarpspeedPolicy::squadReduce();
-  static constexpr warpspeed::SquadDesc squadScanStore = WarpspeedPolicy::squadScanStore();
-  static constexpr warpspeed::SquadDesc squadLoad      = WarpspeedPolicy::squadLoad();
-  static constexpr warpspeed::SquadDesc squadSched     = WarpspeedPolicy::squadSched();
-  static constexpr warpspeed::SquadDesc squadLookback  = WarpspeedPolicy::squadLookback();
+  static constexpr scan_warpspeed_policy policy        = get_warpspeed_policy<PolicySelector>();
+  static constexpr warpspeed::SquadDesc squadReduce    = policy.squadReduce();
+  static constexpr warpspeed::SquadDesc squadScanStore = policy.squadScanStore();
+  static constexpr warpspeed::SquadDesc squadLoad      = policy.squadLoad();
+  static constexpr warpspeed::SquadDesc squadSched     = policy.squadSched();
+  static constexpr warpspeed::SquadDesc squadLookback  = policy.squadLookback();
 
-  constexpr int tile_size            = WarpspeedPolicy::tile_size;
-  constexpr int num_look_ahead_items = WarpspeedPolicy::num_look_ahead_items;
+  constexpr int tile_size            = policy.tile_size;
+  constexpr int num_look_ahead_items = policy.num_look_ahead_items;
 
   // We might try to instantiate the kernel with hughe types which would lead to a small tile size. Ensure its never 0
-  constexpr int elemPerThread = WarpspeedPolicy::items_per_thread;
+  constexpr int elemPerThread = policy.items_per_thread;
   static_assert(elemPerThread * squadReduce.threadCount() == tile_size, "Invalid tuning policy");
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -228,8 +334,8 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody(
   warpspeed::SyncHandler syncHandler{};
   warpspeed::SmemAllocator smemAllocator{};
 
-  ScanResources<WarpspeedPolicy, InputT, OutputT, AccumT> res =
-    allocResources<WarpspeedPolicy, InputT, OutputT, AccumT>(syncHandler, smemAllocator, params.numStages);
+  ScanResources<PolicySelector, InputT, OutputT, AccumT> res =
+    allocResources<PolicySelector, InputT, OutputT, AccumT>(syncHandler, smemAllocator, params.numStages);
 
   syncHandler.clusterInitSync(specialRegisters);
 
@@ -617,8 +723,7 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody(
       {
         // otherwise, issue multiple bulk copies in chunks of the input tile size
         // TODO(bgruber): I am sure this could be implemented a lot more efficiently
-        static constexpr int elem_per_chunk =
-          static_cast<int>(WarpspeedPolicy::tile_size * sizeof(InputT) / sizeof(OutputT));
+        static constexpr int elem_per_chunk = static_cast<int>(policy.tile_size * sizeof(InputT) / sizeof(OutputT));
         for (int chunk_offset = 0; chunk_offset < valid_items; chunk_offset += elem_per_chunk)
         {
           const int chunk_size = ::cuda::std::min(valid_items - chunk_offset, elem_per_chunk);
@@ -673,14 +778,7 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody(
 
 #endif // __cccl_ptx_isa >= 860
 
-template <typename ActivePolicy, class = void>
-inline constexpr int get_scan_block_threads = 1;
-
-template <typename ActivePolicy>
-inline constexpr int get_scan_block_threads<ActivePolicy, ::cuda::std::void_t<typename ActivePolicy::WarpspeedPolicy>> =
-  ActivePolicy::WarpspeedPolicy::num_total_threads;
-
-template <typename WarpspeedPolicy,
+template <typename PolicySelector,
           bool ForceInclusive,
           typename RealInitValueT,
           typename InputT,
@@ -695,18 +793,20 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void device_scan_lookahead_body(
   // Cache special registers at start of kernel
   warpspeed::SpecialRegisters specialRegisters = warpspeed::getSpecialRegisters();
 
+  static constexpr scan_warpspeed_policy policy = get_warpspeed_policy<PolicySelector>();
+
   // Dispatch for warp-specialization
-  static constexpr warpspeed::SquadDesc scanSquads[WarpspeedPolicy::num_squads] = {
-    WarpspeedPolicy::squadReduce(),
-    WarpspeedPolicy::squadScanStore(),
-    WarpspeedPolicy::squadLoad(),
-    WarpspeedPolicy::squadSched(),
-    WarpspeedPolicy::squadLookback(),
+  static constexpr warpspeed::SquadDesc scanSquads[scan_warpspeed_policy::num_squads] = {
+    policy.squadReduce(),
+    policy.squadScanStore(),
+    policy.squadLoad(),
+    policy.squadSched(),
+    policy.squadLookback(),
   };
 
   // we need to force inline the lambda, but clang in CUDA mode only likes the GNU syntax
   warpspeed::squadDispatch(specialRegisters, scanSquads, [&](warpspeed::Squad squad) _CCCL_FORCEINLINE_LAMBDA {
-    kernelBody<WarpspeedPolicy, InputT, OutputT, AccumT, ScanOpT, RealInitValueT, ForceInclusive>(
+    kernelBody<PolicySelector, InputT, OutputT, AccumT, ScanOpT, RealInitValueT, ForceInclusive>(
       squad, specialRegisters, params, ::cuda::std::move(scan_op), static_cast<RealInitValueT>(init_value));
   });
 #endif // __cccl_ptx_isa >= 860
@@ -746,66 +846,137 @@ device_scan_init_lookahead_body(warpspeed::tile_state_t<AccumT>* tile_states, co
   }
 }
 
-template <typename WarpspeedPolicy, typename InputT, typename OutputT, typename AccumT>
-_CCCL_API constexpr auto smem_for_stages(int num_stages) -> int
+_CCCL_API constexpr auto smem_for_stages(
+  const scan_warpspeed_policy& policy,
+  int num_stages,
+  int input_size,
+  int input_align,
+  int output_size,
+  int output_align,
+  int accum_size,
+  int accum_align) -> int
 {
   warpspeed::SyncHandler syncHandler{};
   warpspeed::SmemAllocator smemAllocator{};
-  (void) scan::allocResources<WarpspeedPolicy, InputT, OutputT, AccumT>(syncHandler, smemAllocator, num_stages);
+  (void) output_size;
+  const auto counts = make_scan_stage_counts(num_stages);
+
+  const int align_inout = ::cuda::std::max({16, input_align, output_align});
+  const int inout_bytes = policy.tile_size * input_size + 16;
+  // Match sizeof(InOutT): round up to the alignment so each stage matches SmemResource<InOutT>.
+  const int inout_stride    = (inout_bytes + align_inout - 1) & ~(align_inout - 1);
+  const auto reduce_squad   = policy.squadReduce();
+  const int sum_thread_warp = (reduce_squad.threadCount() + reduce_squad.warpCount()) * accum_size;
+
+  void* inout_base = smemAllocator.alloc(static_cast<::cuda::std::uint32_t>(inout_stride * num_stages), align_inout);
+  void* next_block_idx_base = smemAllocator.alloc(
+    static_cast<::cuda::std::uint32_t>(sizeof(uint4) * counts.num_block_idx_stages), alignof(uint4));
+  void* sum_exclusive_base = smemAllocator.alloc(
+    static_cast<::cuda::std::uint32_t>(accum_size * counts.num_sum_exclusive_cta_stages), accum_align);
+  void* sum_thread_warp_base =
+    smemAllocator.alloc(static_cast<::cuda::std::uint32_t>(sum_thread_warp * num_stages), accum_align);
+
+  ScanResourcesRaw res = {
+    warpspeed::SmemResourceRaw{syncHandler, inout_base, inout_stride, inout_stride, num_stages},
+    warpspeed::SmemResourceRaw{
+      syncHandler,
+      next_block_idx_base,
+      static_cast<int>(sizeof(uint4)),
+      static_cast<int>(sizeof(uint4)),
+      counts.num_block_idx_stages},
+    warpspeed::SmemResourceRaw{
+      syncHandler, sum_exclusive_base, accum_size, accum_size, counts.num_sum_exclusive_cta_stages},
+    warpspeed::SmemResourceRaw{syncHandler, sum_thread_warp_base, sum_thread_warp, sum_thread_warp, num_stages},
+  };
+
+  setup_scan_resources(
+    runtime_warpspeed_policy_adapter{policy},
+    syncHandler,
+    smemAllocator,
+    res.smemInOut,
+    res.smemNextBlockIdx,
+    res.smemSumExclusiveCta,
+    res.smemSumThreadAndWarp);
   syncHandler.mHasInitialized = true; // avoid assertion in destructor
   return static_cast<int>(smemAllocator.sizeBytes());
 }
 
-#if 0
-// we check the required shared memory inside a template, so the error message shows the amount in case of failure
-template <int RequiredSharedMemory>
-_CCCL_API constexpr void verify_smem()
+template <typename InputT, typename OutputT, typename AccumT>
+_CCCL_API constexpr auto smem_for_stages(const scan_warpspeed_policy& policy, int num_stages) -> int
 {
-  static_assert(RequiredSharedMemory <= max_smem_per_block,
-                "Single stage configuration exceeds architecture independent SMEM (48KiB)");
+  return smem_for_stages(
+    policy,
+    num_stages,
+    static_cast<int>(sizeof(InputT)),
+    static_cast<int>(alignof(InputT)),
+    static_cast<int>(sizeof(OutputT)),
+    static_cast<int>(alignof(OutputT)),
+    static_cast<int>(sizeof(AccumT)),
+    static_cast<int>(alignof(AccumT)));
 }
-#endif
 
-template <typename WarpspeedPolicy, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
-_CCCL_API constexpr auto one_stage_fits_48KiB_SMEM() -> bool
+_CCCL_API constexpr bool use_warpspeed(
+  const scan_warpspeed_policy& policy,
+  int input_size,
+  int input_align,
+  int output_size,
+  int output_align,
+  int accum_size,
+  int accum_align,
+  bool input_contiguous,
+  bool output_contiguous,
+  bool input_trivially_copyable,
+  bool output_trivially_copyable,
+  bool output_default_constructible)
 {
-  using InputT                    = it_value_t<InputIteratorT>;
-  using OutputT                   = it_value_t<OutputIteratorT>;
-  constexpr int smem_size_1_stage = smem_for_stages<WarpspeedPolicy, InputT, OutputT, AccumT>(1);
-// We can turn this on to report if a single stage of the warpspeed scan would exceed 48KiB if SMEM.
-#if 0
-  verify_smem<smem_size_1_stage>();
+// We need `cuda::std::is_constant_evaluated` for the compile-time SMEM computation. And we need PTX ISA 8.6.
+// MSVC + nvcc < 13.1 just fails to compile `cub.test.device.scan.lid_1.types_0` with `Internal error` and nothing else.
+#if (defined(__CUDA_ARCH__) && __cccl_ptx_isa < 860) || !defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) \
+  || ((_CCCL_COMPILER(MSVC) && _CCCL_CUDA_COMPILER(NVCC, <, 13, 1)))
+  (void) policy;
+  (void) input_size;
+  (void) input_align;
+  (void) output_size;
+  (void) output_align;
+  (void) accum_size;
+  (void) accum_align;
+  (void) input_contiguous;
+  (void) output_contiguous;
+  (void) input_trivially_copyable;
+  (void) output_trivially_copyable;
+  (void) output_default_constructible;
+  return false;
+#else
+  if (!input_contiguous || !output_contiguous || !input_trivially_copyable || !output_trivially_copyable
+      || !output_default_constructible)
+  {
+    return false;
+  }
+
+  return smem_for_stages(policy, 1, input_size, input_align, output_size, output_align, accum_size, accum_align)
+      <= static_cast<int>(max_smem_per_block);
 #endif
-  return smem_size_1_stage <= max_smem_per_block;
 }
 
-template <typename Policy, typename InputIterator, typename OutputIterator, typename AccumT, typename = void>
-inline constexpr bool scan_use_warpspeed = false;
-
-// detect the use via CCCL.C (pre-compiled dispatch and JIT pass) and disable the new kernel.
-// See https://github.com/NVIDIA/cccl/issues/6821 for more details.
-// We also need `cuda::std::is_constant_evaluated` for the compile-time SMEM computation. And we need PTX ISA 8.6.
-// MSVC + nvcc < 13.1 just fails to compile `cub.test.device.scan.lid_1.types_0` with `Internal error` and nothing else.
-#if !defined(CUB_ENABLE_POLICY_PTX_JSON) && !defined(CUB_DEFINE_RUNTIME_POLICIES) \
-  && defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && __cccl_ptx_isa >= 860        \
-  && !(_CCCL_COMPILER(MSVC) && _CCCL_CUDA_COMPILER(NVCC, <, 13, 1))
-template <typename Policy, typename InputIteratorT, typename OutputIteratorT, typename AccumT>
-inline constexpr bool scan_use_warpspeed<Policy,
-                                         InputIteratorT,
-                                         OutputIteratorT,
-                                         AccumT,
-                                         ::cuda::std::void_t<typename Policy::WarpspeedPolicy>> =
-  // for bulk copy: input and output iterators must be contiguous and their value types must be trivially copyable
-  THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InputIteratorT>
-  && ::cuda::std::is_trivially_copyable_v<it_value_t<InputIteratorT>>
-  && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>
-  && ::cuda::std::is_trivially_copyable_v<it_value_t<OutputIteratorT>>
-  // for bulk copy store: we need to prepare a buffer of output types in SMEM
-  && ::cuda::std::is_default_constructible_v<it_value_t<OutputIteratorT>>
-  // need to fit one stage into 48KiB so binaries stay forward compatible with future GPUs
-  && one_stage_fits_48KiB_SMEM<typename Policy::WarpspeedPolicy, InputIteratorT, OutputIteratorT, AccumT>();
-#endif // !defined(CUB_ENABLE_POLICY_PTX_JSON) && !defined(CUB_DEFINE_RUNTIME_POLICIES) &&
-       // defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && __cccl_ptx_isa >= 860
+template <typename InputIteratorT, typename OutputIteratorT, typename AccumT>
+_CCCL_API constexpr bool use_warpspeed(const scan_warpspeed_policy& policy)
+{
+  using InputT  = it_value_t<InputIteratorT>;
+  using OutputT = it_value_t<OutputIteratorT>;
+  return use_warpspeed(
+    policy,
+    static_cast<int>(sizeof(InputT)),
+    static_cast<int>(alignof(InputT)),
+    static_cast<int>(sizeof(OutputT)),
+    static_cast<int>(alignof(OutputT)),
+    static_cast<int>(sizeof(AccumT)),
+    static_cast<int>(alignof(AccumT)),
+    THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InputIteratorT>,
+    THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>,
+    ::cuda::std::is_trivially_copyable_v<InputT>,
+    ::cuda::std::is_trivially_copyable_v<OutputT>,
+    ::cuda::std::is_default_constructible_v<OutputT>);
+}
 } // namespace detail::scan
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh
new file mode 100644
index 00000000000..e4f254dcd6b
--- /dev/null
+++ b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh
@@ -0,0 +1,88 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cub/detail/warpspeed/squad/squad_desc.cuh>
+
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail::scan
+{
+struct scan_warpspeed_policy
+{
+  static constexpr int num_squads = 5;
+
+  bool valid = false;
+  int num_reduce_warps;
+  int num_scan_stor_warps;
+  int num_load_warps;
+  int num_sched_warps;
+  int num_look_ahead_warps;
+
+  int num_look_ahead_items;
+  int num_total_threads;
+  int items_per_thread;
+  int tile_size;
+
+  _CCCL_API constexpr explicit operator bool() const noexcept
+  {
+    return valid;
+  }
+
+  _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const
+  {
+    return warpspeed::SquadDesc{0, num_reduce_warps};
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const
+  {
+    return warpspeed::SquadDesc{1, num_scan_stor_warps};
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const
+  {
+    return warpspeed::SquadDesc{2, num_load_warps};
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadSched() const
+  {
+    return warpspeed::SquadDesc{3, num_sched_warps};
+  }
+  _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const
+  {
+    return warpspeed::SquadDesc{4, num_look_ahead_warps};
+  }
+
+  _CCCL_API constexpr friend bool operator==(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs)
+  {
+    return lhs.valid == rhs.valid && lhs.num_reduce_warps == rhs.num_reduce_warps
+        && lhs.num_scan_stor_warps == rhs.num_scan_stor_warps && lhs.num_load_warps == rhs.num_load_warps
+        && lhs.num_sched_warps == rhs.num_sched_warps && lhs.num_look_ahead_warps == rhs.num_look_ahead_warps
+        && lhs.num_look_ahead_items == rhs.num_look_ahead_items && lhs.num_total_threads == rhs.num_total_threads
+        && lhs.items_per_thread == rhs.items_per_thread && lhs.tile_size == rhs.tile_size;
+  }
+
+  _CCCL_API constexpr friend bool operator!=(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const scan_warpspeed_policy& p)
+  {
+    return os
+        << "scan_warpspeed_policy { .valid = " << p.valid << ", .num_reduce_warps = " << p.num_reduce_warps
+        << ", .num_scan_stor_warps = " << p.num_scan_stor_warps << ", .num_load_warps = " << p.num_load_warps
+        << ", .num_sched_warps = " << p.num_sched_warps << ", .num_look_ahead_warps = " << p.num_look_ahead_warps
+        << ", .num_look_ahead_items = " << p.num_look_ahead_items << ", .num_total_threads = " << p.num_total_threads
+        << ", .items_per_thread = " << p.items_per_thread << ", .tile_size = " << p.tile_size << " }";
+  }
+#endif // !_CCCL_COMPILER(NVRTC)
+};
+} // namespace detail::scan
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/common.cuh b/cub/cub/device/dispatch/tuning/common.cuh
index 273eedfbd82..8811afa012c 100644
--- a/cub/cub/device/dispatch/tuning/common.cuh
+++ b/cub/cub/device/dispatch/tuning/common.cuh
@@ -18,6 +18,8 @@
 #include <thrust/type_traits/is_contiguous_iterator.h>
 #include <thrust/type_traits/is_trivially_relocatable.h>
 
+#include <cuda/__functional/maximum.h>
+#include <cuda/__functional/minimum.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/is_signed.h>
 
@@ -29,6 +31,7 @@ namespace detail
 // libcu++
 enum class type_t
 {
+  boolean,
   int8,
   int16,
   int32,
@@ -50,6 +53,8 @@ inline constexpr auto classify_type = type_t::other;
 template <>
 inline constexpr auto classify_type<char> = ::cuda::std::is_signed_v<char> ? type_t::int8 : type_t::uint8;
 template <>
+inline constexpr auto classify_type<bool> = type_t::boolean;
+template <>
 inline constexpr auto classify_type<signed char> = type_t::int8;
 template <>
 inline constexpr auto classify_type<unsigned char> = type_t::uint8;
@@ -101,6 +106,12 @@ inline constexpr auto classify_op = op_kind_t::other;
 template <typename T>
 inline constexpr auto classify_op<::cuda::std::plus<T>> = op_kind_t::plus;
 
+template <typename T>
+inline constexpr auto classify_op<::cuda::minimum<T>> = op_kind_t::min;
+
+template <typename T>
+inline constexpr auto classify_op<::cuda::maximum<T>> = op_kind_t::max;
+
 struct iterator_info
 {
   int value_type_size;
diff --git a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh
index 3957c6165c3..1b5d3a61259 100644
--- a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 #pragma once
@@ -20,6 +20,7 @@
 #include <cub/agent/agent_scan.cuh>
 #include <cub/detail/delay_constructor.cuh>
 #include <cub/device/dispatch/tuning/common.cuh>
+#include <cub/device/dispatch/tuning/tuning_scan.cuh>
 #include <cub/util_device.cuh>
 
 #include <cuda/__device/arch_id.h>
@@ -32,6 +33,9 @@
 CUB_NAMESPACE_BEGIN
 namespace detail::radix_sort
 {
+using detail::scan::make_mem_scaled_scan_policy;
+using detail::scan::scan_policy;
+
 struct radix_sort_histogram_policy
 {
   int block_threads;
@@ -145,63 +149,6 @@ _CCCL_API constexpr auto make_reg_scaled_radix_sort_onesweep_policy(
     store_algorithm};
 }
 
-// TODO(bgruber): move this into the scan tuning header
-struct scan_policy
-{
-  int block_threads;
-  int items_per_thread;
-  BlockLoadAlgorithm load_algorithm;
-  CacheLoadModifier load_modifier;
-  BlockStoreAlgorithm store_algorithm;
-  BlockScanAlgorithm scan_algorithm;
-  delay_constructor_policy delay_constructor;
-
-  _CCCL_API constexpr friend bool operator==(const scan_policy& lhs, const scan_policy& rhs)
-  {
-    return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread
-        && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier
-        && lhs.store_algorithm == rhs.store_algorithm && lhs.scan_algorithm == rhs.scan_algorithm
-        && lhs.delay_constructor == rhs.delay_constructor;
-  }
-
-  _CCCL_API constexpr friend bool operator!=(const scan_policy& lhs, const scan_policy& rhs)
-  {
-    return !(lhs == rhs);
-  }
-
-#if !_CCCL_COMPILER(NVRTC)
-  friend ::std::ostream& operator<<(::std::ostream& os, const scan_policy& p)
-  {
-    return os
-        << "scan_policy { .block_threads = " << p.block_threads << ", .items_per_thread = " << p.items_per_thread
-        << ", .load_algorithm = " << p.load_algorithm << ", .load_modifier = " << p.load_modifier
-        << ", .store_algorithm = " << p.store_algorithm << ", .scan_algorithm = " << p.scan_algorithm
-        << ", .delay_constructor = " << p.delay_constructor << " }";
-  }
-#endif // !_CCCL_COMPILER(NVRTC)
-};
-
-_CCCL_API constexpr auto make_mem_scaled_scan_policy(
-  int nominal_4b_block_threads,
-  int nominal_4b_items_per_thread,
-  int compute_t_size,
-  BlockLoadAlgorithm load_algorithm,
-  CacheLoadModifier load_modifier,
-  BlockStoreAlgorithm store_algorithm,
-  BlockScanAlgorithm scan_algorithm,
-  delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}) -> scan_policy
-{
-  const auto scaled = scale_mem_bound(nominal_4b_block_threads, nominal_4b_items_per_thread, compute_t_size);
-  return scan_policy{
-    scaled.block_threads,
-    scaled.items_per_thread,
-    load_algorithm,
-    load_modifier,
-    store_algorithm,
-    scan_algorithm,
-    delay_constructor};
-}
-
 struct radix_sort_downsweep_policy
 {
   int block_threads;
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index a83d2bdf6ee..44d43d942db 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -18,18 +18,26 @@
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
-#include <cub/detail/warpspeed/squad/squad_desc.cuh>
+#include <cub/detail/delay_constructor.cuh>
+#include <cub/device/dispatch/kernels/scan_warpspeed_policy.cuh>
 #include <cub/device/dispatch/tuning/common.cuh>
 #include <cub/thread/thread_load.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/__device/arch_id.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__functional/invoke.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_trivially_copy_constructible.h>
 #include <cuda/std/__type_traits/void_t.h>
 
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail::scan
@@ -116,6 +124,69 @@ constexpr _CCCL_HOST_DEVICE offset_size classify_offset_size()
   return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown;
 }
 
+struct scan_policy
+{
+  int block_threads;
+  int items_per_thread;
+  BlockLoadAlgorithm load_algorithm;
+  CacheLoadModifier load_modifier;
+  BlockStoreAlgorithm store_algorithm;
+  BlockScanAlgorithm scan_algorithm;
+  delay_constructor_policy delay_constructor;
+  scan_warpspeed_policy warpspeed = {};
+
+  _CCCL_API constexpr friend bool operator==(const scan_policy& lhs, const scan_policy& rhs)
+  {
+    return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread
+        && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier
+        && lhs.store_algorithm == rhs.store_algorithm && lhs.scan_algorithm == rhs.scan_algorithm
+        && lhs.delay_constructor == rhs.delay_constructor && lhs.warpspeed == rhs.warpspeed;
+  }
+
+  _CCCL_API constexpr friend bool operator!=(const scan_policy& lhs, const scan_policy& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const scan_policy& p)
+  {
+    os << "scan_policy { .block_threads = " << p.block_threads << ", .items_per_thread = " << p.items_per_thread
+       << ", .load_algorithm = " << p.load_algorithm << ", .load_modifier = " << p.load_modifier
+       << ", .store_algorithm = " << p.store_algorithm << ", .scan_algorithm = " << p.scan_algorithm
+       << ", .delay_constructor = " << p.delay_constructor;
+    if (p.warpspeed)
+    {
+      os << ", .warpspeed = " << p.warpspeed;
+    }
+    return os << " }";
+  }
+#endif // !_CCCL_COMPILER(NVRTC)
+};
+
+_CCCL_API constexpr auto make_mem_scaled_scan_policy(
+  int nominal_4b_block_threads,
+  int nominal_4b_items_per_thread,
+  int compute_t_size,
+  BlockLoadAlgorithm load_algorithm,
+  CacheLoadModifier load_modifier,
+  BlockStoreAlgorithm store_algorithm,
+  BlockScanAlgorithm scan_algorithm,
+  delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450},
+  scan_warpspeed_policy warpspeed            = {}) -> scan_policy
+{
+  const auto scaled = scale_mem_bound(nominal_4b_block_threads, nominal_4b_items_per_thread, compute_t_size);
+  return scan_policy{
+    scaled.block_threads,
+    scaled.items_per_thread,
+    load_algorithm,
+    load_modifier,
+    store_algorithm,
+    scan_algorithm,
+    delay_constructor,
+    warpspeed};
+}
+
 template <class ValueT,
           class AccumT,
           class OffsetT,
@@ -219,6 +290,7 @@ struct sm80_tuning<type_t::uint128, primitive_op::yes, primitive_accum::no, accu
 {};
 #endif
 
+// TODO(griwes): remove for CCCL 4.0 when we drop the public scan dispatcher
 template <class AccumT,
           primitive_op PrimitiveOp,
           primitive_accum PrimitiveAccumulator = is_primitive_accum<AccumT>(),
@@ -257,6 +329,7 @@ struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_si
 #endif
 // clang-format on
 
+// TODO(griwes): remove for CCCL 4.0 when we drop the public scan dispatcher
 template <class ValueT,
           class AccumT,
           class OffsetT,
@@ -438,6 +511,7 @@ _CCCL_HOST_DEVICE ScanPolicyWrapper<PolicyT> MakeScanPolicyWrapper(PolicyT polic
   return ScanPolicyWrapper<PolicyT>{policy};
 }
 
+// TODO(griwes): remove this in CCCL 4.0 when we remove the public scan dispatcher
 template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
 struct policy_hub
 {
@@ -680,6 +754,616 @@ struct policy_hub
 
   using MaxPolicy = Policy1200;
 };
+
+#if _CCCL_HAS_CONCEPTS()
+template <typename T>
+concept scan_policy_selector = policy_selector<T, scan_policy>;
+#endif // _CCCL_HAS_CONCEPTS()
+
+constexpr _CCCL_HOST_DEVICE bool is_arithmetic_type(type_t type)
+{
+  switch (type)
+  {
+    case type_t::boolean:
+    case type_t::int8:
+    case type_t::int16:
+    case type_t::int32:
+    case type_t::int64:
+    case type_t::int128:
+    case type_t::uint8:
+    case type_t::uint16:
+    case type_t::uint32:
+    case type_t::uint64:
+    case type_t::uint128:
+    case type_t::float32:
+    case type_t::float64:
+      return true;
+    case type_t::other:
+      return false;
+  }
+
+  return false;
+}
+
+constexpr _CCCL_HOST_DEVICE scan_warpspeed_policy get_warpspeed_policy(
+  ::cuda::arch_id arch, int input_value_size, int accum_size, type_t input_type, op_kind_t operation_t)
+{
+  if (arch >= ::cuda::arch_id::sm_100)
+  {
+    scan_warpspeed_policy warpspeed_policy{};
+    warpspeed_policy.valid = true;
+
+    // TODO(bgruber): tune this
+#if _CCCL_COMPILER(NVHPC)
+    warpspeed_policy.num_reduce_warps    = 2;
+    warpspeed_policy.num_scan_stor_warps = 2;
+#else // _CCCL_COMPILER(NVHPC)
+    warpspeed_policy.num_reduce_warps    = 4;
+    warpspeed_policy.num_scan_stor_warps = 4;
+#endif // _CCCL_COMPILER(NVHPC)
+    warpspeed_policy.num_load_warps       = 1;
+    warpspeed_policy.num_sched_warps      = 1;
+    warpspeed_policy.num_look_ahead_warps = 1;
+
+    // TODO(bgruber): 5 is a bit better for complex<float>
+    warpspeed_policy.num_look_ahead_items = accum_size == 2 ? 3 : 4;
+
+    // Deduced definitions
+    const auto num_threads_per_warp = 32;
+    const auto num_total_warps =
+      warpspeed_policy.num_reduce_warps + warpspeed_policy.num_scan_stor_warps + warpspeed_policy.num_load_warps
+      + warpspeed_policy.num_sched_warps + warpspeed_policy.num_look_ahead_warps;
+    warpspeed_policy.num_total_threads = num_total_warps * num_threads_per_warp;
+
+    const auto squad_reduce_thread_count = warpspeed_policy.num_reduce_warps * num_threads_per_warp;
+
+    // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM)
+    // 2-byte types and double needed special handling
+    auto items_per_thread = ::cuda::std::max(256 / (input_value_size == 2 ? 2 : accum_size) - 1, 1);
+
+    if (arch >= ::cuda::arch_id::sm_120 && operation_t == op_kind_t::other && is_arithmetic_type(input_type))
+    {
+      if (input_value_size == 4 || input_value_size == 8)
+      {
+        items_per_thread = 127;
+      }
+      else
+      {
+        items_per_thread = ::cuda::std::min(items_per_thread, input_value_size <= 2 ? 63 : 127);
+      }
+    }
+
+    warpspeed_policy.items_per_thread = items_per_thread;
+    // TODO(bgruber): the special handling of double below is a LOT faster, but exceeds 48KiB SMEM
+    // clang-format off
+    // |   F64   |      I32      |     72576      |  11.295 us |       2.44% |  11.917 us |       8.02% |     0.622 us |   5.50% |   SLOW   |
+    // |   F64   |      I32      |    1056384     |  16.162 us |       6.24% |  15.847 us |       5.57% |    -0.315 us |  -1.95% |   SAME   |
+    // |   F64   |      I32      |    16781184    |  65.696 us |       1.64% |  60.650 us |       3.37% |    -5.046 us |  -7.68% |   FAST   |
+    // |   F64   |      I32      |   268442496    | 863.896 us |       0.22% | 679.100 us |       0.93% |  -184.796 us | -21.39% |   FAST   |
+    // |   F64   |      I32      |   1073745792   |   3.418 ms |       0.12% |   2.662 ms |       0.46% |  -755.740 us | -22.11% |   FAST   |
+    // |   F64   |      I64      |     72576      |  12.301 us |       8.18% |  12.987 us |       5.75% |     0.686 us |   5.58% |   SAME   |
+    // |   F64   |      I64      |    1056384     |  16.775 us |       5.70% |  16.091 us |       6.14% |    -0.684 us |  -4.08% |   SAME   |
+    // |   F64   |      I64      |    16781184    |  66.970 us |       1.41% |  58.024 us |       3.17% |    -8.946 us | -13.36% |   FAST   |
+    // |   F64   |      I64      |   268442496    | 863.826 us |       0.23% | 676.465 us |       0.98% |  -187.360 us | -21.69% |   FAST   |
+    // |   F64   |      I64      |   1073745792   |   3.419 ms |       0.11% |   2.664 ms |       0.48% |  -755.409 us | -22.09% |   FAST   |
+    // |   F64   |      I64      |   4294975104   |  13.641 ms |       0.05% |  10.575 ms |       0.24% | -3065.815 us | -22.48% |   FAST   |
+    // clang-format on
+    // (256 / (sizeof(InputValueT) == 2 ? 2 : (::cuda::std::is_same_v<InputValueT, double> ? 4 : sizeof(AccumT))) -
+    // 1);
+
+    warpspeed_policy.tile_size = warpspeed_policy.items_per_thread * squad_reduce_thread_count;
+
+    return warpspeed_policy;
+  }
+
+  return {};
+}
+
+struct policy_selector
+{
+  int input_value_size;
+  int input_value_alignment;
+  int output_value_size;
+  int output_value_alignment;
+  int accum_size;
+  int accum_alignment;
+  int offset_size;
+  type_t input_type;
+  type_t accum_type;
+  op_kind_t operation_t;
+  bool accum_is_primitive_or_trivially_copy_constructible;
+  // TODO(griwes): remove this field before policy_selector is publicly exposed
+  bool benchmark_match;
+
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy
+  {
+    const primitive_accum primitive_accum_t =
+      accum_type != type_t::other && accum_type != type_t::int128 ? primitive_accum::yes : primitive_accum::no;
+    const primitive_op primitive_op_t = operation_t != op_kind_t::other ? primitive_op::yes : primitive_op::no;
+
+    const bool large_values = accum_size > 128;
+    const BlockLoadAlgorithm scan_transposed_load =
+      large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE;
+    const BlockStoreAlgorithm scan_transposed_store =
+      large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE;
+    const auto default_delay = default_delay_constructor_policy(accum_is_primitive_or_trivially_copy_constructible);
+
+    const auto warpspeed_policy = get_warpspeed_policy(arch, input_value_size, accum_size, input_type, operation_t);
+
+    if (arch >= ::cuda::arch_id::sm_100)
+    {
+      if (benchmark_match && operation_t == op_kind_t::plus && primitive_accum_t == primitive_accum::yes)
+      {
+        if (offset_size == 4)
+        {
+          switch (input_value_size)
+          {
+            case 1:
+              // ipt_18.tpb_512.ns_768.dcid_7.l2w_820.trp_1.ld_0 1.188818  1.005682  1.173041  1.305288
+              return make_mem_scaled_scan_policy(
+                512,
+                18,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon, 768, 820},
+                warpspeed_policy);
+            case 2:
+              // ipt_13.tpb_512.ns_1384.dcid_7.l2w_720.trp_1.ld_0 1.128443  1.002841  1.119688  1.307692
+              return make_mem_scaled_scan_policy(
+                512,
+                13,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon, 1384, 720},
+                warpspeed_policy);
+            case 4:
+              // ipt_22.tpb_384.ns_1904.dcid_6.l2w_830.trp_1.ld_0 1.148442  0.997167  1.139902  1.462651
+              return make_mem_scaled_scan_policy(
+                384,
+                22,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, 1904, 830},
+                warpspeed_policy);
+            case 8:
+              // ipt_23.tpb_416.ns_772.dcid_5.l2w_710.trp_1.ld_0 1.089468  1.015581  1.085630  1.264583
+              return make_mem_scaled_scan_policy(
+                416,
+                23,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, 772, 710},
+                warpspeed_policy);
+            default:
+              break;
+          }
+        }
+        else if (offset_size == 8)
+        {
+          switch (input_value_size)
+          {
+            case 1:
+              // ipt_14.tpb_384.ns_228.dcid_7.l2w_775.trp_1.ld_1 1.107210  1.000000  1.100637  1.307692
+              return make_mem_scaled_scan_policy(
+                384,
+                14,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_CA,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon, 228, 775},
+                warpspeed_policy);
+            case 2:
+              // todo(gonidelis): Regresses for large inputs. Find better tuning.
+              // ipt_13.tpb_288.ns_1520.dcid_5.l2w_895.trp_1.ld_1 1.080934  0.983509  1.077724  1.305288
+              break;
+            case 4:
+              // ipt_19.tpb_416.ns_956.dcid_7.l2w_550.trp_1.ld_1 1.146142  0.994350  1.137459  1.455636
+              return make_mem_scaled_scan_policy(
+                416,
+                19,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_CA,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backon, 956, 550},
+                warpspeed_policy);
+            case 8:
+              if (accum_type == type_t::float64)
+              {
+                break;
+              }
+              // ipt_22.tpb_320.ns_328.dcid_2.l2w_965.trp_1.ld_0 1.080133  1.000000  1.075577  1.248963
+              return make_mem_scaled_scan_policy(
+                320,
+                22,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::exponential_backoff, 328, 965},
+                warpspeed_policy);
+            default:
+              break;
+          }
+        }
+      }
+    }
+
+    if (arch >= ::cuda::arch_id::sm_90)
+    {
+      if (primitive_op_t == primitive_op::yes)
+      {
+        if (primitive_accum_t == primitive_accum::yes)
+        {
+          switch (accum_size)
+          {
+            case 1:
+              return make_mem_scaled_scan_policy(
+                192,
+                22,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 168, 1140},
+                warpspeed_policy);
+            case 2:
+              return make_mem_scaled_scan_policy(
+                512,
+                12,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 376, 1125},
+                warpspeed_policy);
+            case 4:
+              if (accum_type == type_t::float32)
+              {
+                return make_mem_scaled_scan_policy(
+                  128,
+                  24,
+                  accum_size,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_STORE_WARP_TRANSPOSE,
+                  BLOCK_SCAN_WARP_SCANS,
+                  delay_constructor_policy{delay_constructor_kind::fixed_delay, 688, 1140},
+                  warpspeed_policy);
+              }
+              return make_mem_scaled_scan_policy(
+                128,
+                24,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 648, 1245},
+                warpspeed_policy);
+            case 8:
+              if (accum_type == type_t::float64)
+              {
+                return make_mem_scaled_scan_policy(
+                  224,
+                  24,
+                  accum_size,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_STORE_WARP_TRANSPOSE,
+                  BLOCK_SCAN_WARP_SCANS,
+                  delay_constructor_policy{delay_constructor_kind::fixed_delay, 576, 1215},
+                  warpspeed_policy);
+              }
+              return make_mem_scaled_scan_policy(
+                224,
+                24,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 632, 1290},
+                warpspeed_policy);
+            default:
+              break;
+          }
+        }
+
+#if _CCCL_HAS_INT128()
+        if (primitive_accum_t == primitive_accum::no && accum_size == 16
+            && (accum_type == type_t::int128 || accum_type == type_t::uint128))
+        {
+          return make_mem_scaled_scan_policy(
+            576,
+            21,
+            accum_size,
+            BLOCK_LOAD_WARP_TRANSPOSE,
+            LOAD_DEFAULT,
+            BLOCK_STORE_WARP_TRANSPOSE,
+            BLOCK_SCAN_WARP_SCANS,
+            delay_constructor_policy{delay_constructor_kind::fixed_delay, 860, 630},
+            warpspeed_policy);
+        }
+#endif
+      }
+    }
+
+    // Keep sm_86 aligned with legacy policy_hub behavior: policy_hub resets to default policy for 86.
+    if (arch >= ::cuda::arch_id::sm_86)
+    {
+      return make_mem_scaled_scan_policy(
+        128,
+        15,
+        accum_size,
+        scan_transposed_load,
+        LOAD_DEFAULT,
+        scan_transposed_store,
+        BLOCK_SCAN_WARP_SCANS,
+        default_delay,
+        warpspeed_policy);
+    }
+
+    if (arch >= ::cuda::arch_id::sm_80)
+    {
+      if (primitive_op_t == primitive_op::yes)
+      {
+        if (primitive_accum_t == primitive_accum::yes)
+        {
+          switch (accum_size)
+          {
+            case 1:
+              return make_mem_scaled_scan_policy(
+                320,
+                14,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 368, 725},
+                warpspeed_policy);
+            case 2:
+              return make_mem_scaled_scan_policy(
+                352,
+                16,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 488, 1040},
+                warpspeed_policy);
+            case 4:
+              if (accum_type == type_t::float32)
+              {
+                return make_mem_scaled_scan_policy(
+                  288,
+                  8,
+                  accum_size,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_STORE_WARP_TRANSPOSE,
+                  BLOCK_SCAN_WARP_SCANS,
+                  delay_constructor_policy{delay_constructor_kind::fixed_delay, 724, 1050},
+                  warpspeed_policy);
+              }
+              return make_mem_scaled_scan_policy(
+                320,
+                12,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 268, 1180},
+                warpspeed_policy);
+            case 8:
+              if (accum_type == type_t::float64)
+              {
+                return make_mem_scaled_scan_policy(
+                  384,
+                  12,
+                  accum_size,
+                  BLOCK_LOAD_WARP_TRANSPOSE,
+                  LOAD_DEFAULT,
+                  BLOCK_STORE_WARP_TRANSPOSE,
+                  BLOCK_SCAN_WARP_SCANS,
+                  delay_constructor_policy{delay_constructor_kind::fixed_delay, 388, 1100},
+                  warpspeed_policy);
+              }
+              return make_mem_scaled_scan_policy(
+                288,
+                22,
+                accum_size,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS,
+                delay_constructor_policy{delay_constructor_kind::fixed_delay, 716, 785},
+                warpspeed_policy);
+            default:
+              break;
+          }
+        }
+
+#if _CCCL_HAS_INT128()
+        if (primitive_accum_t == primitive_accum::no && accum_size == 16
+            && (accum_type == type_t::int128 || accum_type == type_t::uint128))
+        {
+          return make_mem_scaled_scan_policy(
+            640,
+            24,
+            accum_size,
+            BLOCK_LOAD_DIRECT,
+            LOAD_DEFAULT,
+            BLOCK_STORE_DIRECT,
+            BLOCK_SCAN_WARP_SCANS,
+            delay_constructor_policy{delay_constructor_kind::no_delay, 0, 1200},
+            warpspeed_policy);
+        }
+#endif
+      }
+    }
+
+    if (arch >= ::cuda::arch_id::sm_75)
+    {
+      if (benchmark_match && operation_t == op_kind_t::plus && primitive_accum_t == primitive_accum::yes
+          && offset_size == 8 && input_value_size == 4)
+      {
+        // ipt_7.tpb_128.ns_628.dcid_1.l2w_520.trp_1.ld_0
+        return make_mem_scaled_scan_policy(
+          128,
+          7,
+          accum_size,
+          BLOCK_LOAD_WARP_TRANSPOSE,
+          LOAD_DEFAULT,
+          BLOCK_STORE_WARP_TRANSPOSE,
+          BLOCK_SCAN_WARP_SCANS,
+          delay_constructor_policy{delay_constructor_kind::fixed_delay, 628, 520},
+          warpspeed_policy);
+      }
+
+      return make_mem_scaled_scan_policy(
+        128,
+        15,
+        accum_size,
+        scan_transposed_load,
+        LOAD_DEFAULT,
+        scan_transposed_store,
+        BLOCK_SCAN_WARP_SCANS,
+        default_delay,
+        warpspeed_policy);
+    }
+
+    if (arch >= ::cuda::arch_id::sm_60)
+    {
+      return make_mem_scaled_scan_policy(
+        128,
+        15,
+        accum_size,
+        scan_transposed_load,
+        LOAD_DEFAULT,
+        scan_transposed_store,
+        BLOCK_SCAN_WARP_SCANS,
+        default_delay,
+        warpspeed_policy);
+    }
+
+    return make_mem_scaled_scan_policy(
+      128,
+      12,
+      accum_size,
+      BLOCK_LOAD_DIRECT,
+      LOAD_CA,
+      BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+      BLOCK_SCAN_RAKING,
+      default_delay,
+      warpspeed_policy);
+  }
+};
+
+#if _CCCL_HAS_CONCEPTS()
+static_assert(scan_policy_selector<policy_selector>);
+#endif // _CCCL_HAS_CONCEPTS()
+
+template <typename ScanOpT, typename InputValueT, typename OutputValueT, typename AccumT, typename = void>
+struct benchmark_match_for_policy_selector
+{
+  static constexpr bool value = false;
+};
+
+template <typename ScanOpT, typename InputValueT, typename OutputValueT, typename AccumT>
+struct benchmark_match_for_policy_selector<
+  ScanOpT,
+  InputValueT,
+  OutputValueT,
+  AccumT,
+  ::cuda::std::void_t<::cuda::std::__accumulator_t<ScanOpT, InputValueT, InputValueT>>>
+{
+  static constexpr bool value =
+    sizeof(AccumT) == sizeof(::cuda::std::__accumulator_t<ScanOpT, InputValueT, InputValueT>)
+    && sizeof(InputValueT) == sizeof(OutputValueT);
+};
+
+// stateless version which can be passed to kernels
+template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
+struct policy_selector_from_types
+{
+  static constexpr int input_value_size       = int{sizeof(InputValueT)};
+  static constexpr int input_value_alignment  = int{alignof(InputValueT)};
+  static constexpr int output_value_size      = int{sizeof(OutputValueT)};
+  static constexpr int output_value_alignment = int{alignof(OutputValueT)};
+  static constexpr int accum_size             = int{sizeof(AccumT)};
+  static constexpr int accum_alignment        = int{alignof(AccumT)};
+  static constexpr type_t input_type          = classify_type<InputValueT>;
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy
+  {
+    constexpr bool benchmark_match =
+      benchmark_match_for_policy_selector<ScanOpT, InputValueT, OutputValueT, AccumT>::value;
+
+    constexpr bool accum_is_primitive_or_trivially_copy_constructible =
+      is_primitive<AccumT>::value || ::cuda::std::is_trivially_copy_constructible_v<AccumT>;
+
+    constexpr auto policies = policy_selector{
+      input_value_size,
+      input_value_alignment,
+      output_value_size,
+      output_value_alignment,
+      accum_size,
+      accum_alignment,
+      int{sizeof(OffsetT)},
+      input_type,
+      classify_type<AccumT>,
+      classify_op<ScanOpT>,
+      accum_is_primitive_or_trivially_copy_constructible,
+      benchmark_match};
+    return policies(arch);
+  }
+};
+
+template <typename PolicySelectorT, typename = void>
+struct selector_smem_info
+{
+  static constexpr bool has_static_layout = false;
+};
+
+template <typename PolicySelectorT>
+struct selector_smem_info<
+  PolicySelectorT,
+  ::cuda::std::void_t<decltype(::cuda::std::integral_constant<int, PolicySelectorT::input_value_size>{}),
+                      decltype(::cuda::std::integral_constant<int, PolicySelectorT::input_value_alignment>{}),
+                      decltype(::cuda::std::integral_constant<int, PolicySelectorT::output_value_size>{}),
+                      decltype(::cuda::std::integral_constant<int, PolicySelectorT::output_value_alignment>{}),
+                      decltype(::cuda::std::integral_constant<int, PolicySelectorT::accum_size>{}),
+                      decltype(::cuda::std::integral_constant<int, PolicySelectorT::accum_alignment>{})>>
+{
+  static constexpr bool has_static_layout     = true;
+  static constexpr int input_value_size       = PolicySelectorT::input_value_size;
+  static constexpr int input_value_alignment  = PolicySelectorT::input_value_alignment;
+  static constexpr int output_value_size      = PolicySelectorT::output_value_size;
+  static constexpr int output_value_alignment = PolicySelectorT::output_value_alignment;
+  static constexpr int accum_size             = PolicySelectorT::accum_size;
+  static constexpr int accum_alignment        = PolicySelectorT::accum_alignment;
+};
 } // namespace detail::scan
 
 CUB_NAMESPACE_END
diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu
index e8a692b6051..2ccdb97587b 100644
--- a/cub/test/catch2_test_device_scan_env.cu
+++ b/cub/test/catch2_test_device_scan_env.cu
@@ -14,6 +14,7 @@ struct stream_registry_factory_t;
 
 #include <thrust/device_vector.h>
 
+#include <cuda/__device/arch_id.h>
 #include <cuda/iterator>
 
 #include "catch2_test_env_launch_helper.h"
@@ -47,36 +48,23 @@ struct block_size_check_t
   }
 };
 
-struct block_size_retreiver_t
-{
-  int* ptr;
-
-  template <class ActivePolicyT>
-  cudaError_t Invoke()
-  {
-    *ptr = ActivePolicyT::ScanPolicyT::BLOCK_THREADS;
-    return cudaSuccess;
-  }
-};
-
 TEST_CASE("Device scan exclusive scan works with default environment", "[scan][device]")
 {
   using num_items_t = int;
   using value_t     = int;
   using offset_t    = cub::detail::choose_offset_t<num_items_t>;
 
-  using policy_t =
-    cub::detail::scan::default_tuning::fn<value_t, value_t, value_t, offset_t, block_size_check_t>::MaxPolicy;
+  using selector_t =
+    cub::detail::scan::policy_selector_from_types<value_t, value_t, value_t, offset_t, block_size_check_t>;
 
   int current_device{};
   REQUIRE(cudaSuccess == cudaGetDevice(&current_device));
 
-  int ptx_version{};
-  REQUIRE(cudaSuccess == cub::PtxVersion(ptx_version, current_device));
+  cudaDeviceProp device_props{};
+  REQUIRE(cudaSuccess == cudaGetDeviceProperties(&device_props, current_device));
 
-  int target_block_size{};
-  block_size_retreiver_t block_size_retreiver{&target_block_size};
-  REQUIRE(cudaSuccess == policy_t::Invoke(ptx_version, block_size_retreiver));
+  const auto target_block_size =
+    selector_t{}(cuda::to_arch_id(cuda::compute_capability{device_props.major, device_props.minor})).block_threads;
 
   num_items_t num_items = 1;
   c2h::device_vector<int> d_block_size(1);
@@ -116,31 +104,16 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev
 template <int BlockThreads>
 struct scan_tuning : cub::detail::scan::tuning<scan_tuning<BlockThreads>>
 {
-  template <class /* InputValueT */, class /* OutputValueT */, class AccumT, class /* Offset */, class /* ScanOpT */>
-  struct fn
+  _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::scan::scan_policy
   {
-    struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
-    {
-      struct ScanPolicyT
-      {
-        static constexpr int BLOCK_THREADS                      = BlockThreads;
-        static constexpr int ITEMS_PER_THREAD                   = 1;
-        static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE;
-
-        static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::CacheLoadModifier::LOAD_DEFAULT;
-        static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM =
-          cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE;
-        static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING;
-
-        struct detail
-        {
-          using delay_constructor_t = cub::detail::default_delay_constructor_t<AccumT>;
-        };
-      };
-    };
-
-    using MaxPolicy = Policy500;
-  };
+    return {BlockThreads,
+            1,
+            cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE,
+            cub::CacheLoadModifier::LOAD_DEFAULT,
+            cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE,
+            cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING,
+            cub::detail::delay_constructor_policy{cub::detail::delay_constructor_kind::fixed_delay, 350, 450}};
+  }
 };
 
 struct get_reduce_tuning_query_t
@@ -212,18 +185,17 @@ TEST_CASE("Device scan inclusive-scan works with default environment", "[scan][d
   using value_t     = int;
   using offset_t    = cub::detail::choose_offset_t<num_items_t>;
 
-  using policy_t =
-    cub::detail::scan::default_tuning::fn<value_t, value_t, value_t, offset_t, block_size_check_t>::MaxPolicy;
+  using selector_t =
+    cub::detail::scan::policy_selector_from_types<value_t, value_t, value_t, offset_t, block_size_check_t>;
 
   int current_device{};
   REQUIRE(cudaSuccess == cudaGetDevice(&current_device));
 
-  int ptx_version{};
-  REQUIRE(cudaSuccess == cub::PtxVersion(ptx_version, current_device));
+  cudaDeviceProp device_props{};
+  REQUIRE(cudaSuccess == cudaGetDeviceProperties(&device_props, current_device));
 
-  int target_block_size{};
-  block_size_retreiver_t block_size_retreiver{&target_block_size};
-  REQUIRE(cudaSuccess == policy_t::Invoke(ptx_version, block_size_retreiver));
+  const auto target_block_size =
+    selector_t{}(cuda::to_arch_id(cuda::compute_capability{device_props.major, device_props.minor})).block_threads;
 
   num_items_t num_items = 1;
   c2h::device_vector<int> d_block_size(1);
diff --git a/cub/test/catch2_test_env_launch_helper.h b/cub/test/catch2_test_env_launch_helper.h
index 886e53f35ed..454a29b28e1 100644
--- a/cub/test/catch2_test_env_launch_helper.h
+++ b/cub/test/catch2_test_env_launch_helper.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <cub/util_device.cuh>
+
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
 #include <cuda/std/optional>
@@ -149,6 +151,51 @@ struct stream_registry_factory_t
     // Get max grid dimension
     return cudaDeviceGetAttribute(&max_grid_dim_x, cudaDevAttrMaxGridDimX, device_ordinal);
   }
+
+  CUB_RUNTIME_FUNCTION cudaError_t MemsetAsync(void* dst, unsigned char value, size_t num_bytes, cudaStream_t stream)
+  {
+    return cudaMemsetAsync(dst, value, num_bytes, stream);
+  }
+
+  CUB_RUNTIME_FUNCTION cudaError_t
+  MemcpyAsync(void* dst, const void* src, size_t num_bytes, cudaMemcpyKind kind, cudaStream_t stream)
+  {
+    return cudaMemcpyAsync(dst, src, num_bytes, kind, stream);
+  }
+
+  CUB_RUNTIME_FUNCTION cudaError_t MaxSharedMemory(int& max_shared_memory) const
+  {
+    int device = 0;
+    auto error = cudaGetDevice(&device);
+    if (error != cudaSuccess)
+    {
+      return error;
+    }
+
+    return cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, device);
+  }
+
+  template <typename Kernel>
+  CUB_RUNTIME_FUNCTION cudaError_t max_dynamic_smem_size_for(int& max_dynamic_smem_size, Kernel kernel_ptr)
+  {
+    NV_IF_ELSE_TARGET(NV_IS_HOST, //
+                      ({ return cub::MaxPotentialDynamicSmemBytes(max_dynamic_smem_size, kernel_ptr); }),
+                      ({
+                        cudaFuncAttributes func_attrs{};
+                        if (const auto error = cudaFuncGetAttributes(&func_attrs, kernel_ptr))
+                        {
+                          return error;
+                        }
+                        max_dynamic_smem_size = func_attrs.maxDynamicSharedSizeBytes;
+                        return cudaSuccess;
+                      }))
+  }
+
+  template <typename Kernel>
+  CUB_RUNTIME_FUNCTION cudaError_t set_max_dynamic_smem_size_for(Kernel kernel_ptr, int smem_size)
+  {
+    return cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+  }
 };
 
 struct stream_scope