From b7f08007846157c2e2c15f99aa25be2b6bf9632b Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Sat, 15 Nov 2025 18:39:56 +0530
Subject: [PATCH 01/14] FEA: Added const _CCCL_GRID_CONSTANT to non-mutable
 kernel parameters

---
 .../dispatch/kernels/kernel_for_each.cuh      |  4 +-
 .../dispatch/kernels/kernel_histogram.cuh     |  8 ++--
 .../dispatch/kernels/kernel_merge_sort.cuh    | 36 +++++++--------
 .../dispatch/kernels/kernel_radix_sort.cuh    | 28 +++++------
 .../device/dispatch/kernels/kernel_reduce.cuh | 46 +++++++++----------
 .../device/dispatch/kernels/kernel_scan.cuh   | 12 ++---
 .../kernels/kernel_segmented_radix_sort.cuh   |  8 ++--
 .../kernels/kernel_segmented_reduce.cuh       | 20 ++++----
 .../kernels/kernel_segmented_sort.cuh         | 18 ++++----
 .../kernels/kernel_three_way_partition.cuh    | 12 ++---
 .../dispatch/kernels/kernel_transform.cuh     |  8 ++--
 .../dispatch/kernels/kernel_unique_by_key.cuh | 10 ++--
 12 files changed, 105 insertions(+), 105 deletions(-)
diff --git a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
index 81ecf9de39c..7af08a50164 100644
--- a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
@@ -86,7 +86,7 @@ using can_regain_copy_freedom =
 
 // This kernel is used when the block size is not known at compile time
 template <class ChainedPolicyT, class OffsetT, class OpT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
+CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OpT op)
 {
   using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
   using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
@@ -111,7 +111,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
 template <class ChainedPolicyT, class OffsetT, class OpT>
 CUB_DETAIL_KERNEL_ATTRIBUTES //
 __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
-  void static_kernel(OffsetT num_items, OpT op)
+  void static_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OpT op)
 {
   using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
   using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
index e4520813221..ee407165fa5 100644
--- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
@@ -452,10 +452,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK
     ::cuda::std::array<CounterT*, NumActiveChannels> d_privatized_histograms_wrapper,
     ::cuda::std::array<OutputDecodeOpT, NumActiveChannels> output_decode_op_wrapper,
     ::cuda::std::array<PrivatizedDecodeOpT, NumActiveChannels> privatized_decode_op_wrapper,
-    OffsetT num_row_pixels,
-    OffsetT num_rows,
-    OffsetT row_stride_samples,
-    int tiles_per_row,
+    _CCCL_GRID_CONSTANT const OffsetT num_row_pixels,
+    _CCCL_GRID_CONSTANT const OffsetT num_rows,
+    _CCCL_GRID_CONSTANT const OffsetT row_stride_samples,
+    _CCCL_GRID_CONSTANT const int tiles_per_row,
     GridQueue<int> tile_queue)
 {
   // Thread block type for compositing input tiles
diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
index 115602e5080..ec5d111eeef 100644
--- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
@@ -167,15 +167,15 @@ __launch_bounds__(
     KeyT,
     ValueT>::policy_t::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel(
-    bool ping,
-    KeyInputIteratorT keys_in,
-    ValueInputIteratorT items_in,
+    _CCCL_GRID_CONSTANT const bool ping,
+    _CCCL_GRID_CONSTANT const KeyInputIteratorT keys_in,
+    _CCCL_GRID_CONSTANT const ValueInputIteratorT items_in,
     KeyIteratorT keys_out,
     ValueIteratorT items_out,
-    OffsetT keys_count,
+    _CCCL_GRID_CONSTANT const OffsetT keys_count,
     KeyT* tmp_keys_out,
     ValueT* tmp_items_out,
-    CompareOpT compare_op,
+    _CCCL_GRID_CONSTANT const CompareOpT compare_op,
     vsmem_t vsmem)
 {
   using MergeSortHelperT = typename VSMemHelperT::template MergeSortVSMemHelperT<
@@ -221,15 +221,15 @@ __launch_bounds__(
 
 template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(
-  bool ping,
-  KeyIteratorT keys_ping,
+  _CCCL_GRID_CONSTANT const bool ping,
+  _CCCL_GRID_CONSTANT const KeyIteratorT keys_ping,
   KeyT* keys_pong,
-  OffsetT keys_count,
-  OffsetT num_partitions,
+  _CCCL_GRID_CONSTANT const OffsetT keys_count,
+  _CCCL_GRID_CONSTANT const OffsetT num_partitions,
   OffsetT* merge_partitions,
-  CompareOpT compare_op,
-  OffsetT target_merged_tiles_number,
-  int items_per_tile)
+  _CCCL_GRID_CONSTANT const CompareOpT compare_op,
+  _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
+  _CCCL_GRID_CONSTANT const int items_per_tile)
 {
   OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
 
@@ -272,15 +272,15 @@ __launch_bounds__(
     KeyT,
     ValueT>::policy_t::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel(
-    bool ping,
-    KeyIteratorT keys_ping,
-    ValueIteratorT items_ping,
-    OffsetT keys_count,
+    _CCCL_GRID_CONSTANT const bool ping,
+    _CCCL_GRID_CONSTANT const KeyIteratorT keys_ping,
+    _CCCL_GRID_CONSTANT const ValueIteratorT items_ping,
+    _CCCL_GRID_CONSTANT const OffsetT keys_count,
     KeyT* keys_pong,
     ValueT* items_pong,
-    CompareOpT compare_op,
+    _CCCL_GRID_CONSTANT const CompareOpT compare_op,
     OffsetT* merge_partitions,
-    OffsetT target_merged_tiles_number,
+    _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
     vsmem_t vsmem)
 {
   using MergeSortHelperT = typename VSMemHelperT::template MergeSortVSMemHelperT<
diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
index cf09fb0bef4..bcbd19e3a6c 100644
--- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
@@ -85,9 +85,9 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(
     const KeyT* d_keys,
     OffsetT* d_spine,
-    OffsetT /*num_items*/,
-    int current_bit,
-    int num_bits,
+    _CCCL_GRID_CONSTANT const OffsetT /*num_items*/,
+    _CCCL_GRID_CONSTANT const int current_bit,
+    _CCCL_GRID_CONSTANT const int num_bits,
     GridEvenShare<OffsetT> even_share,
     DecomposerT decomposer = {})
 {
@@ -144,7 +144,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
  */
 template <typename ChainedPolicyT, typename OffsetT>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
-  CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, int num_counts)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, _CCCL_GRID_CONSTANT const int num_counts)
 {
   // Parameterize the AgentScan type for the current configuration
   using AgentScanT =
@@ -243,9 +243,9 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo
     const ValueT* d_values_in,
     ValueT* d_values_out,
     OffsetT* d_spine,
-    OffsetT num_items,
-    int current_bit,
-    int num_bits,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int current_bit,
+    _CCCL_GRID_CONSTANT const int num_bits,
     GridEvenShare<OffsetT> even_share,
     DecomposerT decomposer = {})
 {
@@ -332,8 +332,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
     const ValueT* d_values_in,
     ValueT* d_values_out,
     OffsetT num_items,
-    int current_bit,
-    int end_bit,
+    _CCCL_GRID_CONSTANT const int current_bit,
+    _CCCL_GRID_CONSTANT const int end_bit,
     DecomposerT decomposer = {})
 {
   // Constants
@@ -444,7 +444,7 @@ template <typename ChainedPolicyT,
           typename DecomposerT = identity_decomposer_t>
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel(
-  OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit, DecomposerT decomposer = {})
+  OffsetT* d_bins_out, const KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, DecomposerT decomposer = {})
 {
   using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy;
   using AgentT = AgentRadixSortHistogram<HistogramPolicyT, Order == SortOrder::Descending, KeyT, OffsetT, DecomposerT>;
@@ -470,10 +470,10 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy
     KeyT* d_keys_out,
     const KeyT* d_keys_in,
     ValueT* d_values_out,
-    const ValueT* d_values_in,
-    PortionOffsetT num_items,
-    int current_bit,
-    int num_bits,
+    ValueT* d_values_in,
+    _CCCL_GRID_CONSTANT const PortionOffsetT num_items,
+    _CCCL_GRID_CONSTANT const int current_bit,
+    _CCCL_GRID_CONSTANT const int num_bits,
     DecomposerT decomposer = {})
 {
   using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy;
diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
index 602b33857a3..f28c103fa9c 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
@@ -129,12 +129,12 @@ template <typename ChainedPolicyT,
           typename TransformOpT>
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceReduceKernel(
-  InputIteratorT d_in,
+  _CCCL_GRID_CONSTANT const InputIteratorT d_in,
   AccumT* d_out,
-  OffsetT num_items,
+  _CCCL_GRID_CONSTANT const OffsetT num_items,
   GridEvenShare<OffsetT> even_share,
-  ReductionOpT reduction_op,
-  TransformOpT transform_op)
+  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+  _CCCL_GRID_CONSTANT const TransformOpT transform_op)
 {
   // Thread block type for reducing input tiles
   using AgentReduceT =
@@ -214,12 +214,12 @@ template <typename ChainedPolicyT,
           typename TransformOpT = ::cuda::std::identity>
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS),
-  1) void DeviceReduceSingleTileKernel(InputIteratorT d_in,
+  1) void DeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
                                        OutputIteratorT d_out,
-                                       OffsetT num_items,
-                                       ReductionOpT reduction_op,
-                                       InitT init,
-                                       TransformOpT transform_op)
+                                       _CCCL_GRID_CONSTANT const OffsetT num_items,
+                                       _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+                                       _CCCL_GRID_CONSTANT const InitT init,
+                                       _CCCL_GRID_CONSTANT const TransformOpT transform_op)
 {
   // Thread block type for reducing input tiles
   using AgentReduceT =
@@ -298,11 +298,11 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
 template <typename ChainedPolicyT, typename InputIteratorT, typename ReductionOpT, typename AccumT, typename TransformOpT>
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeterministicDeviceReduceKernel(
-  InputIteratorT d_in,
+  _CCCL_GRID_CONSTANT const InputIteratorT d_in,
   AccumT* d_out,
-  int num_items,
-  ReductionOpT reduction_op,
-  TransformOpT transform_op,
+  _CCCL_GRID_CONSTANT const int num_items,
+  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+  _CCCL_GRID_CONSTANT const TransformOpT transform_op,
   const int reduce_grid_size)
 {
   using reduce_policy_t = typename ChainedPolicyT::ActivePolicy::ReducePolicy;
@@ -431,12 +431,12 @@ template <typename ChainedPolicyT,
           typename TransformOpT = ::cuda::std::identity>
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS),
-  1) void DeterministicDeviceReduceSingleTileKernel(InputIteratorT d_in,
+  1) void DeterministicDeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
                                                     OutputIteratorT d_out,
-                                                    int num_items,
-                                                    ReductionOpT reduction_op,
-                                                    InitT init,
-                                                    TransformOpT transform_op)
+                                                    _CCCL_GRID_CONSTANT const int num_items,
+                                                    _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+                                                    _CCCL_GRID_CONSTANT const InitT init,
+                                                    _CCCL_GRID_CONSTANT const TransformOpT transform_op)
 {
   using single_tile_policy_t = typename ChainedPolicyT::ActivePolicy::SingleTilePolicy;
 
@@ -498,13 +498,13 @@ template <typename ChainedPolicyT,
           typename TransformOpT>
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
   ChainedPolicyT::ActivePolicy::ReduceNondeterministicPolicy::
-    BLOCK_THREADS)) void NondeterministicDeviceReduceAtomicKernel(InputIteratorT d_in,
+    BLOCK_THREADS)) void NondeterministicDeviceReduceAtomicKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
                                                                   OutputIteratorT d_out,
-                                                                  OffsetT num_items,
+                                                                  _CCCL_GRID_CONSTANT const OffsetT num_items,
                                                                   GridEvenShare<OffsetT> even_share,
-                                                                  ReductionOpT reduction_op,
-                                                                  InitT init,
-                                                                  TransformOpT transform_op)
+                                                                  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+                                                                  _CCCL_GRID_CONSTANT const InitT init,
+                                                                  _CCCL_GRID_CONSTANT const TransformOpT transform_op)
 {
   NV_IF_TARGET(NV_PROVIDES_SM_60,
                (),
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index df596b2d463..cfa32871c7a 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -37,7 +37,7 @@ namespace detail::scan
  *   Number of tiles
  */
 template <typename ScanTileStateT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles)
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);
@@ -64,7 +64,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state
  */
 template <typename ScanTileStateT, typename NumSelectedIteratorT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
+DeviceCompactInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);
@@ -136,13 +136,13 @@ template <typename ChainedPolicyT,
           typename RealInitValueT = typename InitValueT::value_type>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
-    InputIteratorT d_in,
+    _CCCL_GRID_CONSTANT const InputIteratorT d_in,
     OutputIteratorT d_out,
     ScanTileStateT tile_state,
-    int start_tile,
-    ScanOpT scan_op,
+    _CCCL_GRID_CONSTANT const int start_tile,
+    _CCCL_GRID_CONSTANT const ScanOpT scan_op,
     InitValueT init_value,
-    OffsetT num_items)
+    _CCCL_GRID_CONSTANT const OffsetT num_items)
 {
   using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT;
 
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
index 4dbe0645d32..13088889941 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
@@ -102,10 +102,10 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
     KeyT* d_keys_out,
     const ValueT* d_values_in,
     ValueT* d_values_out,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int current_bit,
-    int pass_bits,
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
+    _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
+    _CCCL_GRID_CONSTANT const int current_bit,
+    _CCCL_GRID_CONSTANT const int pass_bits,
     DecomposerT decomposer = {})
 {
   //
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
index cc8182707c0..f5fd0f7fea6 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
@@ -101,12 +101,12 @@ template <typename ChainedPolicyT,
           typename AccumT>
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceSegmentedReduceKernel(
-  InputIteratorT d_in,
+  _CCCL_GRID_CONSTANT const InputIteratorT d_in,
   OutputIteratorT d_out,
-  BeginOffsetIteratorT d_begin_offsets,
-  EndOffsetIteratorT d_end_offsets,
-  ReductionOpT reduction_op,
-  InitT init)
+  _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
+  _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
+  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+  _CCCL_GRID_CONSTANT const InitT init)
 {
   // Thread block type for reducing input tiles
   using AgentReduceT =
@@ -188,12 +188,12 @@ template <typename ChainedPolicyT,
           typename AccumT>
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceFixedSizeSegmentedReduceKernel(
-  InputIteratorT d_in,
+  _CCCL_GRID_CONSTANT const InputIteratorT d_in,
   OutputIteratorT d_out,
-  OffsetT segment_size,
-  int num_segments,
-  ReductionOpT reduction_op,
-  InitT init)
+  _CCCL_GRID_CONSTANT const OffsetT segment_size,
+  _CCCL_GRID_CONSTANT const int num_segments,
+  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+  _CCCL_GRID_CONSTANT const InitT init)
 {
   using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
 
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
index 4133c7448b2..e13833ffa74 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
@@ -133,8 +133,8 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
     const ValueT* d_values_in_orig,
     ValueT* d_values_out_orig,
     device_double_buffer<ValueT> d_values_double_buffer,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets)
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
+    _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets)
 {
   using ActivePolicyT       = typename ChainedPolicyT::ActivePolicy;
   using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
@@ -304,17 +304,17 @@ template <SortOrder Order,
           typename OffsetT>
 __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallSegmentPolicy::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall(
-    local_segment_index_t small_segments,
-    local_segment_index_t medium_segments,
-    local_segment_index_t medium_blocks,
+    _CCCL_GRID_CONSTANT const local_segment_index_t small_segments,
+    _CCCL_GRID_CONSTANT const local_segment_index_t medium_segments,
+    _CCCL_GRID_CONSTANT const local_segment_index_t medium_blocks,
     const local_segment_index_t* d_small_segments_indices,
     const local_segment_index_t* d_medium_segments_indices,
     const KeyT* d_keys_in,
     KeyT* d_keys_out,
     const ValueT* d_values_in,
     ValueT* d_values_out,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets)
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
+    _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets)
 {
   using local_segment_index_t = local_segment_index_t;
 
@@ -432,8 +432,8 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
     const ValueT* d_values_in_orig,
     ValueT* d_values_out_orig,
     device_double_buffer<ValueT> d_values_double_buffer,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets)
+    const _CCCL_GRID_CONSTANT BeginOffsetIteratorT d_begin_offsets,
+    const _CCCL_GRID_CONSTANT EndOffsetIteratorT d_end_offsets)
 {
   using ActivePolicyT         = typename ChainedPolicyT::ActivePolicy;
   using LargeSegmentPolicyT   = typename ActivePolicyT::LargeSegmentPolicy;
diff --git a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
index e67b9acbea6..29d776a9a9c 100644
--- a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
@@ -114,16 +114,16 @@ template <typename ChainedPolicyT,
           typename StreamingContextT>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel(
-    InputIteratorT d_in,
+    _CCCL_GRID_CONSTANT const InputIteratorT d_in,
     FirstOutputIteratorT d_first_part_out,
     SecondOutputIteratorT d_second_part_out,
     UnselectedOutputIteratorT d_unselected_out,
     NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_status,
-    SelectFirstPartOp select_first_part_op,
-    SelectSecondPartOp select_second_part_op,
-    OffsetT num_items,
-    int num_tiles,
+    _CCCL_GRID_CONSTANT const SelectFirstPartOp select_first_part_op,
+    _CCCL_GRID_CONSTANT const SelectSecondPartOp select_second_part_op,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context)
 {
   using AgentThreeWayPartitionPolicyT = typename ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy;
@@ -181,7 +181,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO
  */
 template <typename ScanTileStateT, typename NumSelectedIteratorT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
+DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
index 21987e61578..1126427d5d6 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
@@ -962,11 +962,11 @@ template <typename MaxPolicy,
           typename... RandomAccessIteartorsIn>
 __launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads)
   CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
-    Offset num_items,
-    int num_elem_per_thread,
+    _CCCL_GRID_CONSTANT const Offset num_items,
+    _CCCL_GRID_CONSTANT const int num_elem_per_thread,
     [[maybe_unused]] bool can_vectorize,
-    Predicate pred,
-    F f,
+    _CCCL_GRID_CONSTANT const Predicate pred,
+    _CCCL_GRID_CONSTANT const F f,
     RandomAccessIteratorOut out,
     kernel_arg<RandomAccessIteartorsIn>... ins)
 {
diff --git a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
index bf0e55bd45f..7ea2d0f2297 100644
--- a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
@@ -131,15 +131,15 @@ __launch_bounds__(int(
     EqualityOpT,
     OffsetT>::agent_policy_t::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel(
-    KeyInputIteratorT d_keys_in,
-    ValueInputIteratorT d_values_in,
+    _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in,
+    _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
     KeyOutputIteratorT d_keys_out,
     ValueOutputIteratorT d_values_out,
     NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_state,
-    EqualityOpT equality_op,
-    OffsetT num_items,
-    int num_tiles,
+    _CCCL_GRID_CONSTANT const EqualityOpT equality_op,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int num_tiles,
     vsmem_t vsmem)
 {
   using VsmemHelperT = typename VSMemHelperT::template VSMemHelperDefaultFallbackPolicyT<

From 6f71b0969e6ae7da44c23384d5fc6c653a70e49f Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Sat, 15 Nov 2025 22:21:53 +0530
Subject: [PATCH 02/14] FEA: Added _CCCL_GRID_CONSTANT const to more kernels

---
 .../dispatch/dispatch_adjacent_difference.cuh |  8 ++--
 .../device/dispatch/dispatch_batch_memcpy.cuh | 30 +++++++--------
 cub/cub/device/dispatch/dispatch_merge.cuh    | 30 +++++++--------
 .../dispatch/dispatch_reduce_by_key.cuh       | 18 ++++-----
 cub/cub/device/dispatch/dispatch_rle.cuh      | 14 +++----
 .../device/dispatch/dispatch_scan_by_key.cuh  | 22 +++++------
 .../device/dispatch/dispatch_select_if.cuh    | 16 ++++----
 cub/cub/device/dispatch/dispatch_topk.cuh     | 38 +++++++++----------
 8 files changed, 88 insertions(+), 88 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index a38402ed6ac..f8e2d10fd4d 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -48,11 +48,11 @@ template <typename ChainedPolicyT,
           bool MayAlias,
           bool ReadLeft>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
-  InputIteratorT input,
+  _CCCL_GRID_CONSTANT const InputIteratorT input,
   InputT* first_tile_previous,
-  OutputIteratorT result,
-  DifferenceOpT difference_op,
-  OffsetT num_items)
+  _CCCL_GRID_CONSTANT const OutputIteratorT result,
+  _CCCL_GRID_CONSTANT const DifferenceOpT difference_op,
+  _CCCL_GRID_CONSTANT const OffsetT num_items)
 {
   using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy;
 
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 33f73fa6a56..6e1d6708712 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -83,12 +83,12 @@ template <typename ChainedPolicyT,
           CopyAlg MemcpyOpt>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void MultiBlockBatchMemcpyKernel(
-    InputBufferIt input_buffer_it,
-    OutputBufferIt output_buffer_it,
-    BufferSizeIteratorT buffer_sizes,
-    BufferTileOffsetItT buffer_tile_offsets,
+    _CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it,
+    _CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it,
+    _CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes,
+    _CCCL_GRID_CONSTANT const BufferTileOffsetItT buffer_tile_offsets,
     TileT buffer_offset_tile,
-    TileOffsetT last_tile_offset)
+    _CCCL_GRID_CONSTANT const TileOffsetT last_tile_offset)
 {
   using StatusWord    = typename TileT::StatusWord;
   using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
@@ -210,16 +210,16 @@ template <typename ChainedPolicyT,
           CopyAlg MemcpyOpt>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void BatchMemcpyKernel(
-    InputBufferIt input_buffer_it,
-    OutputBufferIt output_buffer_it,
-    BufferSizeIteratorT buffer_sizes,
-    BufferOffsetT num_buffers,
-    BlevBufferSrcsOutItT blev_buffer_srcs,
-    BlevBufferDstsOutItT blev_buffer_dsts,
-    BlevBufferSizesOutItT blev_buffer_sizes,
-    BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
-    BLevBufferOffsetTileState blev_buffer_scan_state,
-    BLevBlockOffsetTileState blev_block_scan_state)
+    _CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it,
+    _CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it,
+    _CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes,
+    _CCCL_GRID_CONSTANT const BufferOffsetT num_buffers,
+    _CCCL_GRID_CONSTANT const BlevBufferSrcsOutItT blev_buffer_srcs,
+    _CCCL_GRID_CONSTANT const BlevBufferDstsOutItT blev_buffer_dsts,
+    _CCCL_GRID_CONSTANT const BlevBufferSizesOutItT blev_buffer_sizes,
+    _CCCL_GRID_CONSTANT const BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
+    _CCCL_GRID_CONSTANT const BLevBufferOffsetTileState blev_buffer_scan_state,
+    _CCCL_GRID_CONSTANT const BLevBlockOffsetTileState blev_block_scan_state)
 {
   // Internal type used for storing a buffer's size
   using BufferSizeT = it_value_t<BufferSizeIteratorT>;
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index 6ed2facded6..3749aa4c842 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -73,13 +73,13 @@ template <typename MaxPolicy,
           typename Offset,
           typename CompareOp>
 CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
-  KeyIt1 keys1,
-  Offset keys1_count,
-  KeyIt2 keys2,
-  Offset keys2_count,
-  Offset num_diagonals,
+  _CCCL_GRID_CONSTANT const KeyIt1 keys1,
+  _CCCL_GRID_CONSTANT const Offset keys1_count,
+  _CCCL_GRID_CONSTANT const KeyIt2 keys2,
+  _CCCL_GRID_CONSTANT const Offset keys2_count,
+  _CCCL_GRID_CONSTANT const Offset num_diagonals,
   Offset* key1_beg_offsets,
-  CompareOp compare_op)
+  _CCCL_GRID_CONSTANT const CompareOp compare_op)
 {
   // items_per_tile must be the same of the merge kernel later, so we have to consider whether a fallback agent will be
   // selected for the merge agent that changes the tile size
@@ -122,15 +122,15 @@ __launch_bounds__(
                      Offset,
                      CompareOp>::type::policy::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void device_merge_kernel(
-    KeyIt1 keys1,
-    ValueIt1 items1,
-    Offset num_keys1,
-    KeyIt2 keys2,
-    ValueIt2 items2,
-    Offset num_keys2,
-    KeyIt3 keys_result,
-    ValueIt3 items_result,
-    CompareOp compare_op,
+    _CCCL_GRID_CONSTANT const KeyIt1 keys1,
+    _CCCL_GRID_CONSTANT const ValueIt1 items1,
+    _CCCL_GRID_CONSTANT const Offset num_keys1,
+    _CCCL_GRID_CONSTANT const KeyIt2 keys2,
+    _CCCL_GRID_CONSTANT const ValueIt2 items2,
+    _CCCL_GRID_CONSTANT const Offset num_keys2,
+    _CCCL_GRID_CONSTANT const KeyIt3 keys_result,
+    _CCCL_GRID_CONSTANT const ValueIt3 items_result,
+    _CCCL_GRID_CONSTANT const CompareOp compare_op,
     Offset* key1_beg_offsets,
     vsmem_t global_temp_storage)
 {
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 6e310446d26..651f73a04f4 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -181,16 +181,16 @@ template <typename ChainedPolicyT,
           typename StreamingContextT>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel(
-    KeysInputIteratorT d_keys_in,
-    UniqueOutputIteratorT d_unique_out,
-    ValuesInputIteratorT d_values_in,
-    AggregatesOutputIteratorT d_aggregates_out,
-    NumRunsOutputIteratorT d_num_runs_out,
+    _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
+    _CCCL_GRID_CONSTANT const UniqueOutputIteratorT d_unique_out,
+    _CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in,
+    _CCCL_GRID_CONSTANT const AggregatesOutputIteratorT d_aggregates_out,
+    _CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out,
     ScanTileStateT tile_state,
-    int start_tile,
-    EqualityOpT equality_op,
-    ReductionOpT reduction_op,
-    OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int start_tile,
+    _CCCL_GRID_CONSTANT const EqualityOpT equality_op,
+    _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
     vsmem_t vsmem)
 {
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index c757bcc3d6b..9cb567ca023 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -170,14 +170,14 @@ template <typename ChainedPolicyT,
           typename StreamingContextT>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel(
-    InputIteratorT d_in,
-    OffsetsOutputIteratorT d_offsets_out,
-    LengthsOutputIteratorT d_lengths_out,
-    NumRunsOutputIteratorT d_num_runs_out,
+    _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+    _CCCL_GRID_CONSTANT const OffsetsOutputIteratorT d_offsets_out,
+    _CCCL_GRID_CONSTANT const LengthsOutputIteratorT d_lengths_out,
+    _CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out,
     ScanTileStateT tile_status,
-    EqualityOpT equality_op,
-    OffsetT num_items,
-    int num_tiles,
+    _CCCL_GRID_CONSTANT const EqualityOpT equality_op,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context)
 {
   using AgentRlePolicyT = typename ChainedPolicyT::ActivePolicy::RleSweepPolicyT;
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index 1617e3ecc7e..a005966b6a0 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -117,16 +117,16 @@ template <typename ChainedPolicyT,
           typename KeyT = cub::detail::it_value_t<KeysInputIteratorT>>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel(
-    KeysInputIteratorT d_keys_in,
+    _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
     KeyT* d_keys_prev_in,
-    ValuesInputIteratorT d_values_in,
-    ValuesOutputIteratorT d_values_out,
+    _CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in,
+    _CCCL_GRID_CONSTANT const ValuesOutputIteratorT d_values_out,
     ScanByKeyTileStateT tile_state,
-    int start_tile,
-    EqualityOp equality_op,
-    ScanOpT scan_op,
-    InitValueT init_value,
-    OffsetT num_items)
+    _CCCL_GRID_CONSTANT const int start_tile,
+    _CCCL_GRID_CONSTANT const EqualityOp equality_op,
+    _CCCL_GRID_CONSTANT const ScanOpT scan_op,
+    _CCCL_GRID_CONSTANT const InitValueT init_value,
+    _CCCL_GRID_CONSTANT const OffsetT num_items)
 {
   using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT;
 
@@ -153,10 +153,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE
 template <typename ScanTileStateT, typename KeysInputIteratorT, typename OffsetT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel(
   ScanTileStateT tile_state,
-  KeysInputIteratorT d_keys_in,
+  _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
   cub::detail::it_value_t<KeysInputIteratorT>* d_keys_prev_in,
-  OffsetT items_per_tile,
-  int num_tiles)
+  _CCCL_GRID_CONSTANT const OffsetT items_per_tile,
+  _CCCL_GRID_CONSTANT const int num_tiles)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 666cd3d5b85..51c393d775d 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -312,15 +312,15 @@ __launch_bounds__(int(
     OffsetT,
     StreamingContextT>::agent_policy_t::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel(
-    InputIteratorT d_in,
-    FlagsInputIteratorT d_flags,
-    SelectedOutputIteratorT d_selected_out,
-    NumSelectedIteratorT d_num_selected_out,
+    _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+    _CCCL_GRID_CONSTANT const FlagsInputIteratorT d_flags,
+    _CCCL_GRID_CONSTANT const SelectedOutputIteratorT d_selected_out,
+    _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_status,
-    SelectOpT select_op,
-    EqualityOpT equality_op,
-    OffsetT num_items,
-    int num_tiles,
+    _CCCL_GRID_CONSTANT const SelectOpT select_op,
+    _CCCL_GRID_CONSTANT const EqualityOpT equality_op,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
     vsmem_t vsmem)
 {
diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh
index 02ffcbc4c3f..ac5d8ac730a 100644
--- a/cub/cub/device/dispatch/dispatch_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_topk.cuh
@@ -112,22 +112,22 @@ template <typename ChainedPolicyT,
           bool IsFirstPass>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceTopKKernel(
-    const KeyInputIteratorT d_keys_in,
-    KeyOutputIteratorT d_keys_out,
-    const ValueInputIteratorT d_values_in,
-    ValueOutputIteratorT d_values_out,
+    _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in,
+    _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out,
+    _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
+    _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
     KeyInT* in_buf,
     OffsetT* in_idx_buf,
     KeyInT* out_buf,
     OffsetT* out_idx_buf,
     Counter<it_value_t<KeyInputIteratorT>, OffsetT, OutOffsetT>* counter,
     OffsetT* histogram,
-    OffsetT num_items,
-    OutOffsetT k,
-    OffsetT buffer_length,
-    ExtractBinOpT extract_bin_op,
-    IdentifyCandidatesOpT identify_candidates_op,
-    int pass)
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const OutOffsetT k,
+    _CCCL_GRID_CONSTANT const OffsetT buffer_length,
+    _CCCL_GRID_CONSTANT const ExtractBinOpT extract_bin_op,
+    _CCCL_GRID_CONSTANT const IdentifyCandidatesOpT identify_candidates_op,
+    _CCCL_GRID_CONSTANT const int pass)
 {
   using agent_topk_policy_t = typename ChainedPolicyT::ActivePolicy::topk_policy_t;
   using agent_topk_t =
@@ -168,18 +168,18 @@ template <typename ChainedPolicyT,
           typename IdentifyCandidatesOpT>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceTopKLastFilterKernel(
-    const KeyInputIteratorT d_keys_in,
-    KeyOutputIteratorT d_keys_out,
-    const ValueInputIteratorT d_values_in,
-    ValueOutputIteratorT d_values_out,
+    _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in,
+    _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out,
+    _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
+    _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
     KeyInT* in_buf,
     OffsetT* in_idx_buf,
     Counter<it_value_t<KeyInputIteratorT>, OffsetT, OutOffsetT>* counter,
-    OffsetT num_items,
-    OutOffsetT k,
-    OffsetT buffer_length,
-    IdentifyCandidatesOpT identify_candidates_op,
-    int pass)
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const OutOffsetT k,
+    _CCCL_GRID_CONSTANT const OffsetT buffer_length,
+    _CCCL_GRID_CONSTANT const IdentifyCandidatesOpT identify_candidates_op,
+    _CCCL_GRID_CONSTANT const int pass)
 {
   using agent_topk_policy_t = typename ChainedPolicyT::ActivePolicy::topk_policy_t;
   using extract_bin_op_t    = NullType;

From ffd18c160a50c6884e47976808b79de77a324c8f Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Mon, 17 Nov 2025 22:04:19 +0530
Subject: [PATCH 03/14] FEA: Added const _CCCL_GRID_CONSTANT to non-mutable
 kernel parameters

---
 .../device/dispatch/dispatch_adjacent_difference.cuh   |  2 +-
 cub/cub/device/dispatch/dispatch_batch_memcpy.cuh      |  2 +-
 cub/cub/device/dispatch/kernels/kernel_histogram.cuh   |  2 +-
 cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh  |  4 ++--
 cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh  | 10 +++++-----
 cub/cub/device/dispatch/kernels/kernel_reduce.cuh      |  8 ++++----
 cub/cub/device/dispatch/kernels/kernel_scan.cuh        |  4 ++--
 .../dispatch/kernels/kernel_segmented_radix_sort.cuh   |  2 +-
 .../dispatch/kernels/kernel_segmented_reduce.cuh       |  4 ++--
 .../dispatch/kernels/kernel_three_way_partition.cuh    | 10 +++++-----
 .../device/dispatch/kernels/kernel_unique_by_key.cuh   |  6 +++---
 11 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index f8e2d10fd4d..c1adb05ede2 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -33,7 +33,7 @@ namespace detail::adjacent_difference
 {
 template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile)
+DeviceAdjacentDifferenceInitKernel(_CCCL_GRID_CONSTANT const InputIteratorT first, InputT* result, _CCCL_GRID_CONSTANT const OffsetT num_tiles, _CCCL_GRID_CONSTANT const int items_per_tile)
 {
   const int tile_idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
   AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile);
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 6e1d6708712..312278ba33e 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -61,7 +61,7 @@ template <typename BufferOffsetScanTileStateT, typename BlockOffsetScanTileState
 CUB_DETAIL_KERNEL_ATTRIBUTES void InitTileStateKernel(
   BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
   BlockOffsetScanTileStateT block_offset_scan_tile_state,
-  TileOffsetT num_tiles)
+  _CCCL_GRID_CONSTANT const TileOffsetT num_tiles)
 {
   // Initialize tile status
   buffer_offset_scan_tile_state.InitializeStatus(num_tiles);
diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
index ee407165fa5..6a59d2cac1a 100644
--- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
@@ -445,7 +445,7 @@ template <typename ChainedPolicyT,
           typename OffsetT>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramSweepKernel(
-    SampleIteratorT d_samples,
+    _CCCL_GRID_CONSTANT const SampleIteratorT d_samples,
     ::cuda::std::array<int, NumActiveChannels> num_output_bins_wrapper,
     ::cuda::std::array<int, NumActiveChannels> num_privatized_bins_wrapper,
     ::cuda::std::array<CounterT*, NumActiveChannels> d_output_histograms_wrapper,
diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
index ec5d111eeef..f1b1086f252 100644
--- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
@@ -170,8 +170,8 @@ __launch_bounds__(
     _CCCL_GRID_CONSTANT const bool ping,
     _CCCL_GRID_CONSTANT const KeyInputIteratorT keys_in,
     _CCCL_GRID_CONSTANT const ValueInputIteratorT items_in,
-    KeyIteratorT keys_out,
-    ValueIteratorT items_out,
+    _CCCL_GRID_CONSTANT const KeyIteratorT keys_out,
+    _CCCL_GRID_CONSTANT const ValueIteratorT items_out,
     _CCCL_GRID_CONSTANT const OffsetT keys_count,
     KeyT* tmp_keys_out,
     ValueT* tmp_items_out,
diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
index bcbd19e3a6c..698cc8c4763 100644
--- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
@@ -89,7 +89,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
     _CCCL_GRID_CONSTANT const int current_bit,
     _CCCL_GRID_CONSTANT const int num_bits,
     GridEvenShare<OffsetT> even_share,
-    DecomposerT decomposer = {})
+    _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   using ActiveUpsweepPolicyT =
     ::cuda::std::_If<ALT_DIGIT_BITS,
@@ -247,7 +247,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo
     _CCCL_GRID_CONSTANT const int current_bit,
     _CCCL_GRID_CONSTANT const int num_bits,
     GridEvenShare<OffsetT> even_share,
-    DecomposerT decomposer = {})
+    _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   using ActiveUpsweepPolicyT =
     ::cuda::std::_If<ALT_DIGIT_BITS,
@@ -334,7 +334,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
     OffsetT num_items,
     _CCCL_GRID_CONSTANT const int current_bit,
     _CCCL_GRID_CONSTANT const int end_bit,
-    DecomposerT decomposer = {})
+    _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   // Constants
   static constexpr int BLOCK_THREADS    = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS;
@@ -444,7 +444,7 @@ template <typename ChainedPolicyT,
           typename DecomposerT = identity_decomposer_t>
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel(
-  OffsetT* d_bins_out, const KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, DecomposerT decomposer = {})
+  OffsetT* d_bins_out, KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy;
   using AgentT = AgentRadixSortHistogram<HistogramPolicyT, Order == SortOrder::Descending, KeyT, OffsetT, DecomposerT>;
@@ -474,7 +474,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy
     _CCCL_GRID_CONSTANT const PortionOffsetT num_items,
     _CCCL_GRID_CONSTANT const int current_bit,
     _CCCL_GRID_CONSTANT const int num_bits,
-    DecomposerT decomposer = {})
+    _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy;
   using AgentT =
diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
index f28c103fa9c..2d681d4831c 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
@@ -215,7 +215,7 @@ template <typename ChainedPolicyT,
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS),
   1) void DeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
-                                       OutputIteratorT d_out,
+                                       _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
                                        _CCCL_GRID_CONSTANT const OffsetT num_items,
                                        _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
                                        _CCCL_GRID_CONSTANT const InitT init,
@@ -303,7 +303,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   _CCCL_GRID_CONSTANT const int num_items,
   _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
   _CCCL_GRID_CONSTANT const TransformOpT transform_op,
-  const int reduce_grid_size)
+  _CCCL_GRID_CONSTANT const int reduce_grid_size)
 {
   using reduce_policy_t = typename ChainedPolicyT::ActivePolicy::ReducePolicy;
 
@@ -432,7 +432,7 @@ template <typename ChainedPolicyT,
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS),
   1) void DeterministicDeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
-                                                    OutputIteratorT d_out,
+                                                    _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
                                                     _CCCL_GRID_CONSTANT const int num_items,
                                                     _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
                                                     _CCCL_GRID_CONSTANT const InitT init,
@@ -499,7 +499,7 @@ template <typename ChainedPolicyT,
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
   ChainedPolicyT::ActivePolicy::ReduceNondeterministicPolicy::
     BLOCK_THREADS)) void NondeterministicDeviceReduceAtomicKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
-                                                                  OutputIteratorT d_out,
+                                                                  _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
                                                                   _CCCL_GRID_CONSTANT const OffsetT num_items,
                                                                   GridEvenShare<OffsetT> even_share,
                                                                   _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index cfa32871c7a..3ba866cdbe4 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -64,7 +64,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state
  */
 template <typename ScanTileStateT, typename NumSelectedIteratorT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceCompactInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out)
+DeviceCompactInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);
@@ -137,7 +137,7 @@ template <typename ChainedPolicyT,
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
     _CCCL_GRID_CONSTANT const InputIteratorT d_in,
-    OutputIteratorT d_out,
+    _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
     ScanTileStateT tile_state,
     _CCCL_GRID_CONSTANT const int start_tile,
     _CCCL_GRID_CONSTANT const ScanOpT scan_op,
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
index 13088889941..8856956f254 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
@@ -106,7 +106,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
     _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
     _CCCL_GRID_CONSTANT const int current_bit,
     _CCCL_GRID_CONSTANT const int pass_bits,
-    DecomposerT decomposer = {})
+    _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   //
   // Constants
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
index f5fd0f7fea6..8a859631f05 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
@@ -102,7 +102,7 @@ template <typename ChainedPolicyT,
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceSegmentedReduceKernel(
   _CCCL_GRID_CONSTANT const InputIteratorT d_in,
-  OutputIteratorT d_out,
+  _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
   _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
   _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
   _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
@@ -189,7 +189,7 @@ template <typename ChainedPolicyT,
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceFixedSizeSegmentedReduceKernel(
   _CCCL_GRID_CONSTANT const InputIteratorT d_in,
-  OutputIteratorT d_out,
+  _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
   _CCCL_GRID_CONSTANT const OffsetT segment_size,
   _CCCL_GRID_CONSTANT const int num_segments,
   _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
diff --git a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
index 29d776a9a9c..b9b46c1eeb8 100644
--- a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
@@ -115,10 +115,10 @@ template <typename ChainedPolicyT,
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel(
     _CCCL_GRID_CONSTANT const InputIteratorT d_in,
-    FirstOutputIteratorT d_first_part_out,
-    SecondOutputIteratorT d_second_part_out,
-    UnselectedOutputIteratorT d_unselected_out,
-    NumSelectedIteratorT d_num_selected_out,
+    _CCCL_GRID_CONSTANT const FirstOutputIteratorT d_first_part_out,
+    _CCCL_GRID_CONSTANT const SecondOutputIteratorT d_second_part_out,
+    _CCCL_GRID_CONSTANT const UnselectedOutputIteratorT d_unselected_out,
+    _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_status,
     _CCCL_GRID_CONSTANT const SelectFirstPartOp select_first_part_op,
     _CCCL_GRID_CONSTANT const SelectSecondPartOp select_second_part_op,
@@ -181,7 +181,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO
  */
 template <typename ScanTileStateT, typename NumSelectedIteratorT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out)
+DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);
diff --git a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
index 7ea2d0f2297..564d25b1e33 100644
--- a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
@@ -133,9 +133,9 @@ __launch_bounds__(int(
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel(
     _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in,
     _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
-    KeyOutputIteratorT d_keys_out,
-    ValueOutputIteratorT d_values_out,
-    NumSelectedIteratorT d_num_selected_out,
+    _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out,
+    _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
+    _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_state,
     _CCCL_GRID_CONSTANT const EqualityOpT equality_op,
     _CCCL_GRID_CONSTANT const OffsetT num_items,

From 413d9bb81190382bf1bb24ed2caaa04de1a7757c Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Tue, 18 Nov 2025 14:04:49 +0530
Subject: [PATCH 04/14] Removed _CCCL_GRID_CONSTANT const from all operators

---
 .../dispatch/dispatch_adjacent_difference.cuh |  2 +-
 cub/cub/device/dispatch/dispatch_merge.cuh    |  4 ++--
 .../dispatch/dispatch_reduce_by_key.cuh       |  4 ++--
 cub/cub/device/dispatch/dispatch_rle.cuh      |  2 +-
 .../device/dispatch/dispatch_scan_by_key.cuh  |  2 +-
 .../device/dispatch/dispatch_select_if.cuh    |  4 ++--
 cub/cub/device/dispatch/dispatch_topk.cuh     |  6 +++---
 .../dispatch/kernels/kernel_for_each.cuh      |  3 +--
 .../dispatch/kernels/kernel_merge_sort.cuh    |  4 ++--
 .../device/dispatch/kernels/kernel_reduce.cuh | 20 +++++++++----------
 .../device/dispatch/kernels/kernel_scan.cuh   |  2 +-
 .../kernels/kernel_segmented_reduce.cuh       |  4 ++--
 .../kernels/kernel_three_way_partition.cuh    |  4 ++--
 .../dispatch/kernels/kernel_unique_by_key.cuh |  2 +-
 14 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index e2da4dc3483..999f87945d3 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -54,7 +54,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
   _CCCL_GRID_CONSTANT const InputIteratorT input,
   InputT* first_tile_previous,
   _CCCL_GRID_CONSTANT const OutputIteratorT result,
-  _CCCL_GRID_CONSTANT const DifferenceOpT difference_op,
+  DifferenceOpT difference_op,
   _CCCL_GRID_CONSTANT const OffsetT num_items)
 {
   using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy;
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index 3749aa4c842..08ed4441143 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -79,7 +79,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
   _CCCL_GRID_CONSTANT const Offset keys2_count,
   _CCCL_GRID_CONSTANT const Offset num_diagonals,
   Offset* key1_beg_offsets,
-  _CCCL_GRID_CONSTANT const CompareOp compare_op)
+  CompareOp compare_op)
 {
   // items_per_tile must be the same of the merge kernel later, so we have to consider whether a fallback agent will be
   // selected for the merge agent that changes the tile size
@@ -130,7 +130,7 @@ __launch_bounds__(
     _CCCL_GRID_CONSTANT const Offset num_keys2,
     _CCCL_GRID_CONSTANT const KeyIt3 keys_result,
     _CCCL_GRID_CONSTANT const ValueIt3 items_result,
-    _CCCL_GRID_CONSTANT const CompareOp compare_op,
+    CompareOp compare_op,
     Offset* key1_beg_offsets,
     vsmem_t global_temp_storage)
 {
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 651f73a04f4..ed466646113 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -188,8 +188,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
     _CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out,
     ScanTileStateT tile_state,
     _CCCL_GRID_CONSTANT const int start_tile,
-    _CCCL_GRID_CONSTANT const EqualityOpT equality_op,
-    _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+    EqualityOpT equality_op,
+    ReductionOpT reduction_op,
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
     vsmem_t vsmem)
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 9cb567ca023..78c24152740 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -175,7 +175,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA
     _CCCL_GRID_CONSTANT const LengthsOutputIteratorT d_lengths_out,
     _CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out,
     ScanTileStateT tile_status,
-    _CCCL_GRID_CONSTANT const EqualityOpT equality_op,
+    EqualityOpT equality_op,
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context)
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index a005966b6a0..59499f4c096 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -123,7 +123,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE
     _CCCL_GRID_CONSTANT const ValuesOutputIteratorT d_values_out,
     ScanByKeyTileStateT tile_state,
     _CCCL_GRID_CONSTANT const int start_tile,
-    _CCCL_GRID_CONSTANT const EqualityOp equality_op,
+    EqualityOp equality_op,
     _CCCL_GRID_CONSTANT const ScanOpT scan_op,
     _CCCL_GRID_CONSTANT const InitValueT init_value,
     _CCCL_GRID_CONSTANT const OffsetT num_items)
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 51c393d775d..66f0afaeb4e 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -317,8 +317,8 @@ __launch_bounds__(int(
     _CCCL_GRID_CONSTANT const SelectedOutputIteratorT d_selected_out,
     _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_status,
-    _CCCL_GRID_CONSTANT const SelectOpT select_op,
-    _CCCL_GRID_CONSTANT const EqualityOpT equality_op,
+    SelectOpT select_op,
+    EqualityOpT equality_op,
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh
index ac5d8ac730a..624c5d598ad 100644
--- a/cub/cub/device/dispatch/dispatch_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_topk.cuh
@@ -125,8 +125,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const OutOffsetT k,
     _CCCL_GRID_CONSTANT const OffsetT buffer_length,
-    _CCCL_GRID_CONSTANT const ExtractBinOpT extract_bin_op,
-    _CCCL_GRID_CONSTANT const IdentifyCandidatesOpT identify_candidates_op,
+    ExtractBinOpT extract_bin_op,
+    IdentifyCandidatesOpT identify_candidates_op,
     _CCCL_GRID_CONSTANT const int pass)
 {
   using agent_topk_policy_t = typename ChainedPolicyT::ActivePolicy::topk_policy_t;
@@ -178,7 +178,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const OutOffsetT k,
     _CCCL_GRID_CONSTANT const OffsetT buffer_length,
-    _CCCL_GRID_CONSTANT const IdentifyCandidatesOpT identify_candidates_op,
+    IdentifyCandidatesOpT identify_candidates_op,
     _CCCL_GRID_CONSTANT const int pass)
 {
   using agent_topk_policy_t = typename ChainedPolicyT::ActivePolicy::topk_policy_t;
diff --git a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
index 648f4535ac8..9e924848a1e 100644
--- a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
@@ -86,8 +86,7 @@ using can_regain_copy_freedom =
 
 // This kernel is used when the block size is not known at compile time
 template <class ChainedPolicyT, class OffsetT, class OpT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void
-dynamic_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OpT op)
+CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, OpT op)
 {
   using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
   using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
index f1b1086f252..659c48e99bf 100644
--- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
@@ -227,7 +227,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(
   _CCCL_GRID_CONSTANT const OffsetT keys_count,
   _CCCL_GRID_CONSTANT const OffsetT num_partitions,
   OffsetT* merge_partitions,
-  _CCCL_GRID_CONSTANT const CompareOpT compare_op,
+  CompareOpT compare_op,
   _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
   _CCCL_GRID_CONSTANT const int items_per_tile)
 {
@@ -278,7 +278,7 @@ __launch_bounds__(
     _CCCL_GRID_CONSTANT const OffsetT keys_count,
     KeyT* keys_pong,
     ValueT* items_pong,
-    _CCCL_GRID_CONSTANT const CompareOpT compare_op,
+    CompareOpT compare_op,
     OffsetT* merge_partitions,
     _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
     vsmem_t vsmem)
diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
index 2d681d4831c..3df150d7e27 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
@@ -133,8 +133,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   AccumT* d_out,
   _CCCL_GRID_CONSTANT const OffsetT num_items,
   GridEvenShare<OffsetT> even_share,
-  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
-  _CCCL_GRID_CONSTANT const TransformOpT transform_op)
+  ReductionOpT reduction_op,
+  TransformOpT transform_op)
 {
   // Thread block type for reducing input tiles
   using AgentReduceT =
@@ -217,9 +217,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   1) void DeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
                                        _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
                                        _CCCL_GRID_CONSTANT const OffsetT num_items,
-                                       _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+                                       ReductionOpT reduction_op,
                                        _CCCL_GRID_CONSTANT const InitT init,
-                                       _CCCL_GRID_CONSTANT const TransformOpT transform_op)
+                                       TransformOpT transform_op)
 {
   // Thread block type for reducing input tiles
   using AgentReduceT =
@@ -301,8 +301,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   _CCCL_GRID_CONSTANT const InputIteratorT d_in,
   AccumT* d_out,
   _CCCL_GRID_CONSTANT const int num_items,
-  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
-  _CCCL_GRID_CONSTANT const TransformOpT transform_op,
+  ReductionOpT reduction_op,
+  TransformOpT transform_op,
   _CCCL_GRID_CONSTANT const int reduce_grid_size)
 {
   using reduce_policy_t = typename ChainedPolicyT::ActivePolicy::ReducePolicy;
@@ -434,9 +434,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   1) void DeterministicDeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
                                                     _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
                                                     _CCCL_GRID_CONSTANT const int num_items,
-                                                    _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+                                                    ReductionOpT reduction_op,
                                                     _CCCL_GRID_CONSTANT const InitT init,
-                                                    _CCCL_GRID_CONSTANT const TransformOpT transform_op)
+                                                    TransformOpT transform_op)
 {
   using single_tile_policy_t = typename ChainedPolicyT::ActivePolicy::SingleTilePolicy;
 
@@ -502,9 +502,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
                                                                   _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
                                                                   _CCCL_GRID_CONSTANT const OffsetT num_items,
                                                                   GridEvenShare<OffsetT> even_share,
-                                                                  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+                                                                  ReductionOpT reduction_op,
                                                                   _CCCL_GRID_CONSTANT const InitT init,
-                                                                  _CCCL_GRID_CONSTANT const TransformOpT transform_op)
+                                                                  TransformOpT transform_op)
 {
   NV_IF_TARGET(NV_PROVIDES_SM_60,
                (),
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index 76f515faddb..3da879d750b 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -143,7 +143,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
     _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
     ScanTileStateT tile_state,
     _CCCL_GRID_CONSTANT const int start_tile,
-    _CCCL_GRID_CONSTANT const ScanOpT scan_op,
+    ScanOpT scan_op,
     InitValueT init_value,
     _CCCL_GRID_CONSTANT const OffsetT num_items)
 {
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
index 8a859631f05..954b1508282 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
@@ -105,7 +105,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
   _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
   _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
-  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+  ReductionOpT reduction_op,
   _CCCL_GRID_CONSTANT const InitT init)
 {
   // Thread block type for reducing input tiles
@@ -192,7 +192,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
   _CCCL_GRID_CONSTANT const OffsetT segment_size,
   _CCCL_GRID_CONSTANT const int num_segments,
-  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+  ReductionOpT reduction_op,
   _CCCL_GRID_CONSTANT const InitT init)
 {
   using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
diff --git a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
index d0533bf516a..961f03587cf 100644
--- a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
@@ -120,8 +120,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO
     _CCCL_GRID_CONSTANT const UnselectedOutputIteratorT d_unselected_out,
     _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_status,
-    _CCCL_GRID_CONSTANT const SelectFirstPartOp select_first_part_op,
-    _CCCL_GRID_CONSTANT const SelectSecondPartOp select_second_part_op,
+    SelectFirstPartOp select_first_part_op,
+    SelectSecondPartOp select_second_part_op,
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context)
diff --git a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
index 564d25b1e33..3c87c90f040 100644
--- a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
@@ -137,7 +137,7 @@ __launch_bounds__(int(
     _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
     _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_state,
-    _CCCL_GRID_CONSTANT const EqualityOpT equality_op,
+    EqualityOpT equality_op,
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const int num_tiles,
     vsmem_t vsmem)

From 5b6236082e2aa3fb49800af36ac8ff692378fb43 Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Sat, 15 Nov 2025 18:39:56 +0530
Subject: [PATCH 05/14] FEA: Added const _CCCL_GRID_CONSTANT to non-mutable
 kernel parameters

Fixes
---
 cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh | 2 +-
 cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh | 8 ++++++++
 cub/cub/device/dispatch/kernels/kernel_reduce.cuh     | 5 +++--
 cub/cub/device/dispatch/kernels/kernel_scan.cuh       | 9 +++------
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
index 659c48e99bf..dc3773d02b0 100644
--- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
@@ -278,7 +278,7 @@ __launch_bounds__(
     _CCCL_GRID_CONSTANT const OffsetT keys_count,
     KeyT* keys_pong,
     ValueT* items_pong,
-    CompareOpT compare_op,
+    _CCCL_GRID_CONSTANT const CompareOpT compare_op,
     OffsetT* merge_partitions,
     _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
     vsmem_t vsmem)
diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
index 01a8d1fdd57..d1c7063d0fb 100644
--- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
@@ -444,12 +444,16 @@ template <typename ChainedPolicyT,
           typename DecomposerT = identity_decomposer_t>
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel(
+<<<<<<< HEAD
   OffsetT* d_bins_out,
   KeyT* d_keys_in,
   _CCCL_GRID_CONSTANT const OffsetT num_items,
   _CCCL_GRID_CONSTANT const int start_bit,
   _CCCL_GRID_CONSTANT const int end_bit,
   _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
+=======
+  OffsetT* d_bins_out, const KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, DecomposerT decomposer = {})
+>>>>>>> 4b1217559 (FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters)
 {
   using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy;
   using AgentT = AgentRadixSortHistogram<HistogramPolicyT, Order == SortOrder::Descending, KeyT, OffsetT, DecomposerT>;
@@ -479,7 +483,11 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy
     _CCCL_GRID_CONSTANT const PortionOffsetT num_items,
     _CCCL_GRID_CONSTANT const int current_bit,
     _CCCL_GRID_CONSTANT const int num_bits,
+<<<<<<< HEAD
     _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
+=======
+    DecomposerT decomposer = {})
+>>>>>>> 4b1217559 (FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters)
 {
   using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy;
   using AgentT =
diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
index 3df150d7e27..27f081801a9 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
@@ -133,8 +133,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   AccumT* d_out,
   _CCCL_GRID_CONSTANT const OffsetT num_items,
   GridEvenShare<OffsetT> even_share,
-  ReductionOpT reduction_op,
-  TransformOpT transform_op)
+  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
+  _CCCL_GRID_CONSTANT const TransformOpT transform_op)
 {
   // Thread block type for reducing input tiles
   using AgentReduceT =
@@ -215,6 +215,7 @@ template <typename ChainedPolicyT,
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS),
   1) void DeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
+<<<<<<< HEAD
                                        _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
                                        _CCCL_GRID_CONSTANT const OffsetT num_items,
                                        ReductionOpT reduction_op,
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index f27e781b7c8..2ab114f6743 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -39,8 +39,7 @@ namespace detail::scan
  *   Number of tiles
  */
 template <typename ScanTileStateT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles)
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles)
 {
   _CCCL_PDL_GRID_DEPENDENCY_SYNC();
   _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); // beneficial for all problem sizes in cub.bench.scan.exclusive.sum.base
@@ -68,10 +67,8 @@ DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int nu
  *   (i.e., length of `d_selected_out`)
  */
 template <typename ScanTileStateT, typename NumSelectedIteratorT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceCompactInitKernel(
-  ScanTileStateT tile_state,
-  _CCCL_GRID_CONSTANT const int num_tiles,
-  _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out)
+CUB_DETAIL_KERNEL_ATTRIBUTES void
+DeviceCompactInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);

From 3240916503c27f7830beb2d6ff897cb295434768 Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Tue, 18 Nov 2025 14:04:49 +0530
Subject: [PATCH 06/14] Removed _CCCL_GRID_CONSTANT const from all operators

Fixes
---
 cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh | 2 +-
 cub/cub/device/dispatch/kernels/kernel_reduce.cuh     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
index dc3773d02b0..659c48e99bf 100644
--- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
@@ -278,7 +278,7 @@ __launch_bounds__(
     _CCCL_GRID_CONSTANT const OffsetT keys_count,
     KeyT* keys_pong,
     ValueT* items_pong,
-    _CCCL_GRID_CONSTANT const CompareOpT compare_op,
+    CompareOpT compare_op,
     OffsetT* merge_partitions,
     _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
     vsmem_t vsmem)
diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
index 27f081801a9..cd2ccab79dc 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
@@ -133,8 +133,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   AccumT* d_out,
   _CCCL_GRID_CONSTANT const OffsetT num_items,
   GridEvenShare<OffsetT> even_share,
-  _CCCL_GRID_CONSTANT const ReductionOpT reduction_op,
-  _CCCL_GRID_CONSTANT const TransformOpT transform_op)
+  ReductionOpT reduction_op,
+  TransformOpT transform_op)
 {
   // Thread block type for reducing input tiles
   using AgentReduceT =

From 5a888029474eb24f5d5be143c1bccf5546f9acd1 Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Sun, 23 Nov 2025 00:10:54 +0530
Subject: [PATCH 07/14] Fixes

Fixes

More Fixes
---
 .../dispatch/dispatch_adjacent_difference.cuh |  2 +-
 .../dispatch/kernels/kernel_for_each.cuh      |  2 +-
 .../dispatch/kernels/kernel_merge_sort.cuh    |  2 +-
 .../dispatch/kernels/kernel_radix_sort.cuh    | 26 +++++++------------
 .../kernels/kernel_segmented_radix_sort.cuh   |  4 +--
 .../kernels/kernel_segmented_scan.cuh         | 16 ++++++------
 .../kernels/kernel_segmented_sort.cuh         | 10 +++----
 .../dispatch/kernels/kernel_transform.cuh     |  2 +-
 8 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index 999f87945d3..eb0502b66f4 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -34,7 +34,7 @@ namespace detail::adjacent_difference
 template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel(
   _CCCL_GRID_CONSTANT const InputIteratorT first,
-  InputT* result,
+  _CCCL_GRID_CONSTANT const InputT* result,
   _CCCL_GRID_CONSTANT const OffsetT num_tiles,
   _CCCL_GRID_CONSTANT const int items_per_tile)
 {
diff --git a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
index 9e924848a1e..760f221ee9f 100644
--- a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
@@ -111,7 +111,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(_CCCL_GRID_CONSTANT const Offse
 template <class ChainedPolicyT, class OffsetT, class OpT>
 CUB_DETAIL_KERNEL_ATTRIBUTES //
 __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
-  void static_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OpT op)
+  void static_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, OpT op)
 {
   using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
   using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
index 659c48e99bf..e21c6a2a8bb 100644
--- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
@@ -175,7 +175,7 @@ __launch_bounds__(
     _CCCL_GRID_CONSTANT const OffsetT keys_count,
     KeyT* tmp_keys_out,
     ValueT* tmp_items_out,
-    _CCCL_GRID_CONSTANT const CompareOpT compare_op,
+    CompareOpT compare_op,
     vsmem_t vsmem)
 {
   using MergeSortHelperT = typename VSMemHelperT::template MergeSortVSMemHelperT<
diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
index d1c7063d0fb..f18a3b0d633 100644
--- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
@@ -83,7 +83,7 @@ template <typename ChainedPolicyT,
 __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS)
                                        : int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(
-    const KeyT* d_keys,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys,
     OffsetT* d_spine,
     _CCCL_GRID_CONSTANT const OffsetT /*num_items*/,
     _CCCL_GRID_CONSTANT const int current_bit,
@@ -238,9 +238,9 @@ template <typename ChainedPolicyT,
 __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS)
                                        : int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel(
-    const KeyT* d_keys_in,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
     KeyT* d_keys_out,
-    const ValueT* d_values_in,
+    _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
     ValueT* d_values_out,
     OffsetT* d_spine,
     _CCCL_GRID_CONSTANT const OffsetT num_items,
@@ -327,9 +327,9 @@ template <typename ChainedPolicyT,
           typename DecomposerT = identity_decomposer_t>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel(
-    const KeyT* d_keys_in,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
     KeyT* d_keys_out,
-    const ValueT* d_values_in,
+    _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
     ValueT* d_values_out,
     OffsetT num_items,
     _CCCL_GRID_CONSTANT const int current_bit,
@@ -444,16 +444,14 @@ template <typename ChainedPolicyT,
           typename DecomposerT = identity_decomposer_t>
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel(
-<<<<<<< HEAD
   OffsetT* d_bins_out,
   KeyT* d_keys_in,
+  OffsetT* d_bins_out,
+  _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
   _CCCL_GRID_CONSTANT const OffsetT num_items,
   _CCCL_GRID_CONSTANT const int start_bit,
   _CCCL_GRID_CONSTANT const int end_bit,
   _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
-=======
-  OffsetT* d_bins_out, const KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, DecomposerT decomposer = {})
->>>>>>> 4b1217559 (FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters)
 {
   using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy;
   using AgentT = AgentRadixSortHistogram<HistogramPolicyT, Order == SortOrder::Descending, KeyT, OffsetT, DecomposerT>;
@@ -475,19 +473,15 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy
     AtomicOffsetT* d_lookback,
     AtomicOffsetT* d_ctrs,
     OffsetT* d_bins_out,
-    const OffsetT* d_bins_in,
+    _CCCL_GRID_CONSTANT const OffsetT* const d_bins_in,
     KeyT* d_keys_out,
-    const KeyT* d_keys_in,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
     ValueT* d_values_out,
-    ValueT* d_values_in,
+    _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
     _CCCL_GRID_CONSTANT const PortionOffsetT num_items,
     _CCCL_GRID_CONSTANT const int current_bit,
     _CCCL_GRID_CONSTANT const int num_bits,
-<<<<<<< HEAD
     _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
-=======
-    DecomposerT decomposer = {})
->>>>>>> 4b1217559 (FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters)
 {
   using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy;
   using AgentT =
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
index 8856956f254..25ffa659347 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
@@ -98,9 +98,9 @@ template <typename ChainedPolicyT,
 __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS
                                        : ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel(
-    const KeyT* d_keys_in,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
     KeyT* d_keys_out,
-    const ValueT* d_values_in,
+    _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
     ValueT* d_values_out,
     _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
     _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh
index cc0431b8b0b..373ad649a0b 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh
@@ -35,14 +35,14 @@ template <typename ChainedPolicyT,
           typename ActualInitValueT = typename InitValueT::value_type>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::segmented_scan_policy_t::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void device_segmented_scan_kernel(
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    BeginOffsetIteratorInputT begin_offset_d_in,
-    EndOffsetIteratorInputT end_offset_d_in,
-    BeginOffsetIteratorOutputT begin_offset_d_out,
-    OffsetT n_segments,
-    ScanOpT scan_op,
-    InitValueT init_value)
+    _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+    _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorInputT begin_offset_d_in,
+    _CCCL_GRID_CONSTANT const EndOffsetIteratorInputT end_offset_d_in,
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorOutputT begin_offset_d_out,
+    _CCCL_GRID_CONSTANT const OffsetT n_segments,
+    _CCCL_GRID_CONSTANT const ScanOpT scan_op,
+    _CCCL_GRID_CONSTANT const InitValueT init_value)
 {
   using segmented_scan_policy_t = typename ChainedPolicyT::ActivePolicy::segmented_scan_policy_t;
 
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
index e13833ffa74..99135d9bb33 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
@@ -307,11 +307,11 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallSegmentPolicy::BLOCK_THREAD
     _CCCL_GRID_CONSTANT const local_segment_index_t small_segments,
     _CCCL_GRID_CONSTANT const local_segment_index_t medium_segments,
     _CCCL_GRID_CONSTANT const local_segment_index_t medium_blocks,
-    const local_segment_index_t* d_small_segments_indices,
-    const local_segment_index_t* d_medium_segments_indices,
-    const KeyT* d_keys_in,
+    _CCCL_GRID_CONSTANT const local_segment_index_t* const d_small_segments_indices,
+    _CCCL_GRID_CONSTANT const local_segment_index_t* const d_medium_segments_indices,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
     KeyT* d_keys_out,
-    const ValueT* d_values_in,
+    _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
     ValueT* d_values_out,
     _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
     _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets)
@@ -425,7 +425,7 @@ template <SortOrder Order,
           typename OffsetT>
 __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelLarge(
-    const local_segment_index_t* d_segments_indices,
+    _CCCL_GRID_CONSTANT const local_segment_index_t* const d_segments_indices,
     const KeyT* d_keys_in_orig,
     KeyT* d_keys_out_orig,
     device_double_buffer<KeyT> d_keys_double_buffer,
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
index 27fddae480f..59fbb3b30a4 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
@@ -989,7 +989,7 @@ __launch_bounds__(get_block_threads<typename MaxPolicy::ActivePolicy>)
     [[maybe_unused]] bool can_vectorize,
     _CCCL_GRID_CONSTANT const Predicate pred,
     _CCCL_GRID_CONSTANT const F f,
-    RandomAccessIteratorOut out,
+    _CCCL_GRID_CONSTANT const RandomAccessIteratorOut out,
     kernel_arg<RandomAccessIteartorsIn>... ins)
 {
   _CCCL_ASSERT(blockDim.y == 1 && blockDim.z == 1, "transform_kernel only supports 1D blocks");

From bd65528257aae69a43b742d9f2b6de4fa9588fcc Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Sun, 23 Nov 2025 13:15:18 +0530
Subject: [PATCH 08/14] More Fixes

---
 cub/cub/device/dispatch/kernels/kernel_scan.cuh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index 2ab114f6743..33f2d344625 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -39,7 +39,8 @@ namespace detail::scan
  *   Number of tiles
  */
 template <typename ScanTileStateT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles)
+CUB_DETAIL_KERNEL_ATTRIBUTES void
+DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles)
 {
   _CCCL_PDL_GRID_DEPENDENCY_SYNC();
   _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); // beneficial for all problem sizes in cub.bench.scan.exclusive.sum.base
@@ -67,8 +68,8 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state
  *   (i.e., length of `d_selected_out`)
  */
 template <typename ScanTileStateT, typename NumSelectedIteratorT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceCompactInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out)
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceCompactInitKernel(
+  ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);

From 0e50ba35ba34ba9a29d3bf87fe57e099381f9a01 Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Wed, 26 Nov 2025 19:49:32 +0530
Subject: [PATCH 09/14] Minor Fixes

---
 .../dispatch/dispatch_adjacent_difference.cuh |  2 +-
 .../device/dispatch/dispatch_scan_by_key.cuh  |  2 +-
 .../dispatch/dispatch_segmented_sort.cuh      | 28 ++++++++---------
 cub/cub/device/dispatch/dispatch_topk.cuh     | 14 ++++-----
 .../dispatch/kernels/kernel_merge_sort.cuh    | 14 ++++-----
 .../dispatch/kernels/kernel_radix_sort.cuh    | 31 +++++++++----------
 .../device/dispatch/kernels/kernel_reduce.cuh |  5 ++-
 .../kernels/kernel_segmented_radix_sort.cuh   |  4 +--
 .../kernels/kernel_segmented_sort.cuh         |  4 +--
 .../dispatch/kernels/kernel_transform.cuh     |  2 +-
 10 files changed, 52 insertions(+), 54 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index eb0502b66f4..5f592133924 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -52,7 +52,7 @@ template <typename ChainedPolicyT,
           bool ReadLeft>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
   _CCCL_GRID_CONSTANT const InputIteratorT input,
-  InputT* first_tile_previous,
+  _CCCL_GRID_CONSTANT InputT* const first_tile_previous,
   _CCCL_GRID_CONSTANT const OutputIteratorT result,
   DifferenceOpT difference_op,
   _CCCL_GRID_CONSTANT const OffsetT num_items)
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index 59499f4c096..6a15a808927 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -118,7 +118,7 @@ template <typename ChainedPolicyT,
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel(
     _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
-    KeyT* d_keys_prev_in,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_prev_in,
     _CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in,
     _CCCL_GRID_CONSTANT const ValuesOutputIteratorT d_values_out,
     ScanByKeyTileStateT tile_state,
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index a4c523211db..dee5e3d8d8f 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -178,22 +178,22 @@ template <typename WrappedPolicyT,
           typename EndOffsetIteratorT,
           typename KernelLauncherFactory>
 __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContinuationKernel(
-  LargeKernelT large_kernel,
-  SmallKernelT small_kernel,
-  local_segment_index_t num_segments,
-  KeyT* d_current_keys,
-  KeyT* d_final_keys,
+  _CCCL_GRID_CONSTANT const LargeKernelT large_kernel,
+  _CCCL_GRID_CONSTANT const SmallKernelT small_kernel,
+  _CCCL_GRID_CONSTANT const local_segment_index_t num_segments,
+  _CCCL_GRID_CONSTANT KeyT* const d_current_keys,
+  _CCCL_GRID_CONSTANT KeyT* const d_final_keys,
   device_double_buffer<KeyT> d_keys_double_buffer,
-  ValueT* d_current_values,
-  ValueT* d_final_values,
+  _CCCL_GRID_CONSTANT ValueT* const d_current_values,
+  _CCCL_GRID_CONSTANT ValueT* const d_final_values,
   device_double_buffer<ValueT> d_values_double_buffer,
-  BeginOffsetIteratorT d_begin_offsets,
-  EndOffsetIteratorT d_end_offsets,
-  local_segment_index_t* group_sizes,
-  local_segment_index_t* large_and_medium_segments_indices,
-  local_segment_index_t* small_segments_indices,
-  KernelLauncherFactory launcher_factory,
-  WrappedPolicyT wrapped_policy)
+  _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
+  _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
+  _CCCL_GRID_CONSTANT local_segment_index_t* const group_sizes,
+  _CCCL_GRID_CONSTANT local_segment_index_t* const large_and_medium_segments_indices,
+  _CCCL_GRID_CONSTANT local_segment_index_t* const small_segments_indices,
+  _CCCL_GRID_CONSTANT const KernelLauncherFactory launcher_factory,
+  _CCCL_GRID_CONSTANT const WrappedPolicyT wrapped_policy)
 {
   // In case of CDP:
   // 1. each CTA has a different main stream
diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh
index 624c5d598ad..7fc26327c65 100644
--- a/cub/cub/device/dispatch/dispatch_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_topk.cuh
@@ -116,12 +116,12 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads
     _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out,
     _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
     _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
-    KeyInT* in_buf,
-    OffsetT* in_idx_buf,
-    KeyInT* out_buf,
-    OffsetT* out_idx_buf,
+    _CCCL_GRID_CONSTANT KeyInT* const in_buf,
+    _CCCL_GRID_CONSTANT OffsetT* const in_idx_buf,
+    _CCCL_GRID_CONSTANT KeyInT* const out_buf,
+    _CCCL_GRID_CONSTANT OffsetT* const out_idx_buf,
     Counter<it_value_t<KeyInputIteratorT>, OffsetT, OutOffsetT>* counter,
-    OffsetT* histogram,
+    _CCCL_GRID_CONSTANT OffsetT* const histogram,
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const OutOffsetT k,
     _CCCL_GRID_CONSTANT const OffsetT buffer_length,
@@ -172,8 +172,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads
     _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out,
     _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
     _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
-    KeyInT* in_buf,
-    OffsetT* in_idx_buf,
+    _CCCL_GRID_CONSTANT KeyInT* const in_buf,
+    _CCCL_GRID_CONSTANT OffsetT* const in_idx_buf,
     Counter<it_value_t<KeyInputIteratorT>, OffsetT, OutOffsetT>* counter,
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const OutOffsetT k,
diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
index e21c6a2a8bb..3f428f7c7ba 100644
--- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
@@ -173,8 +173,8 @@ __launch_bounds__(
     _CCCL_GRID_CONSTANT const KeyIteratorT keys_out,
     _CCCL_GRID_CONSTANT const ValueIteratorT items_out,
     _CCCL_GRID_CONSTANT const OffsetT keys_count,
-    KeyT* tmp_keys_out,
-    ValueT* tmp_items_out,
+    _CCCL_GRID_CONSTANT KeyT* const tmp_keys_out,
+    _CCCL_GRID_CONSTANT ValueT* const tmp_items_out,
     CompareOpT compare_op,
     vsmem_t vsmem)
 {
@@ -223,10 +223,10 @@ template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(
   _CCCL_GRID_CONSTANT const bool ping,
   _CCCL_GRID_CONSTANT const KeyIteratorT keys_ping,
-  KeyT* keys_pong,
+  _CCCL_GRID_CONSTANT KeyT* const keys_pong,
   _CCCL_GRID_CONSTANT const OffsetT keys_count,
   _CCCL_GRID_CONSTANT const OffsetT num_partitions,
-  OffsetT* merge_partitions,
+  _CCCL_GRID_CONSTANT OffsetT* const merge_partitions,
   CompareOpT compare_op,
   _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
   _CCCL_GRID_CONSTANT const int items_per_tile)
@@ -276,10 +276,10 @@ __launch_bounds__(
     _CCCL_GRID_CONSTANT const KeyIteratorT keys_ping,
     _CCCL_GRID_CONSTANT const ValueIteratorT items_ping,
     _CCCL_GRID_CONSTANT const OffsetT keys_count,
-    KeyT* keys_pong,
-    ValueT* items_pong,
+    _CCCL_GRID_CONSTANT KeyT* const keys_pong,
+    _CCCL_GRID_CONSTANT ValueT* const items_pong,
     CompareOpT compare_op,
-    OffsetT* merge_partitions,
+    _CCCL_GRID_CONSTANT OffsetT* const merge_partitions,
     _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
     vsmem_t vsmem)
 {
diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
index f18a3b0d633..51619687d0b 100644
--- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
@@ -84,7 +84,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
                                        : int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(
     _CCCL_GRID_CONSTANT const KeyT* const d_keys,
-    OffsetT* d_spine,
+    _CCCL_GRID_CONSTANT OffsetT* const d_spine,
     _CCCL_GRID_CONSTANT const OffsetT /*num_items*/,
     _CCCL_GRID_CONSTANT const int current_bit,
     _CCCL_GRID_CONSTANT const int num_bits,
@@ -144,7 +144,8 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp
  */
 template <typename ChainedPolicyT, typename OffsetT>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
-  CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, _CCCL_GRID_CONSTANT const int num_counts)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(
+    _CCCL_GRID_CONSTANT OffsetT* const d_spine, _CCCL_GRID_CONSTANT const int num_counts)
 {
   // Parameterize the AgentScan type for the current configuration
   using AgentScanT =
@@ -239,10 +240,10 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo
                                        : int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel(
     _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
-    KeyT* d_keys_out,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_out,
     _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
-    ValueT* d_values_out,
-    OffsetT* d_spine,
+    _CCCL_GRID_CONSTANT ValueT* const d_values_out,
+    _CCCL_GRID_CONSTANT OffsetT* const d_spine,
     _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const int current_bit,
     _CCCL_GRID_CONSTANT const int num_bits,
@@ -328,9 +329,9 @@ template <typename ChainedPolicyT,
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel(
     _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
-    KeyT* d_keys_out,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_out,
     _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
-    ValueT* d_values_out,
+    _CCCL_GRID_CONSTANT ValueT* const d_values_out,
     OffsetT num_items,
     _CCCL_GRID_CONSTANT const int current_bit,
     _CCCL_GRID_CONSTANT const int end_bit,
@@ -444,9 +445,7 @@ template <typename ChainedPolicyT,
           typename DecomposerT = identity_decomposer_t>
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel(
-  OffsetT* d_bins_out,
-  KeyT* d_keys_in,
-  OffsetT* d_bins_out,
+  _CCCL_GRID_CONSTANT OffsetT* const d_bins_out,
   _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
   _CCCL_GRID_CONSTANT const OffsetT num_items,
   _CCCL_GRID_CONSTANT const int start_bit,
@@ -470,13 +469,13 @@ template <typename ChainedPolicyT,
           typename DecomposerT   = identity_decomposer_t>
 CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
   DeviceRadixSortOnesweepKernel(
-    AtomicOffsetT* d_lookback,
-    AtomicOffsetT* d_ctrs,
-    OffsetT* d_bins_out,
+    _CCCL_GRID_CONSTANT AtomicOffsetT* const d_lookback,
+    _CCCL_GRID_CONSTANT AtomicOffsetT* const d_ctrs,
+    _CCCL_GRID_CONSTANT OffsetT* const d_bins_out,
     _CCCL_GRID_CONSTANT const OffsetT* const d_bins_in,
-    KeyT* d_keys_out,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_out,
     _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
-    ValueT* d_values_out,
+    _CCCL_GRID_CONSTANT ValueT* const d_values_out,
     _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
     _CCCL_GRID_CONSTANT const PortionOffsetT num_items,
     _CCCL_GRID_CONSTANT const int current_bit,
@@ -515,7 +514,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy
  * Exclusive sum kernel
  */
 template <typename ChainedPolicyT, typename OffsetT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins)
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(_CCCL_GRID_CONSTANT OffsetT* const d_bins)
 {
   using ExclusiveSumPolicyT     = typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy;
   constexpr int RADIX_BITS      = ExclusiveSumPolicyT::RADIX_BITS;
diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
index cd2ccab79dc..5dc935c547e 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
@@ -130,7 +130,7 @@ template <typename ChainedPolicyT,
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceReduceKernel(
   _CCCL_GRID_CONSTANT const InputIteratorT d_in,
-  AccumT* d_out,
+  _CCCL_GRID_CONSTANT AccumT* const d_out,
   _CCCL_GRID_CONSTANT const OffsetT num_items,
   GridEvenShare<OffsetT> even_share,
   ReductionOpT reduction_op,
@@ -215,7 +215,6 @@ template <typename ChainedPolicyT,
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS),
   1) void DeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
-<<<<<<< HEAD
                                        _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
                                        _CCCL_GRID_CONSTANT const OffsetT num_items,
                                        ReductionOpT reduction_op,
@@ -300,7 +299,7 @@ template <typename ChainedPolicyT, typename InputIteratorT, typename ReductionOp
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeterministicDeviceReduceKernel(
   _CCCL_GRID_CONSTANT const InputIteratorT d_in,
-  AccumT* d_out,
+  _CCCL_GRID_CONSTANT AccumT* const d_out,
   _CCCL_GRID_CONSTANT const int num_items,
   ReductionOpT reduction_op,
   TransformOpT transform_op,
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
index 25ffa659347..5ac1ad9cfb5 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
@@ -99,9 +99,9 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen
                                        : ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel(
     _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
-    KeyT* d_keys_out,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_out,
     _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
-    ValueT* d_values_out,
+    _CCCL_GRID_CONSTANT ValueT* const d_values_out,
     _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
     _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
     _CCCL_GRID_CONSTANT const int current_bit,
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
index 99135d9bb33..073afd3a1e3 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
@@ -310,9 +310,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallSegmentPolicy::BLOCK_THREAD
     _CCCL_GRID_CONSTANT const local_segment_index_t* const d_small_segments_indices,
     _CCCL_GRID_CONSTANT const local_segment_index_t* const d_medium_segments_indices,
     _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
-    KeyT* d_keys_out,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_out,
     _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
-    ValueT* d_values_out,
+    _CCCL_GRID_CONSTANT ValueT* const d_values_out,
     _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
     _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets)
 {
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
index 59fbb3b30a4..1a863935534 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
@@ -986,7 +986,7 @@ __launch_bounds__(get_block_threads<typename MaxPolicy::ActivePolicy>)
   CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
     _CCCL_GRID_CONSTANT const Offset num_items,
     _CCCL_GRID_CONSTANT const int num_elem_per_thread,
-    [[maybe_unused]] bool can_vectorize,
+    _CCCL_GRID_CONSTANT [[maybe_unused]] const bool can_vectorize,
     _CCCL_GRID_CONSTANT const Predicate pred,
     _CCCL_GRID_CONSTANT const F f,
     _CCCL_GRID_CONSTANT const RandomAccessIteratorOut out,

From 76c4b1094dc2671dd01b555531e95c0c91b0881e Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Thu, 4 Dec 2025 14:24:31 +0530
Subject: [PATCH 10/14] Minor Fixes

---
 .../device/dispatch/kernels/kernel_reduce.cuh | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
index 99655ef1826..6b6ee775642 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
@@ -134,9 +134,9 @@ template <typename ArchPolicies,
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
   ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10})
-    .reduce_policy.block_threads)) void DeviceReduceKernel(InputIteratorT d_in,
-                                                           AccumT* d_out,
-                                                           OffsetT num_items,
+    .reduce_policy.block_threads)) void DeviceReduceKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
+                                                           _CCCL_GRID_CONSTANT AccumT* const d_out,
+                                                           _CCCL_GRID_CONSTANT const OffsetT num_items,
                                                            GridEvenShare<OffsetT> even_share,
                                                            ReductionOpT reduction_op,
                                                            TransformOpT transform_op)
@@ -227,9 +227,9 @@ template <typename ArchPolicies,
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   int(ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile_policy.block_threads),
-  1) void DeviceReduceSingleTileKernel(InputIteratorT d_in,
-                                       OutputIteratorT d_out,
-                                       OffsetT num_items,
+  1) void DeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
+                                       _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
+                                       _CCCL_GRID_CONSTANT const OffsetT num_items,
                                        ReductionOpT reduction_op,
                                        _CCCL_GRID_CONSTANT const InitT init,
                                        TransformOpT transform_op)
@@ -520,12 +520,12 @@ template <typename ArchPolicies,
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
   ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10})
     .reduce_nondeterministic_policy
-    .block_threads)) void NondeterministicDeviceReduceAtomicKernel(InputIteratorT d_in,
-                                                                   OutputIteratorT d_out,
-                                                                   OffsetT num_items,
+    .block_threads)) void NondeterministicDeviceReduceAtomicKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
+                                                                   _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
+                                                                   _CCCL_GRID_CONSTANT const OffsetT num_items,
                                                                    GridEvenShare<OffsetT> even_share,
                                                                    ReductionOpT reduction_op,
-                                                                   InitT init,
+                                                                   _CCCL_GRID_CONSTANT const InitT init,
                                                                    TransformOpT transform_op)
 {
   NV_IF_TARGET(NV_PROVIDES_SM_60,

From b2bf44a506c539ef2abd861c6dd4a7c03bf156a3 Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Sat, 13 Dec 2025 11:33:06 +0530
Subject: [PATCH 11/14] Add _CCCL_GRID_CONSTANT const to ::cuda::std::array for
 Histogram

---
 cub/cub/agent/agent_histogram.cuh             | 16 +++++------
 .../dispatch/kernels/kernel_histogram.cuh     | 28 +++++++++----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index d49aecdc75d..7e2112e5e2b 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -203,12 +203,12 @@ struct AgentHistogram
   _TempStorage& temp_storage;
   WrappedSampleIteratorT d_wrapped_samples; // with cache modifier applied, if possible
   SampleT* d_native_samples; // possibly nullptr if unavailable
-  int* num_output_bins; // one for each channel
-  int* num_privatized_bins; // one for each channel
+  const int* num_output_bins; // one for each channel (read-only)
+  const int* num_privatized_bins; // one for each channel (read-only)
   CounterT* d_privatized_histograms[NumActiveChannels]; // one for each channel
   CounterT** d_output_histograms; // in global memory
-  OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel
-  PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel
+  const OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel (read-only)
+  const PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel (read-only)
   bool prefer_smem; // for privatized counterss
 
   template <typename TwoDimSubscriptableCounterT>
@@ -564,12 +564,12 @@ struct AgentHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram(
     TempStorage& temp_storage,
     SampleIteratorT d_samples,
-    int* num_output_bins,
-    int* num_privatized_bins,
+    const int* num_output_bins,
+    const int* num_privatized_bins,
     CounterT** d_output_histograms,
     CounterT** d_privatized_histograms,
-    OutputDecodeOpT* output_decode_op,
-    PrivatizedDecodeOpT* privatized_decode_op)
+    const OutputDecodeOpT* output_decode_op,
+    const PrivatizedDecodeOpT* privatized_decode_op)
       : temp_storage(temp_storage.Alias())
       , d_wrapped_samples(d_samples)
       , d_native_samples(NativePointer(d_wrapped_samples))
diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
index bf86c9be4b2..77df86d4259 100644
--- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
@@ -47,7 +47,7 @@ struct Transforms
 
     // Method for converting samples to bin-ids
     template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) const
     {
       /// Level iterator wrapper type
       // Wrap the native input pointer with CacheModifiedInputIterator
@@ -185,13 +185,13 @@ struct Transforms
 
     // All types but __half:
     template <typename T>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(T sample, T max_level, T min_level)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(T sample, T max_level, T min_level) const
     {
       return sample >= min_level && sample < max_level;
     }
 
 #if _CCCL_HAS_NVFP16()
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(__half sample, __half max_level, __half min_level)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(__half sample, __half max_level, __half min_level) const
     {
       NV_IF_TARGET(
         NV_PROVIDES_SM_53,
@@ -214,7 +214,7 @@ struct Transforms
     //! @brief Bin computation for floating point (and extended floating point) types
     template <typename T>
     _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int
-    ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::true_type /* is_fp */)
+    ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::true_type /* is_fp */) const
     {
       return static_cast<int>((sample - min_level) * scale.reciprocal);
     }
@@ -222,14 +222,14 @@ struct Transforms
     //! @brief Bin computation for custom types and __[u]int128
     template <typename T>
     _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int
-    ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::false_type /* is_fp */)
+    ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::false_type /* is_fp */) const
     {
       return static_cast<int>(((sample - min_level) * scale.fraction.bins) / scale.fraction.range);
     }
 
     //! @brief Bin computation for integral types of up to 64-bit types
     template <typename T, ::cuda::std::enable_if_t<is_integral_excl_int128<T>::value, int> = 0>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) const
     {
       return static_cast<int>(
         (static_cast<IntArithmeticT>(sample - min_level) * static_cast<IntArithmeticT>(scale.fraction.bins))
@@ -237,13 +237,13 @@ struct Transforms
     }
 
     template <typename T, ::cuda::std::enable_if_t<!is_integral_excl_int128<T>::value, int> = 0>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) const
     {
       return this->ComputeBin(sample, min_level, scale, ::cuda::std::is_floating_point<T>{});
     }
 
 #if _CCCL_HAS_NVFP16()
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(__half sample, __half min_level, ScaleT scale)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(__half sample, __half min_level, ScaleT scale) const
     {
       NV_IF_TARGET(
         NV_PROVIDES_SM_53,
@@ -264,7 +264,7 @@ struct Transforms
 
     // Method for converting samples to bin-ids
     template <CacheLoadModifier LOAD_MODIFIER>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid) const
     {
       const CommonT common_sample = static_cast<CommonT>(sample);
 
@@ -296,7 +296,7 @@ struct Transforms
 
     // Method for converting samples to bin-ids
     template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) const
     {
       if (valid)
       {
@@ -445,12 +445,12 @@ template <typename ChainedPolicyT,
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramSweepKernel(
     _CCCL_GRID_CONSTANT const SampleIteratorT d_samples,
-    ::cuda::std::array<int, NumActiveChannels> num_output_bins_wrapper,
-    ::cuda::std::array<int, NumActiveChannels> num_privatized_bins_wrapper,
+    _CCCL_GRID_CONSTANT const ::cuda::std::array<int, NumActiveChannels> num_output_bins_wrapper,
+    _CCCL_GRID_CONSTANT const ::cuda::std::array<int, NumActiveChannels> num_privatized_bins_wrapper,
     ::cuda::std::array<CounterT*, NumActiveChannels> d_output_histograms_wrapper,
     ::cuda::std::array<CounterT*, NumActiveChannels> d_privatized_histograms_wrapper,
-    ::cuda::std::array<OutputDecodeOpT, NumActiveChannels> output_decode_op_wrapper,
-    ::cuda::std::array<PrivatizedDecodeOpT, NumActiveChannels> privatized_decode_op_wrapper,
+    _CCCL_GRID_CONSTANT const ::cuda::std::array<OutputDecodeOpT, NumActiveChannels> output_decode_op_wrapper,
+    _CCCL_GRID_CONSTANT const ::cuda::std::array<PrivatizedDecodeOpT, NumActiveChannels> privatized_decode_op_wrapper,
     _CCCL_GRID_CONSTANT const OffsetT num_row_pixels,
     _CCCL_GRID_CONSTANT const OffsetT num_rows,
     _CCCL_GRID_CONSTANT const OffsetT row_stride_samples,

From 77363bf59f87b375c823ee3d725dfaca2fc1b68a Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Sat, 13 Dec 2025 11:34:00 +0530
Subject: [PATCH 12/14] Add _CCCL_GRID_CONSTANT const to ::cuda::std::array for
 Histogram

---
 cub/cub/agent/agent_histogram.cuh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index 7e2112e5e2b..683693821f7 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -207,8 +207,10 @@ struct AgentHistogram
   const int* num_privatized_bins; // one for each channel (read-only)
   CounterT* d_privatized_histograms[NumActiveChannels]; // one for each channel
   CounterT** d_output_histograms; // in global memory
-  const OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel (read-only)
-  const PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel (read-only)
+  const OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each
+                                           // channel (read-only)
+  const PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each
+                                                   // channel (read-only)
   bool prefer_smem; // for privatized counterss
 
   template <typename TwoDimSubscriptableCounterT>

From a234d17b9953501342e525e13fb7f557ee086018 Mon Sep 17 00:00:00 2001
From: Aryamaan Singh <aryamaan.singh05@gmail.com>
Date: Mon, 2 Feb 2026 17:22:46 +0530
Subject: [PATCH 13/14] Fixes and Updates

---
 .../dispatch/kernels/kernel_radix_sort.cuh     | 17 +++++++++--------
 .../device/dispatch/kernels/kernel_reduce.cuh  | 18 +++++++++---------
 .../dispatch/kernels/kernel_transform.cuh      | 12 ++++++------
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
index 9d888eccd45..7c32b0ffd21 100644
--- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
@@ -143,7 +143,8 @@ __launch_bounds__(int(ALT_DIGIT_BITS ? PolicySelector{}(::cuda::arch_id{CUB_PTX_
  */
 template <typename PolicySelector, typename OffsetT>
 __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).scan.block_threads, 1)
-  CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, int num_counts)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(
+    _CCCL_GRID_CONSTANT OffsetT* const d_spine, _CCCL_GRID_CONSTANT const int num_counts)
 {
   static constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).scan;
   using ScanPolicy                    = AgentScanPolicy<
@@ -451,12 +452,12 @@ template <typename PolicySelector,
           typename DecomposerT = identity_decomposer_t>
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10})
-    .histogram.block_threads) void DeviceRadixSortHistogramKernel(OffsetT* d_bins_out,
-                                                                  const KeyT* d_keys_in,
-                                                                  OffsetT num_items,
-                                                                  int start_bit,
-                                                                  int end_bit,
-                                                                  DecomposerT decomposer = {})
+    .histogram.block_threads) void DeviceRadixSortHistogramKernel(_CCCL_GRID_CONSTANT OffsetT* const d_bins_out,
+                                                                  _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
+                                                                  _CCCL_GRID_CONSTANT const OffsetT num_items,
+                                                                  _CCCL_GRID_CONSTANT const int start_bit,
+                                                                  _CCCL_GRID_CONSTANT const int end_bit,
+                                                                  _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   static constexpr radix_sort_histogram_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).histogram;
 
@@ -535,7 +536,7 @@ __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).onesweep.
  * Exclusive sum kernel
  */
 template <typename PolicySelector, typename OffsetT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins)
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(_CCCL_GRID_CONSTANT OffsetT* const d_bins)
 {
   static constexpr radix_sort_exclusive_sum_policy policy =
     PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).exclusive_sum;
diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
index 50b0f1c0d5c..6ca08016d7a 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
@@ -134,9 +134,9 @@ template <typename PolicySelector,
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce.block_threads)) void DeviceReduceKernel(
-  InputIteratorT d_in,
-  AccumT* d_out,
-  OffsetT num_items,
+  _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+  _CCCL_GRID_CONSTANT AccumT* const d_out,
+  _CCCL_GRID_CONSTANT const OffsetT num_items,
   GridEvenShare<OffsetT> even_share,
   ReductionOpT reduction_op,
   TransformOpT transform_op)
@@ -227,9 +227,9 @@ template <typename PolicySelector,
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile.block_threads),
-  1) void DeviceReduceSingleTileKernel(InputIteratorT d_in,
-                                       OutputIteratorT d_out,
-                                       OffsetT num_items,
+  1) void DeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
+                                       _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
+                                       _CCCL_GRID_CONSTANT const OffsetT num_items,
                                        ReductionOpT reduction_op,
                                        _CCCL_GRID_CONSTANT const InitT init,
                                        TransformOpT transform_op)
@@ -520,9 +520,9 @@ template <typename PolicySelector,
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
   PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10})
     .reduce_nondeterministic
-    .block_threads)) void NondeterministicDeviceReduceAtomicKernel(InputIteratorT d_in,
-                                                                   OutputIteratorT d_out,
-                                                                   OffsetT num_items,
+    .block_threads)) void NondeterministicDeviceReduceAtomicKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
+                                                                   _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
+                                                                   _CCCL_GRID_CONSTANT const OffsetT num_items,
                                                                    GridEvenShare<OffsetT> even_share,
                                                                    ReductionOpT reduction_op,
                                                                    _CCCL_GRID_CONSTANT const InitT init,
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
index 30b78d491b0..0a4b1958c5e 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
@@ -1013,12 +1013,12 @@ template <typename PolicySelector,
   requires transform_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
 __launch_bounds__(get_block_threads<PolicySelector>) CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
-  Offset num_items,
-  int num_elem_per_thread,
-  [[maybe_unused]] bool can_vectorize,
-  Predicate pred,
-  F f,
-  RandomAccessIteratorOut out,
+  _CCCL_GRID_CONSTANT const Offset num_items,
+  _CCCL_GRID_CONSTANT const int num_elem_per_thread,
+  _CCCL_GRID_CONSTANT [[maybe_unused]] const bool can_vectorize,
+  _CCCL_GRID_CONSTANT const Predicate pred,
+  _CCCL_GRID_CONSTANT const F f,
+  _CCCL_GRID_CONSTANT const RandomAccessIteratorOut out,
   kernel_arg<RandomAccessIteratorsIn>... ins)
 {
   _CCCL_ASSERT(blockDim.y == 1 && blockDim.z == 1, "transform_kernel only supports 1D blocks");

From 967ce0b5be7cc105744c40ea47278993993849b9 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 6 Feb 2026 16:23:52 +0100
Subject: [PATCH 14/14] FIx

---
 cub/cub/device/dispatch/dispatch_adjacent_difference.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index f792d59723d..6e885c59e99 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -34,7 +34,7 @@ namespace detail::adjacent_difference
 template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel(
   _CCCL_GRID_CONSTANT const InputIteratorT first,
-  _CCCL_GRID_CONSTANT const InputT* result,
+  _CCCL_GRID_CONSTANT InputT* const result,
   _CCCL_GRID_CONSTANT const OffsetT num_tiles,
   _CCCL_GRID_CONSTANT const int items_per_tile)
 {