diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index 4c4ba87ed92..568297418ee 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -226,12 +226,14 @@ struct AgentHistogram
   _TempStorage& temp_storage;
   WrappedSampleIteratorT d_wrapped_samples; // with cache modifier applied, if possible
   SampleT* d_native_samples; // possibly nullptr if unavailable
-  int* num_output_bins; // one for each channel
-  int* num_privatized_bins; // one for each channel
+  const int* num_output_bins; // one for each channel
+  const int* num_privatized_bins; // one for each channel
   CounterT* d_privatized_histograms[NumActiveChannels]; // one for each channel
   CounterT** d_output_histograms; // in global memory
-  OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel
-  PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel
+  const OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each
+                                           // channel
+  const PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each
+                                                   // channel
   bool prefer_smem; // for privatized counterss
 
   template <typename TwoDimSubscriptableCounterT>
@@ -587,12 +589,12 @@ struct AgentHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram(
     TempStorage& temp_storage,
     SampleIteratorT d_samples,
-    int* num_output_bins,
-    int* num_privatized_bins,
+    const int* num_output_bins,
+    const int* num_privatized_bins,
     CounterT** d_output_histograms,
     CounterT** d_privatized_histograms,
-    OutputDecodeOpT* output_decode_op,
-    PrivatizedDecodeOpT* privatized_decode_op)
+    const OutputDecodeOpT* output_decode_op,
+    const PrivatizedDecodeOpT* privatized_decode_op)
       : temp_storage(temp_storage.Alias())
       , d_wrapped_samples(d_samples)
       , d_native_samples(NativePointer(d_wrapped_samples))
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index 95e88e71050..ccdc9231c68 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -39,8 +39,11 @@ CUB_NAMESPACE_BEGIN
 namespace detail::adjacent_difference
 {
 template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile)
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel(
+  _CCCL_GRID_CONSTANT const InputIteratorT first,
+  _CCCL_GRID_CONSTANT InputT* const result,
+  _CCCL_GRID_CONSTANT const OffsetT num_tiles,
+  _CCCL_GRID_CONSTANT const int items_per_tile)
 {
   const int tile_idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
   AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile);
@@ -55,11 +58,11 @@ template <typename PolicySelector,
           bool MayAlias,
           bool ReadLeft>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
-  InputIteratorT input,
-  InputT* first_tile_previous,
-  OutputIteratorT result,
+  _CCCL_GRID_CONSTANT const InputIteratorT input,
+  _CCCL_GRID_CONSTANT InputT* const first_tile_previous,
+  _CCCL_GRID_CONSTANT const OutputIteratorT result,
   DifferenceOpT difference_op,
-  OffsetT num_items)
+  _CCCL_GRID_CONSTANT const OffsetT num_items)
 {
   static_assert(::cuda::std::is_empty_v<PolicySelector>);
   static constexpr adjacent_difference_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 33f73fa6a56..312278ba33e 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -61,7 +61,7 @@ template <typename BufferOffsetScanTileStateT, typename BlockOffsetScanTileState
 CUB_DETAIL_KERNEL_ATTRIBUTES void InitTileStateKernel(
   BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
   BlockOffsetScanTileStateT block_offset_scan_tile_state,
-  TileOffsetT num_tiles)
+  _CCCL_GRID_CONSTANT const TileOffsetT num_tiles)
 {
   // Initialize tile status
   buffer_offset_scan_tile_state.InitializeStatus(num_tiles);
@@ -83,12 +83,12 @@ template <typename ChainedPolicyT,
           CopyAlg MemcpyOpt>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void MultiBlockBatchMemcpyKernel(
-    InputBufferIt input_buffer_it,
-    OutputBufferIt output_buffer_it,
-    BufferSizeIteratorT buffer_sizes,
-    BufferTileOffsetItT buffer_tile_offsets,
+    _CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it,
+    _CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it,
+    _CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes,
+    _CCCL_GRID_CONSTANT const BufferTileOffsetItT buffer_tile_offsets,
     TileT buffer_offset_tile,
-    TileOffsetT last_tile_offset)
+    _CCCL_GRID_CONSTANT const TileOffsetT last_tile_offset)
 {
   using StatusWord    = typename TileT::StatusWord;
   using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
@@ -210,16 +210,16 @@ template <typename ChainedPolicyT,
           CopyAlg MemcpyOpt>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void BatchMemcpyKernel(
-    InputBufferIt input_buffer_it,
-    OutputBufferIt output_buffer_it,
-    BufferSizeIteratorT buffer_sizes,
-    BufferOffsetT num_buffers,
-    BlevBufferSrcsOutItT blev_buffer_srcs,
-    BlevBufferDstsOutItT blev_buffer_dsts,
-    BlevBufferSizesOutItT blev_buffer_sizes,
-    BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
-    BLevBufferOffsetTileState blev_buffer_scan_state,
-    BLevBlockOffsetTileState blev_block_scan_state)
+    _CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it,
+    _CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it,
+    _CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes,
+    _CCCL_GRID_CONSTANT const BufferOffsetT num_buffers,
+    _CCCL_GRID_CONSTANT const BlevBufferSrcsOutItT blev_buffer_srcs,
+    _CCCL_GRID_CONSTANT const BlevBufferDstsOutItT blev_buffer_dsts,
+    _CCCL_GRID_CONSTANT const BlevBufferSizesOutItT blev_buffer_sizes,
+    _CCCL_GRID_CONSTANT const BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
+    _CCCL_GRID_CONSTANT const BLevBufferOffsetTileState blev_buffer_scan_state,
+    _CCCL_GRID_CONSTANT const BLevBlockOffsetTileState blev_block_scan_state)
 {
   // Internal type used for storing a buffer's size
   using BufferSizeT = it_value_t<BufferSizeIteratorT>;
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index 7f5a13550b9..2fc376d674e 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -89,11 +89,11 @@ template <typename PolicySelector,
           typename Offset,
           typename CompareOp>
 CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
-  KeyIt1 keys1,
-  Offset keys1_count,
-  KeyIt2 keys2,
-  Offset keys2_count,
-  Offset num_diagonals,
+  _CCCL_GRID_CONSTANT const KeyIt1 keys1,
+  _CCCL_GRID_CONSTANT const Offset keys1_count,
+  _CCCL_GRID_CONSTANT const KeyIt2 keys2,
+  _CCCL_GRID_CONSTANT const Offset keys2_count,
+  _CCCL_GRID_CONSTANT const Offset num_diagonals,
   Offset* key1_beg_offsets,
   CompareOp compare_op)
 {
@@ -137,14 +137,14 @@ __launch_bounds__(
                      Offset,
                      CompareOp>::type::block_threads)
   CUB_DETAIL_KERNEL_ATTRIBUTES void device_merge_kernel(
-    KeyIt1 keys1,
-    ValueIt1 items1,
-    Offset num_keys1,
-    KeyIt2 keys2,
-    ValueIt2 items2,
-    Offset num_keys2,
-    KeyIt3 keys_result,
-    ValueIt3 items_result,
+    _CCCL_GRID_CONSTANT const KeyIt1 keys1,
+    _CCCL_GRID_CONSTANT const ValueIt1 items1,
+    _CCCL_GRID_CONSTANT const Offset num_keys1,
+    _CCCL_GRID_CONSTANT const KeyIt2 keys2,
+    _CCCL_GRID_CONSTANT const ValueIt2 items2,
+    _CCCL_GRID_CONSTANT const Offset num_keys2,
+    _CCCL_GRID_CONSTANT const KeyIt3 keys_result,
+    _CCCL_GRID_CONSTANT const ValueIt3 items_result,
     CompareOp compare_op,
     Offset* key1_beg_offsets,
     vsmem_t global_temp_storage)
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 14fbf3ae15c..e884a5472ed 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -190,16 +190,16 @@ template <typename PolicySelector,
 #endif
 __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel(
-    KeysInputIteratorT d_keys_in,
-    UniqueOutputIteratorT d_unique_out,
-    ValuesInputIteratorT d_values_in,
-    AggregatesOutputIteratorT d_aggregates_out,
-    NumRunsOutputIteratorT d_num_runs_out,
+    _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
+    _CCCL_GRID_CONSTANT const UniqueOutputIteratorT d_unique_out,
+    _CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in,
+    _CCCL_GRID_CONSTANT const AggregatesOutputIteratorT d_aggregates_out,
+    _CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out,
     ScanTileStateT tile_state,
-    int start_tile,
+    _CCCL_GRID_CONSTANT const int start_tile,
     EqualityOpT equality_op,
     ReductionOpT reduction_op,
-    OffsetT num_items,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
     vsmem_t vsmem)
 {
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 2854b88833c..3d4c010782f 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -178,14 +178,14 @@ template <typename PolicySelector,
 #endif // _CCCL_HAS_CONCEPTS()
 __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel(
-    InputIteratorT d_in,
-    OffsetsOutputIteratorT d_offsets_out,
-    LengthsOutputIteratorT d_lengths_out,
-    NumRunsOutputIteratorT d_num_runs_out,
+    _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+    _CCCL_GRID_CONSTANT const OffsetsOutputIteratorT d_offsets_out,
+    _CCCL_GRID_CONSTANT const LengthsOutputIteratorT d_lengths_out,
+    _CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out,
     ScanTileStateT tile_status,
     EqualityOpT equality_op,
-    OffsetT num_items,
-    int num_tiles,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context)
 {
   static constexpr non_trivial_runs::rle_non_trivial_runs_policy policy =
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index f8772d121b3..c4ba7e75eaf 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -120,16 +120,16 @@ template <typename ChainedPolicyT,
           typename KeyT = cub::detail::it_value_t<KeysInputIteratorT>>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel(
-    KeysInputIteratorT d_keys_in,
-    KeyT* d_keys_prev_in,
-    ValuesInputIteratorT d_values_in,
-    ValuesOutputIteratorT d_values_out,
+    _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_prev_in,
+    _CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in,
+    _CCCL_GRID_CONSTANT const ValuesOutputIteratorT d_values_out,
     ScanByKeyTileStateT tile_state,
-    int start_tile,
+    _CCCL_GRID_CONSTANT const int start_tile,
     EqualityOp equality_op,
-    ScanOpT scan_op,
-    InitValueT init_value,
-    OffsetT num_items)
+    _CCCL_GRID_CONSTANT const ScanOpT scan_op,
+    _CCCL_GRID_CONSTANT const InitValueT init_value,
+    _CCCL_GRID_CONSTANT const OffsetT num_items)
 {
   using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT;
 
@@ -156,10 +156,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE
 template <typename ScanTileStateT, typename KeysInputIteratorT, typename OffsetT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel(
   ScanTileStateT tile_state,
-  KeysInputIteratorT d_keys_in,
+  _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
   cub::detail::it_value_t<KeysInputIteratorT>* d_keys_prev_in,
-  OffsetT items_per_tile,
-  int num_tiles)
+  _CCCL_GRID_CONSTANT const OffsetT items_per_tile,
+  _CCCL_GRID_CONSTANT const int num_tiles)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index 728ec456f88..6404054aa02 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -183,25 +183,25 @@ template <typename LargeKernelT,
           typename EndOffsetIteratorT,
           typename KernelLauncherFactory>
 __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContinuationKernel(
-  LargeKernelT large_kernel,
-  SmallKernelT small_kernel,
-  local_segment_index_t num_segments,
-  KeyT* d_current_keys,
-  KeyT* d_final_keys,
+  _CCCL_GRID_CONSTANT const LargeKernelT large_kernel,
+  _CCCL_GRID_CONSTANT const SmallKernelT small_kernel,
+  _CCCL_GRID_CONSTANT const local_segment_index_t num_segments,
+  _CCCL_GRID_CONSTANT KeyT* const d_current_keys,
+  _CCCL_GRID_CONSTANT KeyT* const d_final_keys,
   device_double_buffer<KeyT> d_keys_double_buffer,
-  ValueT* d_current_values,
-  ValueT* d_final_values,
+  _CCCL_GRID_CONSTANT ValueT* const d_current_values,
+  _CCCL_GRID_CONSTANT ValueT* const d_final_values,
   device_double_buffer<ValueT> d_values_double_buffer,
-  BeginOffsetIteratorT d_begin_offsets,
-  EndOffsetIteratorT d_end_offsets,
-  local_segment_index_t* group_sizes,
-  local_segment_index_t* large_and_medium_segments_indices,
-  local_segment_index_t* small_segments_indices,
-  KernelLauncherFactory launcher_factory,
-  int large_block_threads,
-  int small_block_threads,
-  int medium_segments_per_block,
-  int small_segments_per_block)
+  _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
+  _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
+  _CCCL_GRID_CONSTANT local_segment_index_t* const group_sizes,
+  _CCCL_GRID_CONSTANT local_segment_index_t* const large_and_medium_segments_indices,
+  _CCCL_GRID_CONSTANT local_segment_index_t* const small_segments_indices,
+  _CCCL_GRID_CONSTANT const KernelLauncherFactory launcher_factory,
+  _CCCL_GRID_CONSTANT const int large_block_threads,
+  _CCCL_GRID_CONSTANT const int small_block_threads,
+  _CCCL_GRID_CONSTANT const int medium_segments_per_block,
+  _CCCL_GRID_CONSTANT const int small_segments_per_block)
 {
   // In case of CDP:
   // 1. each CTA has a different main stream
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index f70c3e2189a..2f2d611ba8d 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -315,15 +315,15 @@ __launch_bounds__(int(
     OffsetT,
     StreamingContextT>::agent_policy_t::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel(
-    InputIteratorT d_in,
-    FlagsInputIteratorT d_flags,
-    SelectedOutputIteratorT d_selected_out,
-    NumSelectedIteratorT d_num_selected_out,
+    _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+    _CCCL_GRID_CONSTANT const FlagsInputIteratorT d_flags,
+    _CCCL_GRID_CONSTANT const SelectedOutputIteratorT d_selected_out,
+    _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_status,
     SelectOpT select_op,
     EqualityOpT equality_op,
-    OffsetT num_items,
-    int num_tiles,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
     vsmem_t vsmem)
 {
diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh
index 543bceabcee..1c15e99b6a9 100644
--- a/cub/cub/device/dispatch/dispatch_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_topk.cuh
@@ -112,22 +112,22 @@ template <typename PolicySelector,
 #endif // _CCCL_HAS_CONCEPTS()
 __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceTopKKernel(
-    const KeyInputIteratorT d_keys_in,
-    KeyOutputIteratorT d_keys_out,
-    const ValueInputIteratorT d_values_in,
-    ValueOutputIteratorT d_values_out,
-    KeyInT* in_buf,
-    OffsetT* in_idx_buf,
-    KeyInT* out_buf,
-    OffsetT* out_idx_buf,
+    _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in,
+    _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out,
+    _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
+    _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
+    _CCCL_GRID_CONSTANT KeyInT* const in_buf,
+    _CCCL_GRID_CONSTANT OffsetT* const in_idx_buf,
+    _CCCL_GRID_CONSTANT KeyInT* const out_buf,
+    _CCCL_GRID_CONSTANT OffsetT* const out_idx_buf,
     Counter<it_value_t<KeyInputIteratorT>, OffsetT, OutOffsetT>* counter,
-    OffsetT* histogram,
-    OffsetT num_items,
-    OutOffsetT k,
-    OffsetT buffer_length,
+    _CCCL_GRID_CONSTANT OffsetT* const histogram,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const OutOffsetT k,
+    _CCCL_GRID_CONSTANT const OffsetT buffer_length,
     ExtractBinOpT extract_bin_op,
     IdentifyCandidatesOpT identify_candidates_op,
-    int pass)
+    _CCCL_GRID_CONSTANT const int pass)
 {
   static constexpr topk_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
   using agent_topk_policy_t =
@@ -177,18 +177,18 @@ template <typename PolicySelector,
 #endif // _CCCL_HAS_CONCEPTS()
 __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceTopKLastFilterKernel(
-    const KeyInputIteratorT d_keys_in,
-    KeyOutputIteratorT d_keys_out,
-    const ValueInputIteratorT d_values_in,
-    ValueOutputIteratorT d_values_out,
-    KeyInT* in_buf,
-    OffsetT* in_idx_buf,
+    _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in,
+    _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out,
+    _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
+    _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
+    _CCCL_GRID_CONSTANT KeyInT* const in_buf,
+    _CCCL_GRID_CONSTANT OffsetT* const in_idx_buf,
     Counter<it_value_t<KeyInputIteratorT>, OffsetT, OutOffsetT>* counter,
-    OffsetT num_items,
-    OutOffsetT k,
-    OffsetT buffer_length,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const OutOffsetT k,
+    _CCCL_GRID_CONSTANT const OffsetT buffer_length,
     IdentifyCandidatesOpT identify_candidates_op,
-    int pass)
+    _CCCL_GRID_CONSTANT const int pass)
 {
   static constexpr topk_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
   using agent_topk_policy_t =
diff --git a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
index d98683ca964..307f5275061 100644
--- a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh
@@ -90,7 +90,7 @@ template <class PolicySelector, class OffsetT, class OpT>
 #if _CCCL_HAS_CONCEPTS()
   requires for_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
-CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
+CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, OpT op)
 {
   static constexpr for_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
   using agent_policy_t               = policy_t<policy.block_threads, policy.items_per_thread>;
@@ -119,7 +119,7 @@ template <class PolicySelector, class OffsetT, class OpT>
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_DETAIL_KERNEL_ATTRIBUTES //
 __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads)) //
-  void static_kernel(OffsetT num_items, OpT op)
+  void static_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, OpT op)
 {
   static constexpr for_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
   using agent_policy_t               = policy_t<policy.block_threads, policy.items_per_thread>;
diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
index bc8f17f1e39..77df86d4259 100644
--- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
@@ -47,7 +47,7 @@ struct Transforms
 
     // Method for converting samples to bin-ids
     template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) const
     {
       /// Level iterator wrapper type
       // Wrap the native input pointer with CacheModifiedInputIterator
@@ -185,13 +185,13 @@ struct Transforms
 
     // All types but __half:
     template <typename T>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(T sample, T max_level, T min_level)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(T sample, T max_level, T min_level) const
     {
       return sample >= min_level && sample < max_level;
     }
 
 #if _CCCL_HAS_NVFP16()
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(__half sample, __half max_level, __half min_level)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(__half sample, __half max_level, __half min_level) const
     {
       NV_IF_TARGET(
         NV_PROVIDES_SM_53,
@@ -214,7 +214,7 @@ struct Transforms
     //! @brief Bin computation for floating point (and extended floating point) types
     template <typename T>
     _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int
-    ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::true_type /* is_fp */)
+    ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::true_type /* is_fp */) const
     {
       return static_cast<int>((sample - min_level) * scale.reciprocal);
     }
@@ -222,14 +222,14 @@ struct Transforms
     //! @brief Bin computation for custom types and __[u]int128
     template <typename T>
     _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int
-    ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::false_type /* is_fp */)
+    ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::false_type /* is_fp */) const
     {
       return static_cast<int>(((sample - min_level) * scale.fraction.bins) / scale.fraction.range);
     }
 
     //! @brief Bin computation for integral types of up to 64-bit types
     template <typename T, ::cuda::std::enable_if_t<is_integral_excl_int128<T>::value, int> = 0>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) const
     {
       return static_cast<int>(
         (static_cast<IntArithmeticT>(sample - min_level) * static_cast<IntArithmeticT>(scale.fraction.bins))
@@ -237,13 +237,13 @@ struct Transforms
     }
 
     template <typename T, ::cuda::std::enable_if_t<!is_integral_excl_int128<T>::value, int> = 0>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) const
     {
       return this->ComputeBin(sample, min_level, scale, ::cuda::std::is_floating_point<T>{});
     }
 
 #if _CCCL_HAS_NVFP16()
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(__half sample, __half min_level, ScaleT scale)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(__half sample, __half min_level, ScaleT scale) const
     {
       NV_IF_TARGET(
         NV_PROVIDES_SM_53,
@@ -264,7 +264,7 @@ struct Transforms
 
     // Method for converting samples to bin-ids
     template <CacheLoadModifier LOAD_MODIFIER>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid) const
     {
       const CommonT common_sample = static_cast<CommonT>(sample);
 
@@ -296,7 +296,7 @@ struct Transforms
 
     // Method for converting samples to bin-ids
     template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid)
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) const
     {
       if (valid)
       {
@@ -444,17 +444,17 @@ template <typename ChainedPolicyT,
           typename OffsetT>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramSweepKernel(
-    SampleIteratorT d_samples,
-    ::cuda::std::array<int, NumActiveChannels> num_output_bins_wrapper,
-    ::cuda::std::array<int, NumActiveChannels> num_privatized_bins_wrapper,
+    _CCCL_GRID_CONSTANT const SampleIteratorT d_samples,
+    _CCCL_GRID_CONSTANT const ::cuda::std::array<int, NumActiveChannels> num_output_bins_wrapper,
+    _CCCL_GRID_CONSTANT const ::cuda::std::array<int, NumActiveChannels> num_privatized_bins_wrapper,
     ::cuda::std::array<CounterT*, NumActiveChannels> d_output_histograms_wrapper,
     ::cuda::std::array<CounterT*, NumActiveChannels> d_privatized_histograms_wrapper,
-    ::cuda::std::array<OutputDecodeOpT, NumActiveChannels> output_decode_op_wrapper,
-    ::cuda::std::array<PrivatizedDecodeOpT, NumActiveChannels> privatized_decode_op_wrapper,
-    OffsetT num_row_pixels,
-    OffsetT num_rows,
-    OffsetT row_stride_samples,
-    int tiles_per_row,
+    _CCCL_GRID_CONSTANT const ::cuda::std::array<OutputDecodeOpT, NumActiveChannels> output_decode_op_wrapper,
+    _CCCL_GRID_CONSTANT const ::cuda::std::array<PrivatizedDecodeOpT, NumActiveChannels> privatized_decode_op_wrapper,
+    _CCCL_GRID_CONSTANT const OffsetT num_row_pixels,
+    _CCCL_GRID_CONSTANT const OffsetT num_rows,
+    _CCCL_GRID_CONSTANT const OffsetT row_stride_samples,
+    _CCCL_GRID_CONSTANT const int tiles_per_row,
     GridQueue<int> tile_queue)
 {
   // Thread block type for compositing input tiles
diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
index a678ee22ff0..7c5f0d8ae47 100644
--- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh
@@ -167,14 +167,14 @@ __launch_bounds__(
     KeyT,
     ValueT>::policy_t::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel(
-    bool ping,
-    KeyInputIteratorT keys_in,
-    ValueInputIteratorT items_in,
-    KeyIteratorT keys_out,
-    ValueIteratorT items_out,
-    OffsetT keys_count,
-    KeyT* tmp_keys_out,
-    ValueT* tmp_items_out,
+    _CCCL_GRID_CONSTANT const bool ping,
+    _CCCL_GRID_CONSTANT const KeyInputIteratorT keys_in,
+    _CCCL_GRID_CONSTANT const ValueInputIteratorT items_in,
+    _CCCL_GRID_CONSTANT const KeyIteratorT keys_out,
+    _CCCL_GRID_CONSTANT const ValueIteratorT items_out,
+    _CCCL_GRID_CONSTANT const OffsetT keys_count,
+    _CCCL_GRID_CONSTANT KeyT* const tmp_keys_out,
+    _CCCL_GRID_CONSTANT ValueT* const tmp_items_out,
     CompareOpT compare_op,
     vsmem_t vsmem)
 {
@@ -221,15 +221,15 @@ __launch_bounds__(
 
 template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
 CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(
-  bool ping,
-  KeyIteratorT keys_ping,
-  KeyT* keys_pong,
-  OffsetT keys_count,
-  OffsetT num_partitions,
-  OffsetT* merge_partitions,
+  _CCCL_GRID_CONSTANT const bool ping,
+  _CCCL_GRID_CONSTANT const KeyIteratorT keys_ping,
+  _CCCL_GRID_CONSTANT KeyT* const keys_pong,
+  _CCCL_GRID_CONSTANT const OffsetT keys_count,
+  _CCCL_GRID_CONSTANT const OffsetT num_partitions,
+  _CCCL_GRID_CONSTANT OffsetT* const merge_partitions,
   CompareOpT compare_op,
-  OffsetT target_merged_tiles_number,
-  int items_per_tile)
+  _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
+  _CCCL_GRID_CONSTANT const int items_per_tile)
 {
   const OffsetT partition_idx = static_cast<OffsetT>(blockDim.x * blockIdx.x + threadIdx.x);
   if (partition_idx < num_partitions)
@@ -271,15 +271,15 @@ __launch_bounds__(
     KeyT,
     ValueT>::policy_t::BLOCK_THREADS)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel(
-    bool ping,
-    KeyIteratorT keys_ping,
-    ValueIteratorT items_ping,
-    OffsetT keys_count,
-    KeyT* keys_pong,
-    ValueT* items_pong,
+    _CCCL_GRID_CONSTANT const bool ping,
+    _CCCL_GRID_CONSTANT const KeyIteratorT keys_ping,
+    _CCCL_GRID_CONSTANT const ValueIteratorT items_ping,
+    _CCCL_GRID_CONSTANT const OffsetT keys_count,
+    _CCCL_GRID_CONSTANT KeyT* const keys_pong,
+    _CCCL_GRID_CONSTANT ValueT* const items_pong,
     CompareOpT compare_op,
-    OffsetT* merge_partitions,
-    OffsetT target_merged_tiles_number,
+    _CCCL_GRID_CONSTANT OffsetT* const merge_partitions,
+    _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number,
     vsmem_t vsmem)
 {
   using MergeSortHelperT = typename VSMemHelperT::template MergeSortVSMemHelperT<
diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
index 0a5fcdc7125..7c32b0ffd21 100644
--- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh
@@ -81,13 +81,13 @@ template <typename PolicySelector,
 __launch_bounds__(int(ALT_DIGIT_BITS ? PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).alt_upsweep.block_threads
                                      : PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).upsweep.block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(
-    const KeyT* d_keys,
-    OffsetT* d_spine,
-    OffsetT /*num_items*/,
-    int current_bit,
-    int num_bits,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys,
+    _CCCL_GRID_CONSTANT OffsetT* const d_spine,
+    _CCCL_GRID_CONSTANT const OffsetT /*num_items*/,
+    _CCCL_GRID_CONSTANT const int current_bit,
+    _CCCL_GRID_CONSTANT const int num_bits,
     GridEvenShare<OffsetT> even_share,
-    DecomposerT decomposer = {})
+    _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   static constexpr radix_sort_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
   static constexpr radix_sort_upsweep_policy active_upsweep_policy =
@@ -143,7 +143,8 @@ __launch_bounds__(int(ALT_DIGIT_BITS ? PolicySelector{}(::cuda::arch_id{CUB_PTX_
  */
 template <typename PolicySelector, typename OffsetT>
 __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).scan.block_threads, 1)
-  CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, int num_counts)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(
+    _CCCL_GRID_CONSTANT OffsetT* const d_spine, _CCCL_GRID_CONSTANT const int num_counts)
 {
   static constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).scan;
   using ScanPolicy                    = AgentScanPolicy<
@@ -241,16 +242,16 @@ template <typename PolicySelector,
 __launch_bounds__(int(ALT_DIGIT_BITS ? PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).alt_downsweep.block_threads
                                      : PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).downsweep.block_threads))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel(
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    OffsetT* d_spine,
-    OffsetT num_items,
-    int current_bit,
-    int num_bits,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_out,
+    _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
+    _CCCL_GRID_CONSTANT ValueT* const d_values_out,
+    _CCCL_GRID_CONSTANT OffsetT* const d_spine,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int current_bit,
+    _CCCL_GRID_CONSTANT const int num_bits,
     GridEvenShare<OffsetT> even_share,
-    DecomposerT decomposer = {})
+    _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   static constexpr radix_sort_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
 
@@ -335,14 +336,14 @@ template <typename PolicySelector,
           typename DecomposerT = identity_decomposer_t>
 __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile.block_threads, 1)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel(
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_out,
+    _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
+    _CCCL_GRID_CONSTANT ValueT* const d_values_out,
     OffsetT num_items,
-    int current_bit,
-    int end_bit,
-    DecomposerT decomposer = {})
+    _CCCL_GRID_CONSTANT const int current_bit,
+    _CCCL_GRID_CONSTANT const int end_bit,
+    _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   // Constants
   static constexpr radix_sort_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
@@ -451,12 +452,12 @@ template <typename PolicySelector,
           typename DecomposerT = identity_decomposer_t>
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10})
-    .histogram.block_threads) void DeviceRadixSortHistogramKernel(OffsetT* d_bins_out,
-                                                                  const KeyT* d_keys_in,
-                                                                  OffsetT num_items,
-                                                                  int start_bit,
-                                                                  int end_bit,
-                                                                  DecomposerT decomposer = {})
+    .histogram.block_threads) void DeviceRadixSortHistogramKernel(_CCCL_GRID_CONSTANT OffsetT* const d_bins_out,
+                                                                  _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
+                                                                  _CCCL_GRID_CONSTANT const OffsetT num_items,
+                                                                  _CCCL_GRID_CONSTANT const int start_bit,
+                                                                  _CCCL_GRID_CONSTANT const int end_bit,
+                                                                  _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   static constexpr radix_sort_histogram_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).histogram;
 
@@ -479,18 +480,18 @@ template <typename PolicySelector,
 CUB_DETAIL_KERNEL_ATTRIBUTES void
 __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).onesweep.block_threads)
   DeviceRadixSortOnesweepKernel(
-    AtomicOffsetT* d_lookback,
-    AtomicOffsetT* d_ctrs,
-    OffsetT* d_bins_out,
-    const OffsetT* d_bins_in,
-    KeyT* d_keys_out,
-    const KeyT* d_keys_in,
-    ValueT* d_values_out,
-    const ValueT* d_values_in,
-    PortionOffsetT num_items,
-    int current_bit,
-    int num_bits,
-    DecomposerT decomposer = {})
+    _CCCL_GRID_CONSTANT AtomicOffsetT* const d_lookback,
+    _CCCL_GRID_CONSTANT AtomicOffsetT* const d_ctrs,
+    _CCCL_GRID_CONSTANT OffsetT* const d_bins_out,
+    _CCCL_GRID_CONSTANT const OffsetT* const d_bins_in,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_out,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
+    _CCCL_GRID_CONSTANT ValueT* const d_values_out,
+    _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
+    _CCCL_GRID_CONSTANT const PortionOffsetT num_items,
+    _CCCL_GRID_CONSTANT const int current_bit,
+    _CCCL_GRID_CONSTANT const int num_bits,
+    _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   static constexpr radix_sort_onesweep_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).onesweep;
   using OnesweepPolicyT                              = AgentRadixSortOnesweepPolicy<
@@ -535,7 +536,7 @@ __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).onesweep.
  * Exclusive sum kernel
  */
 template <typename PolicySelector, typename OffsetT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins)
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(_CCCL_GRID_CONSTANT OffsetT* const d_bins)
 {
   static constexpr radix_sort_exclusive_sum_policy policy =
     PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).exclusive_sum;
diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
index 217e0f82f43..752a326dd38 100644
--- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh
@@ -131,9 +131,9 @@ template <typename PolicySelector,
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).reduce.block_threads)) void DeviceReduceKernel(
-  InputIteratorT d_in,
-  AccumT* d_out,
-  OffsetT num_items,
+  _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+  _CCCL_GRID_CONSTANT AccumT* const d_out,
+  _CCCL_GRID_CONSTANT const OffsetT num_items,
   GridEvenShare<OffsetT> even_share,
   ReductionOpT reduction_op,
   TransformOpT transform_op)
@@ -224,11 +224,11 @@ template <typename PolicySelector,
 #endif // _CCCL_HAS_CONCEPTS()
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
   int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile.block_threads),
-  1) void DeviceReduceSingleTileKernel(InputIteratorT d_in,
+  1) void DeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
                                        OutputIteratorT d_out,
-                                       OffsetT num_items,
+                                       _CCCL_GRID_CONSTANT const OffsetT num_items,
                                        ReductionOpT reduction_op,
-                                       InitT init,
+                                       _CCCL_GRID_CONSTANT const InitT init,
                                        TransformOpT transform_op)
 {
   static constexpr agent_reduce_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile;
@@ -288,12 +288,12 @@ template <typename PolicySelector,
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
   PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10})
     .reduce_nondeterministic
-    .block_threads)) void NondeterministicDeviceReduceAtomicKernel(InputIteratorT d_in,
-                                                                   OutputIteratorT d_out,
-                                                                   OffsetT num_items,
+    .block_threads)) void NondeterministicDeviceReduceAtomicKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in,
+                                                                   _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
+                                                                   _CCCL_GRID_CONSTANT const OffsetT num_items,
                                                                    GridEvenShare<OffsetT> even_share,
                                                                    ReductionOpT reduction_op,
-                                                                   InitT init,
+                                                                   _CCCL_GRID_CONSTANT const InitT init,
                                                                    TransformOpT transform_op)
 {
   NV_IF_TARGET(NV_PROVIDES_SM_60,
diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
index 5346385f0ba..010424f231f 100644
--- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh
@@ -55,7 +55,7 @@ union tile_state_kernel_arg_t
  */
 template <typename ChainedPolicyT, typename InputIteratorT, typename OutputIteratorT, typename ScanTileState, typename AccumT>
 CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel(
-  tile_state_kernel_arg_t<ScanTileState, AccumT> tile_state, int num_tiles)
+  tile_state_kernel_arg_t<ScanTileState, AccumT> tile_state, _CCCL_GRID_CONSTANT const int num_tiles)
 {
   _CCCL_PDL_GRID_DEPENDENCY_SYNC();
   _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); // beneficial for all problem sizes in cub.bench.scan.exclusive.sum.base
@@ -94,8 +94,8 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel(
  *   (i.e., length of `d_selected_out`)
  */
 template <typename ScanTileStateT, typename NumSelectedIteratorT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceCompactInitKernel(
+  ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
index 0f3a3bc5a20..31afe3c71b7 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh
@@ -111,15 +111,15 @@ template <typename PolicySelector,
 #endif // _CCCL_HAS_CONCEPTS()
 __launch_bounds__(segmented_radix_sort_kernel_launch_bounds<PolicySelector, AltDigitBits>())
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel(
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int current_bit,
-    int pass_bits,
-    DecomposerT decomposer = {})
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_out,
+    _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
+    _CCCL_GRID_CONSTANT ValueT* const d_values_out,
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
+    _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
+    _CCCL_GRID_CONSTANT const int current_bit,
+    _CCCL_GRID_CONSTANT const int pass_bits,
+    _CCCL_GRID_CONSTANT const DecomposerT decomposer = {})
 {
   //
   // Constants
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
index c6855460292..d01794b1bdc 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh
@@ -112,14 +112,14 @@ template <typename PolicySelector,
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).large_reduce.block_threads) //
   void DeviceSegmentedReduceKernel(
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int num_segments,
+    _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+    _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
+    _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
+    _CCCL_GRID_CONSTANT const int num_segments,
     ReductionOpT reduction_op,
-    InitT init,
-    size_t max_segment_size)
+    _CCCL_GRID_CONSTANT const InitT init,
+    _CCCL_GRID_CONSTANT const size_t max_segment_size)
 {
   static constexpr segmented_reduce_policy full_policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
 
@@ -314,15 +314,15 @@ template <typename ChainedPolicyT,
           typename AccumT>
 CUB_DETAIL_KERNEL_ATTRIBUTES
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceFixedSizeSegmentedReduceKernel(
-  InputIteratorT d_in,
-  OutputIteratorT d_out,
-  OffsetT segment_size,
-  int num_segments,
+  _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+  _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
+  _CCCL_GRID_CONSTANT const OffsetT segment_size,
+  _CCCL_GRID_CONSTANT const int num_segments,
   ReductionOpT reduction_op,
-  InitT init,
-  AccumT* d_partial_out,
-  int full_chunk_size,
-  int blocks_per_segment)
+  _CCCL_GRID_CONSTANT const InitT init,
+  _CCCL_GRID_CONSTANT AccumT* const d_partial_out,
+  _CCCL_GRID_CONSTANT const int full_chunk_size,
+  _CCCL_GRID_CONSTANT const int blocks_per_segment)
 {
   using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
 
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh
index cc0431b8b0b..373ad649a0b 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh
@@ -35,14 +35,14 @@ template <typename ChainedPolicyT,
           typename ActualInitValueT = typename InitValueT::value_type>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::segmented_scan_policy_t::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void device_segmented_scan_kernel(
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    BeginOffsetIteratorInputT begin_offset_d_in,
-    EndOffsetIteratorInputT end_offset_d_in,
-    BeginOffsetIteratorOutputT begin_offset_d_out,
-    OffsetT n_segments,
-    ScanOpT scan_op,
-    InitValueT init_value)
+    _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+    _CCCL_GRID_CONSTANT const OutputIteratorT d_out,
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorInputT begin_offset_d_in,
+    _CCCL_GRID_CONSTANT const EndOffsetIteratorInputT end_offset_d_in,
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorOutputT begin_offset_d_out,
+    _CCCL_GRID_CONSTANT const OffsetT n_segments,
+    _CCCL_GRID_CONSTANT const ScanOpT scan_op,
+    _CCCL_GRID_CONSTANT const InitValueT init_value)
 {
   using segmented_scan_policy_t = typename ChainedPolicyT::ActivePolicy::segmented_scan_policy_t;
 
diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
index 7c8fc153698..dc9dfcd3d7c 100644
--- a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh
@@ -137,8 +137,8 @@ __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).large_seg
     const ValueT* d_values_in_orig,
     ValueT* d_values_out_orig,
     device_double_buffer<ValueT> d_values_double_buffer,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets)
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
+    _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets)
 {
   static constexpr segmented_sort_policy active_policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
   static constexpr auto large_policy                   = active_policy.large_segment;
@@ -328,17 +328,17 @@ template <SortOrder Order,
 #endif // _CCCL_HAS_CONCEPTS()
 __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).small_segment.block_threads)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall(
-    local_segment_index_t small_segments,
-    local_segment_index_t medium_segments,
-    local_segment_index_t medium_blocks,
-    const local_segment_index_t* d_small_segments_indices,
-    const local_segment_index_t* d_medium_segments_indices,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets)
+    _CCCL_GRID_CONSTANT const local_segment_index_t small_segments,
+    _CCCL_GRID_CONSTANT const local_segment_index_t medium_segments,
+    _CCCL_GRID_CONSTANT const local_segment_index_t medium_blocks,
+    _CCCL_GRID_CONSTANT const local_segment_index_t* const d_small_segments_indices,
+    _CCCL_GRID_CONSTANT const local_segment_index_t* const d_medium_segments_indices,
+    _CCCL_GRID_CONSTANT const KeyT* const d_keys_in,
+    _CCCL_GRID_CONSTANT KeyT* const d_keys_out,
+    _CCCL_GRID_CONSTANT const ValueT* const d_values_in,
+    _CCCL_GRID_CONSTANT ValueT* const d_values_out,
+    _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
+    _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets)
 {
   using local_segment_index_t = local_segment_index_t;
 
@@ -466,15 +466,15 @@ template <SortOrder Order,
 #endif // _CCCL_HAS_CONCEPTS()
 __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).large_segment.block_threads)
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelLarge(
-    const local_segment_index_t* d_segments_indices,
+    _CCCL_GRID_CONSTANT const local_segment_index_t* const d_segments_indices,
     const KeyT* d_keys_in_orig,
     KeyT* d_keys_out_orig,
     device_double_buffer<KeyT> d_keys_double_buffer,
     const ValueT* d_values_in_orig,
     ValueT* d_values_out_orig,
     device_double_buffer<ValueT> d_values_double_buffer,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets)
+    const _CCCL_GRID_CONSTANT BeginOffsetIteratorT d_begin_offsets,
+    const _CCCL_GRID_CONSTANT EndOffsetIteratorT d_end_offsets)
 {
   static constexpr segmented_radix_sort_policy large_policy =
     PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).large_segment;
diff --git a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
index e67b9acbea6..961f03587cf 100644
--- a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh
@@ -114,16 +114,16 @@ template <typename ChainedPolicyT,
           typename StreamingContextT>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel(
-    InputIteratorT d_in,
-    FirstOutputIteratorT d_first_part_out,
-    SecondOutputIteratorT d_second_part_out,
-    UnselectedOutputIteratorT d_unselected_out,
-    NumSelectedIteratorT d_num_selected_out,
+    _CCCL_GRID_CONSTANT const InputIteratorT d_in,
+    _CCCL_GRID_CONSTANT const FirstOutputIteratorT d_first_part_out,
+    _CCCL_GRID_CONSTANT const SecondOutputIteratorT d_second_part_out,
+    _CCCL_GRID_CONSTANT const UnselectedOutputIteratorT d_unselected_out,
+    _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_status,
     SelectFirstPartOp select_first_part_op,
     SelectSecondPartOp select_second_part_op,
-    OffsetT num_items,
-    int num_tiles,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int num_tiles,
     _CCCL_GRID_CONSTANT const StreamingContextT streaming_context)
 {
   using AgentThreeWayPartitionPolicyT = typename ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy;
@@ -180,8 +180,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO
  *   (i.e., length of @p d_selected_out)
  */
 template <typename ScanTileStateT, typename NumSelectedIteratorT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionInitKernel(
+  ScanTileStateT tile_state,
+  _CCCL_GRID_CONSTANT const int num_tiles,
+  _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out)
 {
   // Initialize tile status
   tile_state.InitializeStatus(num_tiles);
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
index c2f46a2bff3..ac3d5ac020d 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
@@ -1033,12 +1033,12 @@ template <typename PolicySelector,
   requires transform_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
 __launch_bounds__(get_block_threads<PolicySelector>) CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
-  Offset num_items,
-  int num_elem_per_thread,
-  [[maybe_unused]] bool can_vectorize,
-  Predicate pred,
-  F f,
-  RandomAccessIteratorOut out,
+  _CCCL_GRID_CONSTANT const Offset num_items,
+  _CCCL_GRID_CONSTANT const int num_elem_per_thread,
+  [[maybe_unused]] _CCCL_GRID_CONSTANT const bool can_vectorize,
+  _CCCL_GRID_CONSTANT const Predicate pred,
+  _CCCL_GRID_CONSTANT const F f,
+  _CCCL_GRID_CONSTANT const RandomAccessIteratorOut out,
   kernel_arg<RandomAccessIteratorsIn>... ins)
 {
   _CCCL_ASSERT(blockDim.y == 1 && blockDim.z == 1, "transform_kernel only supports 1D blocks");
diff --git a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
index 8255d50ba58..fea4f0cb18b 100644
--- a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh
@@ -134,15 +134,15 @@ __launch_bounds__(int(
     EqualityOpT,
     OffsetT>::agent_policy_t::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel(
-    KeyInputIteratorT d_keys_in,
-    ValueInputIteratorT d_values_in,
-    KeyOutputIteratorT d_keys_out,
-    ValueOutputIteratorT d_values_out,
-    NumSelectedIteratorT d_num_selected_out,
+    _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in,
+    _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
+    _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out,
+    _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
+    _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out,
     ScanTileStateT tile_state,
     EqualityOpT equality_op,
-    OffsetT num_items,
-    int num_tiles,
+    _CCCL_GRID_CONSTANT const OffsetT num_items,
+    _CCCL_GRID_CONSTANT const int num_tiles,
     vsmem_t vsmem)
 {
   using VsmemHelperT = typename VSMemHelperT::template VSMemHelperDefaultFallbackPolicyT<
diff --git a/libcudacxx/include/cuda/std/__cccl/execution_space.h b/libcudacxx/include/cuda/std/__cccl/execution_space.h
index eafc737b49d..96e37e5d372 100644
--- a/libcudacxx/include/cuda/std/__cccl/execution_space.h
+++ b/libcudacxx/include/cuda/std/__cccl/execution_space.h
@@ -48,7 +48,7 @@
 #  define _CCCL_PTX_ARCH() __CUDA_ARCH__
 #endif
 
-#if (_CCCL_CUDA_COMPILER(NVCC) || _CCCL_CUDA_COMPILER(NVRTC) || _CCCL_CUDA_COMPILER(CLANG, >=, 20)) \
+#if (_CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || _CCCL_CUDA_COMPILER(NVRTC) || _CCCL_CUDA_COMPILER(CLANG, >=, 20)) \
   && _CCCL_PTX_ARCH() >= 700
 #  define _CCCL_HAS_GRID_CONSTANT() 1
 #  define _CCCL_GRID_CONSTANT       __grid_constant__