From b7f08007846157c2e2c15f99aa25be2b6bf9632b Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Sat, 15 Nov 2025 18:39:56 +0530 Subject: [PATCH 01/14] FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters --- .../dispatch/kernels/kernel_for_each.cuh | 4 +- .../dispatch/kernels/kernel_histogram.cuh | 8 ++-- .../dispatch/kernels/kernel_merge_sort.cuh | 36 +++++++-------- .../dispatch/kernels/kernel_radix_sort.cuh | 28 +++++------ .../device/dispatch/kernels/kernel_reduce.cuh | 46 +++++++++---------- .../device/dispatch/kernels/kernel_scan.cuh | 12 ++--- .../kernels/kernel_segmented_radix_sort.cuh | 8 ++-- .../kernels/kernel_segmented_reduce.cuh | 20 ++++---- .../kernels/kernel_segmented_sort.cuh | 18 ++++---- .../kernels/kernel_three_way_partition.cuh | 12 ++--- .../dispatch/kernels/kernel_transform.cuh | 8 ++-- .../dispatch/kernels/kernel_unique_by_key.cuh | 10 ++-- 12 files changed, 105 insertions(+), 105 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh index 81ecf9de39c..7af08a50164 100644 --- a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh @@ -86,7 +86,7 @@ using can_regain_copy_freedom = // This kernel is used when the block size is not known at compile time template -CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op) +CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OpT op) { using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t; using agent_t = agent_block_striped_t; @@ -111,7 +111,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op) template CUB_DETAIL_KERNEL_ATTRIBUTES // __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) // - void static_kernel(OffsetT num_items, OpT op) + void static_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OpT op) { using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t; using agent_t = agent_block_striped_t; diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh index e4520813221..ee407165fa5 100644 --- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh @@ -452,10 +452,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK ::cuda::std::array d_privatized_histograms_wrapper, ::cuda::std::array output_decode_op_wrapper, ::cuda::std::array privatized_decode_op_wrapper, - OffsetT num_row_pixels, - OffsetT num_rows, - OffsetT row_stride_samples, - int tiles_per_row, + _CCCL_GRID_CONSTANT const OffsetT num_row_pixels, + _CCCL_GRID_CONSTANT const OffsetT num_rows, + _CCCL_GRID_CONSTANT const OffsetT row_stride_samples, + _CCCL_GRID_CONSTANT const int tiles_per_row, GridQueue tile_queue) { // Thread block type for compositing input tiles diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh index 115602e5080..ec5d111eeef 100644 --- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh @@ -167,15 +167,15 @@ __launch_bounds__( KeyT, ValueT>::policy_t::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel( - bool ping, - KeyInputIteratorT keys_in, - ValueInputIteratorT items_in, + _CCCL_GRID_CONSTANT const bool ping, + _CCCL_GRID_CONSTANT const KeyInputIteratorT keys_in, + _CCCL_GRID_CONSTANT const ValueInputIteratorT items_in, KeyIteratorT keys_out, ValueIteratorT items_out, - OffsetT keys_count, + _CCCL_GRID_CONSTANT const OffsetT keys_count, KeyT* tmp_keys_out, ValueT* tmp_items_out, - CompareOpT compare_op, + _CCCL_GRID_CONSTANT const CompareOpT compare_op, vsmem_t vsmem) { using MergeSortHelperT = typename VSMemHelperT::template MergeSortVSMemHelperT< @@ -221,15 +221,15 @@ __launch_bounds__( template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel( - bool ping, - KeyIteratorT keys_ping, + _CCCL_GRID_CONSTANT const bool ping, + _CCCL_GRID_CONSTANT const KeyIteratorT keys_ping, KeyT* keys_pong, - OffsetT keys_count, - OffsetT num_partitions, + _CCCL_GRID_CONSTANT const OffsetT keys_count, + _CCCL_GRID_CONSTANT const OffsetT num_partitions, OffsetT* merge_partitions, - CompareOpT compare_op, - OffsetT target_merged_tiles_number, - int items_per_tile) + _CCCL_GRID_CONSTANT const CompareOpT compare_op, + _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number, + _CCCL_GRID_CONSTANT const int items_per_tile) { OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x; @@ -272,15 +272,15 @@ __launch_bounds__( KeyT, ValueT>::policy_t::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel( - bool ping, - KeyIteratorT keys_ping, - ValueIteratorT items_ping, - OffsetT keys_count, + _CCCL_GRID_CONSTANT const bool ping, + _CCCL_GRID_CONSTANT const KeyIteratorT keys_ping, + _CCCL_GRID_CONSTANT const ValueIteratorT items_ping, + _CCCL_GRID_CONSTANT const OffsetT keys_count, KeyT* keys_pong, ValueT* items_pong, - CompareOpT compare_op, + _CCCL_GRID_CONSTANT const CompareOpT compare_op, OffsetT* merge_partitions, - OffsetT target_merged_tiles_number, + _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number, vsmem_t vsmem) { using MergeSortHelperT = typename VSMemHelperT::template MergeSortVSMemHelperT< diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh index cf09fb0bef4..bcbd19e3a6c 100644 --- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh @@ -85,9 +85,9 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel( const KeyT* d_keys, OffsetT* d_spine, - OffsetT /*num_items*/, - int current_bit, - int num_bits, + _CCCL_GRID_CONSTANT const OffsetT /*num_items*/, + _CCCL_GRID_CONSTANT const int current_bit, + _CCCL_GRID_CONSTANT const int num_bits, GridEvenShare even_share, DecomposerT decomposer = {}) { @@ -144,7 +144,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) - CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, int num_counts) + CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, _CCCL_GRID_CONSTANT const int num_counts) { // Parameterize the AgentScan type for the current configuration using AgentScanT = @@ -243,9 +243,9 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo const ValueT* d_values_in, ValueT* d_values_out, OffsetT* d_spine, - OffsetT num_items, - int current_bit, - int num_bits, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const int current_bit, + _CCCL_GRID_CONSTANT const int num_bits, GridEvenShare even_share, DecomposerT decomposer = {}) { @@ -332,8 +332,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE const ValueT* d_values_in, ValueT* d_values_out, OffsetT num_items, - int current_bit, - int end_bit, + _CCCL_GRID_CONSTANT const int current_bit, + _CCCL_GRID_CONSTANT const int end_bit, DecomposerT decomposer = {}) { // Constants @@ -444,7 +444,7 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel( - OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit, DecomposerT decomposer = {}) + OffsetT* d_bins_out, const KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, DecomposerT decomposer = {}) { using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy; using AgentT = AgentRadixSortHistogram; @@ -470,10 +470,10 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out, - const ValueT* d_values_in, - PortionOffsetT num_items, - int current_bit, - int num_bits, + ValueT* d_values_in, + _CCCL_GRID_CONSTANT const PortionOffsetT num_items, + _CCCL_GRID_CONSTANT const int current_bit, + _CCCL_GRID_CONSTANT const int num_bits, DecomposerT decomposer = {}) { using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy; diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index 602b33857a3..f28c103fa9c 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -129,12 +129,12 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceReduceKernel( - InputIteratorT d_in, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, AccumT* d_out, - OffsetT num_items, + _CCCL_GRID_CONSTANT const OffsetT num_items, GridEvenShare even_share, - ReductionOpT reduction_op, - TransformOpT transform_op) + _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + _CCCL_GRID_CONSTANT const TransformOpT transform_op) { // Thread block type for reducing input tiles using AgentReduceT = @@ -214,12 +214,12 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), - 1) void DeviceReduceSingleTileKernel(InputIteratorT d_in, + 1) void DeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in, OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - InitT init, - TransformOpT transform_op) + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + _CCCL_GRID_CONSTANT const InitT init, + _CCCL_GRID_CONSTANT const TransformOpT transform_op) { // Thread block type for reducing input tiles using AgentReduceT = @@ -298,11 +298,11 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeterministicDeviceReduceKernel( - InputIteratorT d_in, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, AccumT* d_out, - int num_items, - ReductionOpT reduction_op, - TransformOpT transform_op, + _CCCL_GRID_CONSTANT const int num_items, + _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + _CCCL_GRID_CONSTANT const TransformOpT transform_op, const int reduce_grid_size) { using reduce_policy_t = typename ChainedPolicyT::ActivePolicy::ReducePolicy; @@ -431,12 +431,12 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), - 1) void DeterministicDeviceReduceSingleTileKernel(InputIteratorT d_in, + 1) void DeterministicDeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in, OutputIteratorT d_out, - int num_items, - ReductionOpT reduction_op, - InitT init, - TransformOpT transform_op) + _CCCL_GRID_CONSTANT const int num_items, + _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + _CCCL_GRID_CONSTANT const InitT init, + _CCCL_GRID_CONSTANT const TransformOpT transform_op) { using single_tile_policy_t = typename ChainedPolicyT::ActivePolicy::SingleTilePolicy; @@ -498,13 +498,13 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int( ChainedPolicyT::ActivePolicy::ReduceNondeterministicPolicy:: - BLOCK_THREADS)) void NondeterministicDeviceReduceAtomicKernel(InputIteratorT d_in, + BLOCK_THREADS)) void NondeterministicDeviceReduceAtomicKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in, OutputIteratorT d_out, - OffsetT num_items, + _CCCL_GRID_CONSTANT const OffsetT num_items, GridEvenShare even_share, - ReductionOpT reduction_op, - InitT init, - TransformOpT transform_op) + _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + _CCCL_GRID_CONSTANT const InitT init, + _CCCL_GRID_CONSTANT const TransformOpT transform_op) { NV_IF_TARGET(NV_PROVIDES_SM_60, (), diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index df596b2d463..cfa32871c7a 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -37,7 +37,7 @@ namespace detail::scan * Number of tiles */ template -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles) { // Initialize tile status tile_state.InitializeStatus(num_tiles); @@ -64,7 +64,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state */ template CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out) +DeviceCompactInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); @@ -136,13 +136,13 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel( - InputIteratorT d_in, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, OutputIteratorT d_out, ScanTileStateT tile_state, - int start_tile, - ScanOpT scan_op, + _CCCL_GRID_CONSTANT const int start_tile, + _CCCL_GRID_CONSTANT const ScanOpT scan_op, InitValueT init_value, - OffsetT num_items) + _CCCL_GRID_CONSTANT const OffsetT num_items) { using ScanPolicyT = typename ChainedPolicyT::ActivePolicy::ScanPolicyT; diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh index 4dbe0645d32..13088889941 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh @@ -102,10 +102,10 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmen KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int current_bit, - int pass_bits, + _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets, + _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets, + _CCCL_GRID_CONSTANT const int current_bit, + _CCCL_GRID_CONSTANT const int pass_bits, DecomposerT decomposer = {}) { // diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh index cc8182707c0..f5fd0f7fea6 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh @@ -101,12 +101,12 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceSegmentedReduceKernel( - InputIteratorT d_in, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, OutputIteratorT d_out, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - ReductionOpT reduction_op, - InitT init) + _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets, + _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets, + _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + _CCCL_GRID_CONSTANT const InitT init) { // Thread block type for reducing input tiles using AgentReduceT = @@ -188,12 +188,12 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceFixedSizeSegmentedReduceKernel( - InputIteratorT d_in, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, OutputIteratorT d_out, - OffsetT segment_size, - int num_segments, - ReductionOpT reduction_op, - InitT init) + _CCCL_GRID_CONSTANT const OffsetT segment_size, + _CCCL_GRID_CONSTANT const int num_segments, + _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + _CCCL_GRID_CONSTANT const InitT init) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh index 4133c7448b2..e13833ffa74 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh @@ -133,8 +133,8 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD const ValueT* d_values_in_orig, ValueT* d_values_out_orig, device_double_buffer d_values_double_buffer, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets) + _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets, + _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; @@ -304,17 +304,17 @@ template __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallSegmentPolicy::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall( - local_segment_index_t small_segments, - local_segment_index_t medium_segments, - local_segment_index_t medium_blocks, + _CCCL_GRID_CONSTANT const local_segment_index_t small_segments, + _CCCL_GRID_CONSTANT const local_segment_index_t medium_segments, + _CCCL_GRID_CONSTANT const local_segment_index_t medium_blocks, const local_segment_index_t* d_small_segments_indices, const local_segment_index_t* d_medium_segments_indices, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets) + _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets, + _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets) { using local_segment_index_t = local_segment_index_t; @@ -432,8 +432,8 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD const ValueT* d_values_in_orig, ValueT* d_values_out_orig, device_double_buffer d_values_double_buffer, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets) + const _CCCL_GRID_CONSTANT BeginOffsetIteratorT d_begin_offsets, + const _CCCL_GRID_CONSTANT EndOffsetIteratorT d_end_offsets) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; diff --git a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh index e67b9acbea6..29d776a9a9c 100644 --- a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh @@ -114,16 +114,16 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel( - InputIteratorT d_in, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_status, - SelectFirstPartOp select_first_part_op, - SelectSecondPartOp select_second_part_op, - OffsetT num_items, - int num_tiles, + _CCCL_GRID_CONSTANT const SelectFirstPartOp select_first_part_op, + _CCCL_GRID_CONSTANT const SelectSecondPartOp select_second_part_op, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context) { using AgentThreeWayPartitionPolicyT = typename ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy; @@ -181,7 +181,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO */ template CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out) +DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh index 21987e61578..1126427d5d6 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh @@ -962,11 +962,11 @@ template __launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads) CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel( - Offset num_items, - int num_elem_per_thread, + _CCCL_GRID_CONSTANT const Offset num_items, + _CCCL_GRID_CONSTANT const int num_elem_per_thread, [[maybe_unused]] bool can_vectorize, - Predicate pred, - F f, + _CCCL_GRID_CONSTANT const Predicate pred, + _CCCL_GRID_CONSTANT const F f, RandomAccessIteratorOut out, kernel_arg... ins) { diff --git a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh index bf0e55bd45f..7ea2d0f2297 100644 --- a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh @@ -131,15 +131,15 @@ __launch_bounds__(int( EqualityOpT, OffsetT>::agent_policy_t::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel( - KeyInputIteratorT d_keys_in, - ValueInputIteratorT d_values_in, + _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in, + _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_state, - EqualityOpT equality_op, - OffsetT num_items, - int num_tiles, + _CCCL_GRID_CONSTANT const EqualityOpT equality_op, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const int num_tiles, vsmem_t vsmem) { using VsmemHelperT = typename VSMemHelperT::template VSMemHelperDefaultFallbackPolicyT< From 6f71b0969e6ae7da44c23384d5fc6c653a70e49f Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Sat, 15 Nov 2025 22:21:53 +0530 Subject: [PATCH 02/14] FEA: Added _CCCL_GRID_CONSTANT const to more kernels --- .../dispatch/dispatch_adjacent_difference.cuh | 8 ++-- .../device/dispatch/dispatch_batch_memcpy.cuh | 30 +++++++-------- cub/cub/device/dispatch/dispatch_merge.cuh | 30 +++++++-------- .../dispatch/dispatch_reduce_by_key.cuh | 18 ++++----- cub/cub/device/dispatch/dispatch_rle.cuh | 14 +++---- .../device/dispatch/dispatch_scan_by_key.cuh | 22 +++++------ .../device/dispatch/dispatch_select_if.cuh | 16 ++++---- cub/cub/device/dispatch/dispatch_topk.cuh | 38 +++++++++---------- 8 files changed, 88 insertions(+), 88 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index a38402ed6ac..f8e2d10fd4d 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -48,11 +48,11 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel( - InputIteratorT input, + _CCCL_GRID_CONSTANT const InputIteratorT input, InputT* first_tile_previous, - OutputIteratorT result, - DifferenceOpT difference_op, - OffsetT num_items) + _CCCL_GRID_CONSTANT const OutputIteratorT result, + _CCCL_GRID_CONSTANT const DifferenceOpT difference_op, + _CCCL_GRID_CONSTANT const OffsetT num_items) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy; diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 33f73fa6a56..6e1d6708712 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -83,12 +83,12 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void MultiBlockBatchMemcpyKernel( - InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - BufferTileOffsetItT buffer_tile_offsets, + _CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it, + _CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it, + _CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes, + _CCCL_GRID_CONSTANT const BufferTileOffsetItT buffer_tile_offsets, TileT buffer_offset_tile, - TileOffsetT last_tile_offset) + _CCCL_GRID_CONSTANT const TileOffsetT last_tile_offset) { using StatusWord = typename TileT::StatusWord; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT; @@ -210,16 +210,16 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void BatchMemcpyKernel( - InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - BufferOffsetT num_buffers, - BlevBufferSrcsOutItT blev_buffer_srcs, - BlevBufferDstsOutItT blev_buffer_dsts, - BlevBufferSizesOutItT blev_buffer_sizes, - BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets, - BLevBufferOffsetTileState blev_buffer_scan_state, - BLevBlockOffsetTileState blev_block_scan_state) + _CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it, + _CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it, + _CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes, + _CCCL_GRID_CONSTANT const BufferOffsetT num_buffers, + _CCCL_GRID_CONSTANT const BlevBufferSrcsOutItT blev_buffer_srcs, + _CCCL_GRID_CONSTANT const BlevBufferDstsOutItT blev_buffer_dsts, + _CCCL_GRID_CONSTANT const BlevBufferSizesOutItT blev_buffer_sizes, + _CCCL_GRID_CONSTANT const BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets, + _CCCL_GRID_CONSTANT const BLevBufferOffsetTileState blev_buffer_scan_state, + _CCCL_GRID_CONSTANT const BLevBlockOffsetTileState blev_block_scan_state) { // Internal type used for storing a buffer's size using BufferSizeT = it_value_t; diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index 6ed2facded6..3749aa4c842 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -73,13 +73,13 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel( - KeyIt1 keys1, - Offset keys1_count, - KeyIt2 keys2, - Offset keys2_count, - Offset num_diagonals, + _CCCL_GRID_CONSTANT const KeyIt1 keys1, + _CCCL_GRID_CONSTANT const Offset keys1_count, + _CCCL_GRID_CONSTANT const KeyIt2 keys2, + _CCCL_GRID_CONSTANT const Offset keys2_count, + _CCCL_GRID_CONSTANT const Offset num_diagonals, Offset* key1_beg_offsets, - CompareOp compare_op) + _CCCL_GRID_CONSTANT const CompareOp compare_op) { // items_per_tile must be the same of the merge kernel later, so we have to consider whether a fallback agent will be // selected for the merge agent that changes the tile size @@ -122,15 +122,15 @@ __launch_bounds__( Offset, CompareOp>::type::policy::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void device_merge_kernel( - KeyIt1 keys1, - ValueIt1 items1, - Offset num_keys1, - KeyIt2 keys2, - ValueIt2 items2, - Offset num_keys2, - KeyIt3 keys_result, - ValueIt3 items_result, - CompareOp compare_op, + _CCCL_GRID_CONSTANT const KeyIt1 keys1, + _CCCL_GRID_CONSTANT const ValueIt1 items1, + _CCCL_GRID_CONSTANT const Offset num_keys1, + _CCCL_GRID_CONSTANT const KeyIt2 keys2, + _CCCL_GRID_CONSTANT const ValueIt2 items2, + _CCCL_GRID_CONSTANT const Offset num_keys2, + _CCCL_GRID_CONSTANT const KeyIt3 keys_result, + _CCCL_GRID_CONSTANT const ValueIt3 items_result, + _CCCL_GRID_CONSTANT const CompareOp compare_op, Offset* key1_beg_offsets, vsmem_t global_temp_storage) { diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 6e310446d26..651f73a04f4 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -181,16 +181,16 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel( - KeysInputIteratorT d_keys_in, - UniqueOutputIteratorT d_unique_out, - ValuesInputIteratorT d_values_in, - AggregatesOutputIteratorT d_aggregates_out, - NumRunsOutputIteratorT d_num_runs_out, + _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in, + _CCCL_GRID_CONSTANT const UniqueOutputIteratorT d_unique_out, + _CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in, + _CCCL_GRID_CONSTANT const AggregatesOutputIteratorT d_aggregates_out, + _CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out, ScanTileStateT tile_state, - int start_tile, - EqualityOpT equality_op, - ReductionOpT reduction_op, - OffsetT num_items, + _CCCL_GRID_CONSTANT const int start_tile, + _CCCL_GRID_CONSTANT const EqualityOpT equality_op, + _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context, vsmem_t vsmem) { diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index c757bcc3d6b..9cb567ca023 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -170,14 +170,14 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel( - InputIteratorT d_in, - OffsetsOutputIteratorT d_offsets_out, - LengthsOutputIteratorT d_lengths_out, - NumRunsOutputIteratorT d_num_runs_out, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, + _CCCL_GRID_CONSTANT const OffsetsOutputIteratorT d_offsets_out, + _CCCL_GRID_CONSTANT const LengthsOutputIteratorT d_lengths_out, + _CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out, ScanTileStateT tile_status, - EqualityOpT equality_op, - OffsetT num_items, - int num_tiles, + _CCCL_GRID_CONSTANT const EqualityOpT equality_op, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context) { using AgentRlePolicyT = typename ChainedPolicyT::ActivePolicy::RleSweepPolicyT; diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 1617e3ecc7e..a005966b6a0 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -117,16 +117,16 @@ template > __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel( - KeysInputIteratorT d_keys_in, + _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in, KeyT* d_keys_prev_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, + _CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in, + _CCCL_GRID_CONSTANT const ValuesOutputIteratorT d_values_out, ScanByKeyTileStateT tile_state, - int start_tile, - EqualityOp equality_op, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items) + _CCCL_GRID_CONSTANT const int start_tile, + _CCCL_GRID_CONSTANT const EqualityOp equality_op, + _CCCL_GRID_CONSTANT const ScanOpT scan_op, + _CCCL_GRID_CONSTANT const InitValueT init_value, + _CCCL_GRID_CONSTANT const OffsetT num_items) { using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT; @@ -153,10 +153,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel( ScanTileStateT tile_state, - KeysInputIteratorT d_keys_in, + _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in, cub::detail::it_value_t* d_keys_prev_in, - OffsetT items_per_tile, - int num_tiles) + _CCCL_GRID_CONSTANT const OffsetT items_per_tile, + _CCCL_GRID_CONSTANT const int num_tiles) { // Initialize tile status tile_state.InitializeStatus(num_tiles); diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 666cd3d5b85..51c393d775d 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -312,15 +312,15 @@ __launch_bounds__(int( OffsetT, StreamingContextT>::agent_policy_t::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel( - InputIteratorT d_in, - FlagsInputIteratorT d_flags, - SelectedOutputIteratorT d_selected_out, - NumSelectedIteratorT d_num_selected_out, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, + _CCCL_GRID_CONSTANT const FlagsInputIteratorT d_flags, + _CCCL_GRID_CONSTANT const SelectedOutputIteratorT d_selected_out, + _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_status, - SelectOpT select_op, - EqualityOpT equality_op, - OffsetT num_items, - int num_tiles, + _CCCL_GRID_CONSTANT const SelectOpT select_op, + _CCCL_GRID_CONSTANT const EqualityOpT equality_op, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context, vsmem_t vsmem) { diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh index 02ffcbc4c3f..ac5d8ac730a 100644 --- a/cub/cub/device/dispatch/dispatch_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_topk.cuh @@ -112,22 +112,22 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceTopKKernel( - const KeyInputIteratorT d_keys_in, - KeyOutputIteratorT d_keys_out, - const ValueInputIteratorT d_values_in, - ValueOutputIteratorT d_values_out, + _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in, + _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out, + _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in, + _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out, KeyInT* in_buf, OffsetT* in_idx_buf, KeyInT* out_buf, OffsetT* out_idx_buf, Counter, OffsetT, OutOffsetT>* counter, OffsetT* histogram, - OffsetT num_items, - OutOffsetT k, - OffsetT buffer_length, - ExtractBinOpT extract_bin_op, - IdentifyCandidatesOpT identify_candidates_op, - int pass) + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const OutOffsetT k, + _CCCL_GRID_CONSTANT const OffsetT buffer_length, + _CCCL_GRID_CONSTANT const ExtractBinOpT extract_bin_op, + _CCCL_GRID_CONSTANT const IdentifyCandidatesOpT identify_candidates_op, + _CCCL_GRID_CONSTANT const int pass) { using agent_topk_policy_t = typename ChainedPolicyT::ActivePolicy::topk_policy_t; using agent_topk_t = @@ -168,18 +168,18 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceTopKLastFilterKernel( - const KeyInputIteratorT d_keys_in, - KeyOutputIteratorT d_keys_out, - const ValueInputIteratorT d_values_in, - ValueOutputIteratorT d_values_out, + _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in, + _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out, + _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in, + _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out, KeyInT* in_buf, OffsetT* in_idx_buf, Counter, OffsetT, OutOffsetT>* counter, - OffsetT num_items, - OutOffsetT k, - OffsetT buffer_length, - IdentifyCandidatesOpT identify_candidates_op, - int pass) + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const OutOffsetT k, + _CCCL_GRID_CONSTANT const OffsetT buffer_length, + _CCCL_GRID_CONSTANT const IdentifyCandidatesOpT identify_candidates_op, + _CCCL_GRID_CONSTANT const int pass) { using agent_topk_policy_t = typename ChainedPolicyT::ActivePolicy::topk_policy_t; using extract_bin_op_t = NullType; From ffd18c160a50c6884e47976808b79de77a324c8f Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Mon, 17 Nov 2025 22:04:19 +0530 Subject: [PATCH 03/14] FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters --- .../device/dispatch/dispatch_adjacent_difference.cuh | 2 +- cub/cub/device/dispatch/dispatch_batch_memcpy.cuh | 2 +- cub/cub/device/dispatch/kernels/kernel_histogram.cuh | 2 +- cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh | 4 ++-- cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh | 10 +++++----- cub/cub/device/dispatch/kernels/kernel_reduce.cuh | 8 ++++---- cub/cub/device/dispatch/kernels/kernel_scan.cuh | 4 ++-- .../dispatch/kernels/kernel_segmented_radix_sort.cuh | 2 +- .../dispatch/kernels/kernel_segmented_reduce.cuh | 4 ++-- .../dispatch/kernels/kernel_three_way_partition.cuh | 10 +++++----- .../device/dispatch/kernels/kernel_unique_by_key.cuh | 6 +++--- 11 files changed, 27 insertions(+), 27 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index f8e2d10fd4d..c1adb05ede2 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -33,7 +33,7 @@ namespace detail::adjacent_difference { template CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile) +DeviceAdjacentDifferenceInitKernel(_CCCL_GRID_CONSTANT const InputIteratorT first, InputT* result, _CCCL_GRID_CONSTANT const OffsetT num_tiles, _CCCL_GRID_CONSTANT const int items_per_tile) { const int tile_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile); diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 6e1d6708712..312278ba33e 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -61,7 +61,7 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramSweepKernel( - SampleIteratorT d_samples, + _CCCL_GRID_CONSTANT const SampleIteratorT d_samples, ::cuda::std::array num_output_bins_wrapper, ::cuda::std::array num_privatized_bins_wrapper, ::cuda::std::array d_output_histograms_wrapper, diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh index ec5d111eeef..f1b1086f252 100644 --- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh @@ -170,8 +170,8 @@ __launch_bounds__( _CCCL_GRID_CONSTANT const bool ping, _CCCL_GRID_CONSTANT const KeyInputIteratorT keys_in, _CCCL_GRID_CONSTANT const ValueInputIteratorT items_in, - KeyIteratorT keys_out, - ValueIteratorT items_out, + _CCCL_GRID_CONSTANT const KeyIteratorT keys_out, + _CCCL_GRID_CONSTANT const ValueIteratorT items_out, _CCCL_GRID_CONSTANT const OffsetT keys_count, KeyT* tmp_keys_out, ValueT* tmp_items_out, diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh index bcbd19e3a6c..698cc8c4763 100644 --- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh @@ -89,7 +89,7 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUp _CCCL_GRID_CONSTANT const int current_bit, _CCCL_GRID_CONSTANT const int num_bits, GridEvenShare even_share, - DecomposerT decomposer = {}) + _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) { using ActiveUpsweepPolicyT = ::cuda::std::_If even_share, - DecomposerT decomposer = {}) + _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) { using ActiveUpsweepPolicyT = ::cuda::std::_If CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel( - OffsetT* d_bins_out, const KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, DecomposerT decomposer = {}) + OffsetT* d_bins_out, KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) { using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy; using AgentT = AgentRadixSortHistogram; @@ -474,7 +474,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy _CCCL_GRID_CONSTANT const PortionOffsetT num_items, _CCCL_GRID_CONSTANT const int current_bit, _CCCL_GRID_CONSTANT const int num_bits, - DecomposerT decomposer = {}) + _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) { using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy; using AgentT = diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index f28c103fa9c..2d681d4831c 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -215,7 +215,7 @@ template even_share, _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index cfa32871c7a..3ba866cdbe4 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -64,7 +64,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state */ template CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceCompactInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out) +DeviceCompactInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); @@ -137,7 +137,7 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out) +DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); diff --git a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh index 7ea2d0f2297..564d25b1e33 100644 --- a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh @@ -133,9 +133,9 @@ __launch_bounds__(int( CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel( _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in, _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in, - KeyOutputIteratorT d_keys_out, - ValueOutputIteratorT d_values_out, - NumSelectedIteratorT d_num_selected_out, + _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out, + _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out, + _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const EqualityOpT equality_op, _CCCL_GRID_CONSTANT const OffsetT num_items, From 413d9bb81190382bf1bb24ed2caaa04de1a7757c Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Tue, 18 Nov 2025 14:04:49 +0530 Subject: [PATCH 04/14] Removed _CCCL_GRID_CONSTANT const from all operators --- .../dispatch/dispatch_adjacent_difference.cuh | 2 +- cub/cub/device/dispatch/dispatch_merge.cuh | 4 ++-- .../dispatch/dispatch_reduce_by_key.cuh | 4 ++-- cub/cub/device/dispatch/dispatch_rle.cuh | 2 +- .../device/dispatch/dispatch_scan_by_key.cuh | 2 +- .../device/dispatch/dispatch_select_if.cuh | 4 ++-- cub/cub/device/dispatch/dispatch_topk.cuh | 6 +++--- .../dispatch/kernels/kernel_for_each.cuh | 3 +-- .../dispatch/kernels/kernel_merge_sort.cuh | 4 ++-- .../device/dispatch/kernels/kernel_reduce.cuh | 20 +++++++++---------- .../device/dispatch/kernels/kernel_scan.cuh | 2 +- .../kernels/kernel_segmented_reduce.cuh | 4 ++-- .../kernels/kernel_three_way_partition.cuh | 4 ++-- .../dispatch/kernels/kernel_unique_by_key.cuh | 2 +- 14 files changed, 31 insertions(+), 32 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index e2da4dc3483..999f87945d3 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -54,7 +54,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel( _CCCL_GRID_CONSTANT const InputIteratorT input, InputT* first_tile_previous, _CCCL_GRID_CONSTANT const OutputIteratorT result, - _CCCL_GRID_CONSTANT const DifferenceOpT difference_op, + DifferenceOpT difference_op, _CCCL_GRID_CONSTANT const OffsetT num_items) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy; diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index 3749aa4c842..08ed4441143 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -79,7 +79,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel( _CCCL_GRID_CONSTANT const Offset keys2_count, _CCCL_GRID_CONSTANT const Offset num_diagonals, Offset* key1_beg_offsets, - _CCCL_GRID_CONSTANT const CompareOp compare_op) + CompareOp compare_op) { // items_per_tile must be the same of the merge kernel later, so we have to consider whether a fallback agent will be // selected for the merge agent that changes the tile size @@ -130,7 +130,7 @@ __launch_bounds__( _CCCL_GRID_CONSTANT const Offset num_keys2, _CCCL_GRID_CONSTANT const KeyIt3 keys_result, _CCCL_GRID_CONSTANT const ValueIt3 items_result, - _CCCL_GRID_CONSTANT const CompareOp compare_op, + CompareOp compare_op, Offset* key1_beg_offsets, vsmem_t global_temp_storage) { diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 651f73a04f4..ed466646113 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -188,8 +188,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH _CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out, ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int start_tile, - _CCCL_GRID_CONSTANT const EqualityOpT equality_op, - _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + EqualityOpT equality_op, + ReductionOpT reduction_op, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context, vsmem_t vsmem) diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 9cb567ca023..78c24152740 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -175,7 +175,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA _CCCL_GRID_CONSTANT const LengthsOutputIteratorT d_lengths_out, _CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out, ScanTileStateT tile_status, - _CCCL_GRID_CONSTANT const EqualityOpT equality_op, + EqualityOpT equality_op, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context) diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index a005966b6a0..59499f4c096 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -123,7 +123,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE _CCCL_GRID_CONSTANT const ValuesOutputIteratorT d_values_out, ScanByKeyTileStateT tile_state, _CCCL_GRID_CONSTANT const int start_tile, - _CCCL_GRID_CONSTANT const EqualityOp equality_op, + EqualityOp equality_op, _CCCL_GRID_CONSTANT const ScanOpT scan_op, _CCCL_GRID_CONSTANT const InitValueT init_value, _CCCL_GRID_CONSTANT const OffsetT num_items) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 51c393d775d..66f0afaeb4e 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -317,8 +317,8 @@ __launch_bounds__(int( _CCCL_GRID_CONSTANT const SelectedOutputIteratorT d_selected_out, _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_status, - _CCCL_GRID_CONSTANT const SelectOpT select_op, - _CCCL_GRID_CONSTANT const EqualityOpT equality_op, + SelectOpT select_op, + EqualityOpT equality_op, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context, diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh index ac5d8ac730a..624c5d598ad 100644 --- a/cub/cub/device/dispatch/dispatch_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_topk.cuh @@ -125,8 +125,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OutOffsetT k, _CCCL_GRID_CONSTANT const OffsetT buffer_length, - _CCCL_GRID_CONSTANT const ExtractBinOpT extract_bin_op, - _CCCL_GRID_CONSTANT const IdentifyCandidatesOpT identify_candidates_op, + ExtractBinOpT extract_bin_op, + IdentifyCandidatesOpT identify_candidates_op, _CCCL_GRID_CONSTANT const int pass) { using agent_topk_policy_t = typename ChainedPolicyT::ActivePolicy::topk_policy_t; @@ -178,7 +178,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OutOffsetT k, _CCCL_GRID_CONSTANT const OffsetT buffer_length, - _CCCL_GRID_CONSTANT const IdentifyCandidatesOpT identify_candidates_op, + IdentifyCandidatesOpT identify_candidates_op, _CCCL_GRID_CONSTANT const int pass) { using agent_topk_policy_t = typename ChainedPolicyT::ActivePolicy::topk_policy_t; diff --git a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh index 648f4535ac8..9e924848a1e 100644 --- a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh @@ -86,8 +86,7 @@ using can_regain_copy_freedom = // This kernel is used when the block size is not known at compile time template -CUB_DETAIL_KERNEL_ATTRIBUTES void -dynamic_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OpT op) +CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, OpT op) { using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t; using agent_t = agent_block_striped_t; diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh index f1b1086f252..659c48e99bf 100644 --- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh @@ -227,7 +227,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel( _CCCL_GRID_CONSTANT const OffsetT keys_count, _CCCL_GRID_CONSTANT const OffsetT num_partitions, OffsetT* merge_partitions, - _CCCL_GRID_CONSTANT const CompareOpT compare_op, + CompareOpT compare_op, _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number, _CCCL_GRID_CONSTANT const int items_per_tile) { @@ -278,7 +278,7 @@ __launch_bounds__( _CCCL_GRID_CONSTANT const OffsetT keys_count, KeyT* keys_pong, ValueT* items_pong, - _CCCL_GRID_CONSTANT const CompareOpT compare_op, + CompareOpT compare_op, OffsetT* merge_partitions, _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number, vsmem_t vsmem) diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index 2d681d4831c..3df150d7e27 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -133,8 +133,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) AccumT* d_out, _CCCL_GRID_CONSTANT const OffsetT num_items, GridEvenShare even_share, - _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, - _CCCL_GRID_CONSTANT const TransformOpT transform_op) + ReductionOpT reduction_op, + TransformOpT transform_op) { // Thread block type for reducing input tiles using AgentReduceT = @@ -217,9 +217,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( 1) void DeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in, _CCCL_GRID_CONSTANT const OutputIteratorT d_out, _CCCL_GRID_CONSTANT const OffsetT num_items, - _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + ReductionOpT reduction_op, _CCCL_GRID_CONSTANT const InitT init, - _CCCL_GRID_CONSTANT const TransformOpT transform_op) + TransformOpT transform_op) { // Thread block type for reducing input tiles using AgentReduceT = @@ -301,8 +301,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) _CCCL_GRID_CONSTANT const InputIteratorT d_in, AccumT* d_out, _CCCL_GRID_CONSTANT const int num_items, - _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, - _CCCL_GRID_CONSTANT const TransformOpT transform_op, + ReductionOpT reduction_op, + TransformOpT transform_op, _CCCL_GRID_CONSTANT const int reduce_grid_size) { using reduce_policy_t = typename ChainedPolicyT::ActivePolicy::ReducePolicy; @@ -434,9 +434,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( 1) void DeterministicDeviceReduceSingleTileKernel(_CCCL_GRID_CONSTANT const InputIteratorT d_in, _CCCL_GRID_CONSTANT const OutputIteratorT d_out, _CCCL_GRID_CONSTANT const int num_items, - _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + ReductionOpT reduction_op, _CCCL_GRID_CONSTANT const InitT init, - _CCCL_GRID_CONSTANT const TransformOpT transform_op) + TransformOpT transform_op) { using single_tile_policy_t = typename ChainedPolicyT::ActivePolicy::SingleTilePolicy; @@ -502,9 +502,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int( _CCCL_GRID_CONSTANT const OutputIteratorT d_out, _CCCL_GRID_CONSTANT const OffsetT num_items, GridEvenShare even_share, - _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + ReductionOpT reduction_op, _CCCL_GRID_CONSTANT const InitT init, - _CCCL_GRID_CONSTANT const TransformOpT transform_op) + TransformOpT transform_op) { NV_IF_TARGET(NV_PROVIDES_SM_60, (), diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index 76f515faddb..3da879d750b 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -143,7 +143,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) _CCCL_GRID_CONSTANT const OutputIteratorT d_out, ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int start_tile, - _CCCL_GRID_CONSTANT const ScanOpT scan_op, + ScanOpT scan_op, InitValueT init_value, _CCCL_GRID_CONSTANT const OffsetT num_items) { diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh index 8a859631f05..954b1508282 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh @@ -105,7 +105,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) _CCCL_GRID_CONSTANT const OutputIteratorT d_out, _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets, _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets, - _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + ReductionOpT reduction_op, _CCCL_GRID_CONSTANT const InitT init) { // Thread block type for reducing input tiles @@ -192,7 +192,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) _CCCL_GRID_CONSTANT const OutputIteratorT d_out, _CCCL_GRID_CONSTANT const OffsetT segment_size, _CCCL_GRID_CONSTANT const int num_segments, - _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + ReductionOpT reduction_op, _CCCL_GRID_CONSTANT const InitT init) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; diff --git a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh index d0533bf516a..961f03587cf 100644 --- a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh @@ -120,8 +120,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO _CCCL_GRID_CONSTANT const UnselectedOutputIteratorT d_unselected_out, _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_status, - _CCCL_GRID_CONSTANT const SelectFirstPartOp select_first_part_op, - _CCCL_GRID_CONSTANT const SelectSecondPartOp select_second_part_op, + SelectFirstPartOp select_first_part_op, + SelectSecondPartOp select_second_part_op, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context) diff --git a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh index 564d25b1e33..3c87c90f040 100644 --- a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh @@ -137,7 +137,7 @@ __launch_bounds__(int( _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out, _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_state, - _CCCL_GRID_CONSTANT const EqualityOpT equality_op, + EqualityOpT equality_op, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int num_tiles, vsmem_t vsmem) From 5b6236082e2aa3fb49800af36ac8ff692378fb43 Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Sat, 15 Nov 2025 18:39:56 +0530 Subject: [PATCH 05/14] FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters Fixes --- cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh | 2 +- cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh | 8 ++++++++ cub/cub/device/dispatch/kernels/kernel_reduce.cuh | 5 +++-- cub/cub/device/dispatch/kernels/kernel_scan.cuh | 9 +++------ 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh index 659c48e99bf..dc3773d02b0 100644 --- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh @@ -278,7 +278,7 @@ __launch_bounds__( _CCCL_GRID_CONSTANT const OffsetT keys_count, KeyT* keys_pong, ValueT* items_pong, - CompareOpT compare_op, + _CCCL_GRID_CONSTANT const CompareOpT compare_op, OffsetT* merge_partitions, _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number, vsmem_t vsmem) diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh index 01a8d1fdd57..d1c7063d0fb 100644 --- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh @@ -444,12 +444,16 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel( +<<<<<<< HEAD OffsetT* d_bins_out, KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) +======= + OffsetT* d_bins_out, const KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, DecomposerT decomposer = {}) +>>>>>>> 4b1217559 (FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters) { using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy; using AgentT = AgentRadixSortHistogram; @@ -479,7 +483,11 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy _CCCL_GRID_CONSTANT const PortionOffsetT num_items, _CCCL_GRID_CONSTANT const int current_bit, _CCCL_GRID_CONSTANT const int num_bits, +<<<<<<< HEAD _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) +======= + DecomposerT decomposer = {}) +>>>>>>> 4b1217559 (FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters) { using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy; using AgentT = diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index 3df150d7e27..27f081801a9 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -133,8 +133,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) AccumT* d_out, _CCCL_GRID_CONSTANT const OffsetT num_items, GridEvenShare even_share, - ReductionOpT reduction_op, - TransformOpT transform_op) + _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, + _CCCL_GRID_CONSTANT const TransformOpT transform_op) { // Thread block type for reducing input tiles using AgentReduceT = @@ -215,6 +215,7 @@ template -CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles) { _CCCL_PDL_GRID_DEPENDENCY_SYNC(); _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); // beneficial for all problem sizes in cub.bench.scan.exclusive.sum.base @@ -68,10 +67,8 @@ DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int nu * (i.e., length of `d_selected_out`) */ template -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceCompactInitKernel( - ScanTileStateT tile_state, - _CCCL_GRID_CONSTANT const int num_tiles, - _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out) +CUB_DETAIL_KERNEL_ATTRIBUTES void +DeviceCompactInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); From 3240916503c27f7830beb2d6ff897cb295434768 Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Tue, 18 Nov 2025 14:04:49 +0530 Subject: [PATCH 06/14] Removed _CCCL_GRID_CONSTANT const from all operators Fixes --- cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh | 2 +- cub/cub/device/dispatch/kernels/kernel_reduce.cuh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh index dc3773d02b0..659c48e99bf 100644 --- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh @@ -278,7 +278,7 @@ __launch_bounds__( _CCCL_GRID_CONSTANT const OffsetT keys_count, KeyT* keys_pong, ValueT* items_pong, - _CCCL_GRID_CONSTANT const CompareOpT compare_op, + CompareOpT compare_op, OffsetT* merge_partitions, _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number, vsmem_t vsmem) diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index 27f081801a9..cd2ccab79dc 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -133,8 +133,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) AccumT* d_out, _CCCL_GRID_CONSTANT const OffsetT num_items, GridEvenShare even_share, - _CCCL_GRID_CONSTANT const ReductionOpT reduction_op, - _CCCL_GRID_CONSTANT const TransformOpT transform_op) + ReductionOpT reduction_op, + TransformOpT transform_op) { // Thread block type for reducing input tiles using AgentReduceT = From 5a888029474eb24f5d5be143c1bccf5546f9acd1 Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Sun, 23 Nov 2025 00:10:54 +0530 Subject: [PATCH 07/14] Fixes Fixes More Fixes --- .../dispatch/dispatch_adjacent_difference.cuh | 2 +- .../dispatch/kernels/kernel_for_each.cuh | 2 +- .../dispatch/kernels/kernel_merge_sort.cuh | 2 +- .../dispatch/kernels/kernel_radix_sort.cuh | 26 +++++++------------ .../kernels/kernel_segmented_radix_sort.cuh | 4 +-- .../kernels/kernel_segmented_scan.cuh | 16 ++++++------ .../kernels/kernel_segmented_sort.cuh | 10 +++---- .../dispatch/kernels/kernel_transform.cuh | 2 +- 8 files changed, 29 insertions(+), 35 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 999f87945d3..eb0502b66f4 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -34,7 +34,7 @@ namespace detail::adjacent_difference template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel( _CCCL_GRID_CONSTANT const InputIteratorT first, - InputT* result, + _CCCL_GRID_CONSTANT const InputT* result, _CCCL_GRID_CONSTANT const OffsetT num_tiles, _CCCL_GRID_CONSTANT const int items_per_tile) { diff --git a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh index 9e924848a1e..760f221ee9f 100644 --- a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh @@ -111,7 +111,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(_CCCL_GRID_CONSTANT const Offse template CUB_DETAIL_KERNEL_ATTRIBUTES // __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) // - void static_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OpT op) + void static_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, OpT op) { using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t; using agent_t = agent_block_striped_t; diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh index 659c48e99bf..e21c6a2a8bb 100644 --- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh @@ -175,7 +175,7 @@ __launch_bounds__( _CCCL_GRID_CONSTANT const OffsetT keys_count, KeyT* tmp_keys_out, ValueT* tmp_items_out, - _CCCL_GRID_CONSTANT const CompareOpT compare_op, + CompareOpT compare_op, vsmem_t vsmem) { using MergeSortHelperT = typename VSMemHelperT::template MergeSortVSMemHelperT< diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh index d1c7063d0fb..f18a3b0d633 100644 --- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh @@ -83,7 +83,7 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel( - const KeyT* d_keys_in, + _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, KeyT* d_keys_out, - const ValueT* d_values_in, + _CCCL_GRID_CONSTANT const ValueT* const d_values_in, ValueT* d_values_out, OffsetT num_items, _CCCL_GRID_CONSTANT const int current_bit, @@ -444,16 +444,14 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel( -<<<<<<< HEAD OffsetT* d_bins_out, KeyT* d_keys_in, + OffsetT* d_bins_out, + _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) -======= - OffsetT* d_bins_out, const KeyT* d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, _CCCL_GRID_CONSTANT const int end_bit, DecomposerT decomposer = {}) ->>>>>>> 4b1217559 (FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters) { using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy; using AgentT = AgentRadixSortHistogram; @@ -475,19 +473,15 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out, - const OffsetT* d_bins_in, + _CCCL_GRID_CONSTANT const OffsetT* const d_bins_in, KeyT* d_keys_out, - const KeyT* d_keys_in, + _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, ValueT* d_values_out, - ValueT* d_values_in, + _CCCL_GRID_CONSTANT const ValueT* const d_values_in, _CCCL_GRID_CONSTANT const PortionOffsetT num_items, _CCCL_GRID_CONSTANT const int current_bit, _CCCL_GRID_CONSTANT const int num_bits, -<<<<<<< HEAD _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) -======= - DecomposerT decomposer = {}) ->>>>>>> 4b1217559 (FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters) { using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy; using AgentT = diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh index 8856956f254..25ffa659347 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh @@ -98,9 +98,9 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::segmented_scan_policy_t::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void device_segmented_scan_kernel( - InputIteratorT d_in, - OutputIteratorT d_out, - BeginOffsetIteratorInputT begin_offset_d_in, - EndOffsetIteratorInputT end_offset_d_in, - BeginOffsetIteratorOutputT begin_offset_d_out, - OffsetT n_segments, - ScanOpT scan_op, - InitValueT init_value) + _CCCL_GRID_CONSTANT const InputIteratorT d_in, + _CCCL_GRID_CONSTANT const OutputIteratorT d_out, + _CCCL_GRID_CONSTANT const BeginOffsetIteratorInputT begin_offset_d_in, + _CCCL_GRID_CONSTANT const EndOffsetIteratorInputT end_offset_d_in, + _CCCL_GRID_CONSTANT const BeginOffsetIteratorOutputT begin_offset_d_out, + _CCCL_GRID_CONSTANT const OffsetT n_segments, + _CCCL_GRID_CONSTANT const ScanOpT scan_op, + _CCCL_GRID_CONSTANT const InitValueT init_value) { using segmented_scan_policy_t = typename ChainedPolicyT::ActivePolicy::segmented_scan_policy_t; diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh index e13833ffa74..99135d9bb33 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh @@ -307,11 +307,11 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallSegmentPolicy::BLOCK_THREAD _CCCL_GRID_CONSTANT const local_segment_index_t small_segments, _CCCL_GRID_CONSTANT const local_segment_index_t medium_segments, _CCCL_GRID_CONSTANT const local_segment_index_t medium_blocks, - const local_segment_index_t* d_small_segments_indices, - const local_segment_index_t* d_medium_segments_indices, - const KeyT* d_keys_in, + _CCCL_GRID_CONSTANT const local_segment_index_t* const d_small_segments_indices, + _CCCL_GRID_CONSTANT const local_segment_index_t* const d_medium_segments_indices, + _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, KeyT* d_keys_out, - const ValueT* d_values_in, + _CCCL_GRID_CONSTANT const ValueT* const d_values_in, ValueT* d_values_out, _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets, _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets) @@ -425,7 +425,7 @@ template __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelLarge( - const local_segment_index_t* d_segments_indices, + _CCCL_GRID_CONSTANT const local_segment_index_t* const d_segments_indices, const KeyT* d_keys_in_orig, KeyT* d_keys_out_orig, device_double_buffer d_keys_double_buffer, diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh index 27fddae480f..59fbb3b30a4 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh @@ -989,7 +989,7 @@ __launch_bounds__(get_block_threads) [[maybe_unused]] bool can_vectorize, _CCCL_GRID_CONSTANT const Predicate pred, _CCCL_GRID_CONSTANT const F f, - RandomAccessIteratorOut out, + _CCCL_GRID_CONSTANT const RandomAccessIteratorOut out, kernel_arg... ins) { _CCCL_ASSERT(blockDim.y == 1 && blockDim.z == 1, "transform_kernel only supports 1D blocks"); From bd65528257aae69a43b742d9f2b6de4fa9588fcc Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Sun, 23 Nov 2025 13:15:18 +0530 Subject: [PATCH 08/14] More Fixes --- cub/cub/device/dispatch/kernels/kernel_scan.cuh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index 2ab114f6743..33f2d344625 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -39,7 +39,8 @@ namespace detail::scan * Number of tiles */ template -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles) +CUB_DETAIL_KERNEL_ATTRIBUTES void +DeviceScanInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles) { _CCCL_PDL_GRID_DEPENDENCY_SYNC(); _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); // beneficial for all problem sizes in cub.bench.scan.exclusive.sum.base @@ -67,8 +68,8 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state * (i.e., length of `d_selected_out`) */ template -CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceCompactInitKernel(ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceCompactInitKernel( + ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); From 0e50ba35ba34ba9a29d3bf87fe57e099381f9a01 Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Wed, 26 Nov 2025 19:49:32 +0530 Subject: [PATCH 09/14] Minor Fixes --- .../dispatch/dispatch_adjacent_difference.cuh | 2 +- .../device/dispatch/dispatch_scan_by_key.cuh | 2 +- .../dispatch/dispatch_segmented_sort.cuh | 28 ++++++++--------- cub/cub/device/dispatch/dispatch_topk.cuh | 14 ++++----- .../dispatch/kernels/kernel_merge_sort.cuh | 14 ++++----- .../dispatch/kernels/kernel_radix_sort.cuh | 31 +++++++++---------- .../device/dispatch/kernels/kernel_reduce.cuh | 5 ++- .../kernels/kernel_segmented_radix_sort.cuh | 4 +-- .../kernels/kernel_segmented_sort.cuh | 4 +-- .../dispatch/kernels/kernel_transform.cuh | 2 +- 10 files changed, 52 insertions(+), 54 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index eb0502b66f4..5f592133924 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -52,7 +52,7 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel( _CCCL_GRID_CONSTANT const InputIteratorT input, - InputT* first_tile_previous, + _CCCL_GRID_CONSTANT InputT* const first_tile_previous, _CCCL_GRID_CONSTANT const OutputIteratorT result, DifferenceOpT difference_op, _CCCL_GRID_CONSTANT const OffsetT num_items) diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 59499f4c096..6a15a808927 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -118,7 +118,7 @@ template __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContinuationKernel( - LargeKernelT large_kernel, - SmallKernelT small_kernel, - local_segment_index_t num_segments, - KeyT* d_current_keys, - KeyT* d_final_keys, + _CCCL_GRID_CONSTANT const LargeKernelT large_kernel, + _CCCL_GRID_CONSTANT const SmallKernelT small_kernel, + _CCCL_GRID_CONSTANT const local_segment_index_t num_segments, + _CCCL_GRID_CONSTANT KeyT* const d_current_keys, + _CCCL_GRID_CONSTANT KeyT* const d_final_keys, device_double_buffer d_keys_double_buffer, - ValueT* d_current_values, - ValueT* d_final_values, + _CCCL_GRID_CONSTANT ValueT* const d_current_values, + _CCCL_GRID_CONSTANT ValueT* const d_final_values, device_double_buffer d_values_double_buffer, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - local_segment_index_t* group_sizes, - local_segment_index_t* large_and_medium_segments_indices, - local_segment_index_t* small_segments_indices, - KernelLauncherFactory launcher_factory, - WrappedPolicyT wrapped_policy) + _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets, + _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets, + _CCCL_GRID_CONSTANT local_segment_index_t* const group_sizes, + _CCCL_GRID_CONSTANT local_segment_index_t* const large_and_medium_segments_indices, + _CCCL_GRID_CONSTANT local_segment_index_t* const small_segments_indices, + _CCCL_GRID_CONSTANT const KernelLauncherFactory launcher_factory, + _CCCL_GRID_CONSTANT const WrappedPolicyT wrapped_policy) { // In case of CDP: // 1. each CTA has a different main stream diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh index 624c5d598ad..7fc26327c65 100644 --- a/cub/cub/device/dispatch/dispatch_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_topk.cuh @@ -116,12 +116,12 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out, _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in, _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out, - KeyInT* in_buf, - OffsetT* in_idx_buf, - KeyInT* out_buf, - OffsetT* out_idx_buf, + _CCCL_GRID_CONSTANT KeyInT* const in_buf, + _CCCL_GRID_CONSTANT OffsetT* const in_idx_buf, + _CCCL_GRID_CONSTANT KeyInT* const out_buf, + _CCCL_GRID_CONSTANT OffsetT* const out_idx_buf, Counter, OffsetT, OutOffsetT>* counter, - OffsetT* histogram, + _CCCL_GRID_CONSTANT OffsetT* const histogram, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OutOffsetT k, _CCCL_GRID_CONSTANT const OffsetT buffer_length, @@ -172,8 +172,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::topk_policy_t::block_threads _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out, _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in, _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out, - KeyInT* in_buf, - OffsetT* in_idx_buf, + _CCCL_GRID_CONSTANT KeyInT* const in_buf, + _CCCL_GRID_CONSTANT OffsetT* const in_idx_buf, Counter, OffsetT, OutOffsetT>* counter, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const OutOffsetT k, diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh index e21c6a2a8bb..3f428f7c7ba 100644 --- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh @@ -173,8 +173,8 @@ __launch_bounds__( _CCCL_GRID_CONSTANT const KeyIteratorT keys_out, _CCCL_GRID_CONSTANT const ValueIteratorT items_out, _CCCL_GRID_CONSTANT const OffsetT keys_count, - KeyT* tmp_keys_out, - ValueT* tmp_items_out, + _CCCL_GRID_CONSTANT KeyT* const tmp_keys_out, + _CCCL_GRID_CONSTANT ValueT* const tmp_items_out, CompareOpT compare_op, vsmem_t vsmem) { @@ -223,10 +223,10 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) - CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, _CCCL_GRID_CONSTANT const int num_counts) + CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel( + _CCCL_GRID_CONSTANT OffsetT* const d_spine, _CCCL_GRID_CONSTANT const int num_counts) { // Parameterize the AgentScan type for the current configuration using AgentScanT = @@ -239,10 +240,10 @@ __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDo : int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel( _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, - KeyT* d_keys_out, + _CCCL_GRID_CONSTANT KeyT* const d_keys_out, _CCCL_GRID_CONSTANT const ValueT* const d_values_in, - ValueT* d_values_out, - OffsetT* d_spine, + _CCCL_GRID_CONSTANT ValueT* const d_values_out, + _CCCL_GRID_CONSTANT OffsetT* const d_spine, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int current_bit, _CCCL_GRID_CONSTANT const int num_bits, @@ -328,9 +329,9 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel( - OffsetT* d_bins_out, - KeyT* d_keys_in, - OffsetT* d_bins_out, + _CCCL_GRID_CONSTANT OffsetT* const d_bins_out, _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, _CCCL_GRID_CONSTANT const OffsetT num_items, _CCCL_GRID_CONSTANT const int start_bit, @@ -470,13 +469,13 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS) DeviceRadixSortOnesweepKernel( - AtomicOffsetT* d_lookback, - AtomicOffsetT* d_ctrs, - OffsetT* d_bins_out, + _CCCL_GRID_CONSTANT AtomicOffsetT* const d_lookback, + _CCCL_GRID_CONSTANT AtomicOffsetT* const d_ctrs, + _CCCL_GRID_CONSTANT OffsetT* const d_bins_out, _CCCL_GRID_CONSTANT const OffsetT* const d_bins_in, - KeyT* d_keys_out, + _CCCL_GRID_CONSTANT KeyT* const d_keys_out, _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, - ValueT* d_values_out, + _CCCL_GRID_CONSTANT ValueT* const d_values_out, _CCCL_GRID_CONSTANT const ValueT* const d_values_in, _CCCL_GRID_CONSTANT const PortionOffsetT num_items, _CCCL_GRID_CONSTANT const int current_bit, @@ -515,7 +514,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy * Exclusive sum kernel */ template -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(_CCCL_GRID_CONSTANT OffsetT* const d_bins) { using ExclusiveSumPolicyT = typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy; constexpr int RADIX_BITS = ExclusiveSumPolicyT::RADIX_BITS; diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index cd2ccab79dc..5dc935c547e 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -130,7 +130,7 @@ template even_share, ReductionOpT reduction_op, @@ -215,7 +215,6 @@ template ) CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel( _CCCL_GRID_CONSTANT const Offset num_items, _CCCL_GRID_CONSTANT const int num_elem_per_thread, - [[maybe_unused]] bool can_vectorize, + _CCCL_GRID_CONSTANT [[maybe_unused]] const bool can_vectorize, _CCCL_GRID_CONSTANT const Predicate pred, _CCCL_GRID_CONSTANT const F f, _CCCL_GRID_CONSTANT const RandomAccessIteratorOut out, From 76c4b1094dc2671dd01b555531e95c0c91b0881e Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Thu, 4 Dec 2025 14:24:31 +0530 Subject: [PATCH 10/14] Minor Fixes --- .../device/dispatch/kernels/kernel_reduce.cuh | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index 99655ef1826..6b6ee775642 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -134,9 +134,9 @@ template even_share, ReductionOpT reduction_op, TransformOpT transform_op) @@ -227,9 +227,9 @@ template even_share, ReductionOpT reduction_op, - InitT init, + _CCCL_GRID_CONSTANT const InitT init, TransformOpT transform_op) { NV_IF_TARGET(NV_PROVIDES_SM_60, From b2bf44a506c539ef2abd861c6dd4a7c03bf156a3 Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Sat, 13 Dec 2025 11:33:06 +0530 Subject: [PATCH 11/14] Add _CCCL_GRID_CONSTANT const to ::cuda::std::array for Histogram --- cub/cub/agent/agent_histogram.cuh | 16 +++++------ .../dispatch/kernels/kernel_histogram.cuh | 28 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index d49aecdc75d..7e2112e5e2b 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -203,12 +203,12 @@ struct AgentHistogram _TempStorage& temp_storage; WrappedSampleIteratorT d_wrapped_samples; // with cache modifier applied, if possible SampleT* d_native_samples; // possibly nullptr if unavailable - int* num_output_bins; // one for each channel - int* num_privatized_bins; // one for each channel + const int* num_output_bins; // one for each channel (read-only) + const int* num_privatized_bins; // one for each channel (read-only) CounterT* d_privatized_histograms[NumActiveChannels]; // one for each channel CounterT** d_output_histograms; // in global memory - OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel - PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel + const OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel (read-only) + const PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel (read-only) bool prefer_smem; // for privatized counterss template @@ -564,12 +564,12 @@ struct AgentHistogram _CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram( TempStorage& temp_storage, SampleIteratorT d_samples, - int* num_output_bins, - int* num_privatized_bins, + const int* num_output_bins, + const int* num_privatized_bins, CounterT** d_output_histograms, CounterT** d_privatized_histograms, - OutputDecodeOpT* output_decode_op, - PrivatizedDecodeOpT* privatized_decode_op) + const OutputDecodeOpT* output_decode_op, + const PrivatizedDecodeOpT* privatized_decode_op) : temp_storage(temp_storage.Alias()) , d_wrapped_samples(d_samples) , d_native_samples(NativePointer(d_wrapped_samples)) diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh index bf86c9be4b2..77df86d4259 100644 --- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh @@ -47,7 +47,7 @@ struct Transforms // Method for converting samples to bin-ids template - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) const { /// Level iterator wrapper type // Wrap the native input pointer with CacheModifiedInputIterator @@ -185,13 +185,13 @@ struct Transforms // All types but __half: template - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(T sample, T max_level, T min_level) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(T sample, T max_level, T min_level) const { return sample >= min_level && sample < max_level; } #if _CCCL_HAS_NVFP16() - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(__half sample, __half max_level, __half min_level) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(__half sample, __half max_level, __half min_level) const { NV_IF_TARGET( NV_PROVIDES_SM_53, @@ -214,7 +214,7 @@ struct Transforms //! @brief Bin computation for floating point (and extended floating point) types template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int - ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::true_type /* is_fp */) + ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::true_type /* is_fp */) const { return static_cast((sample - min_level) * scale.reciprocal); } @@ -222,14 +222,14 @@ struct Transforms //! @brief Bin computation for custom types and __[u]int128 template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int - ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::false_type /* is_fp */) + ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::false_type /* is_fp */) const { return static_cast(((sample - min_level) * scale.fraction.bins) / scale.fraction.range); } //! @brief Bin computation for integral types of up to 64-bit types template ::value, int> = 0> - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) const { return static_cast( (static_cast(sample - min_level) * static_cast(scale.fraction.bins)) @@ -237,13 +237,13 @@ struct Transforms } template ::value, int> = 0> - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) const { return this->ComputeBin(sample, min_level, scale, ::cuda::std::is_floating_point{}); } #if _CCCL_HAS_NVFP16() - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(__half sample, __half min_level, ScaleT scale) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(__half sample, __half min_level, ScaleT scale) const { NV_IF_TARGET( NV_PROVIDES_SM_53, @@ -264,7 +264,7 @@ struct Transforms // Method for converting samples to bin-ids template - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid) const { const CommonT common_sample = static_cast(sample); @@ -296,7 +296,7 @@ struct Transforms // Method for converting samples to bin-ids template - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) const { if (valid) { @@ -445,12 +445,12 @@ template num_output_bins_wrapper, - ::cuda::std::array num_privatized_bins_wrapper, + _CCCL_GRID_CONSTANT const ::cuda::std::array num_output_bins_wrapper, + _CCCL_GRID_CONSTANT const ::cuda::std::array num_privatized_bins_wrapper, ::cuda::std::array d_output_histograms_wrapper, ::cuda::std::array d_privatized_histograms_wrapper, - ::cuda::std::array output_decode_op_wrapper, - ::cuda::std::array privatized_decode_op_wrapper, + _CCCL_GRID_CONSTANT const ::cuda::std::array output_decode_op_wrapper, + _CCCL_GRID_CONSTANT const ::cuda::std::array privatized_decode_op_wrapper, _CCCL_GRID_CONSTANT const OffsetT num_row_pixels, _CCCL_GRID_CONSTANT const OffsetT num_rows, _CCCL_GRID_CONSTANT const OffsetT row_stride_samples, From 77363bf59f87b375c823ee3d725dfaca2fc1b68a Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Sat, 13 Dec 2025 11:34:00 +0530 Subject: [PATCH 12/14] Add _CCCL_GRID_CONSTANT const to ::cuda::std::array for Histogram --- cub/cub/agent/agent_histogram.cuh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index 7e2112e5e2b..683693821f7 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -207,8 +207,10 @@ struct AgentHistogram const int* num_privatized_bins; // one for each channel (read-only) CounterT* d_privatized_histograms[NumActiveChannels]; // one for each channel CounterT** d_output_histograms; // in global memory - const OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel (read-only) - const PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel (read-only) + const OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each + // channel (read-only) + const PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each + // channel (read-only) bool prefer_smem; // for privatized counterss template From a234d17b9953501342e525e13fb7f557ee086018 Mon Sep 17 00:00:00 2001 From: Aryamaan Singh Date: Mon, 2 Feb 2026 17:22:46 +0530 Subject: [PATCH 13/14] Fixes and Updates --- .../dispatch/kernels/kernel_radix_sort.cuh | 17 +++++++++-------- .../device/dispatch/kernels/kernel_reduce.cuh | 18 +++++++++--------- .../dispatch/kernels/kernel_transform.cuh | 12 ++++++------ 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh index 9d888eccd45..7c32b0ffd21 100644 --- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh @@ -143,7 +143,8 @@ __launch_bounds__(int(ALT_DIGIT_BITS ? PolicySelector{}(::cuda::arch_id{CUB_PTX_ */ template __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).scan.block_threads, 1) - CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, int num_counts) + CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel( + _CCCL_GRID_CONSTANT OffsetT* const d_spine, _CCCL_GRID_CONSTANT const int num_counts) { static constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).scan; using ScanPolicy = AgentScanPolicy< @@ -451,12 +452,12 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}) - .histogram.block_threads) void DeviceRadixSortHistogramKernel(OffsetT* d_bins_out, - const KeyT* d_keys_in, - OffsetT num_items, - int start_bit, - int end_bit, - DecomposerT decomposer = {}) + .histogram.block_threads) void DeviceRadixSortHistogramKernel(_CCCL_GRID_CONSTANT OffsetT* const d_bins_out, + _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const int start_bit, + _CCCL_GRID_CONSTANT const int end_bit, + _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) { static constexpr radix_sort_histogram_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).histogram; @@ -535,7 +536,7 @@ __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).onesweep. * Exclusive sum kernel */ template -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(_CCCL_GRID_CONSTANT OffsetT* const d_bins) { static constexpr radix_sort_exclusive_sum_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).exclusive_sum; diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index 50b0f1c0d5c..6ca08016d7a 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -134,9 +134,9 @@ template even_share, ReductionOpT reduction_op, TransformOpT transform_op) @@ -227,9 +227,9 @@ template even_share, ReductionOpT reduction_op, _CCCL_GRID_CONSTANT const InitT init, diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh index 30b78d491b0..0a4b1958c5e 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh @@ -1013,12 +1013,12 @@ template #endif // _CCCL_HAS_CONCEPTS() __launch_bounds__(get_block_threads) CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel( - Offset num_items, - int num_elem_per_thread, - [[maybe_unused]] bool can_vectorize, - Predicate pred, - F f, - RandomAccessIteratorOut out, + _CCCL_GRID_CONSTANT const Offset num_items, + _CCCL_GRID_CONSTANT const int num_elem_per_thread, + _CCCL_GRID_CONSTANT [[maybe_unused]] const bool can_vectorize, + _CCCL_GRID_CONSTANT const Predicate pred, + _CCCL_GRID_CONSTANT const F f, + _CCCL_GRID_CONSTANT const RandomAccessIteratorOut out, kernel_arg... ins) { _CCCL_ASSERT(blockDim.y == 1 && blockDim.z == 1, "transform_kernel only supports 1D blocks"); From 967ce0b5be7cc105744c40ea47278993993849b9 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 6 Feb 2026 16:23:52 +0100 Subject: [PATCH 14/14] FIx --- cub/cub/device/dispatch/dispatch_adjacent_difference.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index f792d59723d..6e885c59e99 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -34,7 +34,7 @@ namespace detail::adjacent_difference template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel( _CCCL_GRID_CONSTANT const InputIteratorT first, - _CCCL_GRID_CONSTANT const InputT* result, + _CCCL_GRID_CONSTANT InputT* const result, _CCCL_GRID_CONSTANT const OffsetT num_tiles, _CCCL_GRID_CONSTANT const int items_per_tile) {