diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index 4c4ba87ed92..568297418ee 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -226,12 +226,14 @@ struct AgentHistogram _TempStorage& temp_storage; WrappedSampleIteratorT d_wrapped_samples; // with cache modifier applied, if possible SampleT* d_native_samples; // possibly nullptr if unavailable - int* num_output_bins; // one for each channel - int* num_privatized_bins; // one for each channel + const int* num_output_bins; // one for each channel + const int* num_privatized_bins; // one for each channel CounterT* d_privatized_histograms[NumActiveChannels]; // one for each channel CounterT** d_output_histograms; // in global memory - OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel - PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel + const OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each + // channel + const PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each + // channel bool prefer_smem; // for privatized counterss template @@ -587,12 +589,12 @@ struct AgentHistogram _CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram( TempStorage& temp_storage, SampleIteratorT d_samples, - int* num_output_bins, - int* num_privatized_bins, + const int* num_output_bins, + const int* num_privatized_bins, CounterT** d_output_histograms, CounterT** d_privatized_histograms, - OutputDecodeOpT* output_decode_op, - PrivatizedDecodeOpT* privatized_decode_op) + const OutputDecodeOpT* output_decode_op, + const PrivatizedDecodeOpT* privatized_decode_op) : temp_storage(temp_storage.Alias()) , d_wrapped_samples(d_samples) , d_native_samples(NativePointer(d_wrapped_samples)) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 95e88e71050..ccdc9231c68 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -39,8 +39,11 @@ CUB_NAMESPACE_BEGIN namespace detail::adjacent_difference { template -CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel( + _CCCL_GRID_CONSTANT const InputIteratorT first, + _CCCL_GRID_CONSTANT InputT* const result, + _CCCL_GRID_CONSTANT const OffsetT num_tiles, + _CCCL_GRID_CONSTANT const int items_per_tile) { const int tile_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile); @@ -55,11 +58,11 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel( - InputIteratorT input, - InputT* first_tile_previous, - OutputIteratorT result, + _CCCL_GRID_CONSTANT const InputIteratorT input, + _CCCL_GRID_CONSTANT InputT* const first_tile_previous, + _CCCL_GRID_CONSTANT const OutputIteratorT result, DifferenceOpT difference_op, - OffsetT num_items) + _CCCL_GRID_CONSTANT const OffsetT num_items) { static_assert(::cuda::std::is_empty_v); static constexpr adjacent_difference_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 33f73fa6a56..312278ba33e 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -61,7 +61,7 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void MultiBlockBatchMemcpyKernel( - InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - BufferTileOffsetItT buffer_tile_offsets, + _CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it, + _CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it, + _CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes, + _CCCL_GRID_CONSTANT const BufferTileOffsetItT buffer_tile_offsets, TileT buffer_offset_tile, - TileOffsetT last_tile_offset) + _CCCL_GRID_CONSTANT const TileOffsetT last_tile_offset) { using StatusWord = typename TileT::StatusWord; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT; @@ -210,16 +210,16 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void BatchMemcpyKernel( - InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - BufferOffsetT num_buffers, - BlevBufferSrcsOutItT blev_buffer_srcs, - BlevBufferDstsOutItT blev_buffer_dsts, - BlevBufferSizesOutItT blev_buffer_sizes, - BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets, - BLevBufferOffsetTileState blev_buffer_scan_state, - BLevBlockOffsetTileState blev_block_scan_state) + _CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it, + _CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it, + _CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes, + _CCCL_GRID_CONSTANT const BufferOffsetT num_buffers, + _CCCL_GRID_CONSTANT const BlevBufferSrcsOutItT blev_buffer_srcs, + _CCCL_GRID_CONSTANT const BlevBufferDstsOutItT blev_buffer_dsts, + _CCCL_GRID_CONSTANT const BlevBufferSizesOutItT blev_buffer_sizes, + _CCCL_GRID_CONSTANT const BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets, + _CCCL_GRID_CONSTANT const BLevBufferOffsetTileState blev_buffer_scan_state, + _CCCL_GRID_CONSTANT const BLevBlockOffsetTileState blev_block_scan_state) { // Internal type used for storing a buffer's size using BufferSizeT = it_value_t; diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index 7f5a13550b9..2fc376d674e 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -89,11 +89,11 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel( - KeyIt1 keys1, - Offset keys1_count, - KeyIt2 keys2, - Offset keys2_count, - Offset num_diagonals, + _CCCL_GRID_CONSTANT const KeyIt1 keys1, + _CCCL_GRID_CONSTANT const Offset keys1_count, + _CCCL_GRID_CONSTANT const KeyIt2 keys2, + _CCCL_GRID_CONSTANT const Offset keys2_count, + _CCCL_GRID_CONSTANT const Offset num_diagonals, Offset* key1_beg_offsets, CompareOp compare_op) { @@ -137,14 +137,14 @@ __launch_bounds__( Offset, CompareOp>::type::block_threads) CUB_DETAIL_KERNEL_ATTRIBUTES void device_merge_kernel( - KeyIt1 keys1, - ValueIt1 items1, - Offset num_keys1, - KeyIt2 keys2, - ValueIt2 items2, - Offset num_keys2, - KeyIt3 keys_result, - ValueIt3 items_result, + _CCCL_GRID_CONSTANT const KeyIt1 keys1, + _CCCL_GRID_CONSTANT const ValueIt1 items1, + _CCCL_GRID_CONSTANT const Offset num_keys1, + _CCCL_GRID_CONSTANT const KeyIt2 keys2, + _CCCL_GRID_CONSTANT const ValueIt2 items2, + _CCCL_GRID_CONSTANT const Offset num_keys2, + _CCCL_GRID_CONSTANT const KeyIt3 keys_result, + _CCCL_GRID_CONSTANT const ValueIt3 items_result, CompareOp compare_op, Offset* key1_beg_offsets, vsmem_t global_temp_storage) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 14fbf3ae15c..e884a5472ed 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -190,16 +190,16 @@ template > __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel( - KeysInputIteratorT d_keys_in, - KeyT* d_keys_prev_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, + _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in, + _CCCL_GRID_CONSTANT KeyT* const d_keys_prev_in, + _CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in, + _CCCL_GRID_CONSTANT const ValuesOutputIteratorT d_values_out, ScanByKeyTileStateT tile_state, - int start_tile, + _CCCL_GRID_CONSTANT const int start_tile, EqualityOp equality_op, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items) + _CCCL_GRID_CONSTANT const ScanOpT scan_op, + _CCCL_GRID_CONSTANT const InitValueT init_value, + _CCCL_GRID_CONSTANT const OffsetT num_items) { using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT; @@ -156,10 +156,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel( ScanTileStateT tile_state, - KeysInputIteratorT d_keys_in, + _CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in, cub::detail::it_value_t* d_keys_prev_in, - OffsetT items_per_tile, - int num_tiles) + _CCCL_GRID_CONSTANT const OffsetT items_per_tile, + _CCCL_GRID_CONSTANT const int num_tiles) { // Initialize tile status tile_state.InitializeStatus(num_tiles); diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 728ec456f88..6404054aa02 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -183,25 +183,25 @@ template __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContinuationKernel( - LargeKernelT large_kernel, - SmallKernelT small_kernel, - local_segment_index_t num_segments, - KeyT* d_current_keys, - KeyT* d_final_keys, + _CCCL_GRID_CONSTANT const LargeKernelT large_kernel, + _CCCL_GRID_CONSTANT const SmallKernelT small_kernel, + _CCCL_GRID_CONSTANT const local_segment_index_t num_segments, + _CCCL_GRID_CONSTANT KeyT* const d_current_keys, + _CCCL_GRID_CONSTANT KeyT* const d_final_keys, device_double_buffer d_keys_double_buffer, - ValueT* d_current_values, - ValueT* d_final_values, + _CCCL_GRID_CONSTANT ValueT* const d_current_values, + _CCCL_GRID_CONSTANT ValueT* const d_final_values, device_double_buffer d_values_double_buffer, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - local_segment_index_t* group_sizes, - local_segment_index_t* large_and_medium_segments_indices, - local_segment_index_t* small_segments_indices, - KernelLauncherFactory launcher_factory, - int large_block_threads, - int small_block_threads, - int medium_segments_per_block, - int small_segments_per_block) + _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets, + _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets, + _CCCL_GRID_CONSTANT local_segment_index_t* const group_sizes, + _CCCL_GRID_CONSTANT local_segment_index_t* const large_and_medium_segments_indices, + _CCCL_GRID_CONSTANT local_segment_index_t* const small_segments_indices, + _CCCL_GRID_CONSTANT const KernelLauncherFactory launcher_factory, + _CCCL_GRID_CONSTANT const int large_block_threads, + _CCCL_GRID_CONSTANT const int small_block_threads, + _CCCL_GRID_CONSTANT const int medium_segments_per_block, + _CCCL_GRID_CONSTANT const int small_segments_per_block) { // In case of CDP: // 1. each CTA has a different main stream diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index f70c3e2189a..2f2d611ba8d 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -315,15 +315,15 @@ __launch_bounds__(int( OffsetT, StreamingContextT>::agent_policy_t::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel( - InputIteratorT d_in, - FlagsInputIteratorT d_flags, - SelectedOutputIteratorT d_selected_out, - NumSelectedIteratorT d_num_selected_out, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, + _CCCL_GRID_CONSTANT const FlagsInputIteratorT d_flags, + _CCCL_GRID_CONSTANT const SelectedOutputIteratorT d_selected_out, + _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_status, SelectOpT select_op, EqualityOpT equality_op, - OffsetT num_items, - int num_tiles, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context, vsmem_t vsmem) { diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh index 543bceabcee..1c15e99b6a9 100644 --- a/cub/cub/device/dispatch/dispatch_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_topk.cuh @@ -112,22 +112,22 @@ template , OffsetT, OutOffsetT>* counter, - OffsetT* histogram, - OffsetT num_items, - OutOffsetT k, - OffsetT buffer_length, + _CCCL_GRID_CONSTANT OffsetT* const histogram, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const OutOffsetT k, + _CCCL_GRID_CONSTANT const OffsetT buffer_length, ExtractBinOpT extract_bin_op, IdentifyCandidatesOpT identify_candidates_op, - int pass) + _CCCL_GRID_CONSTANT const int pass) { static constexpr topk_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); using agent_topk_policy_t = @@ -177,18 +177,18 @@ template , OffsetT, OutOffsetT>* counter, - OffsetT num_items, - OutOffsetT k, - OffsetT buffer_length, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const OutOffsetT k, + _CCCL_GRID_CONSTANT const OffsetT buffer_length, IdentifyCandidatesOpT identify_candidates_op, - int pass) + _CCCL_GRID_CONSTANT const int pass) { static constexpr topk_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); using agent_topk_policy_t = diff --git a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh index d98683ca964..307f5275061 100644 --- a/cub/cub/device/dispatch/kernels/kernel_for_each.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_for_each.cuh @@ -90,7 +90,7 @@ template #if _CCCL_HAS_CONCEPTS() requires for_policy_selector #endif // _CCCL_HAS_CONCEPTS() -CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op) +CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, OpT op) { static constexpr for_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); using agent_policy_t = policy_t; @@ -119,7 +119,7 @@ template #endif // _CCCL_HAS_CONCEPTS() CUB_DETAIL_KERNEL_ATTRIBUTES // __launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads)) // - void static_kernel(OffsetT num_items, OpT op) + void static_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, OpT op) { static constexpr for_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); using agent_policy_t = policy_t; diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh index bc8f17f1e39..77df86d4259 100644 --- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh @@ -47,7 +47,7 @@ struct Transforms // Method for converting samples to bin-ids template - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) const { /// Level iterator wrapper type // Wrap the native input pointer with CacheModifiedInputIterator @@ -185,13 +185,13 @@ struct Transforms // All types but __half: template - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(T sample, T max_level, T min_level) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(T sample, T max_level, T min_level) const { return sample >= min_level && sample < max_level; } #if _CCCL_HAS_NVFP16() - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(__half sample, __half max_level, __half min_level) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(__half sample, __half max_level, __half min_level) const { NV_IF_TARGET( NV_PROVIDES_SM_53, @@ -214,7 +214,7 @@ struct Transforms //! @brief Bin computation for floating point (and extended floating point) types template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int - ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::true_type /* is_fp */) + ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::true_type /* is_fp */) const { return static_cast((sample - min_level) * scale.reciprocal); } @@ -222,14 +222,14 @@ struct Transforms //! @brief Bin computation for custom types and __[u]int128 template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int - ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::false_type /* is_fp */) + ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::false_type /* is_fp */) const { return static_cast(((sample - min_level) * scale.fraction.bins) / scale.fraction.range); } //! @brief Bin computation for integral types of up to 64-bit types template ::value, int> = 0> - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) const { return static_cast( (static_cast(sample - min_level) * static_cast(scale.fraction.bins)) @@ -237,13 +237,13 @@ struct Transforms } template ::value, int> = 0> - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) const { return this->ComputeBin(sample, min_level, scale, ::cuda::std::is_floating_point{}); } #if _CCCL_HAS_NVFP16() - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(__half sample, __half min_level, ScaleT scale) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(__half sample, __half min_level, ScaleT scale) const { NV_IF_TARGET( NV_PROVIDES_SM_53, @@ -264,7 +264,7 @@ struct Transforms // Method for converting samples to bin-ids template - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid) const { const CommonT common_sample = static_cast(sample); @@ -296,7 +296,7 @@ struct Transforms // Method for converting samples to bin-ids template - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) const { if (valid) { @@ -444,17 +444,17 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramSweepKernel( - SampleIteratorT d_samples, - ::cuda::std::array num_output_bins_wrapper, - ::cuda::std::array num_privatized_bins_wrapper, + _CCCL_GRID_CONSTANT const SampleIteratorT d_samples, + _CCCL_GRID_CONSTANT const ::cuda::std::array num_output_bins_wrapper, + _CCCL_GRID_CONSTANT const ::cuda::std::array num_privatized_bins_wrapper, ::cuda::std::array d_output_histograms_wrapper, ::cuda::std::array d_privatized_histograms_wrapper, - ::cuda::std::array output_decode_op_wrapper, - ::cuda::std::array privatized_decode_op_wrapper, - OffsetT num_row_pixels, - OffsetT num_rows, - OffsetT row_stride_samples, - int tiles_per_row, + _CCCL_GRID_CONSTANT const ::cuda::std::array output_decode_op_wrapper, + _CCCL_GRID_CONSTANT const ::cuda::std::array privatized_decode_op_wrapper, + _CCCL_GRID_CONSTANT const OffsetT num_row_pixels, + _CCCL_GRID_CONSTANT const OffsetT num_rows, + _CCCL_GRID_CONSTANT const OffsetT row_stride_samples, + _CCCL_GRID_CONSTANT const int tiles_per_row, GridQueue tile_queue) { // Thread block type for compositing input tiles diff --git a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh index a678ee22ff0..7c5f0d8ae47 100644 --- a/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_merge_sort.cuh @@ -167,14 +167,14 @@ __launch_bounds__( KeyT, ValueT>::policy_t::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel( - bool ping, - KeyInputIteratorT keys_in, - ValueInputIteratorT items_in, - KeyIteratorT keys_out, - ValueIteratorT items_out, - OffsetT keys_count, - KeyT* tmp_keys_out, - ValueT* tmp_items_out, + _CCCL_GRID_CONSTANT const bool ping, + _CCCL_GRID_CONSTANT const KeyInputIteratorT keys_in, + _CCCL_GRID_CONSTANT const ValueInputIteratorT items_in, + _CCCL_GRID_CONSTANT const KeyIteratorT keys_out, + _CCCL_GRID_CONSTANT const ValueIteratorT items_out, + _CCCL_GRID_CONSTANT const OffsetT keys_count, + _CCCL_GRID_CONSTANT KeyT* const tmp_keys_out, + _CCCL_GRID_CONSTANT ValueT* const tmp_items_out, CompareOpT compare_op, vsmem_t vsmem) { @@ -221,15 +221,15 @@ __launch_bounds__( template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel( - bool ping, - KeyIteratorT keys_ping, - KeyT* keys_pong, - OffsetT keys_count, - OffsetT num_partitions, - OffsetT* merge_partitions, + _CCCL_GRID_CONSTANT const bool ping, + _CCCL_GRID_CONSTANT const KeyIteratorT keys_ping, + _CCCL_GRID_CONSTANT KeyT* const keys_pong, + _CCCL_GRID_CONSTANT const OffsetT keys_count, + _CCCL_GRID_CONSTANT const OffsetT num_partitions, + _CCCL_GRID_CONSTANT OffsetT* const merge_partitions, CompareOpT compare_op, - OffsetT target_merged_tiles_number, - int items_per_tile) + _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number, + _CCCL_GRID_CONSTANT const int items_per_tile) { const OffsetT partition_idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); if (partition_idx < num_partitions) @@ -271,15 +271,15 @@ __launch_bounds__( KeyT, ValueT>::policy_t::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel( - bool ping, - KeyIteratorT keys_ping, - ValueIteratorT items_ping, - OffsetT keys_count, - KeyT* keys_pong, - ValueT* items_pong, + _CCCL_GRID_CONSTANT const bool ping, + _CCCL_GRID_CONSTANT const KeyIteratorT keys_ping, + _CCCL_GRID_CONSTANT const ValueIteratorT items_ping, + _CCCL_GRID_CONSTANT const OffsetT keys_count, + _CCCL_GRID_CONSTANT KeyT* const keys_pong, + _CCCL_GRID_CONSTANT ValueT* const items_pong, CompareOpT compare_op, - OffsetT* merge_partitions, - OffsetT target_merged_tiles_number, + _CCCL_GRID_CONSTANT OffsetT* const merge_partitions, + _CCCL_GRID_CONSTANT const OffsetT target_merged_tiles_number, vsmem_t vsmem) { using MergeSortHelperT = typename VSMemHelperT::template MergeSortVSMemHelperT< diff --git a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh index 0a5fcdc7125..7c32b0ffd21 100644 --- a/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_radix_sort.cuh @@ -81,13 +81,13 @@ template even_share, - DecomposerT decomposer = {}) + _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) { static constexpr radix_sort_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); static constexpr radix_sort_upsweep_policy active_upsweep_policy = @@ -143,7 +143,8 @@ __launch_bounds__(int(ALT_DIGIT_BITS ? PolicySelector{}(::cuda::arch_id{CUB_PTX_ */ template __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).scan.block_threads, 1) - CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, int num_counts) + CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel( + _CCCL_GRID_CONSTANT OffsetT* const d_spine, _CCCL_GRID_CONSTANT const int num_counts) { static constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).scan; using ScanPolicy = AgentScanPolicy< @@ -241,16 +242,16 @@ template even_share, - DecomposerT decomposer = {}) + _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) { static constexpr radix_sort_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); @@ -335,14 +336,14 @@ template __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).single_tile.block_threads, 1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel( - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, + _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, + _CCCL_GRID_CONSTANT KeyT* const d_keys_out, + _CCCL_GRID_CONSTANT const ValueT* const d_values_in, + _CCCL_GRID_CONSTANT ValueT* const d_values_out, OffsetT num_items, - int current_bit, - int end_bit, - DecomposerT decomposer = {}) + _CCCL_GRID_CONSTANT const int current_bit, + _CCCL_GRID_CONSTANT const int end_bit, + _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) { // Constants static constexpr radix_sort_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); @@ -451,12 +452,12 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}) - .histogram.block_threads) void DeviceRadixSortHistogramKernel(OffsetT* d_bins_out, - const KeyT* d_keys_in, - OffsetT num_items, - int start_bit, - int end_bit, - DecomposerT decomposer = {}) + .histogram.block_threads) void DeviceRadixSortHistogramKernel(_CCCL_GRID_CONSTANT OffsetT* const d_bins_out, + _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const int start_bit, + _CCCL_GRID_CONSTANT const int end_bit, + _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) { static constexpr radix_sort_histogram_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).histogram; @@ -479,18 +480,18 @@ template -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(_CCCL_GRID_CONSTANT OffsetT* const d_bins) { static constexpr radix_sort_exclusive_sum_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).exclusive_sum; diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index 217e0f82f43..752a326dd38 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -131,9 +131,9 @@ template even_share, ReductionOpT reduction_op, TransformOpT transform_op) @@ -224,11 +224,11 @@ template even_share, ReductionOpT reduction_op, - InitT init, + _CCCL_GRID_CONSTANT const InitT init, TransformOpT transform_op) { NV_IF_TARGET(NV_PROVIDES_SM_60, diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index 5346385f0ba..010424f231f 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -55,7 +55,7 @@ union tile_state_kernel_arg_t */ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel( - tile_state_kernel_arg_t tile_state, int num_tiles) + tile_state_kernel_arg_t tile_state, _CCCL_GRID_CONSTANT const int num_tiles) { _CCCL_PDL_GRID_DEPENDENCY_SYNC(); _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); // beneficial for all problem sizes in cub.bench.scan.exclusive.sum.base @@ -94,8 +94,8 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel( * (i.e., length of `d_selected_out`) */ template -CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceCompactInitKernel( + ScanTileStateT tile_state, _CCCL_GRID_CONSTANT const int num_tiles, NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh index 0f3a3bc5a20..31afe3c71b7 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh @@ -111,15 +111,15 @@ template ()) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel( - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int current_bit, - int pass_bits, - DecomposerT decomposer = {}) + _CCCL_GRID_CONSTANT const KeyT* const d_keys_in, + _CCCL_GRID_CONSTANT KeyT* const d_keys_out, + _CCCL_GRID_CONSTANT const ValueT* const d_values_in, + _CCCL_GRID_CONSTANT ValueT* const d_values_out, + _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets, + _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets, + _CCCL_GRID_CONSTANT const int current_bit, + _CCCL_GRID_CONSTANT const int pass_bits, + _CCCL_GRID_CONSTANT const DecomposerT decomposer = {}) { // // Constants diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh index c6855460292..d01794b1bdc 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh @@ -112,14 +112,14 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceFixedSizeSegmentedReduceKernel( - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT segment_size, - int num_segments, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, + _CCCL_GRID_CONSTANT const OutputIteratorT d_out, + _CCCL_GRID_CONSTANT const OffsetT segment_size, + _CCCL_GRID_CONSTANT const int num_segments, ReductionOpT reduction_op, - InitT init, - AccumT* d_partial_out, - int full_chunk_size, - int blocks_per_segment) + _CCCL_GRID_CONSTANT const InitT init, + _CCCL_GRID_CONSTANT AccumT* const d_partial_out, + _CCCL_GRID_CONSTANT const int full_chunk_size, + _CCCL_GRID_CONSTANT const int blocks_per_segment) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh index cc0431b8b0b..373ad649a0b 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_scan.cuh @@ -35,14 +35,14 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::segmented_scan_policy_t::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void device_segmented_scan_kernel( - InputIteratorT d_in, - OutputIteratorT d_out, - BeginOffsetIteratorInputT begin_offset_d_in, - EndOffsetIteratorInputT end_offset_d_in, - BeginOffsetIteratorOutputT begin_offset_d_out, - OffsetT n_segments, - ScanOpT scan_op, - InitValueT init_value) + _CCCL_GRID_CONSTANT const InputIteratorT d_in, + _CCCL_GRID_CONSTANT const OutputIteratorT d_out, + _CCCL_GRID_CONSTANT const BeginOffsetIteratorInputT begin_offset_d_in, + _CCCL_GRID_CONSTANT const EndOffsetIteratorInputT end_offset_d_in, + _CCCL_GRID_CONSTANT const BeginOffsetIteratorOutputT begin_offset_d_out, + _CCCL_GRID_CONSTANT const OffsetT n_segments, + _CCCL_GRID_CONSTANT const ScanOpT scan_op, + _CCCL_GRID_CONSTANT const InitValueT init_value) { using segmented_scan_policy_t = typename ChainedPolicyT::ActivePolicy::segmented_scan_policy_t; diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh index 7c8fc153698..dc9dfcd3d7c 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_sort.cuh @@ -137,8 +137,8 @@ __launch_bounds__(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).large_seg const ValueT* d_values_in_orig, ValueT* d_values_out_orig, device_double_buffer d_values_double_buffer, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets) + _CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets, + _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets) { static constexpr segmented_sort_policy active_policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); static constexpr auto large_policy = active_policy.large_segment; @@ -328,17 +328,17 @@ template d_keys_double_buffer, const ValueT* d_values_in_orig, ValueT* d_values_out_orig, device_double_buffer d_values_double_buffer, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets) + const _CCCL_GRID_CONSTANT BeginOffsetIteratorT d_begin_offsets, + const _CCCL_GRID_CONSTANT EndOffsetIteratorT d_end_offsets) { static constexpr segmented_radix_sort_policy large_policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).large_segment; diff --git a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh index e67b9acbea6..961f03587cf 100644 --- a/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_three_way_partition.cuh @@ -114,16 +114,16 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel( - InputIteratorT d_in, - FirstOutputIteratorT d_first_part_out, - SecondOutputIteratorT d_second_part_out, - UnselectedOutputIteratorT d_unselected_out, - NumSelectedIteratorT d_num_selected_out, + _CCCL_GRID_CONSTANT const InputIteratorT d_in, + _CCCL_GRID_CONSTANT const FirstOutputIteratorT d_first_part_out, + _CCCL_GRID_CONSTANT const SecondOutputIteratorT d_second_part_out, + _CCCL_GRID_CONSTANT const UnselectedOutputIteratorT d_unselected_out, + _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_status, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, - OffsetT num_items, - int num_tiles, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const int num_tiles, _CCCL_GRID_CONSTANT const StreamingContextT streaming_context) { using AgentThreeWayPartitionPolicyT = typename ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy; @@ -180,8 +180,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO * (i.e., length of @p d_selected_out) */ template -CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionInitKernel( + ScanTileStateT tile_state, + _CCCL_GRID_CONSTANT const int num_tiles, + _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh index c2f46a2bff3..ac3d5ac020d 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh @@ -1033,12 +1033,12 @@ template #endif // _CCCL_HAS_CONCEPTS() __launch_bounds__(get_block_threads) CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel( - Offset num_items, - int num_elem_per_thread, - [[maybe_unused]] bool can_vectorize, - Predicate pred, - F f, - RandomAccessIteratorOut out, + _CCCL_GRID_CONSTANT const Offset num_items, + _CCCL_GRID_CONSTANT const int num_elem_per_thread, + [[maybe_unused]] _CCCL_GRID_CONSTANT const bool can_vectorize, + _CCCL_GRID_CONSTANT const Predicate pred, + _CCCL_GRID_CONSTANT const F f, + _CCCL_GRID_CONSTANT const RandomAccessIteratorOut out, kernel_arg... ins) { _CCCL_ASSERT(blockDim.y == 1 && blockDim.z == 1, "transform_kernel only supports 1D blocks"); diff --git a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh index 8255d50ba58..fea4f0cb18b 100644 --- a/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_unique_by_key.cuh @@ -134,15 +134,15 @@ __launch_bounds__(int( EqualityOpT, OffsetT>::agent_policy_t::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel( - KeyInputIteratorT d_keys_in, - ValueInputIteratorT d_values_in, - KeyOutputIteratorT d_keys_out, - ValueOutputIteratorT d_values_out, - NumSelectedIteratorT d_num_selected_out, + _CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in, + _CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in, + _CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out, + _CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out, + _CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_state, EqualityOpT equality_op, - OffsetT num_items, - int num_tiles, + _CCCL_GRID_CONSTANT const OffsetT num_items, + _CCCL_GRID_CONSTANT const int num_tiles, vsmem_t vsmem) { using VsmemHelperT = typename VSMemHelperT::template VSMemHelperDefaultFallbackPolicyT< diff --git a/libcudacxx/include/cuda/std/__cccl/execution_space.h b/libcudacxx/include/cuda/std/__cccl/execution_space.h index eafc737b49d..96e37e5d372 100644 --- a/libcudacxx/include/cuda/std/__cccl/execution_space.h +++ b/libcudacxx/include/cuda/std/__cccl/execution_space.h @@ -48,7 +48,7 @@ # define _CCCL_PTX_ARCH() __CUDA_ARCH__ #endif -#if (_CCCL_CUDA_COMPILER(NVCC) || _CCCL_CUDA_COMPILER(NVRTC) || _CCCL_CUDA_COMPILER(CLANG, >=, 20)) \ +#if (_CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || _CCCL_CUDA_COMPILER(NVRTC) || _CCCL_CUDA_COMPILER(CLANG, >=, 20)) \ && _CCCL_PTX_ARCH() >= 700 # define _CCCL_HAS_GRID_CONSTANT() 1 # define _CCCL_GRID_CONSTANT __grid_constant__