Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
b7f0800
FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters
toxicteddy00077 Nov 15, 2025
6f71b09
FEA: Added _CCCL_GRID_CONSTANT const to more kernels
toxicteddy00077 Nov 15, 2025
ffd18c1
FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters
toxicteddy00077 Nov 17, 2025
bf617f5
Merge branch 'main' of https://github.com/toxicteddy00077/cccl into f…
toxicteddy00077 Nov 17, 2025
413d9bb
Removed _CCCL_GRID_CONSTANT const from all operators
toxicteddy00077 Nov 18, 2025
f8ac0ed
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Nov 20, 2025
5d51587
Merge branch 'NVIDIA:main' into fea/6482-enable-grid-constant
toxicteddy00077 Nov 22, 2025
5b62360
FEA: Added const _CCCL_GRID_CONSTANT to non-mutable kernel parameters
toxicteddy00077 Nov 15, 2025
3240916
Removed _CCCL_GRID_CONSTANT const from all operators
toxicteddy00077 Nov 18, 2025
5a88802
Fixes
toxicteddy00077 Nov 22, 2025
bd65528
More Fixes
toxicteddy00077 Nov 23, 2025
0e50ba3
Minor Fixes
toxicteddy00077 Nov 26, 2025
5c43ea2
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Dec 4, 2025
76c4b10
Minor Fixes
toxicteddy00077 Dec 4, 2025
df65f6f
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Dec 4, 2025
6c427e4
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Dec 13, 2025
b2bf44a
Add _CCCL_GRID_CONSTANT const to ::cuda::std::array for Histogram
toxicteddy00077 Dec 13, 2025
77363bf
Add _CCCL_GRID_CONSTANT const to ::cuda::std::array for Histogram
toxicteddy00077 Dec 13, 2025
e27640f
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Dec 25, 2025
22ca4f4
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Jan 28, 2026
a234d17
Fixes and Updates
toxicteddy00077 Feb 2, 2026
0c887b4
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Feb 2, 2026
967ce0b
FIx
bernhardmgruber Feb 6, 2026
a6533d4
Merge branch 'main' into fea/6482-enable-grid-constant
fbusato Feb 6, 2026
dfb66b1
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Feb 11, 2026
53dbe0b
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Feb 22, 2026
0835e6d
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Mar 4, 2026
fb8b5cb
Merge branch 'main' into fea/6482-enable-grid-constant
toxicteddy00077 Mar 11, 2026
4a7004e
Merge branch 'main' into fea/6482-enable-grid-constant
fbusato Mar 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -226,12 +226,14 @@ struct AgentHistogram
_TempStorage& temp_storage;
WrappedSampleIteratorT d_wrapped_samples; // with cache modifier applied, if possible
SampleT* d_native_samples; // possibly nullptr if unavailable
int* num_output_bins; // one for each channel
int* num_privatized_bins; // one for each channel
const int* num_output_bins; // one for each channel
const int* num_privatized_bins; // one for each channel
CounterT* d_privatized_histograms[NumActiveChannels]; // one for each channel
CounterT** d_output_histograms; // in global memory
OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel
PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel
const OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each
// channel
const PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each
// channel
bool prefer_smem; // for privatized counterss

template <typename TwoDimSubscriptableCounterT>
Expand Down Expand Up @@ -587,12 +589,12 @@ struct AgentHistogram
_CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram(
TempStorage& temp_storage,
SampleIteratorT d_samples,
int* num_output_bins,
int* num_privatized_bins,
const int* num_output_bins,
const int* num_privatized_bins,
CounterT** d_output_histograms,
CounterT** d_privatized_histograms,
OutputDecodeOpT* output_decode_op,
PrivatizedDecodeOpT* privatized_decode_op)
const OutputDecodeOpT* output_decode_op,
const PrivatizedDecodeOpT* privatized_decode_op)
: temp_storage(temp_storage.Alias())
, d_wrapped_samples(d_samples)
, d_native_samples(NativePointer(d_wrapped_samples))
Expand Down
15 changes: 9 additions & 6 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ CUB_NAMESPACE_BEGIN
namespace detail::adjacent_difference
{
template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
CUB_DETAIL_KERNEL_ATTRIBUTES void
DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile)
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel(
_CCCL_GRID_CONSTANT const InputIteratorT first,
_CCCL_GRID_CONSTANT InputT* const result,
_CCCL_GRID_CONSTANT const OffsetT num_tiles,
_CCCL_GRID_CONSTANT const int items_per_tile)
{
const int tile_idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile);
Expand All @@ -55,11 +58,11 @@ template <typename PolicySelector,
bool MayAlias,
bool ReadLeft>
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
InputIteratorT input,
InputT* first_tile_previous,
OutputIteratorT result,
_CCCL_GRID_CONSTANT const InputIteratorT input,
_CCCL_GRID_CONSTANT InputT* const first_tile_previous,
_CCCL_GRID_CONSTANT const OutputIteratorT result,
DifferenceOpT difference_op,
OffsetT num_items)
_CCCL_GRID_CONSTANT const OffsetT num_items)
{
static_assert(::cuda::std::is_empty_v<PolicySelector>);
static constexpr adjacent_difference_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
Expand Down
32 changes: 16 additions & 16 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ template <typename BufferOffsetScanTileStateT, typename BlockOffsetScanTileState
CUB_DETAIL_KERNEL_ATTRIBUTES void InitTileStateKernel(
BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
BlockOffsetScanTileStateT block_offset_scan_tile_state,
TileOffsetT num_tiles)
_CCCL_GRID_CONSTANT const TileOffsetT num_tiles)
{
// Initialize tile status
buffer_offset_scan_tile_state.InitializeStatus(num_tiles);
Expand All @@ -83,12 +83,12 @@ template <typename ChainedPolicyT,
CopyAlg MemcpyOpt>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
CUB_DETAIL_KERNEL_ATTRIBUTES void MultiBlockBatchMemcpyKernel(
InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferTileOffsetItT buffer_tile_offsets,
_CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it,
_CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it,
_CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes,
_CCCL_GRID_CONSTANT const BufferTileOffsetItT buffer_tile_offsets,
TileT buffer_offset_tile,
TileOffsetT last_tile_offset)
_CCCL_GRID_CONSTANT const TileOffsetT last_tile_offset)
{
using StatusWord = typename TileT::StatusWord;
using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
Expand Down Expand Up @@ -210,16 +210,16 @@ template <typename ChainedPolicyT,
CopyAlg MemcpyOpt>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
CUB_DETAIL_KERNEL_ATTRIBUTES void BatchMemcpyKernel(
InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferOffsetT num_buffers,
BlevBufferSrcsOutItT blev_buffer_srcs,
BlevBufferDstsOutItT blev_buffer_dsts,
BlevBufferSizesOutItT blev_buffer_sizes,
BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
BLevBufferOffsetTileState blev_buffer_scan_state,
BLevBlockOffsetTileState blev_block_scan_state)
_CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it,
_CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it,
_CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes,
_CCCL_GRID_CONSTANT const BufferOffsetT num_buffers,
_CCCL_GRID_CONSTANT const BlevBufferSrcsOutItT blev_buffer_srcs,
_CCCL_GRID_CONSTANT const BlevBufferDstsOutItT blev_buffer_dsts,
_CCCL_GRID_CONSTANT const BlevBufferSizesOutItT blev_buffer_sizes,
_CCCL_GRID_CONSTANT const BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
_CCCL_GRID_CONSTANT const BLevBufferOffsetTileState blev_buffer_scan_state,
_CCCL_GRID_CONSTANT const BLevBlockOffsetTileState blev_block_scan_state)
{
// Internal type used for storing a buffer's size
using BufferSizeT = it_value_t<BufferSizeIteratorT>;
Expand Down
26 changes: 13 additions & 13 deletions cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@ template <typename PolicySelector,
typename Offset,
typename CompareOp>
CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
KeyIt1 keys1,
Offset keys1_count,
KeyIt2 keys2,
Offset keys2_count,
Offset num_diagonals,
_CCCL_GRID_CONSTANT const KeyIt1 keys1,
_CCCL_GRID_CONSTANT const Offset keys1_count,
_CCCL_GRID_CONSTANT const KeyIt2 keys2,
_CCCL_GRID_CONSTANT const Offset keys2_count,
_CCCL_GRID_CONSTANT const Offset num_diagonals,
Offset* key1_beg_offsets,
CompareOp compare_op)
{
Expand Down Expand Up @@ -137,14 +137,14 @@ __launch_bounds__(
Offset,
CompareOp>::type::block_threads)
CUB_DETAIL_KERNEL_ATTRIBUTES void device_merge_kernel(
KeyIt1 keys1,
ValueIt1 items1,
Offset num_keys1,
KeyIt2 keys2,
ValueIt2 items2,
Offset num_keys2,
KeyIt3 keys_result,
ValueIt3 items_result,
_CCCL_GRID_CONSTANT const KeyIt1 keys1,
_CCCL_GRID_CONSTANT const ValueIt1 items1,
_CCCL_GRID_CONSTANT const Offset num_keys1,
_CCCL_GRID_CONSTANT const KeyIt2 keys2,
_CCCL_GRID_CONSTANT const ValueIt2 items2,
_CCCL_GRID_CONSTANT const Offset num_keys2,
_CCCL_GRID_CONSTANT const KeyIt3 keys_result,
_CCCL_GRID_CONSTANT const ValueIt3 items_result,
CompareOp compare_op,
Offset* key1_beg_offsets,
vsmem_t global_temp_storage)
Expand Down
14 changes: 7 additions & 7 deletions cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -190,16 +190,16 @@ template <typename PolicySelector,
#endif
__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel(
KeysInputIteratorT d_keys_in,
UniqueOutputIteratorT d_unique_out,
ValuesInputIteratorT d_values_in,
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
_CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
_CCCL_GRID_CONSTANT const UniqueOutputIteratorT d_unique_out,
_CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in,
_CCCL_GRID_CONSTANT const AggregatesOutputIteratorT d_aggregates_out,
_CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out,
ScanTileStateT tile_state,
int start_tile,
_CCCL_GRID_CONSTANT const int start_tile,
EqualityOpT equality_op,
ReductionOpT reduction_op,
OffsetT num_items,
_CCCL_GRID_CONSTANT const OffsetT num_items,
_CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
vsmem_t vsmem)
{
Expand Down
12 changes: 6 additions & 6 deletions cub/cub/device/dispatch/dispatch_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -178,14 +178,14 @@ template <typename PolicySelector,
#endif // _CCCL_HAS_CONCEPTS()
__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel(
InputIteratorT d_in,
OffsetsOutputIteratorT d_offsets_out,
LengthsOutputIteratorT d_lengths_out,
NumRunsOutputIteratorT d_num_runs_out,
_CCCL_GRID_CONSTANT const InputIteratorT d_in,
_CCCL_GRID_CONSTANT const OffsetsOutputIteratorT d_offsets_out,
_CCCL_GRID_CONSTANT const LengthsOutputIteratorT d_lengths_out,
_CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out,
ScanTileStateT tile_status,
EqualityOpT equality_op,
OffsetT num_items,
int num_tiles,
_CCCL_GRID_CONSTANT const OffsetT num_items,
_CCCL_GRID_CONSTANT const int num_tiles,
_CCCL_GRID_CONSTANT const StreamingContextT streaming_context)
{
static constexpr non_trivial_runs::rle_non_trivial_runs_policy policy =
Expand Down
22 changes: 11 additions & 11 deletions cub/cub/device/dispatch/dispatch_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -120,16 +120,16 @@ template <typename ChainedPolicyT,
typename KeyT = cub::detail::it_value_t<KeysInputIteratorT>>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS))
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel(
KeysInputIteratorT d_keys_in,
KeyT* d_keys_prev_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
_CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
_CCCL_GRID_CONSTANT KeyT* const d_keys_prev_in,
_CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in,
_CCCL_GRID_CONSTANT const ValuesOutputIteratorT d_values_out,
ScanByKeyTileStateT tile_state,
int start_tile,
_CCCL_GRID_CONSTANT const int start_tile,
EqualityOp equality_op,
ScanOpT scan_op,
InitValueT init_value,
OffsetT num_items)
_CCCL_GRID_CONSTANT const ScanOpT scan_op,
_CCCL_GRID_CONSTANT const InitValueT init_value,
_CCCL_GRID_CONSTANT const OffsetT num_items)
{
using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT;

Expand All @@ -156,10 +156,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE
template <typename ScanTileStateT, typename KeysInputIteratorT, typename OffsetT>
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel(
ScanTileStateT tile_state,
KeysInputIteratorT d_keys_in,
_CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
cub::detail::it_value_t<KeysInputIteratorT>* d_keys_prev_in,
OffsetT items_per_tile,
int num_tiles)
_CCCL_GRID_CONSTANT const OffsetT items_per_tile,
_CCCL_GRID_CONSTANT const int num_tiles)
{
// Initialize tile status
tile_state.InitializeStatus(num_tiles);
Expand Down
34 changes: 17 additions & 17 deletions cub/cub/device/dispatch/dispatch_segmented_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -183,25 +183,25 @@ template <typename LargeKernelT,
typename EndOffsetIteratorT,
typename KernelLauncherFactory>
__launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContinuationKernel(
LargeKernelT large_kernel,
SmallKernelT small_kernel,
local_segment_index_t num_segments,
KeyT* d_current_keys,
KeyT* d_final_keys,
_CCCL_GRID_CONSTANT const LargeKernelT large_kernel,
_CCCL_GRID_CONSTANT const SmallKernelT small_kernel,
_CCCL_GRID_CONSTANT const local_segment_index_t num_segments,
_CCCL_GRID_CONSTANT KeyT* const d_current_keys,
_CCCL_GRID_CONSTANT KeyT* const d_final_keys,
device_double_buffer<KeyT> d_keys_double_buffer,
ValueT* d_current_values,
ValueT* d_final_values,
_CCCL_GRID_CONSTANT ValueT* const d_current_values,
_CCCL_GRID_CONSTANT ValueT* const d_final_values,
device_double_buffer<ValueT> d_values_double_buffer,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
local_segment_index_t* group_sizes,
local_segment_index_t* large_and_medium_segments_indices,
local_segment_index_t* small_segments_indices,
KernelLauncherFactory launcher_factory,
int large_block_threads,
int small_block_threads,
int medium_segments_per_block,
int small_segments_per_block)
_CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
_CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
_CCCL_GRID_CONSTANT local_segment_index_t* const group_sizes,
_CCCL_GRID_CONSTANT local_segment_index_t* const large_and_medium_segments_indices,
_CCCL_GRID_CONSTANT local_segment_index_t* const small_segments_indices,
_CCCL_GRID_CONSTANT const KernelLauncherFactory launcher_factory,
_CCCL_GRID_CONSTANT const int large_block_threads,
_CCCL_GRID_CONSTANT const int small_block_threads,
_CCCL_GRID_CONSTANT const int medium_segments_per_block,
_CCCL_GRID_CONSTANT const int small_segments_per_block)
{
// In case of CDP:
// 1. each CTA has a different main stream
Expand Down
12 changes: 6 additions & 6 deletions cub/cub/device/dispatch/dispatch_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -315,15 +315,15 @@ __launch_bounds__(int(
OffsetT,
StreamingContextT>::agent_policy_t::BLOCK_THREADS))
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel(
InputIteratorT d_in,
FlagsInputIteratorT d_flags,
SelectedOutputIteratorT d_selected_out,
NumSelectedIteratorT d_num_selected_out,
_CCCL_GRID_CONSTANT const InputIteratorT d_in,
_CCCL_GRID_CONSTANT const FlagsInputIteratorT d_flags,
_CCCL_GRID_CONSTANT const SelectedOutputIteratorT d_selected_out,
_CCCL_GRID_CONSTANT const NumSelectedIteratorT d_num_selected_out,
ScanTileStateT tile_status,
SelectOpT select_op,
EqualityOpT equality_op,
OffsetT num_items,
int num_tiles,
_CCCL_GRID_CONSTANT const OffsetT num_items,
_CCCL_GRID_CONSTANT const int num_tiles,
_CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
vsmem_t vsmem)
{
Expand Down
46 changes: 23 additions & 23 deletions cub/cub/device/dispatch/dispatch_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -112,22 +112,22 @@ template <typename PolicySelector,
#endif // _CCCL_HAS_CONCEPTS()
__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceTopKKernel(
const KeyInputIteratorT d_keys_in,
KeyOutputIteratorT d_keys_out,
const ValueInputIteratorT d_values_in,
ValueOutputIteratorT d_values_out,
KeyInT* in_buf,
OffsetT* in_idx_buf,
KeyInT* out_buf,
OffsetT* out_idx_buf,
_CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in,
_CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out,
_CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
_CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
_CCCL_GRID_CONSTANT KeyInT* const in_buf,
_CCCL_GRID_CONSTANT OffsetT* const in_idx_buf,
_CCCL_GRID_CONSTANT KeyInT* const out_buf,
_CCCL_GRID_CONSTANT OffsetT* const out_idx_buf,
Counter<it_value_t<KeyInputIteratorT>, OffsetT, OutOffsetT>* counter,
OffsetT* histogram,
OffsetT num_items,
OutOffsetT k,
OffsetT buffer_length,
_CCCL_GRID_CONSTANT OffsetT* const histogram,
_CCCL_GRID_CONSTANT const OffsetT num_items,
_CCCL_GRID_CONSTANT const OutOffsetT k,
_CCCL_GRID_CONSTANT const OffsetT buffer_length,
ExtractBinOpT extract_bin_op,
IdentifyCandidatesOpT identify_candidates_op,
int pass)
_CCCL_GRID_CONSTANT const int pass)
{
static constexpr topk_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
using agent_topk_policy_t =
Expand Down Expand Up @@ -177,18 +177,18 @@ template <typename PolicySelector,
#endif // _CCCL_HAS_CONCEPTS()
__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceTopKLastFilterKernel(
const KeyInputIteratorT d_keys_in,
KeyOutputIteratorT d_keys_out,
const ValueInputIteratorT d_values_in,
ValueOutputIteratorT d_values_out,
KeyInT* in_buf,
OffsetT* in_idx_buf,
_CCCL_GRID_CONSTANT const KeyInputIteratorT d_keys_in,
_CCCL_GRID_CONSTANT const KeyOutputIteratorT d_keys_out,
_CCCL_GRID_CONSTANT const ValueInputIteratorT d_values_in,
_CCCL_GRID_CONSTANT const ValueOutputIteratorT d_values_out,
_CCCL_GRID_CONSTANT KeyInT* const in_buf,
_CCCL_GRID_CONSTANT OffsetT* const in_idx_buf,
Counter<it_value_t<KeyInputIteratorT>, OffsetT, OutOffsetT>* counter,
OffsetT num_items,
OutOffsetT k,
OffsetT buffer_length,
_CCCL_GRID_CONSTANT const OffsetT num_items,
_CCCL_GRID_CONSTANT const OutOffsetT k,
_CCCL_GRID_CONSTANT const OffsetT buffer_length,
IdentifyCandidatesOpT identify_candidates_op,
int pass)
_CCCL_GRID_CONSTANT const int pass)
{
static constexpr topk_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
using agent_topk_policy_t =
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/kernels/kernel_for_each.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ template <class PolicySelector, class OffsetT, class OpT>
#if _CCCL_HAS_CONCEPTS()
requires for_policy_selector<PolicySelector>
#endif // _CCCL_HAS_CONCEPTS()
CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, OpT op)
{
static constexpr for_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
using agent_policy_t = policy_t<policy.block_threads, policy.items_per_thread>;
Expand Down Expand Up @@ -119,7 +119,7 @@ template <class PolicySelector, class OffsetT, class OpT>
#endif // _CCCL_HAS_CONCEPTS()
CUB_DETAIL_KERNEL_ATTRIBUTES //
__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads)) //
void static_kernel(OffsetT num_items, OpT op)
void static_kernel(_CCCL_GRID_CONSTANT const OffsetT num_items, OpT op)
{
static constexpr for_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
using agent_policy_t = policy_t<policy.block_threads, policy.items_per_thread>;
Expand Down
Loading
Loading