Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 90 additions & 37 deletions cub/benchmarks/bench/scan/exclusive/base.cuh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

#pragma once
Expand All @@ -7,47 +7,98 @@

#include <cuda/std/__functional/invoke.h>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>

#if !TUNE_BASE
# if TUNE_TRANSPOSE == 0
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
# else // TUNE_TRANSPOSE == 1
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
# endif // TUNE_TRANSPOSE

# if TUNE_LOAD == 0
# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
# elif TUNE_LOAD == 1
# define TUNE_LOAD_MODIFIER cub::LOAD_CA
# endif // TUNE_LOAD
# if !USES_WARPSPEED()
# include <look_back_helper.cuh>
# endif // !USES_WARPSPEED()

template <typename AccumT>
struct policy_hub_t
{
template <int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
cub::BlockLoadAlgorithm LOAD_ALGORITHM,
cub::CacheLoadModifier LOAD_MODIFIER,
cub::BlockStoreAlgorithm STORE_ALGORITHM,
cub::BlockScanAlgorithm SCAN_ALGORITHM>
using agent_policy_t = cub::AgentScanPolicy<
NOMINAL_BLOCK_THREADS_4B,
NOMINAL_ITEMS_PER_THREAD_4B,
ComputeT,
LOAD_ALGORITHM,
LOAD_MODIFIER,
STORE_ALGORITHM,
SCAN_ALGORITHM,
cub::detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
delay_constructor_t>;

struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
struct MaxPolicy : cub::ChainedPolicy<300, MaxPolicy, MaxPolicy>
{
# if USES_WARPSPEED()
struct WarpspeedPolicy
{
static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS;
static constexpr int num_look_ahead_items = TUNE_NUM_LOOKBACK_ITEMS;
static constexpr int items_per_thread = TUNE_ITEMS_PLUS_ONE - 1;

// the rest are fixed or derived definitions

static constexpr int num_squads = 5;
static constexpr int num_threads_per_warp = 32;
static constexpr int num_load_warps = 1;
static constexpr int num_sched_warps = 1;
static constexpr int num_look_ahead_warps = 1;

static constexpr int num_total_warps =
2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;

static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp;

static constexpr int tile_size = items_per_thread * squad_reduce_thread_count;

using SquadDesc = cub::detail::warpspeed::SquadDesc;

// The squads cannot be static constexpr variables, as those are not device accessible
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadReduce() noexcept
{
return SquadDesc{0, num_reduce_and_scan_warps};
}
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadScanStore() noexcept
{
return SquadDesc{1, num_reduce_and_scan_warps};
}
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLoad() noexcept
{
return SquadDesc{2, num_load_warps};
}
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadSched() noexcept
{
return SquadDesc{3, num_sched_warps};
}
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLookback() noexcept
{
return SquadDesc{4, num_look_ahead_warps};
}
};
# else // USES_WARPSPEED()
# if TUNE_TRANSPOSE == 0
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
# else // TUNE_TRANSPOSE == 1
# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
# endif // TUNE_TRANSPOSE

# if TUNE_LOAD == 0
# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
# elif TUNE_LOAD == 1
# define TUNE_LOAD_MODIFIER cub::LOAD_CA
# endif // TUNE_LOAD

template <int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
cub::BlockLoadAlgorithm LOAD_ALGORITHM,
cub::CacheLoadModifier LOAD_MODIFIER,
cub::BlockStoreAlgorithm STORE_ALGORITHM,
cub::BlockScanAlgorithm SCAN_ALGORITHM>
using agent_policy_t = cub::AgentScanPolicy<
NOMINAL_BLOCK_THREADS_4B,
NOMINAL_ITEMS_PER_THREAD_4B,
ComputeT,
LOAD_ALGORITHM,
LOAD_MODIFIER,
STORE_ALGORITHM,
SCAN_ALGORITHM,
cub::detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
delay_constructor_t>;

using ScanPolicyT =
agent_policy_t<TUNE_THREADS,
TUNE_ITEMS,
Expand All @@ -56,9 +107,8 @@ struct policy_hub_t
TUNE_LOAD_MODIFIER,
TUNE_STORE_ALGORITHM,
cub::BLOCK_SCAN_WARP_SCANS>;
# endif // USES_WARPSPEED()
};

using MaxPolicy = policy_t;
};
#endif // TUNE_BASE

Expand All @@ -72,6 +122,9 @@ try
using input_it_t = const T*;
using output_it_t = T*;
using offset_t = cub::detail::choose_offset_t<OffsetT>;
#if USES_WARPSPEED()
static_assert(sizeof(offset_t) == sizeof(size_t)); // warpspeed scan uses size_t internally
#endif // USES_WARPSPEED()

#if !TUNE_BASE
using policy_t = policy_hub_t<accum_t>;
Expand Down Expand Up @@ -128,7 +181,7 @@ catch (const std::bad_alloc&)
state.skip("Skipping: out of memory.");
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, offset_types))
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, scan_offset_types))
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 32, 4));
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/scan/exclusive/custom.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,7 @@

#include <nvbench_helper.cuh>

using op_t = max_t;
#define USES_WARPSPEED() 0
using op_t = max_t;
using scan_offset_types = offset_types;
#include "base.cuh"
7 changes: 6 additions & 1 deletion cub/benchmarks/bench/scan/exclusive/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@

#include <nvbench_helper.cuh>

// This benchmark tunes the old, non-warpspeed scan implementation. Using it for benchmarking, will pick the warpspeed
// implementation on SM100+, but it's better to use the sum.warpspeed.cu benchmark instead, which uses a single OffsetT.

// %RANGE% TUNE_ITEMS ipt 7:24:1
// %RANGE% TUNE_THREADS tpb 128:1024:32
// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
Expand All @@ -13,5 +16,7 @@
// %RANGE% TUNE_TRANSPOSE trp 0:1:1
// %RANGE% TUNE_LOAD ld 0:1:1

using op_t = ::cuda::std::plus<>;
#define USES_WARPSPEED() 0
using op_t = ::cuda::std::plus<>;
using scan_offset_types = offset_types;
#include "base.cuh"
38 changes: 38 additions & 0 deletions cub/benchmarks/bench/scan/exclusive/sum.warpspeed.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// This tunes the warpspeed implementation of scan, which is only available on SM100+. It has entirely different tuning
// parameters and is agnostic of the offset type. It is thus in a separate file, so we can continue to tune the old scan
// implementation on older hardware architectures.

#include <cuda/__cccl_config>

#if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
# warning "This benchmark does not support being compiled for multiple architectures. Disabling it."
#else // _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1

# if __CUDA_ARCH_LIST__ < 1000
# warning "Warpspeed scan requires at least sm_100. Disabling it."
# else // __CUDA_ARCH_LIST__ < 1000

# if __cccl_ptx_isa < 860
# warning "Warpspeed scan requires at least PTX ISA 8.6. Disabling it."
# else // if __cccl_ptx_isa < 860

# include <nvbench_helper.cuh>

// %RANGE% TUNE_NUM_REDUCE_SCAN_WARPS wrps 1:8:1
// %RANGE% TUNE_NUM_LOOKBACK_ITEMS lbi 1:8:1

// TODO(bgruber): find a good range and step width, items per thread should be coprime with 32 to avoid SMEM conflicts.
// Should we specify nominal items per thread instead?
// %RANGE% TUNE_ITEMS_PLUS_ONE ipt 8:256:8

# define USES_WARPSPEED() 1
using op_t = ::cuda::std::plus<>;
using scan_offset_types = nvbench::type_list<int64_t>;
# include "base.cuh"

# endif // __cccl_ptx_isa < 860
# endif // __CUDA_ARCH_LIST__ < 1000
#endif // _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
23 changes: 11 additions & 12 deletions cub/cub/device/dispatch/tuning/tuning_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,8 @@ struct policy_hub
#if __cccl_ptx_isa >= 860
struct WarpspeedPolicy
{
// TODO(bgruber): remove everything but num_reduce_and_scan_warps, num_look_ahead_items and items_per_thread

// Squad definitions
static constexpr int num_squads = 5;
static constexpr int num_threads_per_warp = 32;
Expand All @@ -576,29 +578,26 @@ struct policy_hub
# if _CCCL_COMPILER(NVHPC)
// need to reduce the number of threads to <= 256, so each thread can use up to 255 registers. This avoids an
// error in ptxas, see also: https://github.com/NVIDIA/cccl/issues/7700.
static constexpr int num_reduce_warps = 2;
static constexpr int num_scan_stor_warps = 2;
static constexpr int num_reduce_and_scan_warps = 2;
# else // _CCCL_COMPILER(NVHPC)
static constexpr int num_reduce_warps = 4;
static constexpr int num_scan_stor_warps = 4;
static constexpr int num_reduce_and_scan_warps = 4; // this can be tuned
# endif // _CCCL_COMPILER(NVHPC)
static constexpr int num_load_warps = 1;
static constexpr int num_sched_warps = 1;
static constexpr int num_look_ahead_warps = 1;

static constexpr int num_load_warps = 1; // no point in being more than 1
static constexpr int num_sched_warps = 1; // no point in being more than 1
static constexpr int num_look_ahead_warps = 1; // must be 1
// TODO(bgruber): 5 is a bit better for complex<float>
static constexpr int num_look_ahead_items = sizeof(AccumT) == 2 ? 3 : 4;

// Deduced definitions
static constexpr int num_total_warps =
num_reduce_warps + num_scan_stor_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;

# if _CCCL_COMPILER(NVHPC)
static_assert(num_total_threads <= 256);
# endif // _CCCL_COMPILER(NVHPC)

static constexpr int squad_reduce_thread_count = num_reduce_warps * num_threads_per_warp;
static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp;

// manual tuning based on cub.bench.scan.exclusive.sum.base
// 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM)
Expand Down Expand Up @@ -627,11 +626,11 @@ struct policy_hub
// The squads cannot be static constexpr variables, as those are not device accessible
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadReduce() noexcept
{
return warpspeed::SquadDesc{0, num_reduce_warps};
return warpspeed::SquadDesc{0, num_reduce_and_scan_warps};
}
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadScanStore() noexcept
{
return warpspeed::SquadDesc{1, num_scan_stor_warps};
return warpspeed::SquadDesc{1, num_reduce_and_scan_warps};
}
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr warpspeed::SquadDesc squadLoad() noexcept
{
Expand Down
Loading