diff --git a/c/parallel/include/cccl/c/scan.h b/c/parallel/include/cccl/c/scan.h index a9db31c764a..5b80668e123 100644 --- a/c/parallel/include/cccl/c/scan.h +++ b/c/parallel/include/cccl/c/scan.h @@ -34,6 +34,7 @@ typedef struct cccl_device_scan_build_result_t CUkernel scan_kernel; bool force_inclusive; cccl_init_kind_t init_kind; + bool use_warpspeed; size_t description_bytes_per_tile; size_t payload_bytes_per_tile; void* runtime_policy; diff --git a/c/parallel/src/radix_sort.cu b/c/parallel/src/radix_sort.cu index 975281478d2..2033c2d937e 100644 --- a/c/parallel/src/radix_sort.cu +++ b/c/parallel/src/radix_sort.cu @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -223,18 +223,7 @@ try std::string offset_t; check(cccl_type_name_from_nvrtc(&offset_t)); - // TODO(bgruber): generalize this somewhere - const auto key_type = [&] { - switch (input_keys_it.value_type.type) - { - case CCCL_FLOAT32: - return cub::detail::type_t::float32; - case CCCL_FLOAT64: - return cub::detail::type_t::float64; - default: - return cub::detail::type_t::other; - } - }(); + const auto key_type = cccl_type_enum_to_cub_type(input_keys_it.value_type.type); const auto policy_sel = cub::detail::radix_sort::policy_selector{ static_cast(input_keys_it.value_type.size), @@ -268,6 +257,8 @@ using device_radix_sort_policy = {5}; using namespace cub; using namespace cub::detail; using namespace cub::detail::radix_sort; +using cub::detail::delay_constructor_policy; +using cub::detail::delay_constructor_kind; static_assert(device_radix_sort_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6}, "Host generated and JIT compiled policy mismatch"); )XXX", input_keys_it.value_type.size, // 0 diff --git a/c/parallel/src/reduce.cu b/c/parallel/src/reduce.cu index 8df0aeb5d8a..c673d3453c3 100644 --- a/c/parallel/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -223,31 +223,8 @@ try const auto policy_sel = [&] { using namespace cub::detail; - auto accum_type = type_t::other; - if (accum_t.type == CCCL_FLOAT32) - { - accum_type = type_t::float32; - } - else if (accum_t.type == CCCL_FLOAT64) - { - accum_type = type_t::float64; - } - - auto operation_t = op_kind_t::other; - switch (op.type) - { - case CCCL_PLUS: - operation_t = op_kind_t::plus; - break; - case CCCL_MINIMUM: - operation_t = op_kind_t::min; - break; - case CCCL_MAXIMUM: - operation_t = op_kind_t::max; - break; - default: - break; - } + const auto accum_type = cccl_type_enum_to_cub_type(accum_t.type); + const auto operation_t = cccl_op_kind_to_cub_op(op.type); const int offset_size = int{sizeof(OffsetT)}; return cub::detail::reduce::policy_selector{accum_type, operation_t, offset_size, static_cast(accum_t.size)}; diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 4c039525cb5..cba53e453e0 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -4,13 +4,12 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// #include #include -#include #include #include #include @@ -21,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -56,33 +56,6 @@ enum class InitKind NoInit, }; -struct scan_runtime_tuning_policy -{ - cub::detail::RuntimeScanAgentPolicy scan; - - auto Scan() const - { - return scan; - } - - void CheckLoadModifier() const - { - if (scan.LoadModifier() == cub::CacheLoadModifier::LOAD_LDG) - { - throw std::runtime_error("The memory consistency model does not apply to texture " - "accesses"); - } - } - - using MaxPolicy = scan_runtime_tuning_policy; - - template - cudaError_t Invoke(int, F& op) - { - return op.template Invoke(*this); - } -}; - static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_type_info init) { // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet @@ -135,8 +108,8 @@ std::string get_scan_kernel_name( bool force_inclusive, cccl_init_kind_t init_kind) { - std::string chained_policy_t; - check(cccl_type_name_from_nvrtc(&chained_policy_t)); + std::string policy_selector_t; + check(cccl_type_name_from_nvrtc(&policy_selector_t)); const cccl_type_info accum_t = scan::get_accumulator_type(op, input_it, init); const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type); @@ -177,7 +150,7 @@ std::string get_scan_kernel_name( auto tile_state_t = std::format("cub::ScanTileState<{0}>", accum_cpp_t); return std::format( "cub::detail::scan::DeviceScanKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}>", - chained_policy_t, // 0 + policy_selector_t, // 0 input_iterator_t, // 1 output_iterator_t, // 2 tile_state_t, // 3 @@ -189,20 +162,6 @@ std::string get_scan_kernel_name( init_t); // 9 } -template -struct dynamic_scan_policy_t -{ - using MaxPolicy = dynamic_scan_policy_t; - - template - cudaError_t Invoke(int device_ptx_version, F& op) - { - return op.template Invoke(GetPolicy(device_ptx_version, accumulator_type)); - } - - cccl_type_info accumulator_type; -}; - struct scan_kernel_source { cccl_device_scan_build_result_t& build; @@ -219,11 +178,16 @@ struct scan_kernel_source { return build.scan_kernel; } - scan_tile_state TileState() + scan_tile_state TileState() const { return {build.description_bytes_per_tile, build.payload_bytes_per_tile}; } + bool use_warpspeed(const cub::detail::scan::scan_policy& /*policy*/) const + { + return build.use_warpspeed; + } + std::size_t look_ahead_tile_state_size() const { return look_ahead_tile_state_alignment(); @@ -287,8 +251,77 @@ try const auto output_it_value_t = cccl_type_enum_to_name(output_it.value_type.type); - std::string policy_hub_expr = std::format( - "cub::detail::scan::policy_hub<{}, {}, {}, {}, {}>", + const auto policy_sel = [&] { + using cub::detail::scan::policy_selector; + using cub::detail::scan::primitive_accum; + using cub::detail::scan::primitive_op; + + const auto accum_type = cccl_type_enum_to_cub_type(accum_t.type); + const auto operation_t = cccl_op_kind_to_cub_op(op.type); + const auto input_type = input_it.value_type.type; + const auto input_type_t = cccl_type_enum_to_cub_type(input_type); + + const auto output_type = output_it.value_type.type; + const bool types_match = input_type == output_type && input_type == accum_t.type; + const bool benchmark_match = + operation_t != cub::detail::op_kind_t::other && types_match && input_type != CCCL_STORAGE; + const bool accum_is_primitive_or_trivially_copy_constructible = true; + + return policy_selector{ + static_cast(input_it.value_type.size), + static_cast(input_it.value_type.alignment), + static_cast(output_it.value_type.size), + static_cast(output_it.value_type.alignment), + static_cast(accum_t.size), + static_cast(accum_t.alignment), + int{sizeof(OffsetT)}, + input_type_t, + accum_type, + operation_t, + accum_is_primitive_or_trivially_copy_constructible, + benchmark_match}; + }(); + + const auto arch_id = cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}); + const auto active_policy = policy_sel(arch_id); + +#if _CCCL_CUDACC_AT_LEAST(12, 8) + const auto is_trivial_type = [](cccl_type_enum /* type */) { + // TODO: implement actual logic here when nontrivial custom types become supported + return true; + }; + + const bool input_contiguous = input_it.type == cccl_iterator_kind_t::CCCL_POINTER; + const bool output_contiguous = output_it.type == cccl_iterator_kind_t::CCCL_POINTER; + const bool input_trivially_copyable = is_trivial_type(input_it.value_type.type); + const bool output_trivially_copyable = is_trivial_type(output_it.value_type.type); + const bool output_default_constructible = output_trivially_copyable; + + const bool use_warpspeed = + active_policy.warpspeed + && cub::detail::scan::use_warpspeed( + active_policy.warpspeed, + static_cast(input_it.value_type.size), + static_cast(input_it.value_type.alignment), + static_cast(output_it.value_type.size), + static_cast(output_it.value_type.alignment), + static_cast(accum_t.size), + static_cast(accum_t.alignment), + input_contiguous, + output_contiguous, + input_trivially_copyable, + output_trivially_copyable, + output_default_constructible); +#else + const bool use_warpspeed = false; +#endif + + // TODO(bgruber): drop this if tuning policies become formattable + std::stringstream policy_sel_str; + policy_sel_str << active_policy; + + std::string policy_selector_expr = std::format( + "cub::detail::scan::policy_selector_from_types<{}, {}, {}, {}, {}>", input_it_value_t, output_it_value_t, accum_cpp, @@ -307,20 +340,20 @@ struct __align__({1}) storage_t {{ {2} {3} {4} -using device_scan_policy = {5}::MaxPolicy; - -#include -__device__ consteval auto& policy_generator() {{ - return ptx_json::id() - = cub::detail::scan::ScanPolicyWrapper::EncodedPolicy(); -}} +using device_scan_policy = {5}; +using namespace cub; +using namespace cub::detail::scan; +using cub::detail::delay_constructor_policy; +using cub::detail::delay_constructor_kind; +static_assert(device_scan_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6}, "Host generated and JIT compiled policy mismatch"); )XXX", input_it.value_type.size, // 0 input_it.value_type.alignment, // 1 input_iterator_src, // 2 output_iterator_src, // 3 op_src, // 4 - policy_hub_expr); // 5 + policy_selector_expr, // 5 + policy_sel_str.view()); // 6 #if false // CCCL_DEBUGGING_SWITCH fflush(stderr); @@ -344,7 +377,6 @@ __device__ consteval auto& policy_generator() {{ "-rdc=true", "-dlto", "-DCUB_DISABLE_CDP", - "-DCUB_ENABLE_POLICY_PTX_JSON", "-std=c++20"}; cccl::detail::extend_args_with_build_config(args, config); @@ -379,11 +411,6 @@ __device__ consteval auto& policy_generator() {{ auto [description_bytes_per_tile, payload_bytes_per_tile] = get_tile_state_bytes_per_tile(accum_t, accum_cpp, args.data(), args.size(), arch); - nlohmann::json runtime_policy = cub::detail::ptx_json::parse("device_scan_policy", {result.data.get(), result.size}); - - using cub::detail::RuntimeScanAgentPolicy; - auto scan_policy = RuntimeScanAgentPolicy::from_json(runtime_policy, "ScanPolicyT"); - build_ptr->cc = cc; build_ptr->cubin = (void*) result.data.release(); build_ptr->cubin_size = result.size; @@ -392,7 +419,8 @@ __device__ consteval auto& policy_generator() {{ build_ptr->init_kind = init_kind; build_ptr->description_bytes_per_tile = description_bytes_per_tile; build_ptr->payload_bytes_per_tile = payload_bytes_per_tile; - build_ptr->runtime_policy = new scan::scan_runtime_tuning_policy{scan_policy}; + build_ptr->runtime_policy = new cub::detail::scan::policy_selector{policy_sel}; + build_ptr->use_warpspeed = use_warpspeed; return CUDA_SUCCESS; } @@ -426,30 +454,18 @@ CUresult cccl_device_scan( CUdevice cu_device; check(cuCtxGetDevice(&cu_device)); - auto exec_status = cub::DispatchScan< - indirect_arg_t, - indirect_arg_t, - indirect_arg_t, - std::conditional_t, cub::NullType, indirect_arg_t>, - cuda::std::size_t, - void, - EnforceInclusive, - scan::scan_runtime_tuning_policy, - scan::scan_kernel_source, - cub::detail::CudaDriverLauncherFactory>:: - Dispatch( - d_temp_storage, - *temp_storage_bytes, - d_in, - d_out, - op, - init, - num_items, - stream, - {build}, - cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}, - *reinterpret_cast(build.runtime_policy)); - + auto exec_status = cub::detail::scan::dispatch_with_accum( + d_temp_storage, + *temp_storage_bytes, + indirect_arg_t{d_in}, + indirect_arg_t{d_out}, + indirect_arg_t{op}, + std::conditional_t, cub::NullType, indirect_arg_t>{init}, + static_cast(num_items), + stream, + *static_cast(build.runtime_policy), + scan::scan_kernel_source{build}, + cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}); error = static_cast(exec_status); } catch (const std::exception& exc) @@ -591,7 +607,8 @@ try return CUDA_ERROR_INVALID_VALUE; } std::unique_ptr cubin(reinterpret_cast(build_ptr->cubin)); - std::unique_ptr policy(reinterpret_cast(build_ptr->runtime_policy)); + std::unique_ptr policy( + static_cast(build_ptr->runtime_policy)); check(cuLibraryUnload(build_ptr->library)); return CUDA_SUCCESS; diff --git a/c/parallel/src/segmented_reduce.cu b/c/parallel/src/segmented_reduce.cu index 2c61508e606..f81201e1902 100644 --- a/c/parallel/src/segmented_reduce.cu +++ b/c/parallel/src/segmented_reduce.cu @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -154,35 +154,11 @@ try // OffsetT is checked to match have 64-bit size const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64); - // TODO(bgruber): share this with reduce.cu const auto policy_sel = [&] { using namespace cub::detail; - auto accum_type = type_t::other; - if (accum_t.type == CCCL_FLOAT32) - { - accum_type = type_t::float32; - } - else if (accum_t.type == CCCL_FLOAT64) - { - accum_type = type_t::float64; - } - - auto operation_t = op_kind_t::other; - switch (op.type) - { - case CCCL_PLUS: - operation_t = op_kind_t::plus; - break; - case CCCL_MINIMUM: - operation_t = op_kind_t::min; - break; - case CCCL_MAXIMUM: - operation_t = op_kind_t::max; - break; - default: - break; - } + const auto accum_type = cccl_type_enum_to_cub_type(accum_t.type); + const auto operation_t = cccl_op_kind_to_cub_op(op.type); const int offset_size = int{sizeof(OffsetT)}; return cub::detail::segmented_reduce::policy_selector{ diff --git a/c/parallel/src/util/types.h b/c/parallel/src/util/types.h index 9d2acb62659..837025a69c2 100644 --- a/c/parallel/src/util/types.h +++ b/c/parallel/src/util/types.h @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -114,6 +114,8 @@ inline constexpr cub::detail::type_t cccl_type_enum_to_cub_type(cccl_type_enum t { switch (type) { + case CCCL_BOOLEAN: + return cub::detail::type_t::boolean; case CCCL_INT8: return cub::detail::type_t::int8; case CCCL_INT16: @@ -140,3 +142,18 @@ inline constexpr cub::detail::type_t cccl_type_enum_to_cub_type(cccl_type_enum t return cub::detail::type_t::other; } } + +inline constexpr cub::detail::op_kind_t cccl_op_kind_to_cub_op(cccl_op_kind_t type) +{ + switch (type) + { + case CCCL_PLUS: + return cub::detail::op_kind_t::plus; + case CCCL_MINIMUM: + return cub::detail::op_kind_t::min; + case CCCL_MAXIMUM: + return cub::detail::op_kind_t::max; + default: + return cub::detail::op_kind_t::other; + } +} diff --git a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu index f6acf8ae6c0..c3f5ebcb299 100644 --- a/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu +++ b/cub/benchmarks/bench/scan/applications/P1/log-cdf-from-log-pdfs.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include @@ -34,26 +34,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< - TUNE_THREADS, - TUNE_ITEMS, - // TODO Tune - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - cub::BLOCK_SCAN_WARP_SCANS, - TUNE_STORE_ALGORITHM, - delay_constructor_t>; - }; - - using MaxPolicy = policy_t; -}; #endif // !TUNE_BASE +#include "../../policy_selector.h" + namespace impl { /* @@ -125,15 +109,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list; - using dispatch_t = - cub::DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); auto mu = static_cast(state.get_float64("Mu{io}")); @@ -161,14 +136,39 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list(elements); size_t tmp_size; - dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream); + cub::detail::scan::dispatch_with_accum( + nullptr, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + bench_stream +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream()); + cub::detail::scan::dispatch_with_accum( + d_tmp, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + launch.get_stream() +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); }); // for validation, use diff --git a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu index 98da18d7106..7aa9d79c2e1 100644 --- a/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu +++ b/cub/benchmarks/bench/scan/applications/P1/non-commutative-bicyclic-monoid.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include @@ -30,26 +30,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< - TUNE_THREADS, - TUNE_ITEMS, - // TODO Tune - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - cub::BLOCK_SCAN_WARP_SCANS, - TUNE_STORE_ALGORITHM, - delay_constructor_t>; - }; - - using MaxPolicy = policy_t; -}; #endif // !TUNE_BASE +#include "../../policy_selector.h" + namespace impl { /* Consider free monoid with two generators, ``q`` and ``p``, modulo defining relationship (``p * q == 1``). @@ -112,15 +96,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list using output_it_t = pair_t*; using offset_t = cub::detail::choose_offset_t; -#if !TUNE_BASE - using policy_t = policy_hub_t; - using dispatch_t = cub:: - DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector output(elements); @@ -148,14 +123,39 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list cudaStream_t bench_stream = state.get_cuda_stream(); size_t tmp_size; - dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream); + cub::detail::scan::dispatch_with_accum( + nullptr, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + bench_stream +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream()); + cub::detail::scan::dispatch_with_accum( + d_tmp, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + launch.get_stream() +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); }); } diff --git a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu index 182277add7c..acb7da72051 100644 --- a/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu +++ b/cub/benchmarks/bench/scan/applications/P1/rabin-karp-second-fingerprinting.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include @@ -36,26 +36,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< - TUNE_THREADS, - TUNE_ITEMS, - // TODO Tune - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - cub::BLOCK_SCAN_WARP_SCANS, - TUNE_STORE_ALGORITHM, - delay_constructor_t>; - }; - - using MaxPolicy = policy_t; -}; #endif // !TUNE_BASE +#include "../../policy_selector.h" + namespace impl { /* Denote epsilon, the identity element, be an empty sequence, and consider @@ -308,15 +292,6 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list; - using dispatch_t = cub:: - DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector input = generate(elements); @@ -338,13 +313,26 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list( + nullptr, + tmp_size, + inp_it, + out_it, + op_t{p}, + wrapped_init_t{}, + input.size(), + bench_stream +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( + cub::detail::scan::dispatch_with_accum( d_tmp, tmp_size, inp_it, @@ -352,7 +340,12 @@ static void inclusive_scan(nvbench::state& state, nvbench::type_list{} +#endif // !TUNE_BASE + ); }); // for validation uncomment these two lines diff --git a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu index cf37c7f6312..82a158c0940 100644 --- a/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu +++ b/cub/benchmarks/bench/scan/applications/P1/running-min-max.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include @@ -37,26 +37,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< - TUNE_THREADS, - TUNE_ITEMS, - // TODO Tune - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - cub::BLOCK_SCAN_WARP_SCANS, - TUNE_STORE_ALGORITHM, - delay_constructor_t>; - }; - - using MaxPolicy = policy_t; -}; #endif // !TUNE_BASE +#include "../../policy_selector.h" + namespace impl { /* Given input sequence of values, compute sequence of @@ -215,15 +199,6 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list) using output_it_t = pair_t*; using offset_t = cub::detail::choose_offset_t; -#if !TUNE_BASE - using policy_t = policy_hub_t; - using dispatch_t = cub:: - DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector output(elements); @@ -241,13 +216,39 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list) cudaStream_t bench_stream = state.get_cuda_stream().get_stream(); size_t tmp_size; - dispatch_t::Dispatch(nullptr, tmp_size, inp_it, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream); + cub::detail::scan::dispatch_with_accum( + nullptr, + tmp_size, + inp_it, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + bench_stream +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch(d_tmp, tmp_size, inp_it, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream()); + cub::detail::scan::dispatch_with_accum( + d_tmp, + tmp_size, + inp_it, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + launch.get_stream() +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); }); // for verification use diff --git a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu index 7135ed7cd5a..92811c32dc4 100644 --- a/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu +++ b/cub/benchmarks/bench/scan/applications/P1/scan-over-unitriangular-group.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include @@ -35,26 +35,10 @@ # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD - -struct policy_hub_t -{ - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> - { - using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< - TUNE_THREADS, - TUNE_ITEMS, - // TODO Tune - TUNE_LOAD_ALGORITHM, - TUNE_LOAD_MODIFIER, - cub::BLOCK_SCAN_WARP_SCANS, - TUNE_STORE_ALGORITHM, - delay_constructor_t>; - }; - - using MaxPolicy = policy_t; -}; #endif // !TUNE_BASE +#include "../../policy_selector.h" + namespace impl { template @@ -126,15 +110,6 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list) using output_it_t = tuple_t*; using offset_t = cub::detail::choose_offset_t; -#if !TUNE_BASE - using policy_t = policy_hub_t; - using dispatch_t = cub:: - DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); cudaStream_t bench_stream = state.get_cuda_stream().get_stream(); @@ -159,14 +134,39 @@ void benchmark_impl(nvbench::state& state, nvbench::type_list) auto d_output = thrust::raw_pointer_cast(output.data()); size_t tmp_size; - dispatch_t::Dispatch(nullptr, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), bench_stream); + cub::detail::scan::dispatch_with_accum( + nullptr, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + bench_stream +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( - d_tmp, tmp_size, d_input, d_output, op_t{}, wrapped_init_t{}, input.size(), launch.get_stream()); + cub::detail::scan::dispatch_with_accum( + d_tmp, + tmp_size, + d_input, + d_output, + op_t{}, + wrapped_init_t{}, + input.size(), + launch.get_stream() +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); }); // for validation use (recommended for integral types and smallish input sizes) diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index e5242d5f2aa..3a3eb99d0a1 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -13,105 +13,10 @@ # if !USES_WARPSPEED() # include # endif // !USES_WARPSPEED() - -template -struct policy_hub_t -{ - struct MaxPolicy : cub::ChainedPolicy<300, MaxPolicy, MaxPolicy> - { -# if USES_WARPSPEED() - struct WarpspeedPolicy - { - static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS; - static constexpr int num_look_ahead_items = TUNE_NUM_LOOKBACK_ITEMS; - static constexpr int items_per_thread = TUNE_ITEMS_PLUS_ONE - 1; - - // the rest are fixed or derived definitions - - static constexpr int num_squads = 5; - static constexpr int num_threads_per_warp = 32; - static constexpr int num_load_warps = 1; - static constexpr int num_sched_warps = 1; - static constexpr int num_look_ahead_warps = 1; - - static constexpr int num_total_warps = - 2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps; - static constexpr int num_total_threads = num_total_warps * num_threads_per_warp; - - static constexpr int squad_reduce_thread_count = num_reduce_and_scan_warps * num_threads_per_warp; - - static constexpr int tile_size = items_per_thread * squad_reduce_thread_count; - - using SquadDesc = cub::detail::warpspeed::SquadDesc; - - // The squads cannot be static constexpr variables, as those are not device accessible - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadReduce() noexcept - { - return SquadDesc{0, num_reduce_and_scan_warps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadScanStore() noexcept - { - return SquadDesc{1, num_reduce_and_scan_warps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLoad() noexcept - { - return SquadDesc{2, num_load_warps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadSched() noexcept - { - return SquadDesc{3, num_sched_warps}; - } - [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE static constexpr SquadDesc squadLookback() noexcept - { - return SquadDesc{4, num_look_ahead_warps}; - } - }; -# else // USES_WARPSPEED() -# if TUNE_TRANSPOSE == 0 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT -# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT -# else // TUNE_TRANSPOSE == 1 -# define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE -# define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE -# endif // TUNE_TRANSPOSE - -# if TUNE_LOAD == 0 -# define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT -# elif TUNE_LOAD == 1 -# define TUNE_LOAD_MODIFIER cub::LOAD_CA -# endif // TUNE_LOAD - - template - using agent_policy_t = cub::AgentScanPolicy< - NOMINAL_BLOCK_THREADS_4B, - NOMINAL_ITEMS_PER_THREAD_4B, - ComputeT, - LOAD_ALGORITHM, - LOAD_MODIFIER, - STORE_ALGORITHM, - SCAN_ALGORITHM, - cub::detail::MemBoundScaling, - delay_constructor_t>; - - using ScanPolicyT = - agent_policy_t; -# endif // USES_WARPSPEED() - }; -}; #endif // TUNE_BASE +#include "../policy_selector.h" + template static void basic(nvbench::state& state, nvbench::type_list) try @@ -126,15 +31,6 @@ try static_assert(sizeof(offset_t) == sizeof(size_t)); // warpspeed scan uses size_t internally #endif // USES_WARPSPEED() -#if !TUNE_BASE - using policy_t = policy_hub_t; - using dispatch_t = cub:: - DispatchScan; -#else - using dispatch_t = - cub::DispatchScan; -#endif - const auto elements = static_cast(state.get_int64("Elements{io}")); if (sizeof(offset_t) == 4 && elements > std::numeric_limits::max()) { @@ -145,15 +41,15 @@ try thrust::device_vector input = generate(elements); thrust::device_vector output(elements); - T* d_input = thrust::raw_pointer_cast(input.data()); - T* d_output = thrust::raw_pointer_cast(output.data()); + const T* d_input = thrust::raw_pointer_cast(input.data()); + T* d_output = thrust::raw_pointer_cast(output.data()); state.add_element_count(elements); state.add_global_memory_reads(elements, "Size"); state.add_global_memory_writes(elements); size_t tmp_size; - dispatch_t::Dispatch( + cub::detail::scan::dispatch_with_accum( nullptr, tmp_size, d_input, @@ -161,11 +57,16 @@ try op_t{}, wrapped_init_t{T{}}, static_cast(input.size()), - 0 /* stream */); + 0 /* stream */ +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); - thrust::device_vector tmp(tmp_size); + thrust::device_vector tmp(tmp_size, thrust::no_init); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { - dispatch_t::Dispatch( + cub::detail::scan::dispatch_with_accum( thrust::raw_pointer_cast(tmp.data()), tmp_size, d_input, @@ -173,7 +74,12 @@ try op_t{}, wrapped_init_t{T{}}, static_cast(input.size()), - launch.get_stream()); + launch.get_stream() +#if !TUNE_BASE + , + policy_selector{} +#endif // !TUNE_BASE + ); }); } catch (const std::bad_alloc&) diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu index 57d7b873f35..2d5c53d6b99 100644 --- a/cub/benchmarks/bench/scan/exclusive/by_key.cu +++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #include @@ -106,7 +106,7 @@ static void scan(nvbench::state& state, nvbench::type_list(elements), 0 /* stream */); - thrust::device_vector tmp(tmp_size); + thrust::device_vector tmp(tmp_size, thrust::no_init); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { diff --git a/cub/benchmarks/bench/scan/policy_selector.h b/cub/benchmarks/bench/scan/policy_selector.h new file mode 100644 index 00000000000..f1f30d93a90 --- /dev/null +++ b/cub/benchmarks/bench/scan/policy_selector.h @@ -0,0 +1,73 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#if !TUNE_BASE +# include "look_back_helper.cuh" +#endif // !TUNE_BASE + +#ifndef USES_WARPSPEED +# define USES_WARPSPEED() 0 +#endif + +#if !TUNE_BASE +template +struct policy_selector +{ +# if USES_WARPSPEED() + _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::scan::scan_policy + { + static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS; + static constexpr int num_look_ahead_items = TUNE_NUM_LOOKBACK_ITEMS; + static constexpr int items_per_thread = TUNE_ITEMS_PLUS_ONE - 1; + + static constexpr int num_threads_per_warp = 32; + static constexpr int num_load_warps = 1; + static constexpr int num_sched_warps = 1; + static constexpr int num_look_ahead_warps = 1; + + static constexpr int num_total_warps = + 2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps; + static constexpr int num_total_threads = num_total_warps * num_threads_per_warp; + static constexpr int squad_reduce_threads = num_reduce_and_scan_warps * num_threads_per_warp; + static constexpr int tile_size = items_per_thread * squad_reduce_threads; + + auto warpspeed_policy = cub::detail::scan::scan_warpspeed_policy{ + true, + num_reduce_and_scan_warps, + num_reduce_and_scan_warps, + num_load_warps, + num_sched_warps, + num_look_ahead_warps, + num_look_ahead_items, + num_total_threads, + items_per_thread, + tile_size}; + + return cub::detail::scan::scan_policy{ + num_total_threads, + items_per_thread, + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_STORE_WARP_TRANSPOSE, + cub::BLOCK_SCAN_WARP_SCANS, + cub::detail::delay_constructor_policy{cub::detail::delay_constructor_kind::fixed_delay, 350, 450}, + warpspeed_policy}; + } +# else + _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::scan::scan_policy + { + return cub::detail::scan::make_mem_scaled_scan_policy( + TUNE_THREADS, + TUNE_ITEMS, + int{sizeof(AccumT)}, + TUNE_LOAD_ALGORITHM, + TUNE_LOAD_MODIFIER, + TUNE_STORE_ALGORITHM, + cub::BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy); + } +# endif +}; +#endif // !TUNE_BASE diff --git a/cub/cub/detail/launcher/cuda_driver.cuh b/cub/cub/detail/launcher/cuda_driver.cuh index 61e5f3d2e89..a4dc9782458 100644 --- a/cub/cub/detail/launcher/cuda_driver.cuh +++ b/cub/cub/detail/launcher/cuda_driver.cuh @@ -141,6 +141,60 @@ struct CudaDriverLauncherFactory cuDeviceGetAttribute(&max_shared_memory, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, device)); } + _CCCL_HIDE_FROM_ABI CUB_RUNTIME_FUNCTION ::cudaError_t + max_dynamic_smem_size_for(int& max_dynamic_smem_size, ::CUkernel kernel_ptr) const + { + max_dynamic_smem_size = -1; + + ::CUfunction kernel_fn; + auto status = static_cast<::cudaError_t>(::cuKernelGetFunction(&kernel_fn, kernel_ptr)); + if (status != cudaSuccess) + { + return status; + } + + int static_smem_size = 0; + status = static_cast<::cudaError_t>( + ::cuFuncGetAttribute(&static_smem_size, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_fn)); + if (status != cudaSuccess) + { + return status; + } + + int reserved_smem_size = 0; + status = static_cast<::cudaError_t>( + ::cuDeviceGetAttribute(&reserved_smem_size, CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK, device)); + if (status != cudaSuccess) + { + return status; + } + + int max_smem_size_optin = 0; + status = static_cast<::cudaError_t>( + ::cuDeviceGetAttribute(&max_smem_size_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); + if (status != cudaSuccess) + { + return status; + } + + max_dynamic_smem_size = max_smem_size_optin - reserved_smem_size - static_smem_size; + return cudaSuccess; + } + + _CCCL_HIDE_FROM_ABI CUB_RUNTIME_FUNCTION ::cudaError_t + set_max_dynamic_smem_size_for(::CUkernel kernel_ptr, int smem_size) const + { + ::CUfunction kernel_fn; + auto status = static_cast<::cudaError_t>(::cuKernelGetFunction(&kernel_fn, kernel_ptr)); + if (status != cudaSuccess) + { + return status; + } + + return static_cast<::cudaError_t>( + ::cuFuncSetAttribute(kernel_fn, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem_size)); + } + CUdevice device; int cc; }; diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 6a1e6577ade..bfc2fc83413 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -52,12 +52,6 @@ struct tuning return static_cast(*this); } }; - -struct default_tuning : tuning -{ - template - using fn = policy_hub; -}; } // namespace detail::scan //! @rst @@ -127,9 +121,6 @@ struct DeviceScan ::cuda::execution::determinism::__determinism_holder_t, cudaStream_t stream) { - using scan_tuning_t = ::cuda::std::execution:: - __query_result_or_t; - // Unsigned integer type for global offsets using offset_t = detail::choose_offset_t; @@ -140,14 +131,26 @@ struct DeviceScan cub::detail::it_value_t, typename InitValueT::value_type>>; - using policy_t = typename scan_tuning_t:: - template fn, detail::it_value_t, accum_t, offset_t, ScanOpT>; + using default_policy_selector_t = detail::scan::policy_selector_from_types< + detail::it_value_t, + detail::it_value_t, + accum_t, + offset_t, + ScanOpT>; - using dispatch_t = - DispatchScan; + using policy_selector_t = ::cuda::std::execution:: + __query_result_or_t; - return dispatch_t::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, static_cast(num_items), stream); + return detail::scan::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init, + static_cast(num_items), + stream, + policy_selector_t{}); } template , detail::InputValue, OffsetT>:: - Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - ::cuda::std::plus<>{}, - detail::InputValue(init_value), - num_items, - stream); + return detail::scan::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + ::cuda::std::plus<>{}, + detail::InputValue(init_value), + static_cast(num_items), + stream); } //! @rst @@ -597,14 +600,14 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - return DispatchScan, OffsetT>::Dispatch( + return detail::scan::dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, detail::InputValue(init_value), - num_items, + static_cast(num_items), stream); } @@ -939,14 +942,14 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - return DispatchScan, OffsetT>::Dispatch( + return detail::scan::dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, detail::InputValue(init_value), - num_items, + static_cast(num_items), stream); } @@ -1168,8 +1171,15 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - return DispatchScan, NullType, OffsetT>::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream); + return detail::scan::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + ::cuda::std::plus<>{}, + NullType{}, + static_cast(num_items), + stream); } //! @rst @@ -1365,8 +1375,8 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - return DispatchScan::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream); + return detail::scan::dispatch( + d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), static_cast(num_items), stream); } //! @rst @@ -1456,23 +1466,16 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - using AccumT = ::cuda::std::__accumulator_t, InitValueT>; - return DispatchScan< - InputIteratorT, - OutputIteratorT, - ScanOpT, - detail::InputValue, - OffsetT, - AccumT, - ForceInclusive::Yes>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - detail::InputValue(init_value), - num_items, - stream); + return detail::scan::dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + detail::InputValue(init_value), + static_cast(num_items), + stream); } //! @rst diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 4662b798153..096242314ec 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -23,6 +23,7 @@ #endif // no system header #include +#include #include #include #include @@ -42,18 +43,24 @@ #include #include #include +#include #include #include +#include #include #include #include +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) +# include +#endif + CUB_NAMESPACE_BEGIN namespace detail::scan { -template ) + DeviceScanInitKernel) CUB_DEFINE_KERNEL_GETTER( ScanKernel, - DeviceScanKernel*>(ts)); return arg; } + + CUB_RUNTIME_FUNCTION static constexpr bool use_warpspeed(const scan_policy& policy) + { +#if _CCCL_CUDACC_AT_LEAST(12, 8) + if (policy.warpspeed) + { + return detail::scan::use_warpspeed(policy.warpspeed); + } +#else + (void) policy; +#endif + return false; + } +}; + +// TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API +template +_CCCL_API constexpr auto convert_policy() -> scan_policy +{ + using scan_policy_t = typename LegacyActivePolicy::ScanPolicyT; + return scan_policy{ + scan_policy_t::BLOCK_THREADS, + scan_policy_t::ITEMS_PER_THREAD, + scan_policy_t::LOAD_ALGORITHM, + scan_policy_t::LOAD_MODIFIER, + scan_policy_t::STORE_ALGORITHM, + scan_policy_t::SCAN_ALGORITHM, + detail::delay_constructor_policy_from_type}; +} + +// TODO(griwes): remove in CCCL 4.0 when we drop the scan dispatcher after publishing the tuning API +template +struct policy_selector_from_hub +{ + // Called from device code during dispatch, and from host code when clang-cuda evaluates + // scan_policy_selector concept checks. + _CCCL_API constexpr auto operator()(::cuda::arch_id /*arch*/) const -> scan_policy + { + return convert_policy(); + } }; } // namespace detail::scan @@ -145,6 +192,7 @@ struct DeviceScanKernelSource * Enum flag to specify whether to enforce inclusive scan. * */ +// TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0 template < typename InputIteratorT, typename OutputIteratorT, @@ -160,7 +208,8 @@ template < typename PolicyHub = detail::scan:: policy_hub, detail::it_value_t, AccumT, OffsetT, ScanOpT>, typename KernelSource = detail::scan::DeviceScanKernelSource< - typename PolicyHub::MaxPolicy, + detail::scan:: + policy_selector_from_hub, detail::it_value_t, AccumT>, THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, ScanOpT, @@ -250,6 +299,7 @@ struct DispatchScan * @param[in] max_policy * Struct encoding chain of algorithm tuning policies */ + // TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0 CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -408,8 +458,9 @@ struct DispatchScan } #if __cccl_ptx_isa >= 860 - template - CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm(ActivePolicyT) + template + CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t + __invoke_lookahead_algorithm(PolicyGetter policy_getter, const PolicySelectorT& policy_selector) { if (num_items == 0) { @@ -417,16 +468,20 @@ struct DispatchScan return cudaSuccess; } - using InputT = ::cuda::std::iter_value_t; - using OutputT = ::cuda::std::iter_value_t; - using WarpspeedPolicy = typename ActivePolicyT::WarpspeedPolicy; + CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter(); + CUB_DETAIL_CONSTEXPR_ISH const auto warpspeed_policy = active_policy.warpspeed; const int grid_dim = - static_cast(::cuda::ceil_div(num_items, static_cast(WarpspeedPolicy::tile_size))); + static_cast(::cuda::ceil_div(num_items, static_cast(warpspeed_policy.tile_size))); if (d_temp_storage == nullptr) { - temp_storage_bytes = grid_dim * kernel_source.look_ahead_tile_state_size(); + temp_storage_bytes = static_cast(grid_dim) * kernel_source.look_ahead_tile_state_size(); + return cudaSuccess; + } + + if (num_items == 0) + { return cudaSuccess; } @@ -435,12 +490,6 @@ struct DispatchScan { return error; } - // number of stages to have an even workload across all SMs (improves small problem sizes), assuming 1 CTA per SM - // +1 since it tends to improve performance - // TODO(bgruber): make the +1 a tuning parameter - [[maybe_unused]] const int max_stages_for_even_workload = - static_cast(::cuda::ceil_div(num_items, static_cast(sm_count * WarpspeedPolicy::tile_size)) + 1); - // Maximum dynamic shared memory size that we can use for temporary storage. int max_dynamic_smem_size{}; if (const auto error = @@ -452,19 +501,77 @@ struct DispatchScan // TODO(bgruber): we probably need to ensure alignment of d_temp_storage _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), ""); - constexpr int smem_size_1_stage = detail::scan::smem_for_stages(1); - static_assert(smem_size_1_stage <= detail::max_smem_per_block); // this is ensured by scan_use_warpspeed + using selector_smem_info_t = detail::scan::selector_smem_info; - auto scan_kernel = kernel_source.ScanKernel(); - int num_stages = 1; - int smem_size = smem_size_1_stage; + auto scan_kernel = kernel_source.ScanKernel(); + int smem_size_1_stage = 0; + if constexpr (selector_smem_info_t::has_static_layout) + { + CUB_DETAIL_CONSTEXPR_ISH int static_smem_size_1_stage = detail::scan::smem_for_stages( + warpspeed_policy, + 1, + selector_smem_info_t::input_value_size, + selector_smem_info_t::input_value_alignment, + selector_smem_info_t::output_value_size, + selector_smem_info_t::output_value_alignment, + selector_smem_info_t::accum_size, + selector_smem_info_t::accum_alignment); + CUB_DETAIL_STATIC_ISH_ASSERT(static_smem_size_1_stage <= detail::max_smem_per_block, + "Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)"); + smem_size_1_stage = static_smem_size_1_stage; + } + else + { + smem_size_1_stage = detail::scan::smem_for_stages( + warpspeed_policy, + 1, + policy_selector.input_value_size, + policy_selector.input_value_alignment, + policy_selector.output_value_size, + policy_selector.output_value_alignment, + policy_selector.accum_size, + policy_selector.accum_alignment); + } + + int num_stages = 1; + int smem_size = smem_size_1_stage; // When launched from the host, maximize the number of stages that we can fit inside the shared memory. NV_IF_TARGET(NV_IS_HOST, ({ + // number of stages to have an even workload across all SMs (improves small problem sizes), assuming + // 1 CTA per SM +1 since it tends to improve performance + // TODO(bgruber): make the +1 a tuning parameter + const int max_stages_for_even_workload = static_cast( + ::cuda::ceil_div(num_items, static_cast(sm_count * warpspeed_policy.tile_size)) + 1); + while (num_stages <= max_stages_for_even_workload) { - const auto next_smem_size = - detail::scan::smem_for_stages(num_stages + 1); + const auto next_smem_size = [&] { + if constexpr (selector_smem_info_t::has_static_layout) + { + return detail::scan::smem_for_stages( + warpspeed_policy, + num_stages + 1, + selector_smem_info_t::input_value_size, + selector_smem_info_t::input_value_alignment, + selector_smem_info_t::output_value_size, + selector_smem_info_t::output_value_alignment, + selector_smem_info_t::accum_size, + selector_smem_info_t::accum_alignment); + } + else + { + return detail::scan::smem_for_stages( + warpspeed_policy, + num_stages + 1, + policy_selector.input_value_size, + policy_selector.input_value_alignment, + policy_selector.output_value_size, + policy_selector.output_value_alignment, + policy_selector.accum_size, + policy_selector.accum_alignment); + } + }(); if (next_smem_size > max_dynamic_smem_size) { // This number of stages failed, so stay at the current settings @@ -487,7 +594,7 @@ struct DispatchScan const auto init_grid_size = ::cuda::ceil_div(grid_dim, init_kernel_threads); # ifdef CUB_DEBUG_LOG - _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, , %lld>>>()\n", + _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); @@ -517,7 +624,7 @@ struct DispatchScan // Invoke scan kernel { - constexpr int block_dim = WarpspeedPolicy::num_total_threads; + const int block_dim = warpspeed_policy.num_total_threads; # ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", grid_dim, block_dim, smem_size, (long long) stream); @@ -555,24 +662,179 @@ struct DispatchScan } #endif // __cccl_ptx_isa >= 860 - template - CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {}) + // On Windows, the `if CUB_DETAIL_CONSTEXPR_ISH` results in `warning C4702: unreachable code`. + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4702) + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t + __invoke(PolicyGetter policy_getter, [[maybe_unused]] const PolicySelectorT& policy_selector) { + CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter(); + + CUB_DETAIL_STATIC_ISH_ASSERT(active_policy.load_modifier != CacheLoadModifier::LOAD_LDG, + "The memory consistency model does not apply to texture accesses"); + #if __cccl_ptx_isa >= 860 - if constexpr (detail::scan::scan_use_warpspeed< - ActivePolicyT, - THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, - THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, - AccumT>) +# if defined(CUB_DEFINE_RUNTIME_POLICIES) + if (kernel_source.use_warpspeed(active_policy)) { - return __invoke_lookahead_algorithm(active_policy); + return __invoke_lookahead_algorithm(policy_getter, policy_selector); } - else +# else + if CUB_DETAIL_CONSTEXPR_ISH (KernelSource::use_warpspeed(active_policy)) + { + return __invoke_lookahead_algorithm(policy_getter, policy_selector); + } +# endif #endif // __cccl_ptx_isa >= 860 + + // Number of input tiles + const int tile_size = active_policy.block_threads * active_policy.items_per_thread; + const int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); + + auto tile_state = kernel_source.TileState(); + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (const auto error = CubDebug(tile_state.AllocationSize(num_tiles, allocation_sizes[0]))) + { + return error; // bytes needed for tile status descriptors + } + + // Compute allocation pointers into the single storage blob (or compute + // the necessary size of the blob) + void* allocations[1] = {}; + if (const auto error = + CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + { + return error; + } + + // Return if the caller is simply requesting the size of the storage allocation, or the problem is empty + if (d_temp_storage == nullptr || num_items == 0) + { + return cudaSuccess; + } + + // Construct the tile status interface + if (const auto error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) { - return Invoke( - kernel_source.InitKernel(), kernel_source.ScanKernel(), detail::scan::MakeScanPolicyWrapper(active_policy)); + return error; } + + // Log init_kernel configuration + constexpr int init_kernel_threads = 128; + const int init_grid_size = ::cuda::ceil_div(num_tiles, init_kernel_threads); + +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream); +#endif // CUB_DEBUG_LOG + + // Invoke init_kernel to initialize tile descriptors + if (const auto error = CubDebug( + launcher_factory(init_grid_size, init_kernel_threads, 0, stream, /* use_pdl */ true) + .doit(kernel_source.InitKernel(), kernel_source.make_tile_state_kernel_arg(tile_state), num_tiles))) + { + return error; + } + + // Check for failure to launch + if (const auto error = CubDebug(cudaPeekAtLastError())) + { + return error; + } + + // Sync the stream if specified to flush runtime errors + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + + // Get SM occupancy for scan_kernel + int scan_sm_occupancy; + if (const auto error = CubDebug( + launcher_factory.MaxSmOccupancy(scan_sm_occupancy, kernel_source.ScanKernel(), active_policy.block_threads))) + { + return error; + } + + // Get max x-dimension of grid + int max_dim_x; + if (const auto error = CubDebug(launcher_factory.MaxGridDimX(max_dim_x))) + { + return error; + } + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + const int scan_grid_size = ::cuda::std::min(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { +// Log scan_kernel configuration +#ifdef CUB_DEBUG_LOG + _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " + "per thread, %d SM occupancy\n", + start_tile, + scan_grid_size, + active_policy.block_threads, + (long long) stream, + active_policy.items_per_thread, + scan_sm_occupancy); +#endif // CUB_DEBUG_LOG + + // Invoke scan_kernel + if (const auto error = CubDebug( + launcher_factory(scan_grid_size, active_policy.block_threads, 0, stream, /* use_pdl */ true) + .doit(kernel_source.ScanKernel(), + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_in), + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_out), + kernel_source.make_tile_state_kernel_arg(tile_state), + start_tile, + scan_op, + init_value, + num_items, + /* num_stages, unused */ 1))) + { + return error; + } + + // Check for failure to launch + if (const auto error = CubDebug(cudaPeekAtLastError())) + { + return error; + } + + // Sync the stream if specified to flush runtime errors + if (const auto error = CubDebug(detail::DebugSyncStream(stream))) + { + return error; + } + } + + return cudaSuccess; + } + + _CCCL_DIAG_POP + + template + CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT = {}) + { + struct policy_getter + { + // host-device not api, because clang warns about exclude_from_explicit_instantiation in local types + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto operator()() const + { + return detail::scan::convert_policy(); + } + }; + + using policy_selector_t = detail::scan::policy_selector_from_types< + detail::it_value_t, + detail::it_value_t, + AccumT, + OffsetT, + ScanOpT>; + return __invoke(policy_getter{}, policy_selector_t{}); } /** @@ -614,6 +876,7 @@ struct DispatchScan * @param[in] max_policy * Struct encoding chain of algorithm tuning policies */ + // TODO(griwes): deprecate when we make the tuning API public and remove in CCCL 4.0 template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -654,4 +917,145 @@ struct DispatchScan } }; +namespace detail::scan +{ +template < + ForceInclusive EnforceInclusive = ForceInclusive::No, + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT, + typename InitValueT, + typename OffsetT, + typename AccumT = ::cuda::std::__accumulator_t, + ::cuda::std::_If<::cuda::std::is_same_v, + cub::detail::it_value_t, + typename InitValueT::value_type>>, + typename PolicySelector = policy_selector_from_types, + detail::it_value_t, + AccumT, + OffsetT, + ScanOpT>, + typename KernelSource = DeviceScanKernelSource< + PolicySelector, + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, + ScanOpT, + InitValueT, + OffsetT, + AccumT, + EnforceInclusive>, + typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> +#if _CCCL_HAS_CONCEPTS() + requires scan_policy_selector +#endif // _CCCL_HAS_CONCEPTS() +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + InitValueT init_value, + OffsetT num_items, + cudaStream_t stream, + PolicySelector policy_selector = {}, + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}) -> cudaError_t +{ + static_assert(::cuda::std::is_unsigned_v && sizeof(OffsetT) >= 4, + "DispatchScan only supports unsigned offset types of at least 4-bytes"); + + ::cuda::arch_id arch_id{}; + if (const auto error = CubDebug(launcher_factory.PtxArchId(arch_id))) + { + return error; + } + +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + NV_IF_TARGET(NV_IS_HOST, + (std::stringstream ss; ss << policy_selector(arch_id); + _CubLog("Dispatching DeviceScan to arch %d with tuning: %s\n", (int) arch_id, ss.str().c_str());)) +#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + + struct fake_policy + { + using MaxPolicy = void; + }; + + return dispatch_arch(policy_selector, arch_id, [&](auto policy_getter) { + return DispatchScan{ + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + scan_op, + init_value, + stream, + -1 /* ptx_version, not used actually */, + kernel_source, + launcher_factory} + .__invoke(policy_getter, policy_selector); + }); +} + +template , + detail::it_value_t, + AccumT, + OffsetT, + ScanOpT>, + typename KernelSource = DeviceScanKernelSource< + PolicySelector, + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t, + ScanOpT, + InitValueT, + OffsetT, + AccumT, + EnforceInclusive>, + typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_with_accum( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + InitValueT init_value, + OffsetT num_items, + cudaStream_t stream, + PolicySelector policy_selector = {}, + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}) -> cudaError_t +{ + return dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init_value, + num_items, + stream, + policy_selector, + kernel_source, + launcher_factory); +} +} // namespace detail::scan + CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/kernels/kernel_scan.cuh b/cub/cub/device/dispatch/kernels/kernel_scan.cuh index 5346385f0ba..d703d035d2b 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan.cuh @@ -53,7 +53,11 @@ union tile_state_kernel_arg_t * @param[in] num_tiles * Number of tiles */ -template +template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel( tile_state_kernel_arg_t tile_state, int num_tiles) { @@ -61,8 +65,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel( _CCCL_PDL_TRIGGER_NEXT_LAUNCH(); // beneficial for all problem sizes in cub.bench.scan.exclusive.sum.base #if _CCCL_CUDACC_AT_LEAST(12, 8) - if constexpr (detail::scan:: - scan_use_warpspeed) + constexpr scan_policy policy = PolicySelectorT{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); + if constexpr (policy.warpspeed + && detail::scan::use_warpspeed(policy.warpspeed)) { device_scan_init_lookahead_body(tile_state.lookahead, num_tiles); } @@ -106,28 +111,26 @@ DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIte *d_num_selected_out = 0; } } -template +template [[nodiscard]] _CCCL_DEVICE_API _CCCL_CONSTEVAL int get_device_scan_launch_bounds() noexcept { + constexpr scan_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); #if _CCCL_CUDACC_AT_LEAST(12, 8) - if constexpr (detail::scan:: - scan_use_warpspeed) + if constexpr (policy.warpspeed + && detail::scan::use_warpspeed(policy.warpspeed)) { - return get_scan_block_threads; + return policy.warpspeed.num_total_threads; } - else #endif // _CCCL_CUDACC_AT_LEAST(12, 8) - { - return static_cast(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS); - } + return policy.block_threads; } /** * @brief Scan kernel entry point (multi-block) * * - * @tparam ChainedPolicyT - * Chained tuning policy + * @tparam PolicySelector + * Policy selector for tuning * * @tparam InputIteratorT * Random-access input iterator type for reading scan inputs @iterator @@ -167,7 +170,7 @@ template -__launch_bounds__(get_device_scan_launch_bounds(), 1) +__launch_bounds__(get_device_scan_launch_bounds()) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel( _CCCL_GRID_CONSTANT const InputIteratorT d_in, _CCCL_GRID_CONSTANT const OutputIteratorT d_out, @@ -192,23 +195,41 @@ __launch_bounds__(get_device_scan_launch_bounds) + if constexpr (policy.warpspeed + && detail::scan::use_warpspeed(policy.warpspeed)) { - NV_IF_TARGET(NV_PROVIDES_SM_100, ({ - auto scan_params = scanKernelParams, it_value_t, AccumT>{ - d_in, d_out, tile_state.lookahead, num_items, num_stages}; - device_scan_lookahead_body( - scan_params, scan_op, init_value); - })); + NV_IF_TARGET( + NV_PROVIDES_SM_100, ({ + auto scan_params = scanKernelParams, it_value_t, AccumT>{ + d_in, d_out, tile_state.lookahead, num_items, num_stages}; + device_scan_lookahead_body(scan_params, scan_op, init_value); + })); } else #endif // _CCCL_CUDACC_AT_LEAST(12, 8) { + using ScanPolicyT = AgentScanPolicy< + policy.block_threads, + policy.items_per_thread, + AccumT, + policy.load_algorithm, + policy.load_modifier, + policy.store_algorithm, + policy.scan_algorithm, + NoScaling, + delay_constructor_t>; + // Thread block type for scanning input tiles using AgentScanT = detail::scan::AgentScan< - typename ActivePolicy::ScanPolicyT, + ScanPolicyT, InputIteratorT, OutputIteratorT, ScanOpT, diff --git a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh index 9134b70e4dc..92b84623932 100644 --- a/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_scan_warpspeed.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include #include +#include #include #include #include @@ -42,6 +44,68 @@ CUB_NAMESPACE_BEGIN namespace detail::scan { +template +_CCCL_API constexpr scan_warpspeed_policy get_warpspeed_policy() noexcept +{ + return PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).warpspeed; +} + +template +struct static_warpspeed_policy_adapter +{ + _CCCL_API static constexpr scan_warpspeed_policy policy() noexcept + { + return get_warpspeed_policy(); + } + + _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const + { + return policy().squadReduce(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const + { + return policy().squadScanStore(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const + { + return policy().squadLoad(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadSched() const + { + return policy().squadSched(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const + { + return policy().squadLookback(); + } +}; + +struct runtime_warpspeed_policy_adapter +{ + const scan_warpspeed_policy& policy; + + _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const + { + return policy.squadReduce(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const + { + return policy.squadScanStore(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const + { + return policy.squadLoad(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadSched() const + { + return policy.squadSched(); + } + _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const + { + return policy.squadLookback(); + } +}; + template struct scanKernelParams { @@ -53,76 +117,117 @@ struct scanKernelParams }; // Struct holding all scan kernel resources -template +template struct ScanResources { + _CCCL_API static constexpr scan_warpspeed_policy warpspeed_policy() noexcept + { + return get_warpspeed_policy(); + } + // align to at least 16 bytes (InputT/OutputT may be aligned higher) so each stage starts correctly aligned struct alignas(::cuda::std::max({::cuda::std::size_t{16}, alignof(InputT), alignof(OutputT)})) InOutT { // the tile_size size is a multiple of the warp size, and thus for sure a multiple of 16 - static_assert(WarpspeedPolicy::tile_size % 16 == 0, "tile_size must be multiple of 16"); + static_assert(ScanResources::warpspeed_policy().tile_size % 16 == 0, "tile_size must be multiple of 16"); // therefore, unaligned inputs need exactly 16 bytes extra for overcopying (tail padding = 16 - head padding) - ::cuda::std::byte inout[WarpspeedPolicy::tile_size * sizeof(InputT) + 16]; + ::cuda::std::byte inout[ScanResources::warpspeed_policy().tile_size * sizeof(InputT) + 16]; }; static_assert(alignof(InOutT) >= alignof(InputT)); static_assert(alignof(InOutT) >= alignof(OutputT)); - using SumThreadAndWarpT = - AccumT[WarpspeedPolicy::squadReduce().threadCount() + WarpspeedPolicy::squadReduce().warpCount()]; + using SumThreadAndWarpT = AccumT[ScanResources::warpspeed_policy().squadReduce().threadCount() + + ScanResources::warpspeed_policy().squadReduce().warpCount()]; warpspeed::SmemResource smemInOut; // will also be used to stage the output (as OutputT) for the bulk copy warpspeed::SmemResource smemNextBlockIdx; warpspeed::SmemResource smemSumExclusiveCta; warpspeed::SmemResource smemSumThreadAndWarp; }; + +struct ScanResourcesRaw +{ + warpspeed::SmemResourceRaw smemInOut; + warpspeed::SmemResourceRaw smemNextBlockIdx; + warpspeed::SmemResourceRaw smemSumExclusiveCta; + warpspeed::SmemResourceRaw smemSumThreadAndWarp; +}; + +struct scan_stage_counts +{ + int num_block_idx_stages; + int num_sum_exclusive_cta_stages; +}; + +_CCCL_API constexpr scan_stage_counts make_scan_stage_counts(int num_stages) +{ + int num_block_idx_stages = num_stages - 1; + num_block_idx_stages = num_block_idx_stages < 1 ? 1 : num_block_idx_stages; + return {num_block_idx_stages, 2}; +} + +template +_CCCL_API constexpr void setup_scan_resources( + const PolicyAdapter& policy, + warpspeed::SyncHandler& syncHandler, + warpspeed::SmemAllocator& smemAllocator, + SmemInOutT& smemInOut, + SmemNextBlockIdxT& smemNextBlockIdx, + SmemSumExclusiveCtaT& smemSumExclusiveCta, + SmemSumThreadAndWarpT& smemSumThreadAndWarp) +{ + const warpspeed::SquadDesc scanSquads[scan_warpspeed_policy::num_squads] = { + policy.squadReduce(), + policy.squadScanStore(), + policy.squadLoad(), + policy.squadSched(), + policy.squadLookback(), + }; + + smemInOut.addPhase(syncHandler, smemAllocator, policy.squadLoad()); + smemInOut.addPhase(syncHandler, smemAllocator, {policy.squadReduce(), policy.squadScanStore()}); + + smemNextBlockIdx.addPhase(syncHandler, smemAllocator, policy.squadSched()); + smemNextBlockIdx.addPhase(syncHandler, smemAllocator, scanSquads); + + smemSumExclusiveCta.addPhase(syncHandler, smemAllocator, policy.squadLookback()); + smemSumExclusiveCta.addPhase(syncHandler, smemAllocator, policy.squadScanStore()); + + smemSumThreadAndWarp.addPhase(syncHandler, smemAllocator, policy.squadReduce()); + smemSumThreadAndWarp.addPhase(syncHandler, smemAllocator, policy.squadScanStore()); +} + // Function to allocate resources. -template -[[nodiscard]] _CCCL_API constexpr ScanResources +template +[[nodiscard]] _CCCL_API constexpr ScanResources allocResources(warpspeed::SyncHandler& syncHandler, warpspeed::SmemAllocator& smemAllocator, int numStages) { - using ScanResourcesT = ScanResources; + using ScanResourcesT = ScanResources; using InOutT = typename ScanResourcesT::InOutT; using SumThreadAndWarpT = typename ScanResourcesT::SumThreadAndWarpT; - // If numBlockIdxStages is one less than the number of stages, we find a small - // speedup compared to setting it equal to num_stages. Not sure why. - int numBlockIdxStages = numStages - 1; - // Ensure we have at least 1 stage - numBlockIdxStages = numBlockIdxStages < 1 ? 1 : numBlockIdxStages; - - // We do not need too many sumExclusiveCta stages. The lookback warp is the - // bottleneck. As soon as it produces a new value, it will be consumed by the - // scanStore squad, releasing the stage. - int numSumExclusiveCtaStages = 2; + const auto [num_block_idx_stages, num_sum_exclusive_cta_stages] = make_scan_stage_counts(numStages); ScanResourcesT res = { warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{numStages}), - warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{numBlockIdxStages}), - warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{numSumExclusiveCtaStages}), + warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{num_block_idx_stages}), + warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{num_sum_exclusive_cta_stages}), warpspeed::SmemResource(syncHandler, smemAllocator, warpspeed::Stages{numStages}), }; - // asdfasdf - constexpr warpspeed::SquadDesc scanSquads[WarpspeedPolicy::num_squads] = { - WarpspeedPolicy::squadReduce(), - WarpspeedPolicy::squadScanStore(), - WarpspeedPolicy::squadLoad(), - WarpspeedPolicy::squadSched(), - WarpspeedPolicy::squadLookback(), - }; - res.smemInOut.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadLoad()); - res.smemInOut.addPhase( - syncHandler, smemAllocator, {WarpspeedPolicy::squadReduce(), WarpspeedPolicy::squadScanStore()}); - - res.smemNextBlockIdx.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadSched()); - res.smemNextBlockIdx.addPhase(syncHandler, smemAllocator, scanSquads); - - res.smemSumExclusiveCta.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadLookback()); - res.smemSumExclusiveCta.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadScanStore()); - - res.smemSumThreadAndWarp.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadReduce()); - res.smemSumThreadAndWarp.addPhase(syncHandler, smemAllocator, WarpspeedPolicy::squadScanStore()); + setup_scan_resources( + static_warpspeed_policy_adapter{}, + syncHandler, + smemAllocator, + res.smemInOut, + res.smemNextBlockIdx, + res.smemSumExclusiveCta, + res.smemSumThreadAndWarp); return res; } @@ -192,7 +297,7 @@ _CCCL_DEVICE_API Tp warpScanExclusive(const Tp regInput, ScanOpT& scan_op) // warp-specialization dispatch is performed once at the start of the kernel and // not in any of the hot loops (even if that may seem the case from a first // glance at the code). -template (); + static constexpr warpspeed::SquadDesc squadReduce = policy.squadReduce(); + static constexpr warpspeed::SquadDesc squadScanStore = policy.squadScanStore(); + static constexpr warpspeed::SquadDesc squadLoad = policy.squadLoad(); + static constexpr warpspeed::SquadDesc squadSched = policy.squadSched(); + static constexpr warpspeed::SquadDesc squadLookback = policy.squadLookback(); - constexpr int tile_size = WarpspeedPolicy::tile_size; - constexpr int num_look_ahead_items = WarpspeedPolicy::num_look_ahead_items; + constexpr int tile_size = policy.tile_size; + constexpr int num_look_ahead_items = policy.num_look_ahead_items; // We might try to instantiate the kernel with hughe types which would lead to a small tile size. Ensure its never 0 - constexpr int elemPerThread = WarpspeedPolicy::items_per_thread; + constexpr int elemPerThread = policy.items_per_thread; static_assert(elemPerThread * squadReduce.threadCount() == tile_size, "Invalid tuning policy"); //////////////////////////////////////////////////////////////////////////////// @@ -228,8 +334,8 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody( warpspeed::SyncHandler syncHandler{}; warpspeed::SmemAllocator smemAllocator{}; - ScanResources res = - allocResources(syncHandler, smemAllocator, params.numStages); + ScanResources res = + allocResources(syncHandler, smemAllocator, params.numStages); syncHandler.clusterInitSync(specialRegisters); @@ -617,8 +723,7 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody( { // otherwise, issue multiple bulk copies in chunks of the input tile size // TODO(bgruber): I am sure this could be implemented a lot more efficiently - static constexpr int elem_per_chunk = - static_cast(WarpspeedPolicy::tile_size * sizeof(InputT) / sizeof(OutputT)); + static constexpr int elem_per_chunk = static_cast(policy.tile_size * sizeof(InputT) / sizeof(OutputT)); for (int chunk_offset = 0; chunk_offset < valid_items; chunk_offset += elem_per_chunk) { const int chunk_size = ::cuda::std::min(valid_items - chunk_offset, elem_per_chunk); @@ -673,14 +778,7 @@ _CCCL_DEVICE_API _CCCL_FORCEINLINE void kernelBody( #endif // __cccl_ptx_isa >= 860 -template -inline constexpr int get_scan_block_threads = 1; - -template -inline constexpr int get_scan_block_threads> = - ActivePolicy::WarpspeedPolicy::num_total_threads; - -template (); + // Dispatch for warp-specialization - static constexpr warpspeed::SquadDesc scanSquads[WarpspeedPolicy::num_squads] = { - WarpspeedPolicy::squadReduce(), - WarpspeedPolicy::squadScanStore(), - WarpspeedPolicy::squadLoad(), - WarpspeedPolicy::squadSched(), - WarpspeedPolicy::squadLookback(), + static constexpr warpspeed::SquadDesc scanSquads[scan_warpspeed_policy::num_squads] = { + policy.squadReduce(), + policy.squadScanStore(), + policy.squadLoad(), + policy.squadSched(), + policy.squadLookback(), }; // we need to force inline the lambda, but clang in CUDA mode only likes the GNU syntax warpspeed::squadDispatch(specialRegisters, scanSquads, [&](warpspeed::Squad squad) _CCCL_FORCEINLINE_LAMBDA { - kernelBody( + kernelBody( squad, specialRegisters, params, ::cuda::std::move(scan_op), static_cast(init_value)); }); #endif // __cccl_ptx_isa >= 860 @@ -746,66 +846,137 @@ device_scan_init_lookahead_body(warpspeed::tile_state_t* tile_states, co } } -template -_CCCL_API constexpr auto smem_for_stages(int num_stages) -> int +_CCCL_API constexpr auto smem_for_stages( + const scan_warpspeed_policy& policy, + int num_stages, + int input_size, + int input_align, + int output_size, + int output_align, + int accum_size, + int accum_align) -> int { warpspeed::SyncHandler syncHandler{}; warpspeed::SmemAllocator smemAllocator{}; - (void) scan::allocResources(syncHandler, smemAllocator, num_stages); + (void) output_size; + const auto counts = make_scan_stage_counts(num_stages); + + const int align_inout = ::cuda::std::max({16, input_align, output_align}); + const int inout_bytes = policy.tile_size * input_size + 16; + // Match sizeof(InOutT): round up to the alignment so each stage matches SmemResource. + const int inout_stride = (inout_bytes + align_inout - 1) & ~(align_inout - 1); + const auto reduce_squad = policy.squadReduce(); + const int sum_thread_warp = (reduce_squad.threadCount() + reduce_squad.warpCount()) * accum_size; + + void* inout_base = smemAllocator.alloc(static_cast<::cuda::std::uint32_t>(inout_stride * num_stages), align_inout); + void* next_block_idx_base = smemAllocator.alloc( + static_cast<::cuda::std::uint32_t>(sizeof(uint4) * counts.num_block_idx_stages), alignof(uint4)); + void* sum_exclusive_base = smemAllocator.alloc( + static_cast<::cuda::std::uint32_t>(accum_size * counts.num_sum_exclusive_cta_stages), accum_align); + void* sum_thread_warp_base = + smemAllocator.alloc(static_cast<::cuda::std::uint32_t>(sum_thread_warp * num_stages), accum_align); + + ScanResourcesRaw res = { + warpspeed::SmemResourceRaw{syncHandler, inout_base, inout_stride, inout_stride, num_stages}, + warpspeed::SmemResourceRaw{ + syncHandler, + next_block_idx_base, + static_cast(sizeof(uint4)), + static_cast(sizeof(uint4)), + counts.num_block_idx_stages}, + warpspeed::SmemResourceRaw{ + syncHandler, sum_exclusive_base, accum_size, accum_size, counts.num_sum_exclusive_cta_stages}, + warpspeed::SmemResourceRaw{syncHandler, sum_thread_warp_base, sum_thread_warp, sum_thread_warp, num_stages}, + }; + + setup_scan_resources( + runtime_warpspeed_policy_adapter{policy}, + syncHandler, + smemAllocator, + res.smemInOut, + res.smemNextBlockIdx, + res.smemSumExclusiveCta, + res.smemSumThreadAndWarp); syncHandler.mHasInitialized = true; // avoid assertion in destructor return static_cast(smemAllocator.sizeBytes()); } -#if 0 -// we check the required shared memory inside a template, so the error message shows the amount in case of failure -template -_CCCL_API constexpr void verify_smem() +template +_CCCL_API constexpr auto smem_for_stages(const scan_warpspeed_policy& policy, int num_stages) -> int { - static_assert(RequiredSharedMemory <= max_smem_per_block, - "Single stage configuration exceeds architecture independent SMEM (48KiB)"); + return smem_for_stages( + policy, + num_stages, + static_cast(sizeof(InputT)), + static_cast(alignof(InputT)), + static_cast(sizeof(OutputT)), + static_cast(alignof(OutputT)), + static_cast(sizeof(AccumT)), + static_cast(alignof(AccumT))); } -#endif -template -_CCCL_API constexpr auto one_stage_fits_48KiB_SMEM() -> bool +_CCCL_API constexpr bool use_warpspeed( + const scan_warpspeed_policy& policy, + int input_size, + int input_align, + int output_size, + int output_align, + int accum_size, + int accum_align, + bool input_contiguous, + bool output_contiguous, + bool input_trivially_copyable, + bool output_trivially_copyable, + bool output_default_constructible) { - using InputT = it_value_t; - using OutputT = it_value_t; - constexpr int smem_size_1_stage = smem_for_stages(1); -// We can turn this on to report if a single stage of the warpspeed scan would exceed 48KiB if SMEM. -#if 0 - verify_smem(); +// We need `cuda::std::is_constant_evaluated` for the compile-time SMEM computation. And we need PTX ISA 8.6. +// MSVC + nvcc < 13.1 just fails to compile `cub.test.device.scan.lid_1.types_0` with `Internal error` and nothing else. +#if (defined(__CUDA_ARCH__) && __cccl_ptx_isa < 860) || !defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) \ + || ((_CCCL_COMPILER(MSVC) && _CCCL_CUDA_COMPILER(NVCC, <, 13, 1))) + (void) policy; + (void) input_size; + (void) input_align; + (void) output_size; + (void) output_align; + (void) accum_size; + (void) accum_align; + (void) input_contiguous; + (void) output_contiguous; + (void) input_trivially_copyable; + (void) output_trivially_copyable; + (void) output_default_constructible; + return false; +#else + if (!input_contiguous || !output_contiguous || !input_trivially_copyable || !output_trivially_copyable + || !output_default_constructible) + { + return false; + } + + return smem_for_stages(policy, 1, input_size, input_align, output_size, output_align, accum_size, accum_align) + <= static_cast(max_smem_per_block); #endif - return smem_size_1_stage <= max_smem_per_block; } -template -inline constexpr bool scan_use_warpspeed = false; - -// detect the use via CCCL.C (pre-compiled dispatch and JIT pass) and disable the new kernel. -// See https://github.com/NVIDIA/cccl/issues/6821 for more details. -// We also need `cuda::std::is_constant_evaluated` for the compile-time SMEM computation. And we need PTX ISA 8.6. -// MSVC + nvcc < 13.1 just fails to compile `cub.test.device.scan.lid_1.types_0` with `Internal error` and nothing else. -#if !defined(CUB_ENABLE_POLICY_PTX_JSON) && !defined(CUB_DEFINE_RUNTIME_POLICIES) \ - && defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && __cccl_ptx_isa >= 860 \ - && !(_CCCL_COMPILER(MSVC) && _CCCL_CUDA_COMPILER(NVCC, <, 13, 1)) -template -inline constexpr bool scan_use_warpspeed> = - // for bulk copy: input and output iterators must be contiguous and their value types must be trivially copyable - THRUST_NS_QUALIFIER::is_contiguous_iterator_v - && ::cuda::std::is_trivially_copyable_v> - && THRUST_NS_QUALIFIER::is_contiguous_iterator_v - && ::cuda::std::is_trivially_copyable_v> - // for bulk copy store: we need to prepare a buffer of output types in SMEM - && ::cuda::std::is_default_constructible_v> - // need to fit one stage into 48KiB so binaries stay forward compatible with future GPUs - && one_stage_fits_48KiB_SMEM(); -#endif // !defined(CUB_ENABLE_POLICY_PTX_JSON) && !defined(CUB_DEFINE_RUNTIME_POLICIES) && - // defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && __cccl_ptx_isa >= 860 +template +_CCCL_API constexpr bool use_warpspeed(const scan_warpspeed_policy& policy) +{ + using InputT = it_value_t; + using OutputT = it_value_t; + return use_warpspeed( + policy, + static_cast(sizeof(InputT)), + static_cast(alignof(InputT)), + static_cast(sizeof(OutputT)), + static_cast(alignof(OutputT)), + static_cast(sizeof(AccumT)), + static_cast(alignof(AccumT)), + THRUST_NS_QUALIFIER::is_contiguous_iterator_v, + THRUST_NS_QUALIFIER::is_contiguous_iterator_v, + ::cuda::std::is_trivially_copyable_v, + ::cuda::std::is_trivially_copyable_v, + ::cuda::std::is_default_constructible_v); +} } // namespace detail::scan CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh new file mode 100644 index 00000000000..e4f254dcd6b --- /dev/null +++ b/cub/cub/device/dispatch/kernels/scan_warpspeed_policy.cuh @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#include + +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + +CUB_NAMESPACE_BEGIN + +namespace detail::scan +{ +struct scan_warpspeed_policy +{ + static constexpr int num_squads = 5; + + bool valid = false; + int num_reduce_warps; + int num_scan_stor_warps; + int num_load_warps; + int num_sched_warps; + int num_look_ahead_warps; + + int num_look_ahead_items; + int num_total_threads; + int items_per_thread; + int tile_size; + + _CCCL_API constexpr explicit operator bool() const noexcept + { + return valid; + } + + _CCCL_API constexpr warpspeed::SquadDesc squadReduce() const + { + return warpspeed::SquadDesc{0, num_reduce_warps}; + } + _CCCL_API constexpr warpspeed::SquadDesc squadScanStore() const + { + return warpspeed::SquadDesc{1, num_scan_stor_warps}; + } + _CCCL_API constexpr warpspeed::SquadDesc squadLoad() const + { + return warpspeed::SquadDesc{2, num_load_warps}; + } + _CCCL_API constexpr warpspeed::SquadDesc squadSched() const + { + return warpspeed::SquadDesc{3, num_sched_warps}; + } + _CCCL_API constexpr warpspeed::SquadDesc squadLookback() const + { + return warpspeed::SquadDesc{4, num_look_ahead_warps}; + } + + _CCCL_API constexpr friend bool operator==(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs) + { + return lhs.valid == rhs.valid && lhs.num_reduce_warps == rhs.num_reduce_warps + && lhs.num_scan_stor_warps == rhs.num_scan_stor_warps && lhs.num_load_warps == rhs.num_load_warps + && lhs.num_sched_warps == rhs.num_sched_warps && lhs.num_look_ahead_warps == rhs.num_look_ahead_warps + && lhs.num_look_ahead_items == rhs.num_look_ahead_items && lhs.num_total_threads == rhs.num_total_threads + && lhs.items_per_thread == rhs.items_per_thread && lhs.tile_size == rhs.tile_size; + } + + _CCCL_API constexpr friend bool operator!=(const scan_warpspeed_policy& lhs, const scan_warpspeed_policy& rhs) + { + return !(lhs == rhs); + } + +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const scan_warpspeed_policy& p) + { + return os + << "scan_warpspeed_policy { .valid = " << p.valid << ", .num_reduce_warps = " << p.num_reduce_warps + << ", .num_scan_stor_warps = " << p.num_scan_stor_warps << ", .num_load_warps = " << p.num_load_warps + << ", .num_sched_warps = " << p.num_sched_warps << ", .num_look_ahead_warps = " << p.num_look_ahead_warps + << ", .num_look_ahead_items = " << p.num_look_ahead_items << ", .num_total_threads = " << p.num_total_threads + << ", .items_per_thread = " << p.items_per_thread << ", .tile_size = " << p.tile_size << " }"; + } +#endif // !_CCCL_COMPILER(NVRTC) +}; +} // namespace detail::scan + +CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/common.cuh b/cub/cub/device/dispatch/tuning/common.cuh index 273eedfbd82..8811afa012c 100644 --- a/cub/cub/device/dispatch/tuning/common.cuh +++ b/cub/cub/device/dispatch/tuning/common.cuh @@ -18,6 +18,8 @@ #include #include +#include +#include #include #include @@ -29,6 +31,7 @@ namespace detail // libcu++ enum class type_t { + boolean, int8, int16, int32, @@ -50,6 +53,8 @@ inline constexpr auto classify_type = type_t::other; template <> inline constexpr auto classify_type = ::cuda::std::is_signed_v ? type_t::int8 : type_t::uint8; template <> +inline constexpr auto classify_type = type_t::boolean; +template <> inline constexpr auto classify_type = type_t::int8; template <> inline constexpr auto classify_type = type_t::uint8; @@ -101,6 +106,12 @@ inline constexpr auto classify_op = op_kind_t::other; template inline constexpr auto classify_op<::cuda::std::plus> = op_kind_t::plus; +template +inline constexpr auto classify_op<::cuda::minimum> = op_kind_t::min; + +template +inline constexpr auto classify_op<::cuda::maximum> = op_kind_t::max; + struct iterator_info { int value_type_size; diff --git a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh index 3957c6165c3..1b5d3a61259 100644 --- a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 #pragma once @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,9 @@ CUB_NAMESPACE_BEGIN namespace detail::radix_sort { +using detail::scan::make_mem_scaled_scan_policy; +using detail::scan::scan_policy; + struct radix_sort_histogram_policy { int block_threads; @@ -145,63 +149,6 @@ _CCCL_API constexpr auto make_reg_scaled_radix_sort_onesweep_policy( store_algorithm}; } -// TODO(bgruber): move this into the scan tuning header -struct scan_policy -{ - int block_threads; - int items_per_thread; - BlockLoadAlgorithm load_algorithm; - CacheLoadModifier load_modifier; - BlockStoreAlgorithm store_algorithm; - BlockScanAlgorithm scan_algorithm; - delay_constructor_policy delay_constructor; - - _CCCL_API constexpr friend bool operator==(const scan_policy& lhs, const scan_policy& rhs) - { - return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread - && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier - && lhs.store_algorithm == rhs.store_algorithm && lhs.scan_algorithm == rhs.scan_algorithm - && lhs.delay_constructor == rhs.delay_constructor; - } - - _CCCL_API constexpr friend bool operator!=(const scan_policy& lhs, const scan_policy& rhs) - { - return !(lhs == rhs); - } - -#if !_CCCL_COMPILER(NVRTC) - friend ::std::ostream& operator<<(::std::ostream& os, const scan_policy& p) - { - return os - << "scan_policy { .block_threads = " << p.block_threads << ", .items_per_thread = " << p.items_per_thread - << ", .load_algorithm = " << p.load_algorithm << ", .load_modifier = " << p.load_modifier - << ", .store_algorithm = " << p.store_algorithm << ", .scan_algorithm = " << p.scan_algorithm - << ", .delay_constructor = " << p.delay_constructor << " }"; - } -#endif // !_CCCL_COMPILER(NVRTC) -}; - -_CCCL_API constexpr auto make_mem_scaled_scan_policy( - int nominal_4b_block_threads, - int nominal_4b_items_per_thread, - int compute_t_size, - BlockLoadAlgorithm load_algorithm, - CacheLoadModifier load_modifier, - BlockStoreAlgorithm store_algorithm, - BlockScanAlgorithm scan_algorithm, - delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}) -> scan_policy -{ - const auto scaled = scale_mem_bound(nominal_4b_block_threads, nominal_4b_items_per_thread, compute_t_size); - return scan_policy{ - scaled.block_threads, - scaled.items_per_thread, - load_algorithm, - load_modifier, - store_algorithm, - scan_algorithm, - delay_constructor}; -} - struct radix_sort_downsweep_policy { int block_threads; diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index a83d2bdf6ee..44d43d942db 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -18,18 +18,26 @@ #include #include #include -#include +#include +#include #include #include #include #include +#include #include #include #include #include +#include +#include #include +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + CUB_NAMESPACE_BEGIN namespace detail::scan @@ -116,6 +124,69 @@ constexpr _CCCL_HOST_DEVICE offset_size classify_offset_size() return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown; } +struct scan_policy +{ + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_algorithm; + CacheLoadModifier load_modifier; + BlockStoreAlgorithm store_algorithm; + BlockScanAlgorithm scan_algorithm; + delay_constructor_policy delay_constructor; + scan_warpspeed_policy warpspeed = {}; + + _CCCL_API constexpr friend bool operator==(const scan_policy& lhs, const scan_policy& rhs) + { + return lhs.block_threads == rhs.block_threads && lhs.items_per_thread == rhs.items_per_thread + && lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier + && lhs.store_algorithm == rhs.store_algorithm && lhs.scan_algorithm == rhs.scan_algorithm + && lhs.delay_constructor == rhs.delay_constructor && lhs.warpspeed == rhs.warpspeed; + } + + _CCCL_API constexpr friend bool operator!=(const scan_policy& lhs, const scan_policy& rhs) + { + return !(lhs == rhs); + } + +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const scan_policy& p) + { + os << "scan_policy { .block_threads = " << p.block_threads << ", .items_per_thread = " << p.items_per_thread + << ", .load_algorithm = " << p.load_algorithm << ", .load_modifier = " << p.load_modifier + << ", .store_algorithm = " << p.store_algorithm << ", .scan_algorithm = " << p.scan_algorithm + << ", .delay_constructor = " << p.delay_constructor; + if (p.warpspeed) + { + os << ", .warpspeed = " << p.warpspeed; + } + return os << " }"; + } +#endif // !_CCCL_COMPILER(NVRTC) +}; + +_CCCL_API constexpr auto make_mem_scaled_scan_policy( + int nominal_4b_block_threads, + int nominal_4b_items_per_thread, + int compute_t_size, + BlockLoadAlgorithm load_algorithm, + CacheLoadModifier load_modifier, + BlockStoreAlgorithm store_algorithm, + BlockScanAlgorithm scan_algorithm, + delay_constructor_policy delay_constructor = {delay_constructor_kind::fixed_delay, 350, 450}, + scan_warpspeed_policy warpspeed = {}) -> scan_policy +{ + const auto scaled = scale_mem_bound(nominal_4b_block_threads, nominal_4b_items_per_thread, compute_t_size); + return scan_policy{ + scaled.block_threads, + scaled.items_per_thread, + load_algorithm, + load_modifier, + store_algorithm, + scan_algorithm, + delay_constructor, + warpspeed}; +} + template (), @@ -257,6 +329,7 @@ struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_si #endif // clang-format on +// TODO(griwes): remove for CCCL 4.0 when we drop the public scan dispatcher template MakeScanPolicyWrapper(PolicyT polic return ScanPolicyWrapper{policy}; } +// TODO(griwes): remove this in CCCL 4.0 when we remove the public scan dispatcher template struct policy_hub { @@ -680,6 +754,616 @@ struct policy_hub using MaxPolicy = Policy1200; }; + +#if _CCCL_HAS_CONCEPTS() +template +concept scan_policy_selector = policy_selector; +#endif // _CCCL_HAS_CONCEPTS() + +constexpr _CCCL_HOST_DEVICE bool is_arithmetic_type(type_t type) +{ + switch (type) + { + case type_t::boolean: + case type_t::int8: + case type_t::int16: + case type_t::int32: + case type_t::int64: + case type_t::int128: + case type_t::uint8: + case type_t::uint16: + case type_t::uint32: + case type_t::uint64: + case type_t::uint128: + case type_t::float32: + case type_t::float64: + return true; + case type_t::other: + return false; + } + + return false; +} + +constexpr _CCCL_HOST_DEVICE scan_warpspeed_policy get_warpspeed_policy( + ::cuda::arch_id arch, int input_value_size, int accum_size, type_t input_type, op_kind_t operation_t) +{ + if (arch >= ::cuda::arch_id::sm_100) + { + scan_warpspeed_policy warpspeed_policy{}; + warpspeed_policy.valid = true; + + // TODO(bgruber): tune this +#if _CCCL_COMPILER(NVHPC) + warpspeed_policy.num_reduce_warps = 2; + warpspeed_policy.num_scan_stor_warps = 2; +#else // _CCCL_COMPILER(NVHPC) + warpspeed_policy.num_reduce_warps = 4; + warpspeed_policy.num_scan_stor_warps = 4; +#endif // _CCCL_COMPILER(NVHPC) + warpspeed_policy.num_load_warps = 1; + warpspeed_policy.num_sched_warps = 1; + warpspeed_policy.num_look_ahead_warps = 1; + + // TODO(bgruber): 5 is a bit better for complex + warpspeed_policy.num_look_ahead_items = accum_size == 2 ? 3 : 4; + + // Deduced definitions + const auto num_threads_per_warp = 32; + const auto num_total_warps = + warpspeed_policy.num_reduce_warps + warpspeed_policy.num_scan_stor_warps + warpspeed_policy.num_load_warps + + warpspeed_policy.num_sched_warps + warpspeed_policy.num_look_ahead_warps; + warpspeed_policy.num_total_threads = num_total_warps * num_threads_per_warp; + + const auto squad_reduce_thread_count = warpspeed_policy.num_reduce_warps * num_threads_per_warp; + + // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM) + // 2-byte types and double needed special handling + auto items_per_thread = ::cuda::std::max(256 / (input_value_size == 2 ? 2 : accum_size) - 1, 1); + + if (arch >= ::cuda::arch_id::sm_120 && operation_t == op_kind_t::other && is_arithmetic_type(input_type)) + { + if (input_value_size == 4 || input_value_size == 8) + { + items_per_thread = 127; + } + else + { + items_per_thread = ::cuda::std::min(items_per_thread, input_value_size <= 2 ? 63 : 127); + } + } + + warpspeed_policy.items_per_thread = items_per_thread; + // TODO(bgruber): the special handling of double below is a LOT faster, but exceeds 48KiB SMEM + // clang-format off + // | F64 | I32 | 72576 | 11.295 us | 2.44% | 11.917 us | 8.02% | 0.622 us | 5.50% | SLOW | + // | F64 | I32 | 1056384 | 16.162 us | 6.24% | 15.847 us | 5.57% | -0.315 us | -1.95% | SAME | + // | F64 | I32 | 16781184 | 65.696 us | 1.64% | 60.650 us | 3.37% | -5.046 us | -7.68% | FAST | + // | F64 | I32 | 268442496 | 863.896 us | 0.22% | 679.100 us | 0.93% | -184.796 us | -21.39% | FAST | + // | F64 | I32 | 1073745792 | 3.418 ms | 0.12% | 2.662 ms | 0.46% | -755.740 us | -22.11% | FAST | + // | F64 | I64 | 72576 | 12.301 us | 8.18% | 12.987 us | 5.75% | 0.686 us | 5.58% | SAME | + // | F64 | I64 | 1056384 | 16.775 us | 5.70% | 16.091 us | 6.14% | -0.684 us | -4.08% | SAME | + // | F64 | I64 | 16781184 | 66.970 us | 1.41% | 58.024 us | 3.17% | -8.946 us | -13.36% | FAST | + // | F64 | I64 | 268442496 | 863.826 us | 0.23% | 676.465 us | 0.98% | -187.360 us | -21.69% | FAST | + // | F64 | I64 | 1073745792 | 3.419 ms | 0.11% | 2.664 ms | 0.48% | -755.409 us | -22.09% | FAST | + // | F64 | I64 | 4294975104 | 13.641 ms | 0.05% | 10.575 ms | 0.24% | -3065.815 us | -22.48% | FAST | + // clang-format on + // (256 / (sizeof(InputValueT) == 2 ? 2 : (::cuda::std::is_same_v ? 4 : sizeof(AccumT))) - + // 1); + + warpspeed_policy.tile_size = warpspeed_policy.items_per_thread * squad_reduce_thread_count; + + return warpspeed_policy; + } + + return {}; +} + +struct policy_selector +{ + int input_value_size; + int input_value_alignment; + int output_value_size; + int output_value_alignment; + int accum_size; + int accum_alignment; + int offset_size; + type_t input_type; + type_t accum_type; + op_kind_t operation_t; + bool accum_is_primitive_or_trivially_copy_constructible; + // TODO(griwes): remove this field before policy_selector is publicly exposed + bool benchmark_match; + + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy + { + const primitive_accum primitive_accum_t = + accum_type != type_t::other && accum_type != type_t::int128 ? primitive_accum::yes : primitive_accum::no; + const primitive_op primitive_op_t = operation_t != op_kind_t::other ? primitive_op::yes : primitive_op::no; + + const bool large_values = accum_size > 128; + const BlockLoadAlgorithm scan_transposed_load = + large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE; + const BlockStoreAlgorithm scan_transposed_store = + large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; + const auto default_delay = default_delay_constructor_policy(accum_is_primitive_or_trivially_copy_constructible); + + const auto warpspeed_policy = get_warpspeed_policy(arch, input_value_size, accum_size, input_type, operation_t); + + if (arch >= ::cuda::arch_id::sm_100) + { + if (benchmark_match && operation_t == op_kind_t::plus && primitive_accum_t == primitive_accum::yes) + { + if (offset_size == 4) + { + switch (input_value_size) + { + case 1: + // ipt_18.tpb_512.ns_768.dcid_7.l2w_820.trp_1.ld_0 1.188818 1.005682 1.173041 1.305288 + return make_mem_scaled_scan_policy( + 512, + 18, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon, 768, 820}, + warpspeed_policy); + case 2: + // ipt_13.tpb_512.ns_1384.dcid_7.l2w_720.trp_1.ld_0 1.128443 1.002841 1.119688 1.307692 + return make_mem_scaled_scan_policy( + 512, + 13, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon, 1384, 720}, + warpspeed_policy); + case 4: + // ipt_22.tpb_384.ns_1904.dcid_6.l2w_830.trp_1.ld_0 1.148442 0.997167 1.139902 1.462651 + return make_mem_scaled_scan_policy( + 384, + 22, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter, 1904, 830}, + warpspeed_policy); + case 8: + // ipt_23.tpb_416.ns_772.dcid_5.l2w_710.trp_1.ld_0 1.089468 1.015581 1.085630 1.264583 + return make_mem_scaled_scan_policy( + 416, + 23, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon_jitter_window, 772, 710}, + warpspeed_policy); + default: + break; + } + } + else if (offset_size == 8) + { + switch (input_value_size) + { + case 1: + // ipt_14.tpb_384.ns_228.dcid_7.l2w_775.trp_1.ld_1 1.107210 1.000000 1.100637 1.307692 + return make_mem_scaled_scan_policy( + 384, + 14, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_CA, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon, 228, 775}, + warpspeed_policy); + case 2: + // todo(gonidelis): Regresses for large inputs. Find better tuning. + // ipt_13.tpb_288.ns_1520.dcid_5.l2w_895.trp_1.ld_1 1.080934 0.983509 1.077724 1.305288 + break; + case 4: + // ipt_19.tpb_416.ns_956.dcid_7.l2w_550.trp_1.ld_1 1.146142 0.994350 1.137459 1.455636 + return make_mem_scaled_scan_policy( + 416, + 19, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_CA, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backon, 956, 550}, + warpspeed_policy); + case 8: + if (accum_type == type_t::float64) + { + break; + } + // ipt_22.tpb_320.ns_328.dcid_2.l2w_965.trp_1.ld_0 1.080133 1.000000 1.075577 1.248963 + return make_mem_scaled_scan_policy( + 320, + 22, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::exponential_backoff, 328, 965}, + warpspeed_policy); + default: + break; + } + } + } + } + + if (arch >= ::cuda::arch_id::sm_90) + { + if (primitive_op_t == primitive_op::yes) + { + if (primitive_accum_t == primitive_accum::yes) + { + switch (accum_size) + { + case 1: + return make_mem_scaled_scan_policy( + 192, + 22, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 168, 1140}, + warpspeed_policy); + case 2: + return make_mem_scaled_scan_policy( + 512, + 12, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 376, 1125}, + warpspeed_policy); + case 4: + if (accum_type == type_t::float32) + { + return make_mem_scaled_scan_policy( + 128, + 24, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 688, 1140}, + warpspeed_policy); + } + return make_mem_scaled_scan_policy( + 128, + 24, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 648, 1245}, + warpspeed_policy); + case 8: + if (accum_type == type_t::float64) + { + return make_mem_scaled_scan_policy( + 224, + 24, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 576, 1215}, + warpspeed_policy); + } + return make_mem_scaled_scan_policy( + 224, + 24, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 632, 1290}, + warpspeed_policy); + default: + break; + } + } + +#if _CCCL_HAS_INT128() + if (primitive_accum_t == primitive_accum::no && accum_size == 16 + && (accum_type == type_t::int128 || accum_type == type_t::uint128)) + { + return make_mem_scaled_scan_policy( + 576, + 21, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 860, 630}, + warpspeed_policy); + } +#endif + } + } + + // Keep sm_86 aligned with legacy policy_hub behavior: policy_hub resets to default policy for 86. + if (arch >= ::cuda::arch_id::sm_86) + { + return make_mem_scaled_scan_policy( + 128, + 15, + accum_size, + scan_transposed_load, + LOAD_DEFAULT, + scan_transposed_store, + BLOCK_SCAN_WARP_SCANS, + default_delay, + warpspeed_policy); + } + + if (arch >= ::cuda::arch_id::sm_80) + { + if (primitive_op_t == primitive_op::yes) + { + if (primitive_accum_t == primitive_accum::yes) + { + switch (accum_size) + { + case 1: + return make_mem_scaled_scan_policy( + 320, + 14, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 368, 725}, + warpspeed_policy); + case 2: + return make_mem_scaled_scan_policy( + 352, + 16, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 488, 1040}, + warpspeed_policy); + case 4: + if (accum_type == type_t::float32) + { + return make_mem_scaled_scan_policy( + 288, + 8, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 724, 1050}, + warpspeed_policy); + } + return make_mem_scaled_scan_policy( + 320, + 12, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 268, 1180}, + warpspeed_policy); + case 8: + if (accum_type == type_t::float64) + { + return make_mem_scaled_scan_policy( + 384, + 12, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 388, 1100}, + warpspeed_policy); + } + return make_mem_scaled_scan_policy( + 288, + 22, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 716, 785}, + warpspeed_policy); + default: + break; + } + } + +#if _CCCL_HAS_INT128() + if (primitive_accum_t == primitive_accum::no && accum_size == 16 + && (accum_type == type_t::int128 || accum_type == type_t::uint128)) + { + return make_mem_scaled_scan_policy( + 640, + 24, + accum_size, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + BLOCK_STORE_DIRECT, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::no_delay, 0, 1200}, + warpspeed_policy); + } +#endif + } + } + + if (arch >= ::cuda::arch_id::sm_75) + { + if (benchmark_match && operation_t == op_kind_t::plus && primitive_accum_t == primitive_accum::yes + && offset_size == 8 && input_value_size == 4) + { + // ipt_7.tpb_128.ns_628.dcid_1.l2w_520.trp_1.ld_0 + return make_mem_scaled_scan_policy( + 128, + 7, + accum_size, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS, + delay_constructor_policy{delay_constructor_kind::fixed_delay, 628, 520}, + warpspeed_policy); + } + + return make_mem_scaled_scan_policy( + 128, + 15, + accum_size, + scan_transposed_load, + LOAD_DEFAULT, + scan_transposed_store, + BLOCK_SCAN_WARP_SCANS, + default_delay, + warpspeed_policy); + } + + if (arch >= ::cuda::arch_id::sm_60) + { + return make_mem_scaled_scan_policy( + 128, + 15, + accum_size, + scan_transposed_load, + LOAD_DEFAULT, + scan_transposed_store, + BLOCK_SCAN_WARP_SCANS, + default_delay, + warpspeed_policy); + } + + return make_mem_scaled_scan_policy( + 128, + 12, + accum_size, + BLOCK_LOAD_DIRECT, + LOAD_CA, + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + BLOCK_SCAN_RAKING, + default_delay, + warpspeed_policy); + } +}; + +#if _CCCL_HAS_CONCEPTS() +static_assert(scan_policy_selector); +#endif // _CCCL_HAS_CONCEPTS() + +template +struct benchmark_match_for_policy_selector +{ + static constexpr bool value = false; +}; + +template +struct benchmark_match_for_policy_selector< + ScanOpT, + InputValueT, + OutputValueT, + AccumT, + ::cuda::std::void_t<::cuda::std::__accumulator_t>> +{ + static constexpr bool value = + sizeof(AccumT) == sizeof(::cuda::std::__accumulator_t) + && sizeof(InputValueT) == sizeof(OutputValueT); +}; + +// stateless version which can be passed to kernels +template +struct policy_selector_from_types +{ + static constexpr int input_value_size = int{sizeof(InputValueT)}; + static constexpr int input_value_alignment = int{alignof(InputValueT)}; + static constexpr int output_value_size = int{sizeof(OutputValueT)}; + static constexpr int output_value_alignment = int{alignof(OutputValueT)}; + static constexpr int accum_size = int{sizeof(AccumT)}; + static constexpr int accum_alignment = int{alignof(AccumT)}; + static constexpr type_t input_type = classify_type; + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> scan_policy + { + constexpr bool benchmark_match = + benchmark_match_for_policy_selector::value; + + constexpr bool accum_is_primitive_or_trivially_copy_constructible = + is_primitive::value || ::cuda::std::is_trivially_copy_constructible_v; + + constexpr auto policies = policy_selector{ + input_value_size, + input_value_alignment, + output_value_size, + output_value_alignment, + accum_size, + accum_alignment, + int{sizeof(OffsetT)}, + input_type, + classify_type, + classify_op, + accum_is_primitive_or_trivially_copy_constructible, + benchmark_match}; + return policies(arch); + } +}; + +template +struct selector_smem_info +{ + static constexpr bool has_static_layout = false; +}; + +template +struct selector_smem_info< + PolicySelectorT, + ::cuda::std::void_t{}), + decltype(::cuda::std::integral_constant{}), + decltype(::cuda::std::integral_constant{}), + decltype(::cuda::std::integral_constant{}), + decltype(::cuda::std::integral_constant{}), + decltype(::cuda::std::integral_constant{})>> +{ + static constexpr bool has_static_layout = true; + static constexpr int input_value_size = PolicySelectorT::input_value_size; + static constexpr int input_value_alignment = PolicySelectorT::input_value_alignment; + static constexpr int output_value_size = PolicySelectorT::output_value_size; + static constexpr int output_value_alignment = PolicySelectorT::output_value_alignment; + static constexpr int accum_size = PolicySelectorT::accum_size; + static constexpr int accum_alignment = PolicySelectorT::accum_alignment; +}; } // namespace detail::scan CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu index e8a692b6051..2ccdb97587b 100644 --- a/cub/test/catch2_test_device_scan_env.cu +++ b/cub/test/catch2_test_device_scan_env.cu @@ -14,6 +14,7 @@ struct stream_registry_factory_t; #include +#include #include #include "catch2_test_env_launch_helper.h" @@ -47,36 +48,23 @@ struct block_size_check_t } }; -struct block_size_retreiver_t -{ - int* ptr; - - template - cudaError_t Invoke() - { - *ptr = ActivePolicyT::ScanPolicyT::BLOCK_THREADS; - return cudaSuccess; - } -}; - TEST_CASE("Device scan exclusive scan works with default environment", "[scan][device]") { using num_items_t = int; using value_t = int; using offset_t = cub::detail::choose_offset_t; - using policy_t = - cub::detail::scan::default_tuning::fn::MaxPolicy; + using selector_t = + cub::detail::scan::policy_selector_from_types; int current_device{}; REQUIRE(cudaSuccess == cudaGetDevice(¤t_device)); - int ptx_version{}; - REQUIRE(cudaSuccess == cub::PtxVersion(ptx_version, current_device)); + cudaDeviceProp device_props{}; + REQUIRE(cudaSuccess == cudaGetDeviceProperties(&device_props, current_device)); - int target_block_size{}; - block_size_retreiver_t block_size_retreiver{&target_block_size}; - REQUIRE(cudaSuccess == policy_t::Invoke(ptx_version, block_size_retreiver)); + const auto target_block_size = + selector_t{}(cuda::to_arch_id(cuda::compute_capability{device_props.major, device_props.minor})).block_threads; num_items_t num_items = 1; c2h::device_vector d_block_size(1); @@ -116,31 +104,16 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev template struct scan_tuning : cub::detail::scan::tuning> { - template - struct fn + _CCCL_API constexpr auto operator()(cuda::arch_id /*arch*/) const -> cub::detail::scan::scan_policy { - struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> - { - struct ScanPolicyT - { - static constexpr int BLOCK_THREADS = BlockThreads; - static constexpr int ITEMS_PER_THREAD = 1; - static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE; - - static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::CacheLoadModifier::LOAD_DEFAULT; - static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = - cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE; - static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING; - - struct detail - { - using delay_constructor_t = cub::detail::default_delay_constructor_t; - }; - }; - }; - - using MaxPolicy = Policy500; - }; + return {BlockThreads, + 1, + cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE, + cub::CacheLoadModifier::LOAD_DEFAULT, + cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE, + cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING, + cub::detail::delay_constructor_policy{cub::detail::delay_constructor_kind::fixed_delay, 350, 450}}; + } }; struct get_reduce_tuning_query_t @@ -212,18 +185,17 @@ TEST_CASE("Device scan inclusive-scan works with default environment", "[scan][d using value_t = int; using offset_t = cub::detail::choose_offset_t; - using policy_t = - cub::detail::scan::default_tuning::fn::MaxPolicy; + using selector_t = + cub::detail::scan::policy_selector_from_types; int current_device{}; REQUIRE(cudaSuccess == cudaGetDevice(¤t_device)); - int ptx_version{}; - REQUIRE(cudaSuccess == cub::PtxVersion(ptx_version, current_device)); + cudaDeviceProp device_props{}; + REQUIRE(cudaSuccess == cudaGetDeviceProperties(&device_props, current_device)); - int target_block_size{}; - block_size_retreiver_t block_size_retreiver{&target_block_size}; - REQUIRE(cudaSuccess == policy_t::Invoke(ptx_version, block_size_retreiver)); + const auto target_block_size = + selector_t{}(cuda::to_arch_id(cuda::compute_capability{device_props.major, device_props.minor})).block_threads; num_items_t num_items = 1; c2h::device_vector d_block_size(1); diff --git a/cub/test/catch2_test_env_launch_helper.h b/cub/test/catch2_test_env_launch_helper.h index 886e53f35ed..454a29b28e1 100644 --- a/cub/test/catch2_test_env_launch_helper.h +++ b/cub/test/catch2_test_env_launch_helper.h @@ -3,6 +3,8 @@ #pragma once +#include + #include #include @@ -149,6 +151,51 @@ struct stream_registry_factory_t // Get max grid dimension return cudaDeviceGetAttribute(&max_grid_dim_x, cudaDevAttrMaxGridDimX, device_ordinal); } + + CUB_RUNTIME_FUNCTION cudaError_t MemsetAsync(void* dst, unsigned char value, size_t num_bytes, cudaStream_t stream) + { + return cudaMemsetAsync(dst, value, num_bytes, stream); + } + + CUB_RUNTIME_FUNCTION cudaError_t + MemcpyAsync(void* dst, const void* src, size_t num_bytes, cudaMemcpyKind kind, cudaStream_t stream) + { + return cudaMemcpyAsync(dst, src, num_bytes, kind, stream); + } + + CUB_RUNTIME_FUNCTION cudaError_t MaxSharedMemory(int& max_shared_memory) const + { + int device = 0; + auto error = cudaGetDevice(&device); + if (error != cudaSuccess) + { + return error; + } + + return cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, device); + } + + template + CUB_RUNTIME_FUNCTION cudaError_t max_dynamic_smem_size_for(int& max_dynamic_smem_size, Kernel kernel_ptr) + { + NV_IF_ELSE_TARGET(NV_IS_HOST, // + ({ return cub::MaxPotentialDynamicSmemBytes(max_dynamic_smem_size, kernel_ptr); }), + ({ + cudaFuncAttributes func_attrs{}; + if (const auto error = cudaFuncGetAttributes(&func_attrs, kernel_ptr)) + { + return error; + } + max_dynamic_smem_size = func_attrs.maxDynamicSharedSizeBytes; + return cudaSuccess; + })) + } + + template + CUB_RUNTIME_FUNCTION cudaError_t set_max_dynamic_smem_size_for(Kernel kernel_ptr, int smem_size) + { + return cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); + } }; struct stream_scope