From 49150057170d2911ccdb02e0f31ef110df0c0e2c Mon Sep 17 00:00:00 2001 From: Chih-Chen Kao Date: Thu, 7 Sep 2023 13:31:10 +0200 Subject: [PATCH 01/68] Add kernelPath and includeDir to the ctor Signed-off-by: Chih-Chen Kao --- ParallelPrimitives/RadixSort.cpp | 8 ++++---- ParallelPrimitives/RadixSort.h | 6 +++--- ParallelPrimitives/RadixSort.inl | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 2a80264..2edc763 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -56,7 +56,7 @@ void printKernelInfo( const std::string& name, oroFunction func ) namespace Oro { -RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils ) : m_device{ device }, m_oroutils{ oroutils } +RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils, const std::string& kernelPath, const std::string& includeDir ) : m_device{ device }, m_oroutils{ oroutils } { oroGetDeviceProperties( &m_props, device ); @@ -72,10 +72,10 @@ RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils ) : m_device{ devi assert( m_num_threads_per_block_for_scan % warp_size == 0 ); assert( m_num_threads_per_block_for_sort % warp_size == 0 ); - configure(); + configure( kernelPath, includeDir ); } -void RadixSort::exclusiveScanCpu( const Oro::GpuMemory& countsGpu, Oro::GpuMemory& offsetsGpu, oroStream stream ) const noexcept +void RadixSort::exclusiveScanCpu( const Oro::GpuMemory& countsGpu, Oro::GpuMemory& offsetsGpu ) const noexcept { const auto buffer_size = countsGpu.size(); @@ -210,7 +210,7 @@ int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept return number_of_blocks; } -void RadixSort::configure( const std::string& kernelPath, const std::string& includeDir, oroStream stream ) noexcept +void RadixSort::configure( const std::string& kernelPath, const std::string& includeDir ) noexcept { compileKernels( kernelPath, includeDir ); diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index d529feb..6df72f5 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -31,7 +31,7 @@ class RadixSort final LOG, }; - RadixSort( oroDevice device, OrochiUtils& oroutils ); + RadixSort( oroDevice device, OrochiUtils& oroutils, const std::string& kernelPath = "", const std::string& includeDir = "" ); // Allow move but disallow copy. RadixSort( RadixSort&& ) noexcept = default; @@ -61,12 +61,12 @@ class RadixSort final /// It copies the count result from the Device to Host before computation, and then copies the offsets back from Host to Device afterward. /// @param countsGpu The count result in GPU memory. Otuput: The offset. /// @param offsetsGpu The offsets. - void exclusiveScanCpu( const Oro::GpuMemory& countsGpu, Oro::GpuMemory& offsetsGpu, oroStream stream ) const noexcept; + void exclusiveScanCpu( const Oro::GpuMemory& countsGpu, Oro::GpuMemory& offsetsGpu ) const noexcept; /// @brief Configure the settings, compile the kernels and allocate the memory. /// @param kernelPath The kernel path. /// @param includeDir The include directory. - void configure( const std::string& kernelPath = "", const std::string& includeDir = "", oroStream stream = 0 ) noexcept; + void configure( const std::string& kernelPath, const std::string& includeDir ) noexcept; private: // GPU blocks for the count kernel diff --git a/ParallelPrimitives/RadixSort.inl b/ParallelPrimitives/RadixSort.inl index 0577132..77df463 100644 --- a/ParallelPrimitives/RadixSort.inl +++ b/ParallelPrimitives/RadixSort.inl @@ -95,7 +95,7 @@ void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int en { case ScanAlgo::SCAN_CPU: { - exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer, stream ); + exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer ); } break; @@ -116,7 +116,7 @@ void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int en break; default: - exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer, stream ); + exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer ); break; } }; From b725f75045b852d6632fa2ec428ad9f4f67c9bba Mon Sep 17 00:00:00 2001 From: Chih-Chen Kao Date: Fri, 8 Sep 2023 07:58:20 +0200 Subject: [PATCH 02/68] Add missing header Signed-off-by: Chih-Chen Kao --- ParallelPrimitives/RadixSort.h | 1 + 1 file changed, 1 insertion(+) diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index 6df72f5..d3e89bf 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include From 117a318d443ae9b8cc315c3738b97fdab531e46a Mon Sep 17 00:00:00 2001 From: Chih-Chen Kao Date: Fri, 8 Sep 2023 12:23:44 +0200 Subject: [PATCH 03/68] fix template constexpr rule Signed-off-by: Chih-Chen Kao --- ParallelPrimitives/RadixSort.inl | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/ParallelPrimitives/RadixSort.inl b/ParallelPrimitives/RadixSort.inl index 77df463..ad9ecdb 100644 --- a/ParallelPrimitives/RadixSort.inl +++ b/ParallelPrimitives/RadixSort.inl @@ -36,6 +36,25 @@ constexpr void execute( CallableType&& callable, RecordType& time_record, const time_record[index] = stopwatch.getMs(); } } + +template +void resize_record( T& t ) noexcept +{ + if constexpr( enable_profile ) + { + t.resize( 3 ); + } +} + +template +void print_record( const T& t ) noexcept +{ + if constexpr( enable_profile ) + { + printf( "%3.2f, %3.2f, %3.2f\n", t[0], t[1], t[2] ); + } +} + } // namespace template @@ -73,10 +92,7 @@ void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int en using RecordType = std::conditional_t, Empty>; RecordType t; - if constexpr( enable_profile ) - { - t.resize( 3 ); - } + resize_record( t ); const auto launch_count_kernel = [&]() noexcept { @@ -143,8 +159,5 @@ void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int en execute( launch_sort_kernel, t, 2, stream ); - if constexpr( enable_profile ) - { - printf( "%3.2f, %3.2f, %3.2f\n", t[0], t[1], t[2] ); - } + print_record( t ); } From 571111b402b5502d8a34204b4b44e6b188ab6a93 Mon Sep 17 00:00:00 2001 From: Chih-Chen Kao Date: Fri, 8 Sep 2023 16:40:08 +0200 Subject: [PATCH 04/68] Use default values for bitcode Signed-off-by: Chih-Chen Kao --- ParallelPrimitives/RadixSort.cpp | 43 ++++++++++++++++----------- ParallelPrimitives/RadixSortKernels.h | 17 +++++++++++ 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 2edc763..fd93056 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -59,19 +59,6 @@ namespace Oro RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils, const std::string& kernelPath, const std::string& includeDir ) : m_device{ device }, m_oroutils{ oroutils } { oroGetDeviceProperties( &m_props, device ); - - m_num_threads_per_block_for_count = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_COUNT_BLOCK_SIZE; - m_num_threads_per_block_for_scan = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SCAN_BLOCK_SIZE; - m_num_threads_per_block_for_sort = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SORT_BLOCK_SIZE; - - const auto warp_size = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE; - - m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size; - - assert( m_num_threads_per_block_for_count % warp_size == 0 ); - assert( m_num_threads_per_block_for_scan % warp_size == 0 ); - assert( m_num_threads_per_block_for_sort % warp_size == 0 ); - configure( kernelPath, includeDir ); } @@ -123,10 +110,30 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string binaryPath = getCurrentDir(); binaryPath += isAmd ? "oro_compiled_kernels.hipfb" : "oro_compiled_kernels.fatbin"; log = "loading pre-compiled kernels at path : " + binaryPath; + + m_num_threads_per_block_for_count = DEFAULT_COUNT_BLOCK_SIZE; + m_num_threads_per_block_for_scan = DEFAULT_SCAN_BLOCK_SIZE; + m_num_threads_per_block_for_sort = DEFAULT_SORT_BLOCK_SIZE; + + const auto warp_size = DEFAULT_WARP_SIZE; + + m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size; } else { log = "compiling kernels at path : " + currentKernelPath + " in : " + currentIncludeDir; + + m_num_threads_per_block_for_count = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_COUNT_BLOCK_SIZE; + m_num_threads_per_block_for_scan = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SCAN_BLOCK_SIZE; + m_num_threads_per_block_for_sort = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SORT_BLOCK_SIZE; + + const auto warp_size = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE; + + m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size; + + assert( m_num_threads_per_block_for_count % warp_size == 0 ); + assert( m_num_threads_per_block_for_scan % warp_size == 0 ); + assert( m_num_threads_per_block_for_sort % warp_size == 0 ); } if( m_flags == Flag::LOG ) @@ -135,13 +142,15 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string } const auto includeArg{ "-I" + currentIncludeDir }; - const auto count_block_size_param = "-DCOUNT_WG_SIZE=" + std::to_string( m_num_threads_per_block_for_count ); - const auto scan_block_size_param = "-DSCAN_WG_SIZE=" + std::to_string( m_num_threads_per_block_for_scan ); - const auto sort_block_size_param = "-DSORT_WG_SIZE=" + std::to_string( m_num_threads_per_block_for_sort ); - const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK=" + std::to_string( m_num_warps_per_block_for_sort ); + const auto overwrite_flag = "-DOVERWRITE"; + const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count ); + const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan ); + const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort ); + const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort ); std::vector opts; opts.push_back( includeArg.c_str() ); + opts.push_back( overwrite_flag ); opts.push_back( count_block_size_param.c_str() ); opts.push_back( scan_block_size_param.c_str() ); opts.push_back( sort_block_size_param.c_str() ); diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 435569f..a529452 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -14,6 +14,23 @@ using u64 = unsigned long long; // #define NV_WORKAROUND 1 +// default values +#if defined( OVERWRITE ) + +constexpr auto COUNT_WG_SIZE{ COUNT_WG_SIZE_VAL }; +constexpr auto SCAN_WG_SIZE{ SCAN_WG_SIZE_VAL }; +constexpr auto SORT_WG_SIZE{ SORT_WG_SIZE_VAL }; +constexpr auto SORT_NUM_WARPS_PER_BLOCK{ SORT_NUM_WARPS_PER_BLOCK_VAL }; + +#else + +constexpr auto COUNT_WG_SIZE{ DEFAULT_COUNT_BLOCK_SIZE }; +constexpr auto SCAN_WG_SIZE{ DEFAULT_SCAN_BLOCK_SIZE }; +constexpr auto SORT_WG_SIZE{ DEFAULT_SORT_BLOCK_SIZE }; +constexpr auto SORT_NUM_WARPS_PER_BLOCK{ DEFAULT_NUM_WARPS_PER_BLOCK }; + +#endif + __device__ constexpr u32 getMaskedBits( const u32 value, const u32 shift ) noexcept { return ( value >> shift ) & RADIX_MASK; } extern "C" __global__ void CountKernel( int* gSrc, int* gDst, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) From c9bb91b3242504651d249e0ca847d3a2705350c8 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 14 Sep 2023 14:04:53 +0900 Subject: [PATCH 05/68] [ORO-0] simple porting --- ParallelPrimitives/RadixSort.cpp | 289 ++++--- ParallelPrimitives/RadixSort.h | 84 +- ParallelPrimitives/RadixSort.inl | 208 ++--- ParallelPrimitives/RadixSortConfigs.h | 13 + ParallelPrimitives/RadixSortKernels.h | 1057 +++++++++++++++++-------- Test/RadixSort/main.cpp | 7 +- 6 files changed, 1088 insertions(+), 570 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index fd93056..a33ff47 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -18,6 +18,9 @@ #include #endif +inline uint64_t div_round_up64( uint64_t val, uint64_t divisor ) { return ( val + divisor - 1 ) / divisor; } +inline uint64_t next_multiple64( uint64_t val, uint64_t divisor ) { return div_round_up64( val, divisor ) * divisor; } + namespace { #if defined( ORO_PRECOMPILED ) @@ -58,26 +61,26 @@ namespace Oro RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils, const std::string& kernelPath, const std::string& includeDir ) : m_device{ device }, m_oroutils{ oroutils } { - oroGetDeviceProperties( &m_props, device ); + //oroGetDeviceProperties( &m_props, device ); configure( kernelPath, includeDir ); } -void RadixSort::exclusiveScanCpu( const Oro::GpuMemory& countsGpu, Oro::GpuMemory& offsetsGpu ) const noexcept -{ - const auto buffer_size = countsGpu.size(); - - std::vector counts = countsGpu.getData(); - std::vector offsets( buffer_size ); - - int sum = 0; - for( int i = 0; i < counts.size(); ++i ) - { - offsets[i] = sum; - sum += counts[i]; - } - - offsetsGpu.copyFromHost( offsets.data(), std::size( offsets ) ); -} +//void RadixSort::exclusiveScanCpu( const Oro::GpuMemory& countsGpu, Oro::GpuMemory& offsetsGpu ) const noexcept +//{ +// const auto buffer_size = countsGpu.size(); +// +// std::vector counts = countsGpu.getData(); +// std::vector offsets( buffer_size ); +// +// int sum = 0; +// for( int i = 0; i < counts.size(); ++i ) +// { +// offsets[i] = sum; +// sum += counts[i]; +// } +// +// offsetsGpu.copyFromHost( offsets.data(), std::size( offsets ) ); +//} void RadixSort::compileKernels( const std::string& kernelPath, const std::string& includeDir ) noexcept { @@ -111,29 +114,29 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string binaryPath += isAmd ? "oro_compiled_kernels.hipfb" : "oro_compiled_kernels.fatbin"; log = "loading pre-compiled kernels at path : " + binaryPath; - m_num_threads_per_block_for_count = DEFAULT_COUNT_BLOCK_SIZE; - m_num_threads_per_block_for_scan = DEFAULT_SCAN_BLOCK_SIZE; - m_num_threads_per_block_for_sort = DEFAULT_SORT_BLOCK_SIZE; + //m_num_threads_per_block_for_count = DEFAULT_COUNT_BLOCK_SIZE; + //m_num_threads_per_block_for_scan = DEFAULT_SCAN_BLOCK_SIZE; + //m_num_threads_per_block_for_sort = DEFAULT_SORT_BLOCK_SIZE; - const auto warp_size = DEFAULT_WARP_SIZE; + //const auto warp_size = DEFAULT_WARP_SIZE; - m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size; + //m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size; } else { log = "compiling kernels at path : " + currentKernelPath + " in : " + currentIncludeDir; - m_num_threads_per_block_for_count = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_COUNT_BLOCK_SIZE; - m_num_threads_per_block_for_scan = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SCAN_BLOCK_SIZE; - m_num_threads_per_block_for_sort = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SORT_BLOCK_SIZE; + //m_num_threads_per_block_for_count = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_COUNT_BLOCK_SIZE; + //m_num_threads_per_block_for_scan = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SCAN_BLOCK_SIZE; + //m_num_threads_per_block_for_sort = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SORT_BLOCK_SIZE; - const auto warp_size = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE; + //const auto warp_size = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE; - m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size; + //m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size; - assert( m_num_threads_per_block_for_count % warp_size == 0 ); - assert( m_num_threads_per_block_for_scan % warp_size == 0 ); - assert( m_num_threads_per_block_for_sort % warp_size == 0 ); + //assert( m_num_threads_per_block_for_count % warp_size == 0 ); + //assert( m_num_threads_per_block_for_scan % warp_size == 0 ); + //assert( m_num_threads_per_block_for_sort % warp_size == 0 ); } if( m_flags == Flag::LOG ) @@ -142,19 +145,19 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string } const auto includeArg{ "-I" + currentIncludeDir }; - const auto overwrite_flag = "-DOVERWRITE"; - const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count ); - const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan ); - const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort ); - const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort ); + //const auto overwrite_flag = "-DOVERWRITE"; + //const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count ); + //const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan ); + //const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort ); + //const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort ); std::vector opts; opts.push_back( includeArg.c_str() ); - opts.push_back( overwrite_flag ); - opts.push_back( count_block_size_param.c_str() ); - opts.push_back( scan_block_size_param.c_str() ); - opts.push_back( sort_block_size_param.c_str() ); - opts.push_back( sort_num_warps_param.c_str() ); + //opts.push_back( overwrite_flag ); + //opts.push_back( count_block_size_param.c_str() ); + //opts.push_back( scan_block_size_param.c_str() ); + //opts.push_back( sort_block_size_param.c_str() ); + //opts.push_back( sort_num_warps_param.c_str() ); struct Record { @@ -162,11 +165,15 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string Kernel kernelType; }; + //const std::vector records{ + // { "CountKernel", Kernel::COUNT }, { "ParallelExclusiveScanSingleWG", Kernel::SCAN_SINGLE_WG }, { "ParallelExclusiveScanAllWG", Kernel::SCAN_PARALLEL }, { "SortKernel", Kernel::SORT }, + // { "SortKVKernel", Kernel::SORT_KV }, { "SortSinglePassKernel", Kernel::SORT_SINGLE_PASS }, { "SortSinglePassKVKernel", Kernel::SORT_SINGLE_PASS_KV }, + //}; const std::vector records{ - { "CountKernel", Kernel::COUNT }, { "ParallelExclusiveScanSingleWG", Kernel::SCAN_SINGLE_WG }, { "ParallelExclusiveScanAllWG", Kernel::SCAN_PARALLEL }, { "SortKernel", Kernel::SORT }, - { "SortKVKernel", Kernel::SORT_KV }, { "SortSinglePassKernel", Kernel::SORT_SINGLE_PASS }, { "SortSinglePassKVKernel", Kernel::SORT_SINGLE_PASS_KV }, + { "SortSinglePassKernel", Kernel::SORT_SINGLE_PASS }, { "SortSinglePassKVKernel", Kernel::SORT_SINGLE_PASS_KV }, }; + for( const auto& record : records ) { #if defined( ORO_PP_LOAD_FROM_STRING ) @@ -188,118 +195,170 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string printKernelInfo( record.kernelName, oroFunctions[record.kernelType] ); } } -} - -int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept -{ - const int warpSize = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE; - const int warpPerWG = blockSize / warpSize; - const int warpPerWGP = m_props.maxThreadsPerMultiProcessor / warpSize; - const int occupancyFromWarp = ( warpPerWGP > 0 ) ? ( warpPerWGP / warpPerWG ) : 1; - const int occupancy = std::max( 1, occupancyFromWarp ); - - if( m_flags == Flag::LOG ) - { - std::cout << "Occupancy: " << occupancy << '\n'; - } - - static constexpr auto min_num_blocks = 16; - auto number_of_blocks = m_props.multiProcessorCount > 0 ? m_props.multiProcessorCount * occupancy : min_num_blocks; - - if( m_num_threads_per_block_for_scan > BIN_SIZE ) - { - // Note: both are divisible by 2 - const auto base = m_num_threads_per_block_for_scan / BIN_SIZE; - - // Floor - number_of_blocks = ( number_of_blocks / base ) * base; - } - - return number_of_blocks; + // TODO: bit code support? +#define LOAD_FUNC( var, kernel ) var = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), kernel, &opts ); + LOAD_FUNC( m_gHistogram, "gHistogram" ); + LOAD_FUNC( m_gPrefixSum, "gPrefixSum" ); + LOAD_FUNC( m_onesweep_reorderKey, "onesweep_reorderKey" ); + LOAD_FUNC( m_onesweep_reorderKeyPair, "onesweep_reorderKeyPair" ); + LOAD_FUNC( m_onesweep_reorderKey64, "onesweep_reorderKey64" ); + LOAD_FUNC( m_onesweep_reorderKeyPair64, "onesweep_reorderKeyPair64" ); +#undef LOAD_FUNC } +//int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept +//{ +// const int warpSize = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE; +// const int warpPerWG = blockSize / warpSize; +// const int warpPerWGP = m_props.maxThreadsPerMultiProcessor / warpSize; +// const int occupancyFromWarp = ( warpPerWGP > 0 ) ? ( warpPerWGP / warpPerWG ) : 1; +// +// const int occupancy = std::max( 1, occupancyFromWarp ); +// +// if( m_flags == Flag::LOG ) +// { +// std::cout << "Occupancy: " << occupancy << '\n'; +// } +// +// static constexpr auto min_num_blocks = 16; +// auto number_of_blocks = m_props.multiProcessorCount > 0 ? m_props.multiProcessorCount * occupancy : min_num_blocks; +// +// if( m_num_threads_per_block_for_scan > BIN_SIZE ) +// { +// // Note: both are divisible by 2 +// const auto base = m_num_threads_per_block_for_scan / BIN_SIZE; +// +// // Floor +// number_of_blocks = ( number_of_blocks / base ) * base; +// } +// +// return number_of_blocks; +//} + void RadixSort::configure( const std::string& kernelPath, const std::string& includeDir ) noexcept { compileKernels( kernelPath, includeDir ); - m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count ); + //m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count ); - /// The tmp buffer size of the count kernel and the scan kernel. + ///// The tmp buffer size of the count kernel and the scan kernel. - const auto tmp_buffer_size = BIN_SIZE * m_num_blocks_for_count; + //const auto tmp_buffer_size = BIN_SIZE * m_num_blocks_for_count; - /// @c tmp_buffer_size must be divisible by @c m_num_threads_per_block_for_scan - /// This is guaranteed since @c m_num_blocks_for_count will be adjusted accordingly + ///// @c tmp_buffer_size must be divisible by @c m_num_threads_per_block_for_scan + ///// This is guaranteed since @c m_num_blocks_for_count will be adjusted accordingly - m_num_blocks_for_scan = tmp_buffer_size / m_num_threads_per_block_for_scan; + //m_num_blocks_for_scan = tmp_buffer_size / m_num_threads_per_block_for_scan; - m_tmp_buffer.resize( tmp_buffer_size ); + //m_tmp_buffer.resize( tmp_buffer_size ); - if( selectedScanAlgo == ScanAlgo::SCAN_GPU_PARALLEL ) - { - // These are for the scan kernel - m_partial_sum.resize( m_num_blocks_for_scan ); - m_is_ready.resize( m_num_blocks_for_scan ); - } + //if( selectedScanAlgo == ScanAlgo::SCAN_GPU_PARALLEL ) + //{ + // // These are for the scan kernel + // m_partial_sum.resize( m_num_blocks_for_scan ); + // m_is_ready.resize( m_num_blocks_for_scan ); + //} } void RadixSort::setFlag( Flag flag ) noexcept { m_flags = flag; } -void RadixSort::sort( const KeyValueSoA src, const KeyValueSoA dst, int n, int startBit, int endBit, oroStream stream ) noexcept +void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream ) noexcept { + bool keyPair = src.value != nullptr; + // todo. better to compute SINGLE_SORT_N_ITEMS_PER_WI which we use in the kernel dynamically rather than hard coding it to distribute the work evenly // right now, setting this as large as possible is faster than multi pass sorting if( n < SINGLE_SORT_WG_SIZE * SINGLE_SORT_N_ITEMS_PER_WI ) { - const auto func = oroFunctions[Kernel::SORT_SINGLE_PASS_KV]; - const void* args[] = { &src.key, &src.value, &dst.key, &dst.value, &n, &startBit, &endBit }; - OrochiUtils::launch1D( func, SINGLE_SORT_WG_SIZE, args, SINGLE_SORT_WG_SIZE, 0, stream ); + if( keyPair ) + { + const auto func = oroFunctions[Kernel::SORT_SINGLE_PASS_KV]; + const void* args[] = { &src.key, &src.value, &dst.key, &dst.value, &n, &startBit, &endBit }; + OrochiUtils::launch1D( func, SINGLE_SORT_WG_SIZE, args, SINGLE_SORT_WG_SIZE, 0, stream ); + } + else + { + const auto func = oroFunctions[Kernel::SORT_SINGLE_PASS]; + const void* args[] = { &src, &dst, &n, &startBit, &endBit }; + OrochiUtils::launch1D( func, SINGLE_SORT_WG_SIZE, args, SINGLE_SORT_WG_SIZE, 0, stream ); + } return; } - auto* s{ &src }; - auto* d{ &dst }; + int nIteration = div_round_up64( endBit - startBit, 8 ); + bool use64bitCounter = +#if defined( ENFORCE_64BIT_COUNTER ) + true; +#else + MAX_ELEMENTS_WITH_32BIT_COUNTER < n; +#endif + uint64_t numberOfBlocks = div_round_up64( n, RADIX_SORT_BLOCK_SIZE ); + + // Buffers + void* gpSumBuffer = tempStorage; + void* lookBackBuffer = (void*)( (char*)tempStorage + sizeof( uint32_t ) * 256 * sizeof( u32 /* key */ ) ); - for( int i = startBit; i < endBit; i += N_RADIX ) { - sort1pass( *s, *d, n, i, i + std::min( N_RADIX, endBit - i ), stream ); + oroMemsetD32Async( (oroDeviceptr)gpSumBuffer, 0, 256 * sizeof( u32 /* key */ ), stream ); + oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, 1, stream ); - std::swap( s, d ); - } + const int nBlocks = 2048; - if( s == &src ) - { - OrochiUtils::copyDtoDAsync( dst.key, src.key, n, stream ); - OrochiUtils::copyDtoDAsync( dst.value, src.value, n, stream ); + const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &lookBackBuffer }; + OrochiUtils::launch1D( m_gHistogram, nBlocks * GHISTOGRAM_THREADS_PER_BLOCK, args, GHISTOGRAM_THREADS_PER_BLOCK, 0, stream ); } -} - -void RadixSort::sort( const u32* src, const u32* dst, int n, int startBit, int endBit, oroStream stream ) noexcept -{ - // todo. better to compute SINGLE_SORT_N_ITEMS_PER_WI which we use in the kernel dynamically rather than hard coding it to distribute the work evenly - // right now, setting this as large as possible is faster than multi pass sorting - if( n < SINGLE_SORT_WG_SIZE * SINGLE_SORT_N_ITEMS_PER_WI ) { - const auto func = oroFunctions[Kernel::SORT_SINGLE_PASS]; - const void* args[] = { &src, &dst, &n, &startBit, &endBit }; - OrochiUtils::launch1D( func, SINGLE_SORT_WG_SIZE, args, SINGLE_SORT_WG_SIZE, 0, stream ); - return; + const void* args[] = { &gpSumBuffer }; + OrochiUtils::launch1D( m_gPrefixSum, nIteration * 256, args, 256, 0, stream ); } - auto* s{ &src }; - auto* d{ &dst }; - - for( int i = startBit; i < endBit; i += N_RADIX ) + auto s = src; + auto d = dst; + for( int i = 0; i < nIteration; i++ ) { - sort1pass( *s, *d, n, i, i + std::min( N_RADIX, endBit - i ), stream ); + oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, 256 * numberOfBlocks * ( use64bitCounter ? 2 : 1 ), stream ); + if( keyPair ) + { + const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i }; + OrochiUtils::launch1D( use64bitCounter ? m_onesweep_reorderKeyPair64 : m_onesweep_reorderKeyPair, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream ); + } + else + { + const void* args[] = { &s.key, &d.key, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i }; + OrochiUtils::launch1D( use64bitCounter ? m_onesweep_reorderKey64 : m_onesweep_reorderKey, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream ); + } std::swap( s, d ); } - if( s == &src ) + if( s.key == src.key ) { - OrochiUtils::copyDtoDAsync( dst, src, n, stream ); + oroMemcpyDtoDAsync( (oroDeviceptr)dst.key, (oroDeviceptr)src.key, sizeof( uint32_t ) * n, stream ); + + if( keyPair ) + { + oroMemcpyDtoDAsync( (oroDeviceptr)dst.value, (oroDeviceptr)src.value, sizeof( uint32_t ) * n, stream ); + } } } +void RadixSort::sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream ) noexcept +{ + sort( KeyValueSoA{ src, nullptr }, KeyValueSoA{ dst, nullptr }, n, startBit, endBit, tempStorage, stream ); +} + +uint64_t RadixSort::getRequiredTemporalStorageBytes( u32 numberOfMaxInputs ) const +{ + static_assert( BIN_SIZE == 256, "check alignment of the buffers" ); + uint64_t numberOfBlocks = div_round_up64( numberOfMaxInputs, RADIX_SORT_BLOCK_SIZE ); + uint64_t gpSumBuffer = sizeof( uint32_t ) * 256 * sizeof( u32 /* key */ ); + uint64_t lookBackBuffer = sizeof( uint32_t ) * 256 * numberOfBlocks; +#if !defined( ENFORCE_64BIT_COUNTER ) + if( MAX_ELEMENTS_WITH_32BIT_COUNTER < numberOfMaxInputs ) +#endif + { + lookBackBuffer *= 2; // to 64bit counter + } + return gpSumBuffer + lookBackBuffer; +} }; // namespace Oro diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index d3e89bf..d30ee32 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -43,26 +43,27 @@ class RadixSort final void setFlag( Flag flag ) noexcept; - void sort( const KeyValueSoA src, const KeyValueSoA dst, int n, int startBit, int endBit, oroStream stream = 0 ) noexcept; + void sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream = 0 ) noexcept; - void sort( const u32* src, const u32* dst, int n, int startBit, int endBit, oroStream stream = 0 ) noexcept; + void sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream = 0 ) noexcept; + uint64_t getRequiredTemporalStorageBytes( u32 numberOfMaxInputs ) const; private: - template - void sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept; + //template + //void sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept; - /// @brief Compile the kernels for radix sort. - /// @param kernelPath The kernel path. - /// @param includeDir The include directory. + ///// @brief Compile the kernels for radix sort. + ///// @param kernelPath The kernel path. + ///// @param includeDir The include directory. void compileKernels( const std::string& kernelPath, const std::string& includeDir ) noexcept; - [[nodiscard]] int calculateWGsToExecute( const int blockSize ) const noexcept; + //[[nodiscard]] int calculateWGsToExecute( const int blockSize ) const noexcept; - /// @brief Exclusive scan algorithm on CPU for testing. - /// It copies the count result from the Device to Host before computation, and then copies the offsets back from Host to Device afterward. - /// @param countsGpu The count result in GPU memory. Otuput: The offset. - /// @param offsetsGpu The offsets. - void exclusiveScanCpu( const Oro::GpuMemory& countsGpu, Oro::GpuMemory& offsetsGpu ) const noexcept; + ///// @brief Exclusive scan algorithm on CPU for testing. + ///// It copies the count result from the Device to Host before computation, and then copies the offsets back from Host to Device afterward. + ///// @param countsGpu The count result in GPU memory. Otuput: The offset. + ///// @param offsetsGpu The offsets. + //void exclusiveScanCpu( const Oro::GpuMemory& countsGpu, Oro::GpuMemory& offsetsGpu ) const noexcept; /// @brief Configure the settings, compile the kernels and allocate the memory. /// @param kernelPath The kernel path. @@ -70,21 +71,21 @@ class RadixSort final void configure( const std::string& kernelPath, const std::string& includeDir ) noexcept; private: - // GPU blocks for the count kernel - int m_num_blocks_for_count{}; + //// GPU blocks for the count kernel + //int m_num_blocks_for_count{}; - // GPU blocks for the scan kernel - int m_num_blocks_for_scan{}; + //// GPU blocks for the scan kernel + //int m_num_blocks_for_scan{}; Flag m_flags{ Flag::NO_LOG }; enum class Kernel { - COUNT, - SCAN_SINGLE_WG, - SCAN_PARALLEL, - SORT, - SORT_KV, + //COUNT, + //SCAN_SINGLE_WG, + //SCAN_PARALLEL, + //SORT, + //SORT_KV, SORT_SINGLE_PASS, SORT_SINGLE_PASS_KV, }; @@ -92,33 +93,40 @@ class RadixSort final std::unordered_map oroFunctions; /// @brief The enum class which indicates the selected algorithm of prefix scan. - enum class ScanAlgo - { - SCAN_CPU, - SCAN_GPU_SINGLE_WG, - SCAN_GPU_PARALLEL, - }; + //enum class ScanAlgo + //{ + // SCAN_CPU, + // SCAN_GPU_SINGLE_WG, + // SCAN_GPU_PARALLEL, + //}; - constexpr static auto selectedScanAlgo{ ScanAlgo::SCAN_GPU_PARALLEL }; + //constexpr static auto selectedScanAlgo{ ScanAlgo::SCAN_GPU_PARALLEL }; - GpuMemory m_partial_sum; - GpuMemory m_is_ready; + //GpuMemory m_partial_sum; + //GpuMemory m_is_ready; oroDevice m_device{}; - oroDeviceProp m_props{}; + //oroDeviceProp m_props{}; OrochiUtils& m_oroutils; // This buffer holds the "bucket" table from all GPU blocks. - GpuMemory m_tmp_buffer; + //GpuMemory m_tmp_buffer; + + //int m_num_threads_per_block_for_count{}; + //int m_num_threads_per_block_for_scan{}; + //int m_num_threads_per_block_for_sort{}; - int m_num_threads_per_block_for_count{}; - int m_num_threads_per_block_for_scan{}; - int m_num_threads_per_block_for_sort{}; + //int m_num_warps_per_block_for_sort{}; - int m_num_warps_per_block_for_sort{}; + oroFunction m_gHistogram; + oroFunction m_gPrefixSum; + oroFunction m_onesweep_reorderKey; + oroFunction m_onesweep_reorderKeyPair; + oroFunction m_onesweep_reorderKey64; + oroFunction m_onesweep_reorderKeyPair64; }; -#include +//#include }; // namespace Oro diff --git a/ParallelPrimitives/RadixSort.inl b/ParallelPrimitives/RadixSort.inl index ad9ecdb..fd42633 100644 --- a/ParallelPrimitives/RadixSort.inl +++ b/ParallelPrimitives/RadixSort.inl @@ -57,107 +57,107 @@ void print_record( const T& t ) noexcept } // namespace -template -void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept -{ - static constexpr auto enable_profile = false; - - const u32* srcKey{ nullptr }; - const u32* dstKey{ nullptr }; - - const u32* srcVal{ nullptr }; - const u32* dstVal{ nullptr }; - - static constexpr auto enable_key_value_pair_sorting{ std::is_same_v }; - - if constexpr( enable_key_value_pair_sorting ) - { - srcKey = src.key; - dstKey = dst.key; - - srcVal = src.value; - dstVal = dst.value; - } - else - { - static_assert( std::is_same_v || std::is_same_v ); - srcKey = src; - dstKey = dst; - } - - const int nItemPerWG = ( n + m_num_blocks_for_count - 1 ) / m_num_blocks_for_count; - - // Timer records - - using RecordType = std::conditional_t, Empty>; - RecordType t; - - resize_record( t ); - - const auto launch_count_kernel = [&]() noexcept - { - const auto num_total_thread_for_count = m_num_threads_per_block_for_count * m_num_blocks_for_count; - - const auto func{ oroFunctions[Kernel::COUNT] }; - const void* args[] = { &srcKey, arg_cast( m_tmp_buffer.address() ), &n, &nItemPerWG, &startBit, &m_num_blocks_for_count }; - OrochiUtils::launch1D( func, num_total_thread_for_count, args, m_num_threads_per_block_for_count, 0, stream ); - }; - - execute( launch_count_kernel, t, 0, stream ); - - const auto launch_scan_kernel = [&]() noexcept - { - switch( selectedScanAlgo ) - { - case ScanAlgo::SCAN_CPU: - { - exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer ); - } - break; - - case ScanAlgo::SCAN_GPU_SINGLE_WG: - { - const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), &m_num_blocks_for_count }; - OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_SINGLE_WG], WG_SIZE * m_num_blocks_for_count, args, WG_SIZE, 0, stream ); - } - break; - - case ScanAlgo::SCAN_GPU_PARALLEL: - { - const auto num_total_thread_for_scan = m_num_threads_per_block_for_scan * m_num_blocks_for_scan; - - const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), arg_cast( m_partial_sum.address() ), arg_cast( m_is_ready.address() ) }; - OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_PARALLEL], num_total_thread_for_scan, args, m_num_threads_per_block_for_scan, 0, stream ); - } - break; - - default: - exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer ); - break; - } - }; - - execute( launch_scan_kernel, t, 1, stream ); - - const auto launch_sort_kernel = [&]() noexcept - { - const auto num_blocks_for_sort = m_num_blocks_for_count; - const auto num_total_thread_for_sort = m_num_threads_per_block_for_sort * num_blocks_for_sort; - const auto num_items_per_block = nItemPerWG; - - if constexpr( enable_key_value_pair_sorting ) - { - const void* args[] = { &srcKey, &srcVal, &dstKey, &dstVal, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort }; - OrochiUtils::launch1D( oroFunctions[Kernel::SORT_KV], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream ); - } - else - { - const void* args[] = { &srcKey, &dstKey, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort }; - OrochiUtils::launch1D( oroFunctions[Kernel::SORT], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream ); - } - }; - - execute( launch_sort_kernel, t, 2, stream ); - - print_record( t ); -} +//template +//void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept +//{ +// static constexpr auto enable_profile = false; +// +// const u32* srcKey{ nullptr }; +// const u32* dstKey{ nullptr }; +// +// const u32* srcVal{ nullptr }; +// const u32* dstVal{ nullptr }; +// +// static constexpr auto enable_key_value_pair_sorting{ std::is_same_v }; +// +// if constexpr( enable_key_value_pair_sorting ) +// { +// srcKey = src.key; +// dstKey = dst.key; +// +// srcVal = src.value; +// dstVal = dst.value; +// } +// else +// { +// static_assert( std::is_same_v || std::is_same_v ); +// srcKey = src; +// dstKey = dst; +// } +// +// const int nItemPerWG = ( n + m_num_blocks_for_count - 1 ) / m_num_blocks_for_count; +// +// // Timer records +// +// using RecordType = std::conditional_t, Empty>; +// RecordType t; +// +// resize_record( t ); +// +// const auto launch_count_kernel = [&]() noexcept +// { +// const auto num_total_thread_for_count = m_num_threads_per_block_for_count * m_num_blocks_for_count; +// +// const auto func{ oroFunctions[Kernel::COUNT] }; +// const void* args[] = { &srcKey, arg_cast( m_tmp_buffer.address() ), &n, &nItemPerWG, &startBit, &m_num_blocks_for_count }; +// OrochiUtils::launch1D( func, num_total_thread_for_count, args, m_num_threads_per_block_for_count, 0, stream ); +// }; +// +// execute( launch_count_kernel, t, 0, stream ); +// +// const auto launch_scan_kernel = [&]() noexcept +// { +// switch( selectedScanAlgo ) +// { +// case ScanAlgo::SCAN_CPU: +// { +// exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer ); +// } +// break; +// +// case ScanAlgo::SCAN_GPU_SINGLE_WG: +// { +// const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), &m_num_blocks_for_count }; +// OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_SINGLE_WG], WG_SIZE * m_num_blocks_for_count, args, WG_SIZE, 0, stream ); +// } +// break; +// +// case ScanAlgo::SCAN_GPU_PARALLEL: +// { +// const auto num_total_thread_for_scan = m_num_threads_per_block_for_scan * m_num_blocks_for_scan; +// +// const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), arg_cast( m_partial_sum.address() ), arg_cast( m_is_ready.address() ) }; +// OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_PARALLEL], num_total_thread_for_scan, args, m_num_threads_per_block_for_scan, 0, stream ); +// } +// break; +// +// default: +// exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer ); +// break; +// } +// }; +// +// execute( launch_scan_kernel, t, 1, stream ); +// +// const auto launch_sort_kernel = [&]() noexcept +// { +// const auto num_blocks_for_sort = m_num_blocks_for_count; +// const auto num_total_thread_for_sort = m_num_threads_per_block_for_sort * num_blocks_for_sort; +// const auto num_items_per_block = nItemPerWG; +// +// if constexpr( enable_key_value_pair_sorting ) +// { +// const void* args[] = { &srcKey, &srcVal, &dstKey, &dstVal, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort }; +// OrochiUtils::launch1D( oroFunctions[Kernel::SORT_KV], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream ); +// } +// else +// { +// const void* args[] = { &srcKey, &dstKey, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort }; +// OrochiUtils::launch1D( oroFunctions[Kernel::SORT], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream ); +// } +// }; +// +// execute( launch_sort_kernel, t, 2, stream ); +// +// print_record( t ); +//} diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index 1cc6f2b..c597238 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -42,4 +42,17 @@ static_assert( BIN_SIZE % 2 == 0 ); static_assert( DEFAULT_COUNT_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 ); static_assert( DEFAULT_SCAN_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 ); +#define RADIX_SORT_BLOCK_SIZE 2048 + +#define GHISTOGRAM_ITEM_PER_BLOCK 2048 +#define GHISTOGRAM_THREADS_PER_BLOCK 256 + +#define REORDER_NUMBER_OF_WARPS 8 +#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS ) + +#define MAX_ELEMENTS_WITH_32BIT_COUNTER 0x3FFFFFFF + +// Please uncomment this enforce 64bit counter for lookback counter to measure performance impact. +// #define ENFORCE_64BIT_COUNTER 1 + }; // namespace Oro \ No newline at end of file diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index a529452..56bca91 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -15,52 +15,52 @@ using u64 = unsigned long long; // #define NV_WORKAROUND 1 // default values -#if defined( OVERWRITE ) - -constexpr auto COUNT_WG_SIZE{ COUNT_WG_SIZE_VAL }; -constexpr auto SCAN_WG_SIZE{ SCAN_WG_SIZE_VAL }; -constexpr auto SORT_WG_SIZE{ SORT_WG_SIZE_VAL }; -constexpr auto SORT_NUM_WARPS_PER_BLOCK{ SORT_NUM_WARPS_PER_BLOCK_VAL }; - -#else - -constexpr auto COUNT_WG_SIZE{ DEFAULT_COUNT_BLOCK_SIZE }; -constexpr auto SCAN_WG_SIZE{ DEFAULT_SCAN_BLOCK_SIZE }; -constexpr auto SORT_WG_SIZE{ DEFAULT_SORT_BLOCK_SIZE }; -constexpr auto SORT_NUM_WARPS_PER_BLOCK{ DEFAULT_NUM_WARPS_PER_BLOCK }; - -#endif - -__device__ constexpr u32 getMaskedBits( const u32 value, const u32 shift ) noexcept { return ( value >> shift ) & RADIX_MASK; } - -extern "C" __global__ void CountKernel( int* gSrc, int* gDst, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) -{ - __shared__ int table[BIN_SIZE]; - - for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE ) - { - table[i] = 0; - } - - __syncthreads(); - - const int offset = blockIdx.x * gNItemsPerWG; - const int upperBound = ( offset + gNItemsPerWG > gN ) ? gN - offset : gNItemsPerWG; - - for( int i = threadIdx.x; i < upperBound; i += COUNT_WG_SIZE ) - { - const int idx = offset + i; - const int tableIdx = getMaskedBits( gSrc[idx], START_BIT ); - atomicAdd( &table[tableIdx], 1 ); - } - - __syncthreads(); - - for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE ) - { - gDst[i * N_WGS_EXECUTED + blockIdx.x] = table[i]; - } -} +//#if defined( OVERWRITE ) +// +//constexpr auto COUNT_WG_SIZE{ COUNT_WG_SIZE_VAL }; +//constexpr auto SCAN_WG_SIZE{ SCAN_WG_SIZE_VAL }; +//constexpr auto SORT_WG_SIZE{ SORT_WG_SIZE_VAL }; +//constexpr auto SORT_NUM_WARPS_PER_BLOCK{ SORT_NUM_WARPS_PER_BLOCK_VAL }; +// +//#else +// +//constexpr auto COUNT_WG_SIZE{ DEFAULT_COUNT_BLOCK_SIZE }; +//constexpr auto SCAN_WG_SIZE{ DEFAULT_SCAN_BLOCK_SIZE }; +//constexpr auto SORT_WG_SIZE{ DEFAULT_SORT_BLOCK_SIZE }; +//constexpr auto SORT_NUM_WARPS_PER_BLOCK{ DEFAULT_NUM_WARPS_PER_BLOCK }; +// +//#endif + +//__device__ constexpr u32 getMaskedBits( const u32 value, const u32 shift ) noexcept { return ( value >> shift ) & RADIX_MASK; } +// +//extern "C" __global__ void CountKernel( int* gSrc, int* gDst, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) +//{ +// __shared__ int table[BIN_SIZE]; +// +// for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE ) +// { +// table[i] = 0; +// } +// +// __syncthreads(); +// +// const int offset = blockIdx.x * gNItemsPerWG; +// const int upperBound = ( offset + gNItemsPerWG > gN ) ? gN - offset : gNItemsPerWG; +// +// for( int i = threadIdx.x; i < upperBound; i += COUNT_WG_SIZE ) +// { +// const int idx = offset + i; +// const int tableIdx = getMaskedBits( gSrc[idx], START_BIT ); +// atomicAdd( &table[tableIdx], 1 ); +// } +// +// __syncthreads(); +// +// for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE ) +// { +// gDst[i * N_WGS_EXECUTED + blockIdx.x] = table[i]; +// } +//} template struct ScanImpl @@ -326,139 +326,139 @@ __device__ void localSort4bitMulti( int* keys, u32* ldsKeys, int* values, u32* l } } -__device__ void localSort8bitMulti_shared_bin( int* keys, u32* ldsKeys, const int START_BIT ) -{ - __shared__ unsigned table[BIN_SIZE]; - - for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE ) - { - table[i] = 0U; - } - - LDS_BARRIER; - - for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i ) - { - const int tableIdx = ( keys[i] >> START_BIT ) & RADIX_MASK; - atomicAdd( &table[tableIdx], 1 ); - } - - LDS_BARRIER; - - int globalSum = 0; - for( int binId = 0; binId < BIN_SIZE; binId += SORT_WG_SIZE * 2 ) - { - unsigned* globalOffset = &table[binId]; - const unsigned currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 ); - globalOffset[threadIdx.x * 2] += globalSum; - globalOffset[threadIdx.x * 2 + 1] += globalSum; - globalSum += currentGlobalSum; - } - - LDS_BARRIER; - - __shared__ u32 keyBuffer[SORT_WG_SIZE * SORT_N_ITEMS_PER_WI]; - - for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i ) - { - keyBuffer[threadIdx.x * SORT_N_ITEMS_PER_WI + i] = keys[i]; - } - - LDS_BARRIER; - - if( threadIdx.x == 0 ) - { - for( int i = 0; i < SORT_WG_SIZE * SORT_N_ITEMS_PER_WI; ++i ) - { - const int tableIdx = ( keyBuffer[i] >> START_BIT ) & RADIX_MASK; - const int writeIndex = table[tableIdx]; - - ldsKeys[writeIndex] = keyBuffer[i]; - - ++table[tableIdx]; - } - } - - LDS_BARRIER; - - for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i ) - { - keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i]; - } -} - -__device__ void localSort8bitMulti_group( int* keys, u32* ldsKeys, const int START_BIT ) -{ - constexpr auto N_GROUP_SIZE{ N_BINS_8BIT / ( sizeof( u64 ) / sizeof( u16 ) ) }; - - __shared__ union - { - u16 m_ungrouped[SORT_WG_SIZE + 1][N_BINS_8BIT]; - u64 m_grouped[SORT_WG_SIZE + 1][N_GROUP_SIZE]; - } lds; - - for( int i = 0; i < N_GROUP_SIZE; ++i ) - { - lds.m_grouped[threadIdx.x][i] = 0U; - } - - for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ ) - { - const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK; - ++lds.m_ungrouped[threadIdx.x][in8bit]; - } - - LDS_BARRIER; - - for( int groupId = threadIdx.x; groupId < N_GROUP_SIZE; groupId += SORT_WG_SIZE ) - { - u64 sum = 0U; - for( int i = 0; i < SORT_WG_SIZE; i++ ) - { - const auto current = lds.m_grouped[i][groupId]; - lds.m_grouped[i][groupId] = sum; - sum += current; - } - lds.m_grouped[SORT_WG_SIZE][groupId] = sum; - } - - LDS_BARRIER; - - int globalSum = 0; - for( int binId = 0; binId < N_BINS_8BIT; binId += SORT_WG_SIZE * 2 ) - { - auto* globalOffset = &lds.m_ungrouped[SORT_WG_SIZE][binId]; - const int currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 ); - globalOffset[threadIdx.x * 2] += globalSum; - globalOffset[threadIdx.x * 2 + 1] += globalSum; - globalSum += currentGlobalSum; - } - - LDS_BARRIER; - - for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ ) - { - const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK; - const auto offset = lds.m_ungrouped[SORT_WG_SIZE][in8bit]; - const auto rank = lds.m_ungrouped[threadIdx.x][in8bit]++; - - ldsKeys[offset + rank] = keys[i]; - } - - LDS_BARRIER; - - for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ ) - { - keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i]; - } -} - -template -__device__ void localSort8bitMulti( int* keys, u32* ldsKeys, int* values, u32* ldsValues, const int START_BIT ) -{ - localSort4bitMulti( keys, ldsKeys, values, ldsValues, START_BIT ); - if( N_RADIX > 4 ) localSort4bitMulti( keys, ldsKeys, values, ldsValues, START_BIT + 4 ); -} +//__device__ void localSort8bitMulti_shared_bin( int* keys, u32* ldsKeys, const int START_BIT ) +//{ +// __shared__ unsigned table[BIN_SIZE]; +// +// for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE ) +// { +// table[i] = 0U; +// } +// +// LDS_BARRIER; +// +// for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i ) +// { +// const int tableIdx = ( keys[i] >> START_BIT ) & RADIX_MASK; +// atomicAdd( &table[tableIdx], 1 ); +// } +// +// LDS_BARRIER; +// +// int globalSum = 0; +// for( int binId = 0; binId < BIN_SIZE; binId += SORT_WG_SIZE * 2 ) +// { +// unsigned* globalOffset = &table[binId]; +// const unsigned currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 ); +// globalOffset[threadIdx.x * 2] += globalSum; +// globalOffset[threadIdx.x * 2 + 1] += globalSum; +// globalSum += currentGlobalSum; +// } +// +// LDS_BARRIER; +// +// __shared__ u32 keyBuffer[SORT_WG_SIZE * SORT_N_ITEMS_PER_WI]; +// +// for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i ) +// { +// keyBuffer[threadIdx.x * SORT_N_ITEMS_PER_WI + i] = keys[i]; +// } +// +// LDS_BARRIER; +// +// if( threadIdx.x == 0 ) +// { +// for( int i = 0; i < SORT_WG_SIZE * SORT_N_ITEMS_PER_WI; ++i ) +// { +// const int tableIdx = ( keyBuffer[i] >> START_BIT ) & RADIX_MASK; +// const int writeIndex = table[tableIdx]; +// +// ldsKeys[writeIndex] = keyBuffer[i]; +// +// ++table[tableIdx]; +// } +// } +// +// LDS_BARRIER; +// +// for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i ) +// { +// keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i]; +// } +//} +// +//__device__ void localSort8bitMulti_group( int* keys, u32* ldsKeys, const int START_BIT ) +//{ +// constexpr auto N_GROUP_SIZE{ N_BINS_8BIT / ( sizeof( u64 ) / sizeof( u16 ) ) }; +// +// __shared__ union +// { +// u16 m_ungrouped[SORT_WG_SIZE + 1][N_BINS_8BIT]; +// u64 m_grouped[SORT_WG_SIZE + 1][N_GROUP_SIZE]; +// } lds; +// +// for( int i = 0; i < N_GROUP_SIZE; ++i ) +// { +// lds.m_grouped[threadIdx.x][i] = 0U; +// } +// +// for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ ) +// { +// const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK; +// ++lds.m_ungrouped[threadIdx.x][in8bit]; +// } +// +// LDS_BARRIER; +// +// for( int groupId = threadIdx.x; groupId < N_GROUP_SIZE; groupId += SORT_WG_SIZE ) +// { +// u64 sum = 0U; +// for( int i = 0; i < SORT_WG_SIZE; i++ ) +// { +// const auto current = lds.m_grouped[i][groupId]; +// lds.m_grouped[i][groupId] = sum; +// sum += current; +// } +// lds.m_grouped[SORT_WG_SIZE][groupId] = sum; +// } +// +// LDS_BARRIER; +// +// int globalSum = 0; +// for( int binId = 0; binId < N_BINS_8BIT; binId += SORT_WG_SIZE * 2 ) +// { +// auto* globalOffset = &lds.m_ungrouped[SORT_WG_SIZE][binId]; +// const int currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 ); +// globalOffset[threadIdx.x * 2] += globalSum; +// globalOffset[threadIdx.x * 2 + 1] += globalSum; +// globalSum += currentGlobalSum; +// } +// +// LDS_BARRIER; +// +// for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ ) +// { +// const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK; +// const auto offset = lds.m_ungrouped[SORT_WG_SIZE][in8bit]; +// const auto rank = lds.m_ungrouped[threadIdx.x][in8bit]++; +// +// ldsKeys[offset + rank] = keys[i]; +// } +// +// LDS_BARRIER; +// +// for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ ) +// { +// keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i]; +// } +//} + +//template +//__device__ void localSort8bitMulti( int* keys, u32* ldsKeys, int* values, u32* ldsValues, const int START_BIT ) +//{ +// localSort4bitMulti( keys, ldsKeys, values, ldsValues, START_BIT ); +// if( N_RADIX > 4 ) localSort4bitMulti( keys, ldsKeys, values, ldsValues, START_BIT + 4 ); +//} template __device__ void SortSinglePass( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int gN, const int START_BIT, const int END_BIT ) @@ -514,185 +514,594 @@ extern "C" __global__ void SortSinglePassKernel( int* gSrcKey, int* gDstKey, int extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int gN, const int START_BIT, const int END_BIT ) { SortSinglePass( gSrcKey, gSrcVal, gDstKey, gDstVal, gN, START_BIT, END_BIT ); } -extern "C" __global__ void ParallelExclusiveScanSingleWG( int* gCount, int* gHistogram, const int N_WGS_EXECUTED ) +//extern "C" __global__ void ParallelExclusiveScanSingleWG( int* gCount, int* gHistogram, const int N_WGS_EXECUTED ) +//{ +// // Use a single WG. +// if( blockIdx.x != 0 ) +// { +// return; +// } +// +// // LDS for the parallel scan of the global sum: +// // First we store the sum of the counters of each number to it, +// // then we compute the global offset using parallel exclusive scan. +// __shared__ int blockBuffer[BIN_SIZE]; +// +// // fill the LDS with the local sum +// +// for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE ) +// { +// // Do exclusive scan for each segment handled by each WI in a WG +// +// int localThreadSum = 0; +// for( int i = 0; i < N_WGS_EXECUTED; ++i ) +// { +// int current = gCount[binId * N_WGS_EXECUTED + i]; +// gCount[binId * N_WGS_EXECUTED + i] = localThreadSum; +// +// localThreadSum += current; +// } +// +// // Store the thread local sum to LDS. +// +// blockBuffer[binId] = localThreadSum; +// } +// +// LDS_BARRIER; +// +// // Do parallel exclusive scan on the LDS +// +// int globalSum = 0; +// for( int binId = 0; binId < BIN_SIZE; binId += WG_SIZE * 2 ) +// { +// int* globalOffset = &blockBuffer[binId]; +// int currentGlobalSum = ldsScanExclusive( globalOffset, WG_SIZE * 2 ); +// globalOffset[threadIdx.x * 2] += globalSum; +// globalOffset[threadIdx.x * 2 + 1] += globalSum; +// globalSum += currentGlobalSum; +// } +// +// LDS_BARRIER; +// +// // Add the global offset to the global histogram. +// +// for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE ) +// { +// for( int i = 0; i < N_WGS_EXECUTED; ++i ) +// { +// gHistogram[binId * N_WGS_EXECUTED + i] += blockBuffer[binId]; +// } +// } +//} +// +//extern "C" __device__ void WorkgroupSync( int threadId, int blockId, int currentSegmentSum, int* currentGlobalOffset, volatile int* gPartialSum, volatile bool* gIsReady ) +//{ +// if( threadId == 0 ) +// { +// int offset = 0; +// +// if( blockId != 0 ) +// { +// while( !gIsReady[blockId - 1] ) +// { +// } +// +// offset = gPartialSum[blockId - 1]; +// +// __threadfence(); +// +// // Reset the value +// gIsReady[blockId - 1] = false; +// } +// +// gPartialSum[blockId] = offset + currentSegmentSum; +// +// // Ensure that the gIsReady is only modified after the gPartialSum is written. +// __threadfence(); +// +// gIsReady[blockId] = true; +// +// *currentGlobalOffset = offset; +// } +// +// __syncthreads(); +//} +// +//extern "C" __global__ void ParallelExclusiveScanAllWG( int* gCount, int* gHistogram, volatile int* gPartialSum, volatile bool* gIsReady ) +//{ +// // Fill the LDS with the partial sum of each segment +// __shared__ int blockBuffer[SCAN_WG_SIZE]; +// +// blockBuffer[threadIdx.x] = gCount[blockIdx.x * blockDim.x + threadIdx.x]; +// +// __syncthreads(); +// +// // Do parallel exclusive scan on the LDS +// +// int currentSegmentSum = ldsScanExclusive( blockBuffer, SCAN_WG_SIZE ); +// +// __syncthreads(); +// +// // Sync all the Workgroups to calculate the global offset. +// +// __shared__ int currentGlobalOffset; +// WorkgroupSync( threadIdx.x, blockIdx.x, currentSegmentSum, ¤tGlobalOffset, gPartialSum, gIsReady ); +// +// // Write back the result. +// +// gHistogram[blockIdx.x * blockDim.x + threadIdx.x] = blockBuffer[threadIdx.x] + currentGlobalOffset; +//} +// +//template +//__device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int numberOfInputs, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) +//{ +// __shared__ u32 globalOffset[BIN_SIZE]; +// __shared__ u32 localPrefixSum[BIN_SIZE]; +// __shared__ u32 counters[BIN_SIZE]; +// +// __shared__ u32 matchMasks[SORT_NUM_WARPS_PER_BLOCK][BIN_SIZE]; +// +// for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE ) +// { +// // Note: The size of gHistogram is always BIN_SIZE * N_WGS_EXECUTED +// globalOffset[i] = gHistogram[i * N_WGS_EXECUTED + blockIdx.x]; +// +// counters[i] = 0; +// localPrefixSum[i] = 0; +// } +// +// for( int w = 0; w < SORT_NUM_WARPS_PER_BLOCK; ++w ) +// { +// for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE ) +// { +// matchMasks[w][i] = 0; +// } +// } +// +// __syncthreads(); +// +// for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE ) +// { +// const u32 itemIndex = blockIdx.x * gNItemsPerWG + i; +// if( itemIndex < numberOfInputs ) +// { +// const auto item = gSrcKey[itemIndex]; +// const u32 bucketIndex = getMaskedBits( item, START_BIT ); +// atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); +// } +// } +// +// __syncthreads(); +// +// // Compute Prefix Sum +// +// ldsScanExclusive( localPrefixSum, BIN_SIZE ); +// +// __syncthreads(); +// +// // Reorder +// +// for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE ) +// { +// const u32 itemIndex = blockIdx.x * gNItemsPerWG + i; +// +// const auto item = gSrcKey[itemIndex]; +// const u32 bucketIndex = getMaskedBits( item, START_BIT ); +// +// const int warp = threadIdx.x / 32; +// const int lane = threadIdx.x % 32; +// +// __syncthreads(); +// +// if( itemIndex < numberOfInputs ) +// { +// atomicOr( &matchMasks[warp][bucketIndex], 1u << lane ); +// } +// +// __syncthreads(); +// +// bool flushMask = false; +// +// u32 localOffset = 0; +// u32 localSrcIndex = 0; +// +// if( itemIndex < numberOfInputs ) +// { +// const u32 matchMask = matchMasks[warp][bucketIndex]; +// const u32 lowerMask = ( 1u << lane ) - 1; +// u32 offset = __popc( matchMask & lowerMask ); +// +// flushMask = ( offset == 0 ); +// +// for( int w = 0; w < warp; ++w ) +// { +// offset += __popc( matchMasks[w][bucketIndex] ); +// } +// +// localOffset = counters[bucketIndex] + offset; +// localSrcIndex = i; +// } +// +// __syncthreads(); +// +// if( itemIndex < numberOfInputs ) +// { +// atomicInc( &counters[bucketIndex], 0xFFFFFFFF ); +// } +// +// if( flushMask ) +// { +// matchMasks[warp][bucketIndex] = 0; +// } +// +// // Swap +// +// if( itemIndex < numberOfInputs ) +// { +// const u32 srcIndex = blockIdx.x * gNItemsPerWG + localSrcIndex; +// const u32 dstIndex = globalOffset[bucketIndex] + localOffset; +// gDstKey[dstIndex] = gSrcKey[srcIndex]; +// +// if constexpr( KEY_VALUE_PAIR ) +// { +// gDstVal[dstIndex] = gSrcVal[srcIndex]; +// } +// } +// } +//} +// +//extern "C" __global__ void SortKernel( int* gSrcKey, int* gDstKey, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) +//{ +// SortImpl( gSrcKey, nullptr, gDstKey, nullptr, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED ); +//} +// +//extern "C" __global__ void SortKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) +//{ +// SortImpl( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED ); +//} + +#define RADIX_SORT_KEY_TYPE u32 +#define RADIX_SORT_VALUE_TYPE u32 +#define KEY_IS_16BYTE_ALIGNED 1 + + +typedef unsigned long long uint64_t; +typedef unsigned int uint32_t; +typedef unsigned short uint16_t; +typedef unsigned char uint8_t; + +//#define RADIX_SORT_BLOCK_SIZE 2048 +// +//#define GHISTOGRAM_ITEM_PER_BLOCK 2048 +//#define GHISTOGRAM_THREADS_PER_BLOCK 256 +// +//#define REORDER_NUMBER_OF_WARPS 8 +//#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS ) + +#define PARTITIOIN_BIT_A 0x80000000 +#define PARTITIOIN_BIT_P 0x40000000 +#define PARTITIOIN_FLAG_MASK ( PARTITIOIN_BIT_A | PARTITIOIN_BIT_P ) +#define PARTITIOIN_VALUE_MASK 0x3FFFFFFF + +#define PARTITIOIN_BIT_A_64 0x8000000000000000llu +#define PARTITIOIN_BIT_P_64 0x4000000000000000llu +#define PARTITIOIN_FLAG_MASK_64 ( PARTITIOIN_BIT_A_64 | PARTITIOIN_BIT_P_64 ) +#define PARTITIOIN_VALUE_MASK_64 0x3FFFFFFFFFFFFFFFllu + +__device__ inline void partitionStoreA( volatile uint32_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_A | x; } +__device__ inline void partitionStoreA( volatile uint64_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_A_64 | x; } +__device__ inline void partitionStoreP( volatile uint32_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_P | x; } +__device__ inline void partitionStoreP( volatile uint64_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_P_64 | x; } +__device__ inline bool partitionIsX( uint32_t x ) { return ( x & PARTITIOIN_FLAG_MASK ) == 0; } +__device__ inline bool partitionIsX( uint64_t x ) { return ( x & PARTITIOIN_FLAG_MASK_64 ) == 0; } +__device__ inline bool partitionIsP( uint32_t x ) { return ( x & PARTITIOIN_BIT_P ) != 0; } +__device__ inline bool partitionIsP( uint64_t x ) { return ( x & PARTITIOIN_BIT_P_64 ) != 0; } + +__device__ inline uint32_t partitionGetValue( uint32_t x ) { return x & PARTITIOIN_VALUE_MASK; } +__device__ inline uint32_t partitionGetValue( uint64_t x ) { return static_cast( x & PARTITIOIN_VALUE_MASK_64 ); } + +#if defined( DESCENDING_ORDER ) +#define ORDER_MASK_32 0xFFFFFFFF +#define ORDER_MASK_64 0xFFFFFFFFFFFFFFFFllu +#else +#define ORDER_MASK_32 0 +#define ORDER_MASK_64 0llu +#endif + +#if defined( CUDART_VERSION ) && CUDART_VERSION >= 9000 +#define ITS 1 +#endif + +__device__ inline uint32_t div_round_up( uint32_t val, uint32_t divisor ) { return ( val + divisor - 1 ) / divisor; } +template +__device__ void clearShared( T* sMem, T value ) { - // Use a single WG. - if( blockIdx.x != 0 ) + for( int i = 0; i < NElement; i += NThread ) { - return; + if( i < NElement ) + { + sMem[i + threadIdx.x] = value; + } } +} + +__device__ inline uint32_t getKeyBits( uint32_t x ) { return x ^ ORDER_MASK_32; } +__device__ inline uint64_t getKeyBits( uint64_t x ) { return x ^ ORDER_MASK_64; } +__device__ inline uint32_t getKeyBits( float x ) +{ + if( x == 0.0f ) x = 0.0f; + + uint32_t flip = uint32_t( __float_as_int( x ) >> 31 ) | 0x80000000; + return __float_as_uint( x ) ^ flip ^ ORDER_MASK_32; +} +__device__ inline uint64_t getKeyBits( double x ) +{ + if( x == 0.0 ) x = 0.0; - // LDS for the parallel scan of the global sum: - // First we store the sum of the counters of each number to it, - // then we compute the global offset using parallel exclusive scan. - __shared__ int blockBuffer[BIN_SIZE]; + uint64_t flip = uint64_t( __double_as_longlong( x ) >> 63 ) | 0x8000000000000000llu; + return (uint64_t)__double_as_longlong( x ) ^ flip ^ ORDER_MASK_64; +} - // fill the LDS with the local sum +template +__device__ inline uint32_t prefixSumExclusive( uint32_t prefix, uint32_t* sMemIO ) +{ + uint32_t value = sMemIO[threadIdx.x]; - for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE ) + for( uint32_t offset = 1; offset < NThreads; offset <<= 1 ) { - // Do exclusive scan for each segment handled by each WI in a WG + uint32_t x = sMemIO[threadIdx.x]; - int localThreadSum = 0; - for( int i = 0; i < N_WGS_EXECUTED; ++i ) + if( offset <= threadIdx.x ) { - int current = gCount[binId * N_WGS_EXECUTED + i]; - gCount[binId * N_WGS_EXECUTED + i] = localThreadSum; - - localThreadSum += current; + x += sMemIO[threadIdx.x - offset]; } - // Store the thread local sum to LDS. + __syncthreads(); + + sMemIO[threadIdx.x] = x; - blockBuffer[binId] = localThreadSum; + __syncthreads(); } + uint32_t sum = sMemIO[NThreads - 1]; - LDS_BARRIER; + __syncthreads(); - // Do parallel exclusive scan on the LDS + sMemIO[threadIdx.x] += prefix - value; - int globalSum = 0; - for( int binId = 0; binId < BIN_SIZE; binId += WG_SIZE * 2 ) - { - int* globalOffset = &blockBuffer[binId]; - int currentGlobalSum = ldsScanExclusive( globalOffset, WG_SIZE * 2 ); - globalOffset[threadIdx.x * 2] += globalSum; - globalOffset[threadIdx.x * 2 + 1] += globalSum; - globalSum += currentGlobalSum; - } + __syncthreads(); - LDS_BARRIER; + return sum; +} - // Add the global offset to the global histogram. +extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t numberOfInputs, uint32_t* gpSumBuffer, uint32_t startBits, uint32_t* counter ) +{ + __shared__ uint32_t localCounters[sizeof( RADIX_SORT_KEY_TYPE )][256]; - for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE ) + for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { - for( int i = 0; i < N_WGS_EXECUTED; ++i ) + for( int j = threadIdx.x; j < 256; j += GHISTOGRAM_THREADS_PER_BLOCK ) { - gHistogram[binId * N_WGS_EXECUTED + i] += blockBuffer[binId]; + localCounters[i][j] = 0; } } -} -extern "C" __device__ void WorkgroupSync( int threadId, int blockId, int currentSegmentSum, int* currentGlobalOffset, volatile int* gPartialSum, volatile bool* gIsReady ) -{ - if( threadId == 0 ) + __syncthreads(); + + uint32_t numberOfBlocks = div_round_up( numberOfInputs, GHISTOGRAM_ITEM_PER_BLOCK ); + __shared__ uint32_t iBlock; + if( threadIdx.x == 0 ) + { + iBlock = atomicInc( counter, 0xFFFFFFFF ); + } + + __syncthreads(); + + bool hasData = false; + + while( iBlock < numberOfBlocks ) { - int offset = 0; + hasData = true; - if( blockId != 0 ) +#if defined( KEY_IS_16BYTE_ALIGNED ) + if( ( iBlock + 1 ) * GHISTOGRAM_ITEM_PER_BLOCK <= numberOfInputs ) { - while( !gIsReady[blockId - 1] ) + for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK * 4 ) { + uint32_t itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i + threadIdx.x * 4; + struct alignas( 16 ) Key4 + { + RADIX_SORT_KEY_TYPE xs[4]; + }; + Key4 key4 = *(Key4*)&inputs[itemIndex]; + for( int k = 0; k < 4; k++ ) + { + auto item = key4.xs[k]; + for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) + { + uint32_t bitLocation = startBits + i * 8; + uint32_t bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + atomicInc( &localCounters[i][bits], 0xFFFFFFFF ); + } + } + } + } + else +#endif + for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK ) + { + uint32_t itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x + i; + if( itemIndex < numberOfInputs ) + { + auto item = inputs[itemIndex]; + for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) + { + uint32_t bitLocation = startBits + i * 8; + uint32_t bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + atomicInc( &localCounters[i][bits], 0xFFFFFFFF ); + } + } } - offset = gPartialSum[blockId - 1]; - - __threadfence(); + __syncthreads(); - // Reset the value - gIsReady[blockId - 1] = false; + if( threadIdx.x == 0 ) + { + iBlock = atomicInc( counter, 0xFFFFFFFF ); } - gPartialSum[blockId] = offset + currentSegmentSum; - - // Ensure that the gIsReady is only modified after the gPartialSum is written. - __threadfence(); + __syncthreads(); + } - gIsReady[blockId] = true; + if( hasData ) + { + __syncthreads(); - *currentGlobalOffset = offset; + for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) + { + for( int j = threadIdx.x; j < 256; j += GHISTOGRAM_THREADS_PER_BLOCK ) + { + atomicAdd( &gpSumBuffer[256 * i + j], localCounters[i][j] ); + } + } } - - __syncthreads(); } -extern "C" __global__ void ParallelExclusiveScanAllWG( int* gCount, int* gHistogram, volatile int* gPartialSum, volatile bool* gIsReady ) +extern "C" __global__ void gPrefixSum( uint32_t* gpSumBuffer ) { - // Fill the LDS with the partial sum of each segment - __shared__ int blockBuffer[SCAN_WG_SIZE]; - - blockBuffer[threadIdx.x] = gCount[blockIdx.x * blockDim.x + threadIdx.x]; + __shared__ uint32_t smem[256]; - __syncthreads(); - - // Do parallel exclusive scan on the LDS - - int currentSegmentSum = ldsScanExclusive( blockBuffer, SCAN_WG_SIZE ); + smem[threadIdx.x] = gpSumBuffer[blockIdx.x * 256 + threadIdx.x]; __syncthreads(); - // Sync all the Workgroups to calculate the global offset. - - __shared__ int currentGlobalOffset; - WorkgroupSync( threadIdx.x, blockIdx.x, currentSegmentSum, ¤tGlobalOffset, gPartialSum, gIsReady ); + prefixSumExclusive<256>( 0, smem ); - // Write back the result. - - gHistogram[blockIdx.x * blockDim.x + threadIdx.x] = blockBuffer[threadIdx.x] + currentGlobalOffset; + gpSumBuffer[blockIdx.x * 256 + threadIdx.x] = smem[threadIdx.x]; } -template -__device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int numberOfInputs, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) +template +__device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, uint32_t numberOfInputs, uint32_t* gpSumBuffer, + volatile TLookBack* lookBackBuffer, uint32_t startBits, uint32_t iteration ) { - __shared__ u32 globalOffset[BIN_SIZE]; - __shared__ u32 localPrefixSum[BIN_SIZE]; - __shared__ u32 counters[BIN_SIZE]; - - __shared__ u32 matchMasks[SORT_NUM_WARPS_PER_BLOCK][BIN_SIZE]; - - for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE ) + struct ElementLocation { - // Note: The size of gHistogram is always BIN_SIZE * N_WGS_EXECUTED - globalOffset[i] = gHistogram[i * N_WGS_EXECUTED + blockIdx.x]; - - counters[i] = 0; - localPrefixSum[i] = 0; - } - - for( int w = 0; w < SORT_NUM_WARPS_PER_BLOCK; ++w ) + uint32_t localSrcIndex : 12; + uint32_t localOffset : 12; + uint32_t bucket : 8; + }; + + __shared__ uint32_t pSum[256]; + __shared__ uint32_t localPrefixSum[256]; + __shared__ uint32_t counters[256]; + __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; + __shared__ uint8_t elementBuckets[RADIX_SORT_BLOCK_SIZE]; + __shared__ uint32_t matchMasks[REORDER_NUMBER_OF_WARPS][256]; + + uint32_t bitLocation = startBits + 8 * iteration; + uint32_t blockIndex = blockIdx.x; + uint32_t numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE ); + + clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, uint32_t>( localPrefixSum, 0 ); + clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, uint32_t>( counters, 0 ); + + for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ ) { - for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE ) + for( int i = 0; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - matchMasks[w][i] = 0; + matchMasks[w][i + threadIdx.x] = 0; } } __syncthreads(); - for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE ) + // count +#if defined( KEY_IS_16BYTE_ALIGNED ) + if( ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs ) { - const u32 itemIndex = blockIdx.x * gNItemsPerWG + i; - if( itemIndex < numberOfInputs ) + for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK * 4 ) + { + uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x * 4; + struct alignas( 16 ) Key4 + { + RADIX_SORT_KEY_TYPE xs[4]; + }; + Key4 key4 = *(Key4*)&inputKeys[itemIndex]; + for( int k = 0; k < 4; k++ ) + { + auto item = key4.xs[k]; + uint32_t bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); + elementBuckets[i + threadIdx.x * 4 + k] = bucketIndex; + } + } + } + else +#endif + { + for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - const auto item = gSrcKey[itemIndex]; - const u32 bucketIndex = getMaskedBits( item, START_BIT ); - atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); + uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + if( itemIndex < numberOfInputs ) + { + auto item = inputKeys[itemIndex]; + uint32_t bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); + + elementBuckets[i + threadIdx.x] = bucketIndex; + } } } __syncthreads(); - // Compute Prefix Sum + // Look back + for( int i = threadIdx.x; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + { + uint32_t s = localPrefixSum[i]; + partitionStoreA( &lookBackBuffer[256 * blockIdx.x + i], s ); + uint32_t gp = gpSumBuffer[iteration * 256 + i]; - ldsScanExclusive( localPrefixSum, BIN_SIZE ); + uint32_t p = 0; - __syncthreads(); + for( int iBlock = (int)blockIdx.x - 1; 0 <= iBlock; iBlock-- ) + { + TLookBack counter = lookBackBuffer[256 * iBlock + i]; + while( partitionIsX( counter ) ) + { + counter = lookBackBuffer[256 * iBlock + i]; + } - // Reorder + uint32_t value = partitionGetValue( counter ); + p += value; + if( partitionIsP( counter ) ) + { + break; + } + } - for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE ) - { - const u32 itemIndex = blockIdx.x * gNItemsPerWG + i; + partitionStoreP( &lookBackBuffer[256 * blockIdx.x + i], p + s ); - const auto item = gSrcKey[itemIndex]; - const u32 bucketIndex = getMaskedBits( item, START_BIT ); + // complete global output location + pSum[i] = gp + p; + } + + uint32_t prefix = 0; + for( int i = 0; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + { + prefix += prefixSumExclusive( prefix, &localPrefixSum[i] ); + } - const int warp = threadIdx.x / 32; - const int lane = threadIdx.x % 32; + // reorder + for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + { + uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + uint32_t bucketIndex = elementBuckets[i + threadIdx.x]; __syncthreads(); + int warp = threadIdx.x / 32; + int lane = threadIdx.x % 32; + if( itemIndex < numberOfInputs ) { atomicOr( &matchMasks[warp][bucketIndex], 1u << lane ); @@ -702,24 +1111,27 @@ __device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal bool flushMask = false; - u32 localOffset = 0; - u32 localSrcIndex = 0; - if( itemIndex < numberOfInputs ) { - const u32 matchMask = matchMasks[warp][bucketIndex]; - const u32 lowerMask = ( 1u << lane ) - 1; - u32 offset = __popc( matchMask & lowerMask ); + uint32_t matchMask = matchMasks[warp][bucketIndex]; + uint32_t lowerMask = ( 1u << lane ) - 1; + uint32_t offset = __popc( matchMask & lowerMask ); - flushMask = ( offset == 0 ); + flushMask = offset == 0; - for( int w = 0; w < warp; ++w ) + for( int w = 0; w < warp; w++ ) { offset += __popc( matchMasks[w][bucketIndex] ); } - localOffset = counters[bucketIndex] + offset; - localSrcIndex = i; + uint32_t localOffset = counters[bucketIndex] + offset; + uint32_t to = localOffset + localPrefixSum[bucketIndex]; + + ElementLocation el; + el.localSrcIndex = i + threadIdx.x; + el.localOffset = localOffset; + el.bucket = bucketIndex; + elementLocations[to] = el; } __syncthreads(); @@ -728,34 +1140,57 @@ __device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal { atomicInc( &counters[bucketIndex], 0xFFFFFFFF ); } - if( flushMask ) { matchMasks[warp][bucketIndex] = 0; } + } - // Swap - + for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + { + uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; if( itemIndex < numberOfInputs ) { - const u32 srcIndex = blockIdx.x * gNItemsPerWG + localSrcIndex; - const u32 dstIndex = globalOffset[bucketIndex] + localOffset; - gDstKey[dstIndex] = gSrcKey[srcIndex]; + ElementLocation el = elementLocations[i + threadIdx.x]; + uint32_t srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; + uint8_t bucketIndex = el.bucket; - if constexpr( KEY_VALUE_PAIR ) + uint32_t dstIndex = pSum[bucketIndex] + el.localOffset; + outputKeys[dstIndex] = inputKeys[srcIndex]; + } + } + if( keyPair ) + { + for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + { + uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + if( itemIndex < numberOfInputs ) { - gDstVal[dstIndex] = gSrcVal[srcIndex]; + ElementLocation el = elementLocations[i + threadIdx.x]; + uint32_t srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; + uint8_t bucketIndex = el.bucket; + + uint32_t dstIndex = pSum[bucketIndex] + el.localOffset; + outputValues[dstIndex] = inputValues[srcIndex]; } } } } - -extern "C" __global__ void SortKernel( int* gSrcKey, int* gDstKey, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) +extern "C" __global__ void onesweep_reorderKey( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint32_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) { - SortImpl( gSrcKey, nullptr, gDstKey, nullptr, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED ); + onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration ); } - -extern "C" __global__ void SortKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) +extern "C" __global__ void onesweep_reorderKeyPair( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, uint32_t numberOfInputs, uint32_t* gpSumBuffer, + volatile uint32_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) { - SortImpl( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED ); + onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration ); } +extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) +{ + onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration ); +} +extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, uint32_t numberOfInputs, uint32_t* gpSumBuffer, + volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) +{ + onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration ); +} \ No newline at end of file diff --git a/Test/RadixSort/main.cpp b/Test/RadixSort/main.cpp index fae9f55..57eb4e4 100644 --- a/Test/RadixSort/main.cpp +++ b/Test/RadixSort/main.cpp @@ -67,6 +67,8 @@ class SortTest OrochiUtils::malloc( srcGpu.key, testSize ); OrochiUtils::malloc( dstGpu.key, testSize ); + void* temp; + oroMalloc( (oroDeviceptr*)&temp, m_sort.getRequiredTemporalStorageBytes( testSize ) ); std::vector srcKey( testSize ); for( int i = 0; i < testSize; i++ ) @@ -102,11 +104,11 @@ class SortTest if constexpr( KEY_VALUE_PAIR ) { - m_sort.sort( srcGpu, dstGpu, testSize, 0, testBits); + m_sort.sort( srcGpu, dstGpu, testSize, 0, testBits, temp ); } else { - m_sort.sort( srcGpu.key, dstGpu.key, testSize, 0, testBits ); + m_sort.sort( srcGpu.key, dstGpu.key, testSize, 0, testBits, temp ); } OrochiUtils::waitForCompletion(); @@ -178,6 +180,7 @@ class SortTest OrochiUtils::free( srcGpu.key ); OrochiUtils::free( dstGpu.key ); + oroFree( (oroDeviceptr)temp ); printf( "passed: %3.2fK keys\n", testSize / 1000.f ); } From 12fd48c6624785f927f34f21879df60a9bd406e3 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 21 Sep 2023 16:54:07 +0900 Subject: [PATCH 06/68] [ORO-0] fixed storage ver --- ParallelPrimitives/RadixSort.cpp | 43 +++------- ParallelPrimitives/RadixSort.h | 9 +-- ParallelPrimitives/RadixSortConfigs.h | 8 +- ParallelPrimitives/RadixSortKernels.h | 110 ++++++++++++++++---------- Test/RadixSort/main.cpp | 7 +- 5 files changed, 89 insertions(+), 88 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index a33ff47..864d756 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -200,8 +200,6 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string #define LOAD_FUNC( var, kernel ) var = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), kernel, &opts ); LOAD_FUNC( m_gHistogram, "gHistogram" ); LOAD_FUNC( m_gPrefixSum, "gPrefixSum" ); - LOAD_FUNC( m_onesweep_reorderKey, "onesweep_reorderKey" ); - LOAD_FUNC( m_onesweep_reorderKeyPair, "onesweep_reorderKeyPair" ); LOAD_FUNC( m_onesweep_reorderKey64, "onesweep_reorderKey64" ); LOAD_FUNC( m_onesweep_reorderKeyPair64, "onesweep_reorderKeyPair64" ); #undef LOAD_FUNC @@ -240,6 +238,10 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc { compileKernels( kernelPath, includeDir ); + u64 gpSumBuffer = sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ); + u64 lookBackBuffer = next_multiple64( sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE + 1 ), 16 ); + m_tmpBuffer.resize( gpSumBuffer + lookBackBuffer ); + //m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count ); ///// The tmp buffer size of the count kernel and the scan kernel. @@ -262,7 +264,7 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc } void RadixSort::setFlag( Flag flag ) noexcept { m_flags = flag; } -void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream ) noexcept +void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, oroStream stream ) noexcept { bool keyPair = src.value != nullptr; @@ -286,17 +288,11 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit } int nIteration = div_round_up64( endBit - startBit, 8 ); - bool use64bitCounter = -#if defined( ENFORCE_64BIT_COUNTER ) - true; -#else - MAX_ELEMENTS_WITH_32BIT_COUNTER < n; -#endif uint64_t numberOfBlocks = div_round_up64( n, RADIX_SORT_BLOCK_SIZE ); // Buffers - void* gpSumBuffer = tempStorage; - void* lookBackBuffer = (void*)( (char*)tempStorage + sizeof( uint32_t ) * 256 * sizeof( u32 /* key */ ) ); + void* gpSumBuffer = m_tmpBuffer.ptr(); + void* lookBackBuffer = (void*)( m_tmpBuffer.ptr() + sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ) ); { oroMemsetD32Async( (oroDeviceptr)gpSumBuffer, 0, 256 * sizeof( u32 /* key */ ), stream ); @@ -316,17 +312,17 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit auto d = dst; for( int i = 0; i < nIteration; i++ ) { - oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, 256 * numberOfBlocks * ( use64bitCounter ? 2 : 1 ), stream ); + oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, ( 256 * LOOKBACK_TABLE_SIZE + 1 ) * sizeof( uint64_t ) / 4, stream ); if( keyPair ) { const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i }; - OrochiUtils::launch1D( use64bitCounter ? m_onesweep_reorderKeyPair64 : m_onesweep_reorderKeyPair, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream ); + OrochiUtils::launch1D( m_onesweep_reorderKeyPair64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream ); } else { const void* args[] = { &s.key, &d.key, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i }; - OrochiUtils::launch1D( use64bitCounter ? m_onesweep_reorderKey64 : m_onesweep_reorderKey, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream ); + OrochiUtils::launch1D( m_onesweep_reorderKey64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream ); } std::swap( s, d ); } @@ -342,23 +338,8 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit } } -void RadixSort::sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream ) noexcept +void RadixSort::sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, oroStream stream ) noexcept { - sort( KeyValueSoA{ src, nullptr }, KeyValueSoA{ dst, nullptr }, n, startBit, endBit, tempStorage, stream ); -} - -uint64_t RadixSort::getRequiredTemporalStorageBytes( u32 numberOfMaxInputs ) const -{ - static_assert( BIN_SIZE == 256, "check alignment of the buffers" ); - uint64_t numberOfBlocks = div_round_up64( numberOfMaxInputs, RADIX_SORT_BLOCK_SIZE ); - uint64_t gpSumBuffer = sizeof( uint32_t ) * 256 * sizeof( u32 /* key */ ); - uint64_t lookBackBuffer = sizeof( uint32_t ) * 256 * numberOfBlocks; -#if !defined( ENFORCE_64BIT_COUNTER ) - if( MAX_ELEMENTS_WITH_32BIT_COUNTER < numberOfMaxInputs ) -#endif - { - lookBackBuffer *= 2; // to 64bit counter - } - return gpSumBuffer + lookBackBuffer; + sort( KeyValueSoA{ src, nullptr }, KeyValueSoA{ dst, nullptr }, n, startBit, endBit, stream ); } }; // namespace Oro diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index d30ee32..e78a978 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -43,11 +43,10 @@ class RadixSort final void setFlag( Flag flag ) noexcept; - void sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream = 0 ) noexcept; + void sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept; - void sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream = 0 ) noexcept; + void sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept; - uint64_t getRequiredTemporalStorageBytes( u32 numberOfMaxInputs ) const; private: //template //void sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept; @@ -121,10 +120,10 @@ class RadixSort final oroFunction m_gHistogram; oroFunction m_gPrefixSum; - oroFunction m_onesweep_reorderKey; - oroFunction m_onesweep_reorderKeyPair; oroFunction m_onesweep_reorderKey64; oroFunction m_onesweep_reorderKeyPair64; + + GpuMemory m_tmpBuffer; }; //#include diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index c597238..ed64110 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -50,9 +50,9 @@ static_assert( DEFAULT_SCAN_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 ); #define REORDER_NUMBER_OF_WARPS 8 #define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS ) -#define MAX_ELEMENTS_WITH_32BIT_COUNTER 0x3FFFFFFF - -// Please uncomment this enforce 64bit counter for lookback counter to measure performance impact. -// #define ENFORCE_64BIT_COUNTER 1 +#define LOOKBACK_TABLE_SIZE ( 1024 ) +#define MAX_LOOK_BACK 32 +#define TAIL_BITS 4 +#define TAIL_COUNT ( 1u << TAIL_BITS ) }; // namespace Oro \ No newline at end of file diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 56bca91..c04a6f1 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -777,28 +777,11 @@ typedef unsigned char uint8_t; // //#define REORDER_NUMBER_OF_WARPS 8 //#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS ) - -#define PARTITIOIN_BIT_A 0x80000000 -#define PARTITIOIN_BIT_P 0x40000000 -#define PARTITIOIN_FLAG_MASK ( PARTITIOIN_BIT_A | PARTITIOIN_BIT_P ) -#define PARTITIOIN_VALUE_MASK 0x3FFFFFFF - -#define PARTITIOIN_BIT_A_64 0x8000000000000000llu -#define PARTITIOIN_BIT_P_64 0x4000000000000000llu -#define PARTITIOIN_FLAG_MASK_64 ( PARTITIOIN_BIT_A_64 | PARTITIOIN_BIT_P_64 ) -#define PARTITIOIN_VALUE_MASK_64 0x3FFFFFFFFFFFFFFFllu - -__device__ inline void partitionStoreA( volatile uint32_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_A | x; } -__device__ inline void partitionStoreA( volatile uint64_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_A_64 | x; } -__device__ inline void partitionStoreP( volatile uint32_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_P | x; } -__device__ inline void partitionStoreP( volatile uint64_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_P_64 | x; } -__device__ inline bool partitionIsX( uint32_t x ) { return ( x & PARTITIOIN_FLAG_MASK ) == 0; } -__device__ inline bool partitionIsX( uint64_t x ) { return ( x & PARTITIOIN_FLAG_MASK_64 ) == 0; } -__device__ inline bool partitionIsP( uint32_t x ) { return ( x & PARTITIOIN_BIT_P ) != 0; } -__device__ inline bool partitionIsP( uint64_t x ) { return ( x & PARTITIOIN_BIT_P_64 ) != 0; } - -__device__ inline uint32_t partitionGetValue( uint32_t x ) { return x & PARTITIOIN_VALUE_MASK; } -__device__ inline uint32_t partitionGetValue( uint64_t x ) { return static_cast( x & PARTITIOIN_VALUE_MASK_64 ); } +// +//#define LOOKBACK_TABLE_SIZE ( 1024 ) +//#define MAX_LOOK_BACK 32 +//#define TAIL_BITS 4 +//#define TAIL_COUNT ( 1u << TAIL_BITS ) #if defined( DESCENDING_ORDER ) #define ORDER_MASK_32 0xFFFFFFFF @@ -979,9 +962,8 @@ extern "C" __global__ void gPrefixSum( uint32_t* gpSumBuffer ) gpSumBuffer[blockIdx.x * 256 + threadIdx.x] = smem[threadIdx.x]; } -template __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, uint32_t numberOfInputs, uint32_t* gpSumBuffer, - volatile TLookBack* lookBackBuffer, uint32_t startBits, uint32_t iteration ) + volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) { struct ElementLocation { @@ -1052,34 +1034,77 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } } + struct ParitionID + { + uint64_t value : 32; + uint64_t block : 30; + uint64_t flag : 2; + }; + auto asPartition = []( uint64_t x ) + { + ParitionID pa; + memcpy( &pa, &x, sizeof( ParitionID ) ); + return pa; + }; + auto asU64 = []( ParitionID pa ) + { + uint64_t x; + memcpy( &x, &pa, sizeof( uint64_t ) ); + return x; + }; + + uint32_t* gTailIterator = (uint32_t*)( lookBackBuffer + LOOKBACK_TABLE_SIZE * 256 ); + + if( threadIdx.x == 0 && LOOKBACK_TABLE_SIZE <= blockIndex ) + { + uint32_t mustBeDone = blockIndex - LOOKBACK_TABLE_SIZE + MAX_LOOK_BACK; + while( ( atomicAdd( gTailIterator, 0 ) >> TAIL_BITS ) * TAIL_COUNT <= mustBeDone ) + ; + } __syncthreads(); - // Look back for( int i = threadIdx.x; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { uint32_t s = localPrefixSum[i]; - partitionStoreA( &lookBackBuffer[256 * blockIdx.x + i], s ); + int pIndex = 256 * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i; + + { + ParitionID pa; + pa.value = s; + pa.block = blockIndex; + pa.flag = 1; + lookBackBuffer[pIndex] = asU64( pa ); + } + uint32_t gp = gpSumBuffer[iteration * 256 + i]; uint32_t p = 0; - for( int iBlock = (int)blockIdx.x - 1; 0 <= iBlock; iBlock-- ) + for( int iBlock = (int)blockIndex - 1; 0 <= iBlock; iBlock-- ) { - TLookBack counter = lookBackBuffer[256 * iBlock + i]; - while( partitionIsX( counter ) ) + int lookbackIndex = 256 * ( iBlock % LOOKBACK_TABLE_SIZE ) + i; + ParitionID pa; + do { - counter = lookBackBuffer[256 * iBlock + i]; - } + pa = asPartition( lookBackBuffer[lookbackIndex] ); - uint32_t value = partitionGetValue( counter ); + // when you reach to the maximum, flag must be 2 + if( MAX_LOOK_BACK == blockIndex - iBlock && pa.flag != 2 ) continue; + } while( pa.flag == 0 || pa.block != iBlock ); + + uint32_t value = pa.value; p += value; - if( partitionIsP( counter ) ) + if( pa.flag == 2 ) { break; } } - partitionStoreP( &lookBackBuffer[256 * blockIdx.x + i], p + s ); + ParitionID pa; + pa.value = p + s; + pa.block = blockIndex; + pa.flag = 2; + lookBackBuffer[pIndex] = asU64( pa ); // complete global output location pSum[i] = gp + p; @@ -1091,6 +1116,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys prefix += prefixSumExclusive( prefix, &localPrefixSum[i] ); } + if( threadIdx.x == 0 ) + { + while( ( atomicAdd( gTailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT ) + ; + + atomicInc( gTailIterator, 0xFFFFFFFF ); + } + // reorder for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { @@ -1176,15 +1209,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } } } -extern "C" __global__ void onesweep_reorderKey( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint32_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) -{ - onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration ); -} -extern "C" __global__ void onesweep_reorderKeyPair( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, uint32_t numberOfInputs, uint32_t* gpSumBuffer, - volatile uint32_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) -{ - onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration ); -} extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) { onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration ); diff --git a/Test/RadixSort/main.cpp b/Test/RadixSort/main.cpp index 57eb4e4..e36dda4 100644 --- a/Test/RadixSort/main.cpp +++ b/Test/RadixSort/main.cpp @@ -67,8 +67,6 @@ class SortTest OrochiUtils::malloc( srcGpu.key, testSize ); OrochiUtils::malloc( dstGpu.key, testSize ); - void* temp; - oroMalloc( (oroDeviceptr*)&temp, m_sort.getRequiredTemporalStorageBytes( testSize ) ); std::vector srcKey( testSize ); for( int i = 0; i < testSize; i++ ) @@ -104,11 +102,11 @@ class SortTest if constexpr( KEY_VALUE_PAIR ) { - m_sort.sort( srcGpu, dstGpu, testSize, 0, testBits, temp ); + m_sort.sort( srcGpu, dstGpu, testSize, 0, testBits ); } else { - m_sort.sort( srcGpu.key, dstGpu.key, testSize, 0, testBits, temp ); + m_sort.sort( srcGpu.key, dstGpu.key, testSize, 0, testBits ); } OrochiUtils::waitForCompletion(); @@ -180,7 +178,6 @@ class SortTest OrochiUtils::free( srcGpu.key ); OrochiUtils::free( dstGpu.key ); - oroFree( (oroDeviceptr)temp ); printf( "passed: %3.2fK keys\n", testSize / 1000.f ); } From 284b083a4d7545350fd36fbb2047929596a66788 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 21 Sep 2023 18:26:02 +0900 Subject: [PATCH 07/68] [ORO-0] Memset can be skipped in most situations --- ParallelPrimitives/RadixSort.cpp | 13 +++++++++---- ParallelPrimitives/RadixSortKernels.h | 3 +-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 864d756..3d77b53 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -294,13 +294,15 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit void* gpSumBuffer = m_tmpBuffer.ptr(); void* lookBackBuffer = (void*)( m_tmpBuffer.ptr() + sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ) ); + // counter for gHistogram. + void* counter = (uint8_t*)lookBackBuffer + ( 256 * LOOKBACK_TABLE_SIZE ) + sizeof( uint32_t ); + { - oroMemsetD32Async( (oroDeviceptr)gpSumBuffer, 0, 256 * sizeof( u32 /* key */ ), stream ); - oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, 1, stream ); + oroMemsetD32Async( (oroDeviceptr)m_tmpBuffer.ptr(), 0, m_tmpBuffer.size() / 4, stream ); const int nBlocks = 2048; - const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &lookBackBuffer }; + const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &counter }; OrochiUtils::launch1D( m_gHistogram, nBlocks * GHISTOGRAM_THREADS_PER_BLOCK, args, GHISTOGRAM_THREADS_PER_BLOCK, 0, stream ); } { @@ -312,7 +314,10 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit auto d = dst; for( int i = 0; i < nIteration; i++ ) { - oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, ( 256 * LOOKBACK_TABLE_SIZE + 1 ) * sizeof( uint64_t ) / 4, stream ); + if( numberOfBlocks < LOOKBACK_TABLE_SIZE * 2 ) + { + oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, ( 256 * LOOKBACK_TABLE_SIZE ) * sizeof( uint64_t ) / 4, stream ); + } // other wise, we can skip zero clear look back buffer if( keyPair ) { diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index c04a6f1..3090e31 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -764,7 +764,6 @@ extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, i #define RADIX_SORT_VALUE_TYPE u32 #define KEY_IS_16BYTE_ALIGNED 1 - typedef unsigned long long uint64_t; typedef unsigned int uint32_t; typedef unsigned short uint16_t; @@ -1121,7 +1120,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys while( ( atomicAdd( gTailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT ) ; - atomicInc( gTailIterator, 0xFFFFFFFF ); + atomicInc( gTailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ ); } // reorder From d12fdea3016d4351157160bcf931519ef1f8458e Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 21 Sep 2023 22:54:20 +0900 Subject: [PATCH 08/68] [ORO-0] fix the wrong condition for MAX_LOOK_BACK --- ParallelPrimitives/RadixSortConfigs.h | 2 +- ParallelPrimitives/RadixSortKernels.h | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index ed64110..6eb68d1 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -51,7 +51,7 @@ static_assert( DEFAULT_SCAN_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 ); #define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS ) #define LOOKBACK_TABLE_SIZE ( 1024 ) -#define MAX_LOOK_BACK 32 +#define MAX_LOOK_BACK 64 #define TAIL_BITS 4 #define TAIL_COUNT ( 1u << TAIL_BITS ) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 3090e31..1530b0a 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -778,7 +778,7 @@ typedef unsigned char uint8_t; //#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS ) // //#define LOOKBACK_TABLE_SIZE ( 1024 ) -//#define MAX_LOOK_BACK 32 +//#define MAX_LOOK_BACK 64 //#define TAIL_BITS 4 //#define TAIL_COUNT ( 1u << TAIL_BITS ) @@ -1083,13 +1083,15 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { int lookbackIndex = 256 * ( iBlock % LOOKBACK_TABLE_SIZE ) + i; ParitionID pa; + + // when you reach to the maximum, flag must be 2. flagRequire = 0b10 + // Otherwise, flag can be 1 or 2 flagRequire = 0b11 + int flagRequire = MAX_LOOK_BACK == blockIndex - iBlock ? 2 : 3; + do { pa = asPartition( lookBackBuffer[lookbackIndex] ); - - // when you reach to the maximum, flag must be 2 - if( MAX_LOOK_BACK == blockIndex - iBlock && pa.flag != 2 ) continue; - } while( pa.flag == 0 || pa.block != iBlock ); + } while( ( pa.flag & flagRequire ) == 0 || pa.block != iBlock ); uint32_t value = pa.value; p += value; From 688be9a0eae3aa57d258d78a3615a06bf415296a Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 22 Sep 2023 17:52:26 +0900 Subject: [PATCH 09/68] [ORO-0]fix wrong address for counter - it was luckly working --- ParallelPrimitives/RadixSort.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 3d77b53..691dae8 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -295,7 +295,7 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit void* lookBackBuffer = (void*)( m_tmpBuffer.ptr() + sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ) ); // counter for gHistogram. - void* counter = (uint8_t*)lookBackBuffer + ( 256 * LOOKBACK_TABLE_SIZE ) + sizeof( uint32_t ); + void* counter = (uint8_t*)lookBackBuffer + ( 256 * LOOKBACK_TABLE_SIZE ) * sizeof( uint64_t ) + sizeof( uint32_t ); { oroMemsetD32Async( (oroDeviceptr)m_tmpBuffer.ptr(), 0, m_tmpBuffer.size() / 4, stream ); From a3601869426d249fc233a049f6ebeb3de51bc9b0 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Mon, 25 Dec 2023 13:45:32 +0900 Subject: [PATCH 10/68] use resizeAsync --- ParallelPrimitives/RadixSort.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 937db5e..f4c2d83 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -317,7 +317,7 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc u64 gpSumBuffer = sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ); u64 lookBackBuffer = next_multiple64( sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE + 1 ), 16 ); - m_tmpBuffer.resize( gpSumBuffer + lookBackBuffer ); + m_tmpBuffer.resizeAsync( gpSumBuffer + lookBackBuffer, false /*copy*/, stream ); //m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count ); From 1bb319ae8b4599c44f99f90005c1f48080802626 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Mon, 25 Dec 2023 13:47:48 +0900 Subject: [PATCH 11/68] remove inl dependency --- ParallelPrimitives/RadixSort.h | 3 - ParallelPrimitives/RadixSort.inl | 112 +++++++++++++++---------------- 2 files changed, 56 insertions(+), 59 deletions(-) diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index 8b4f815..9e5b7e5 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -125,7 +125,4 @@ class RadixSort final GpuMemory m_tmpBuffer; }; - -//#include - }; // namespace Oro diff --git a/ParallelPrimitives/RadixSort.inl b/ParallelPrimitives/RadixSort.inl index fd42633..d001238 100644 --- a/ParallelPrimitives/RadixSort.inl +++ b/ParallelPrimitives/RadixSort.inl @@ -1,61 +1,61 @@ -namespace -{ - -struct Empty -{ -}; - -/// @brief Call the callable and measure the elapsed time using the Stopwatch. -/// @tparam CallableType The type of the callable to be invoked in this function. -/// @tparam RecordType The type of the object that stores the recorded times. -/// @tparam enable_profile The elapsed time will be recorded if this is set to True. -/// @param callable The callable object to be called. -/// @param time_record The object that stores the recorded times. -/// @param index The index indicates where to store the elapsed time in @c time_record -/// @param stream The GPU stream -template -constexpr void execute( CallableType&& callable, RecordType& time_record, const int index, const oroStream stream ) noexcept -{ - using TimerType = std::conditional_t; - - TimerType stopwatch; - - if constexpr( enable_profile ) - { - stopwatch.start(); - } - - std::invoke( std::forward( callable ) ); - - if constexpr( enable_profile ) - { - OrochiUtils::waitForCompletion( stream ); - stopwatch.stop(); - time_record[index] = stopwatch.getMs(); - } -} - -template -void resize_record( T& t ) noexcept -{ - if constexpr( enable_profile ) - { - t.resize( 3 ); - } -} - -template -void print_record( const T& t ) noexcept -{ - if constexpr( enable_profile ) - { - printf( "%3.2f, %3.2f, %3.2f\n", t[0], t[1], t[2] ); - } -} - -} // namespace +//namespace +//{ +// +//struct Empty +//{ +//}; +// +///// @brief Call the callable and measure the elapsed time using the Stopwatch. +///// @tparam CallableType The type of the callable to be invoked in this function. +///// @tparam RecordType The type of the object that stores the recorded times. +///// @tparam enable_profile The elapsed time will be recorded if this is set to True. +///// @param callable The callable object to be called. +///// @param time_record The object that stores the recorded times. +///// @param index The index indicates where to store the elapsed time in @c time_record +///// @param stream The GPU stream +//template +//constexpr void execute( CallableType&& callable, RecordType& time_record, const int index, const oroStream stream ) noexcept +//{ +// using TimerType = std::conditional_t; +// +// TimerType stopwatch; +// +// if constexpr( enable_profile ) +// { +// stopwatch.start(); +// } +// +// std::invoke( std::forward( callable ) ); +// +// if constexpr( enable_profile ) +// { +// OrochiUtils::waitForCompletion( stream ); +// stopwatch.stop(); +// time_record[index] = stopwatch.getMs(); +// } +//} +// +//template +//void resize_record( T& t ) noexcept +//{ +// if constexpr( enable_profile ) +// { +// t.resize( 3 ); +// } +//} +// +//template +//void print_record( const T& t ) noexcept +//{ +// if constexpr( enable_profile ) +// { +// printf( "%3.2f, %3.2f, %3.2f\n", t[0], t[1], t[2] ); +// } +//} +// +//} // namespace //template //void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept From cc8ad0eedc69566d030ec0653865dcae67c7c935 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Mon, 25 Dec 2023 14:02:15 +0900 Subject: [PATCH 12/68] constexpr noexcept for helper funcs --- ParallelPrimitives/RadixSort.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index f4c2d83..9748901 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -18,8 +18,8 @@ #include #endif -inline uint64_t div_round_up64( uint64_t val, uint64_t divisor ) { return ( val + divisor - 1 ) / divisor; } -inline uint64_t next_multiple64( uint64_t val, uint64_t divisor ) { return div_round_up64( val, divisor ) * divisor; } +constexpr uint64_t div_round_up64( uint64_t val, uint64_t divisor ) noexcept { return ( val + divisor - 1 ) / divisor; } +constexpr uint64_t next_multiple64( uint64_t val, uint64_t divisor ) noexcept { return div_round_up64( val, divisor ) * divisor; } namespace { From e9b33f8647b5eef4df3e90dc89cd1a682ffeafb0 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Mon, 25 Dec 2023 14:22:39 +0900 Subject: [PATCH 13/68] Use GPU timer --- Test/RadixSort/main.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Test/RadixSort/main.cpp b/Test/RadixSort/main.cpp index 645480f..9ee57a6 100644 --- a/Test/RadixSort/main.cpp +++ b/Test/RadixSort/main.cpp @@ -85,7 +85,6 @@ class SortTest } } - Stopwatch sw; for( int i = 0; i < nRuns; i++ ) { OrochiUtils::copyHtoD( srcGpu.key, srcKey.data(), testSize ); @@ -97,7 +96,8 @@ class SortTest OrochiUtils::waitForCompletion(); } - sw.start(); + OroStopwatch oroStream( nullptr ); + oroStream.start(); if constexpr( KEY_VALUE_PAIR ) { @@ -108,9 +108,10 @@ class SortTest m_sort.sort( srcGpu.key, dstGpu.key, testSize, 0, testBits ); } + oroStream.stop(); + OrochiUtils::waitForCompletion(); - sw.stop(); - float ms = sw.getMs(); + float ms = oroStream.getMs(); float gKeys_s = static_cast( testSize ) / 1000.f / 1000.f / ms; printf( "%5.2fms (%3.2fGKeys/s) sorting %3.1fMkeys [%s]\n", ms, gKeys_s, testSize / 1000.f / 1000.f, KEY_VALUE_PAIR ? "keyValue" : "key" ); } From 8681ee0b324a91b613adcf8f1d3ca4b6a3a18a57 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Mon, 25 Dec 2023 17:08:15 +0900 Subject: [PATCH 14/68] other test variants --- Test/RadixSort/main.cpp | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/Test/RadixSort/main.cpp b/Test/RadixSort/main.cpp index 9ee57a6..2f7578d 100644 --- a/Test/RadixSort/main.cpp +++ b/Test/RadixSort/main.cpp @@ -49,6 +49,19 @@ class Stopwatch }; #endif +struct splitmix64 +{ + uint64_t x = 0; /* The state can be seeded with any value. */ + + uint64_t next() + { + uint64_t z = ( x += 0x9e3779b97f4a7c15 ); + z = ( z ^ ( z >> 30 ) ) * 0xbf58476d1ce4e5b9; + z = ( z ^ ( z >> 27 ) ) * 0x94d049bb133111eb; + return z ^ ( z >> 31 ); + } +}; + using u64 = Oro::RadixSort::u64; using u32 = Oro::RadixSort::u32; @@ -68,9 +81,14 @@ class SortTest OrochiUtils::malloc( dstGpu.key, testSize ); std::vector srcKey( testSize ); + + splitmix64 rng; for( int i = 0; i < testSize; i++ ) { srcKey[i] = getRandom( 0u, (u32)( ( 1ull << (u64)testBits ) - 1 ) ); + + //u32 mask = (u32)( ( 1ull << (u64)testBits ) - 1 ); + //srcKey[i] = rng.next() & mask; } std::vector srcValue( testSize ); @@ -291,6 +309,7 @@ enum TestType TEST_SIMPLE, TEST_PERF, TEST_BITS, + TEST_CAPTURE, TEST_MISC, }; @@ -371,7 +390,11 @@ int main( int argc, char** argv ) sort.test( testSize, 32, nRuns ); } break; - + case TEST_CAPTURE: + { + sort.test( 1u << 27 /*2^29*/, 32, 9999999 ); + } + break; case TEST_MISC: { static constexpr auto file = "input.txt"; From f6de36d2a4d040a7c54dfe94c69752d1de90e9f1 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Mon, 25 Dec 2023 18:04:52 +0900 Subject: [PATCH 15/68] Split iterators --- ParallelPrimitives/RadixSort.cpp | 15 ++++++++++----- ParallelPrimitives/RadixSort.h | 2 ++ ParallelPrimitives/RadixSortKernels.h | 19 +++++++++---------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 9748901..07c8afd 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -316,9 +316,12 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc compileKernels( kernelPath, includeDir ); u64 gpSumBuffer = sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ); - u64 lookBackBuffer = next_multiple64( sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE + 1 ), 16 ); + u64 lookBackBuffer = next_multiple64( sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE ), 16 ); m_tmpBuffer.resizeAsync( gpSumBuffer + lookBackBuffer, false /*copy*/, stream ); + m_tailIterator.resizeAsync( 1, false /*copy*/, stream ); + m_tailIterator.resetAsync( stream ); + m_gpSumCounter.resizeAsync( 1, false /*copy*/, stream ); //m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count ); ///// The tmp buffer size of the count kernel and the scan kernel. @@ -371,11 +374,13 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit // Buffers void* gpSumBuffer = m_tmpBuffer.ptr(); void* lookBackBuffer = (void*)( m_tmpBuffer.ptr() + sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ) ); + void* tailIteratorBuffer = m_tailIterator.ptr(); // counter for gHistogram. - void* counter = (uint8_t*)lookBackBuffer + ( 256 * LOOKBACK_TABLE_SIZE ) * sizeof( uint64_t ) + sizeof( uint32_t ); - { + m_gpSumCounter.resetAsync( stream ); + void* counter = m_gpSumCounter.ptr(); + oroMemsetD32Async( (oroDeviceptr)m_tmpBuffer.ptr(), 0, m_tmpBuffer.size() / 4, stream ); const int nBlocks = 2048; @@ -399,12 +404,12 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit if( keyPair ) { - const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i }; + const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, &gpSumBuffer, &lookBackBuffer, &tailIteratorBuffer, & startBit, &i }; OrochiUtils::launch1D( m_onesweep_reorderKeyPair64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream ); } else { - const void* args[] = { &s.key, &d.key, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i }; + const void* args[] = { &s.key, &d.key, &n, &gpSumBuffer, &lookBackBuffer, &tailIteratorBuffer, &startBit, &i }; OrochiUtils::launch1D( m_onesweep_reorderKey64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream ); } std::swap( s, d ); diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index 9e5b7e5..bb51673 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -124,5 +124,7 @@ class RadixSort final oroFunction m_onesweep_reorderKeyPair64; GpuMemory m_tmpBuffer; + GpuMemory m_gpSumCounter; + GpuMemory m_tailIterator; }; }; // namespace Oro diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 1530b0a..8919e99 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -962,7 +962,7 @@ extern "C" __global__ void gPrefixSum( uint32_t* gpSumBuffer ) } __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, uint32_t numberOfInputs, uint32_t* gpSumBuffer, - volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) + volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits, uint32_t iteration ) { struct ElementLocation { @@ -1052,12 +1052,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys return x; }; - uint32_t* gTailIterator = (uint32_t*)( lookBackBuffer + LOOKBACK_TABLE_SIZE * 256 ); - if( threadIdx.x == 0 && LOOKBACK_TABLE_SIZE <= blockIndex ) { uint32_t mustBeDone = blockIndex - LOOKBACK_TABLE_SIZE + MAX_LOOK_BACK; - while( ( atomicAdd( gTailIterator, 0 ) >> TAIL_BITS ) * TAIL_COUNT <= mustBeDone ) + while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) * TAIL_COUNT <= mustBeDone ) ; } __syncthreads(); @@ -1119,10 +1117,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys if( threadIdx.x == 0 ) { - while( ( atomicAdd( gTailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT ) + while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT ) ; - atomicInc( gTailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ ); + atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ ); } // reorder @@ -1210,12 +1208,13 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } } } -extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) +extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits, + uint32_t iteration ) { - onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration ); + onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration ); } extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, uint32_t numberOfInputs, uint32_t* gpSumBuffer, - volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration ) + volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits, uint32_t iteration ) { - onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration ); + onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration ); } \ No newline at end of file From ed8fb9d3e9302fb89aa4f837f4917516b8c43551 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Mon, 25 Dec 2023 18:26:14 +0900 Subject: [PATCH 16/68] Split temp buffer for simplicity --- ParallelPrimitives/RadixSort.cpp | 20 +++++++++++--------- ParallelPrimitives/RadixSort.h | 3 ++- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 07c8afd..7c58cda 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -316,8 +316,10 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc compileKernels( kernelPath, includeDir ); u64 gpSumBuffer = sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ); - u64 lookBackBuffer = next_multiple64( sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE ), 16 ); - m_tmpBuffer.resizeAsync( gpSumBuffer + lookBackBuffer, false /*copy*/, stream ); + m_gpSumBuffer.resizeAsync( gpSumBuffer, false /*copy*/, stream ); + + u64 lookBackBuffer = sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE ); + m_lookbackBuffer.resizeAsync( lookBackBuffer, false /*copy*/, stream ); m_tailIterator.resizeAsync( 1, false /*copy*/, stream ); m_tailIterator.resetAsync( stream ); @@ -372,17 +374,17 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit uint64_t numberOfBlocks = div_round_up64( n, RADIX_SORT_BLOCK_SIZE ); // Buffers - void* gpSumBuffer = m_tmpBuffer.ptr(); - void* lookBackBuffer = (void*)( m_tmpBuffer.ptr() + sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ) ); + void* gpSumBuffer = m_gpSumBuffer.ptr(); + void* lookBackBuffer = m_lookbackBuffer.ptr(); void* tailIteratorBuffer = m_tailIterator.ptr(); + m_lookbackBuffer.resetAsync( stream ); + m_gpSumCounter.resetAsync( stream ); + m_gpSumBuffer.resetAsync( stream ); + // counter for gHistogram. { - m_gpSumCounter.resetAsync( stream ); void* counter = m_gpSumCounter.ptr(); - - oroMemsetD32Async( (oroDeviceptr)m_tmpBuffer.ptr(), 0, m_tmpBuffer.size() / 4, stream ); - const int nBlocks = 2048; const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &counter }; @@ -399,7 +401,7 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit { if( numberOfBlocks < LOOKBACK_TABLE_SIZE * 2 ) { - oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, ( 256 * LOOKBACK_TABLE_SIZE ) * sizeof( uint64_t ) / 4, stream ); + m_lookbackBuffer.resetAsync( stream ); } // other wise, we can skip zero clear look back buffer if( keyPair ) diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index bb51673..e74cc56 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -123,7 +123,8 @@ class RadixSort final oroFunction m_onesweep_reorderKey64; oroFunction m_onesweep_reorderKeyPair64; - GpuMemory m_tmpBuffer; + GpuMemory m_lookbackBuffer; + GpuMemory m_gpSumBuffer; GpuMemory m_gpSumCounter; GpuMemory m_tailIterator; }; From d6c37b676a23146f6ff5f40f33de1e3a2a381a61 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 09:00:08 +0900 Subject: [PATCH 17/68] to constexprs --- ParallelPrimitives/RadixSortConfigs.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index 6eb68d1..c5e2c0a 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -42,17 +42,17 @@ static_assert( BIN_SIZE % 2 == 0 ); static_assert( DEFAULT_COUNT_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 ); static_assert( DEFAULT_SCAN_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 ); -#define RADIX_SORT_BLOCK_SIZE 2048 +constexpr int RADIX_SORT_BLOCK_SIZE = 2048; -#define GHISTOGRAM_ITEM_PER_BLOCK 2048 -#define GHISTOGRAM_THREADS_PER_BLOCK 256 +constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048; +constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256; -#define REORDER_NUMBER_OF_WARPS 8 -#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS ) +constexpr int REORDER_NUMBER_OF_WARPS = 8; +constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS; -#define LOOKBACK_TABLE_SIZE ( 1024 ) -#define MAX_LOOK_BACK 64 -#define TAIL_BITS 4 -#define TAIL_COUNT ( 1u << TAIL_BITS ) +constexpr int LOOKBACK_TABLE_SIZE = 1024; +constexpr int MAX_LOOK_BACK = 64; +constexpr int TAIL_BITS = 4; +constexpr int TAIL_COUNT = 1u << TAIL_BITS; }; // namespace Oro \ No newline at end of file From 56fa76d32ac0fddeae78ad86a8395801d466e789 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 09:12:54 +0900 Subject: [PATCH 18/68] Fix smaller n execution. --- ParallelPrimitives/RadixSort.cpp | 61 +++++++++++++------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 7c58cda..0e2d06d 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -190,43 +190,30 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string }; -// for( const auto& record : records ) -// { -//#if defined( ORO_PP_LOAD_FROM_STRING ) -// oroFunctions[record.kernelType] = oroutils.getFunctionFromString( device, hip_RadixSortKernels, currentKernelPath.c_str(), record.kernelName.c_str(), &opts, 1, hip::RadixSortKernelsArgs, hip::RadixSortKernelsIncludes ); -//#else -// -// if constexpr( useBitCode ) -// { -// oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary( binaryPath.c_str(), record.kernelName.c_str() ); -// } -// else -// { -// const auto includeArg{ "-I" + currentIncludeDir }; -// const auto overwrite_flag = "-DOVERWRITE"; -// const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count ); -// const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan ); -// const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort ); -// const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort ); -// -// std::vector opts; -// opts.push_back( "-ffast-math" ); -// opts.push_back( includeArg.c_str() ); -// opts.push_back( overwrite_flag ); -// opts.push_back( count_block_size_param.c_str() ); -// opts.push_back( scan_block_size_param.c_str() ); -// opts.push_back( sort_block_size_param.c_str() ); -// opts.push_back( sort_num_warps_param.c_str() ); -// -// oroFunctions[record.kernelType] = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), record.kernelName.c_str(), &opts ); -// } -// -//#endif -// if( m_flags == Flag::LOG ) -// { -// printKernelInfo( record.kernelName, oroFunctions[record.kernelType] ); -// } -// } + for( const auto& record : records ) + { +#if defined( ORO_PP_LOAD_FROM_STRING ) + oroFunctions[record.kernelType] = oroutils.getFunctionFromString( device, hip_RadixSortKernels, currentKernelPath.c_str(), record.kernelName.c_str(), &opts, 1, hip::RadixSortKernelsArgs, hip::RadixSortKernelsIncludes ); +#else + + if constexpr( useBitCode ) + { + oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary( binaryPath.c_str(), record.kernelName.c_str() ); + } + else + { + const auto includeArg{ "-I" + currentIncludeDir }; + std::vector opts; + opts.push_back( includeArg.c_str() ); + oroFunctions[record.kernelType] = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), record.kernelName.c_str(), &opts ); + } + +#endif + if( m_flags == Flag::LOG ) + { + printKernelInfo( record.kernelName, oroFunctions[record.kernelType] ); + } + } // TODO: bit code support? #define LOAD_FUNC( var, kernel ) var = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), kernel, &opts ); From 0ba36d579a723fc0e824198b47af9911497cd3d1 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 09:39:39 +0900 Subject: [PATCH 19/68] adaptive blocksize for counting --- ParallelPrimitives/RadixSort.cpp | 6 ++++-- ParallelPrimitives/RadixSort.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 0e2d06d..876fe0f 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -75,7 +75,7 @@ namespace Oro RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils, oroStream stream, const std::string& kernelPath, const std::string& includeDir ) : m_device{ device }, m_oroutils{ oroutils } { - // oroGetDeviceProperties( &m_props, device ); + oroGetDeviceProperties( &m_props, device ); configure( kernelPath, includeDir, stream ); } @@ -372,7 +372,9 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit // counter for gHistogram. { void* counter = m_gpSumCounter.ptr(); - const int nBlocks = 2048; + int maxBlocksPerMP = 0; + oroError e = oroOccupancyMaxActiveBlocksPerMultiprocessor( &maxBlocksPerMP, m_gHistogram, GHISTOGRAM_THREADS_PER_BLOCK, 0 ); + const int nBlocks = e == oroSuccess ? maxBlocksPerMP * m_props.multiProcessorCount : 2048; const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &counter }; OrochiUtils::launch1D( m_gHistogram, nBlocks * GHISTOGRAM_THREADS_PER_BLOCK, args, GHISTOGRAM_THREADS_PER_BLOCK, 0, stream ); diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index e74cc56..c04854c 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -105,7 +105,7 @@ class RadixSort final //GpuMemory m_is_ready; oroDevice m_device{}; - //oroDeviceProp m_props{}; + oroDeviceProp m_props{}; OrochiUtils& m_oroutils; From d704f749c8b243d5aca554c2be9569fbadf8f052 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 09:51:48 +0900 Subject: [PATCH 20/68] use const ref --- ParallelPrimitives/RadixSort.cpp | 2 +- ParallelPrimitives/RadixSort.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 876fe0f..8829c0b 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -334,7 +334,7 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc } void RadixSort::setFlag( Flag flag ) noexcept { m_flags = flag; } -void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, oroStream stream ) noexcept +void RadixSort::sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n, int startBit, int endBit, oroStream stream ) noexcept { bool keyPair = src.value != nullptr; diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index c04854c..f530d79 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -43,7 +43,7 @@ class RadixSort final void setFlag( Flag flag ) noexcept; - void sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept; + void sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept; void sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept; From 11671a4ddad15089bbe614cab3d4873b8217ed53 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 09:57:53 +0900 Subject: [PATCH 21/68] remove define --- ParallelPrimitives/RadixSortKernels.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 8919e99..bf1c050 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -760,8 +760,8 @@ extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, i // SortImpl( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED ); //} -#define RADIX_SORT_KEY_TYPE u32 -#define RADIX_SORT_VALUE_TYPE u32 +using RADIX_SORT_KEY_TYPE = uint32_t; +using RADIX_SORT_VALUE_TYPE = uint32_t; #define KEY_IS_16BYTE_ALIGNED 1 typedef unsigned long long uint64_t; From 99cc300b45bf392c8b9af2bd7ef0109a20de3360 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 10:02:45 +0900 Subject: [PATCH 22/68] fix compile error and remove unused comments --- ParallelPrimitives/RadixSortKernels.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index bf1c050..4341a81 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -760,8 +760,6 @@ extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, i // SortImpl( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED ); //} -using RADIX_SORT_KEY_TYPE = uint32_t; -using RADIX_SORT_VALUE_TYPE = uint32_t; #define KEY_IS_16BYTE_ALIGNED 1 typedef unsigned long long uint64_t; @@ -769,18 +767,8 @@ typedef unsigned int uint32_t; typedef unsigned short uint16_t; typedef unsigned char uint8_t; -//#define RADIX_SORT_BLOCK_SIZE 2048 -// -//#define GHISTOGRAM_ITEM_PER_BLOCK 2048 -//#define GHISTOGRAM_THREADS_PER_BLOCK 256 -// -//#define REORDER_NUMBER_OF_WARPS 8 -//#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS ) -// -//#define LOOKBACK_TABLE_SIZE ( 1024 ) -//#define MAX_LOOK_BACK 64 -//#define TAIL_BITS 4 -//#define TAIL_COUNT ( 1u << TAIL_BITS ) +using RADIX_SORT_KEY_TYPE = uint32_t; +using RADIX_SORT_VALUE_TYPE = uint32_t; #if defined( DESCENDING_ORDER ) #define ORDER_MASK_32 0xFFFFFFFF From 1fd425cd0e32eab796d5c3fc289e97293df3fdc9 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 10:13:02 +0900 Subject: [PATCH 23/68] remove macro --- ParallelPrimitives/RadixSortKernels.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 4341a81..82a8d55 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -760,7 +760,7 @@ extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, i // SortImpl( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED ); //} -#define KEY_IS_16BYTE_ALIGNED 1 +constexpr auto KEY_IS_16BYTE_ALIGNED = true; typedef unsigned long long uint64_t; typedef unsigned int uint32_t; @@ -872,8 +872,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num { hasData = true; -#if defined( KEY_IS_16BYTE_ALIGNED ) - if( ( iBlock + 1 ) * GHISTOGRAM_ITEM_PER_BLOCK <= numberOfInputs ) + if( KEY_IS_16BYTE_ALIGNED && ( iBlock + 1 ) * GHISTOGRAM_ITEM_PER_BLOCK <= numberOfInputs ) { for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK * 4 ) { @@ -896,7 +895,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num } } else -#endif + { for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK ) { uint32_t itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x + i; @@ -911,7 +910,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num } } } - + } __syncthreads(); if( threadIdx.x == 0 ) @@ -984,8 +983,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); // count -#if defined( KEY_IS_16BYTE_ALIGNED ) - if( ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs ) + if( KEY_IS_16BYTE_ALIGNED && ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs ) { for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK * 4 ) { @@ -1005,7 +1003,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } } else -#endif { for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { From c0ee24b39ee2653b4b90cb0132d302ad15062cf6 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 10:17:56 +0900 Subject: [PATCH 24/68] unified types --- ParallelPrimitives/RadixSortKernels.h | 159 +++++++++++++------------- 1 file changed, 77 insertions(+), 82 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 82a8d55..4088335 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -762,13 +762,8 @@ extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, i constexpr auto KEY_IS_16BYTE_ALIGNED = true; -typedef unsigned long long uint64_t; -typedef unsigned int uint32_t; -typedef unsigned short uint16_t; -typedef unsigned char uint8_t; - -using RADIX_SORT_KEY_TYPE = uint32_t; -using RADIX_SORT_VALUE_TYPE = uint32_t; +using RADIX_SORT_KEY_TYPE = u32; +using RADIX_SORT_VALUE_TYPE = u32; #if defined( DESCENDING_ORDER ) #define ORDER_MASK_32 0xFFFFFFFF @@ -782,7 +777,7 @@ using RADIX_SORT_VALUE_TYPE = uint32_t; #define ITS 1 #endif -__device__ inline uint32_t div_round_up( uint32_t val, uint32_t divisor ) { return ( val + divisor - 1 ) / divisor; } +__device__ inline u32 div_round_up( u32 val, u32 divisor ) { return ( val + divisor - 1 ) / divisor; } template __device__ void clearShared( T* sMem, T value ) { @@ -795,31 +790,31 @@ __device__ void clearShared( T* sMem, T value ) } } -__device__ inline uint32_t getKeyBits( uint32_t x ) { return x ^ ORDER_MASK_32; } -__device__ inline uint64_t getKeyBits( uint64_t x ) { return x ^ ORDER_MASK_64; } -__device__ inline uint32_t getKeyBits( float x ) +__device__ inline u32 getKeyBits( u32 x ) { return x ^ ORDER_MASK_32; } +__device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; } +__device__ inline u32 getKeyBits( float x ) { if( x == 0.0f ) x = 0.0f; - uint32_t flip = uint32_t( __float_as_int( x ) >> 31 ) | 0x80000000; + u32 flip = u32( __float_as_int( x ) >> 31 ) | 0x80000000; return __float_as_uint( x ) ^ flip ^ ORDER_MASK_32; } -__device__ inline uint64_t getKeyBits( double x ) +__device__ inline u64 getKeyBits( double x ) { if( x == 0.0 ) x = 0.0; - uint64_t flip = uint64_t( __double_as_longlong( x ) >> 63 ) | 0x8000000000000000llu; - return (uint64_t)__double_as_longlong( x ) ^ flip ^ ORDER_MASK_64; + u64 flip = u64( __double_as_longlong( x ) >> 63 ) | 0x8000000000000000llu; + return (u64)__double_as_longlong( x ) ^ flip ^ ORDER_MASK_64; } template -__device__ inline uint32_t prefixSumExclusive( uint32_t prefix, uint32_t* sMemIO ) +__device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO ) { - uint32_t value = sMemIO[threadIdx.x]; + u32 value = sMemIO[threadIdx.x]; - for( uint32_t offset = 1; offset < NThreads; offset <<= 1 ) + for( u32 offset = 1; offset < NThreads; offset <<= 1 ) { - uint32_t x = sMemIO[threadIdx.x]; + u32 x = sMemIO[threadIdx.x]; if( offset <= threadIdx.x ) { @@ -832,7 +827,7 @@ __device__ inline uint32_t prefixSumExclusive( uint32_t prefix, uint32_t* sMemIO __syncthreads(); } - uint32_t sum = sMemIO[NThreads - 1]; + u32 sum = sMemIO[NThreads - 1]; __syncthreads(); @@ -843,9 +838,9 @@ __device__ inline uint32_t prefixSumExclusive( uint32_t prefix, uint32_t* sMemIO return sum; } -extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t numberOfInputs, uint32_t* gpSumBuffer, uint32_t startBits, uint32_t* counter ) +extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOfInputs, u32* gpSumBuffer, u32 startBits, u32* counter ) { - __shared__ uint32_t localCounters[sizeof( RADIX_SORT_KEY_TYPE )][256]; + __shared__ u32 localCounters[sizeof( RADIX_SORT_KEY_TYPE )][256]; for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { @@ -857,8 +852,8 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num __syncthreads(); - uint32_t numberOfBlocks = div_round_up( numberOfInputs, GHISTOGRAM_ITEM_PER_BLOCK ); - __shared__ uint32_t iBlock; + u32 numberOfBlocks = div_round_up( numberOfInputs, GHISTOGRAM_ITEM_PER_BLOCK ); + __shared__ u32 iBlock; if( threadIdx.x == 0 ) { iBlock = atomicInc( counter, 0xFFFFFFFF ); @@ -876,7 +871,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num { for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK * 4 ) { - uint32_t itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i + threadIdx.x * 4; + u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i + threadIdx.x * 4; struct alignas( 16 ) Key4 { RADIX_SORT_KEY_TYPE xs[4]; @@ -887,8 +882,8 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num auto item = key4.xs[k]; for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { - uint32_t bitLocation = startBits + i * 8; - uint32_t bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + u32 bitLocation = startBits + i * 8; + u32 bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF; atomicInc( &localCounters[i][bits], 0xFFFFFFFF ); } } @@ -898,14 +893,14 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num { for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK ) { - uint32_t itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x + i; + u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x + i; if( itemIndex < numberOfInputs ) { auto item = inputs[itemIndex]; for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { - uint32_t bitLocation = startBits + i * 8; - uint32_t bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + u32 bitLocation = startBits + i * 8; + u32 bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF; atomicInc( &localCounters[i][bits], 0xFFFFFFFF ); } } @@ -935,9 +930,9 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num } } -extern "C" __global__ void gPrefixSum( uint32_t* gpSumBuffer ) +extern "C" __global__ void gPrefixSum( u32* gpSumBuffer ) { - __shared__ uint32_t smem[256]; + __shared__ u32 smem[256]; smem[threadIdx.x] = gpSumBuffer[blockIdx.x * 256 + threadIdx.x]; @@ -948,29 +943,29 @@ extern "C" __global__ void gPrefixSum( uint32_t* gpSumBuffer ) gpSumBuffer[blockIdx.x * 256 + threadIdx.x] = smem[threadIdx.x]; } -__device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, uint32_t numberOfInputs, uint32_t* gpSumBuffer, - volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits, uint32_t iteration ) +__device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, u32 numberOfInputs, u32* gpSumBuffer, + volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) { struct ElementLocation { - uint32_t localSrcIndex : 12; - uint32_t localOffset : 12; - uint32_t bucket : 8; + u32 localSrcIndex : 12; + u32 localOffset : 12; + u32 bucket : 8; }; - __shared__ uint32_t pSum[256]; - __shared__ uint32_t localPrefixSum[256]; - __shared__ uint32_t counters[256]; + __shared__ u32 pSum[256]; + __shared__ u32 localPrefixSum[256]; + __shared__ u32 counters[256]; __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; - __shared__ uint8_t elementBuckets[RADIX_SORT_BLOCK_SIZE]; - __shared__ uint32_t matchMasks[REORDER_NUMBER_OF_WARPS][256]; + __shared__ u8 elementBuckets[RADIX_SORT_BLOCK_SIZE]; + __shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][256]; - uint32_t bitLocation = startBits + 8 * iteration; - uint32_t blockIndex = blockIdx.x; - uint32_t numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE ); + u32 bitLocation = startBits + 8 * iteration; + u32 blockIndex = blockIdx.x; + u32 numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE ); - clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, uint32_t>( localPrefixSum, 0 ); - clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, uint32_t>( counters, 0 ); + clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( localPrefixSum, 0 ); + clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( counters, 0 ); for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ ) { @@ -987,7 +982,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK * 4 ) { - uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x * 4; + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x * 4; struct alignas( 16 ) Key4 { RADIX_SORT_KEY_TYPE xs[4]; @@ -996,7 +991,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int k = 0; k < 4; k++ ) { auto item = key4.xs[k]; - uint32_t bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + u32 bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF; atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); elementBuckets[i + threadIdx.x * 4 + k] = bucketIndex; } @@ -1006,11 +1001,11 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; if( itemIndex < numberOfInputs ) { auto item = inputKeys[itemIndex]; - uint32_t bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + u32 bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF; atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); elementBuckets[i + threadIdx.x] = bucketIndex; @@ -1020,11 +1015,11 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys struct ParitionID { - uint64_t value : 32; - uint64_t block : 30; - uint64_t flag : 2; + u64 value : 32; + u64 block : 30; + u64 flag : 2; }; - auto asPartition = []( uint64_t x ) + auto asPartition = []( u64 x ) { ParitionID pa; memcpy( &pa, &x, sizeof( ParitionID ) ); @@ -1032,14 +1027,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys }; auto asU64 = []( ParitionID pa ) { - uint64_t x; - memcpy( &x, &pa, sizeof( uint64_t ) ); + u64 x; + memcpy( &x, &pa, sizeof( u64 ) ); return x; }; if( threadIdx.x == 0 && LOOKBACK_TABLE_SIZE <= blockIndex ) { - uint32_t mustBeDone = blockIndex - LOOKBACK_TABLE_SIZE + MAX_LOOK_BACK; + u32 mustBeDone = blockIndex - LOOKBACK_TABLE_SIZE + MAX_LOOK_BACK; while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) * TAIL_COUNT <= mustBeDone ) ; } @@ -1047,7 +1042,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int i = threadIdx.x; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - uint32_t s = localPrefixSum[i]; + u32 s = localPrefixSum[i]; int pIndex = 256 * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i; { @@ -1058,9 +1053,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys lookBackBuffer[pIndex] = asU64( pa ); } - uint32_t gp = gpSumBuffer[iteration * 256 + i]; + u32 gp = gpSumBuffer[iteration * 256 + i]; - uint32_t p = 0; + u32 p = 0; for( int iBlock = (int)blockIndex - 1; 0 <= iBlock; iBlock-- ) { @@ -1076,7 +1071,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys pa = asPartition( lookBackBuffer[lookbackIndex] ); } while( ( pa.flag & flagRequire ) == 0 || pa.block != iBlock ); - uint32_t value = pa.value; + u32 value = pa.value; p += value; if( pa.flag == 2 ) { @@ -1094,7 +1089,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys pSum[i] = gp + p; } - uint32_t prefix = 0; + u32 prefix = 0; for( int i = 0; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { prefix += prefixSumExclusive( prefix, &localPrefixSum[i] ); @@ -1111,8 +1106,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys // reorder for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; - uint32_t bucketIndex = elementBuckets[i + threadIdx.x]; + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + u32 bucketIndex = elementBuckets[i + threadIdx.x]; __syncthreads(); @@ -1130,9 +1125,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys if( itemIndex < numberOfInputs ) { - uint32_t matchMask = matchMasks[warp][bucketIndex]; - uint32_t lowerMask = ( 1u << lane ) - 1; - uint32_t offset = __popc( matchMask & lowerMask ); + u32 matchMask = matchMasks[warp][bucketIndex]; + u32 lowerMask = ( 1u << lane ) - 1; + u32 offset = __popc( matchMask & lowerMask ); flushMask = offset == 0; @@ -1141,8 +1136,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys offset += __popc( matchMasks[w][bucketIndex] ); } - uint32_t localOffset = counters[bucketIndex] + offset; - uint32_t to = localOffset + localPrefixSum[bucketIndex]; + u32 localOffset = counters[bucketIndex] + offset; + u32 to = localOffset + localPrefixSum[bucketIndex]; ElementLocation el; el.localSrcIndex = i + threadIdx.x; @@ -1165,14 +1160,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; if( itemIndex < numberOfInputs ) { ElementLocation el = elementLocations[i + threadIdx.x]; - uint32_t srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; - uint8_t bucketIndex = el.bucket; + u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; + u8 bucketIndex = el.bucket; - uint32_t dstIndex = pSum[bucketIndex] + el.localOffset; + u32 dstIndex = pSum[bucketIndex] + el.localOffset; outputKeys[dstIndex] = inputKeys[srcIndex]; } } @@ -1180,26 +1175,26 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; if( itemIndex < numberOfInputs ) { ElementLocation el = elementLocations[i + threadIdx.x]; - uint32_t srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; - uint8_t bucketIndex = el.bucket; + u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; + u8 bucketIndex = el.bucket; - uint32_t dstIndex = pSum[bucketIndex] + el.localOffset; + u32 dstIndex = pSum[bucketIndex] + el.localOffset; outputValues[dstIndex] = inputValues[srcIndex]; } } } } -extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits, - uint32_t iteration ) +extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, + u32 iteration ) { onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration ); } -extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, uint32_t numberOfInputs, uint32_t* gpSumBuffer, - volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits, uint32_t iteration ) +extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer, + volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) { onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration ); } \ No newline at end of file From 8dbf83a1c9bd24522771b3803d8cb20bd63cc9c4 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 10:19:41 +0900 Subject: [PATCH 25/68] to constexpr noexcept --- ParallelPrimitives/RadixSortKernels.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 4088335..e843d5d 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -777,7 +777,8 @@ using RADIX_SORT_VALUE_TYPE = u32; #define ITS 1 #endif -__device__ inline u32 div_round_up( u32 val, u32 divisor ) { return ( val + divisor - 1 ) / divisor; } +__device__ constexpr u32 div_round_up( u32 val, u32 divisor ) noexcept { return ( val + divisor - 1 ) / divisor; } + template __device__ void clearShared( T* sMem, T value ) { From be5f26e64b0f991fd37a7ac7f7dc3df26dbd41ce Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 10:39:37 +0900 Subject: [PATCH 26/68] use constexpr and remove unused functions --- ParallelPrimitives/RadixSortKernels.h | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index e843d5d..28a578b 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -1,6 +1,5 @@ #include #define LDS_BARRIER __syncthreads() - namespace { @@ -766,15 +765,11 @@ using RADIX_SORT_KEY_TYPE = u32; using RADIX_SORT_VALUE_TYPE = u32; #if defined( DESCENDING_ORDER ) -#define ORDER_MASK_32 0xFFFFFFFF -#define ORDER_MASK_64 0xFFFFFFFFFFFFFFFFllu +constexpr u32 ORDER_MASK_32 = 0xFFFFFFFF; +constexpr u64 ORDER_MASK_64 = 0xFFFFFFFFFFFFFFFFllu; #else -#define ORDER_MASK_32 0 -#define ORDER_MASK_64 0llu -#endif - -#if defined( CUDART_VERSION ) && CUDART_VERSION >= 9000 -#define ITS 1 +constexpr u32 ORDER_MASK_32 = 0; +constexpr u64 ORDER_MASK_64 = 0llu; #endif __device__ constexpr u32 div_round_up( u32 val, u32 divisor ) noexcept { return ( val + divisor - 1 ) / divisor; } @@ -793,20 +788,6 @@ __device__ void clearShared( T* sMem, T value ) __device__ inline u32 getKeyBits( u32 x ) { return x ^ ORDER_MASK_32; } __device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; } -__device__ inline u32 getKeyBits( float x ) -{ - if( x == 0.0f ) x = 0.0f; - - u32 flip = u32( __float_as_int( x ) >> 31 ) | 0x80000000; - return __float_as_uint( x ) ^ flip ^ ORDER_MASK_32; -} -__device__ inline u64 getKeyBits( double x ) -{ - if( x == 0.0 ) x = 0.0; - - u64 flip = u64( __double_as_longlong( x ) >> 63 ) | 0x8000000000000000llu; - return (u64)__double_as_longlong( x ) ^ flip ^ ORDER_MASK_64; -} template __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO ) From a443768d42fd5f24532c70f012c6bece2ecf21ef Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 10:57:57 +0900 Subject: [PATCH 27/68] use BIN_SIZE constant --- ParallelPrimitives/RadixSort.cpp | 6 ++--- ParallelPrimitives/RadixSortConfigs.h | 3 +++ ParallelPrimitives/RadixSortKernels.h | 36 +++++++++++++-------------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 8829c0b..c2601c4 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -302,10 +302,10 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc { compileKernels( kernelPath, includeDir ); - u64 gpSumBuffer = sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ); + u64 gpSumBuffer = sizeof( u32 ) * BIN_SIZE * sizeof( u32 /* key type */ ); m_gpSumBuffer.resizeAsync( gpSumBuffer, false /*copy*/, stream ); - u64 lookBackBuffer = sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE ); + u64 lookBackBuffer = sizeof( u64 ) * ( BIN_SIZE * LOOKBACK_TABLE_SIZE ); m_lookbackBuffer.resizeAsync( lookBackBuffer, false /*copy*/, stream ); m_tailIterator.resizeAsync( 1, false /*copy*/, stream ); @@ -381,7 +381,7 @@ void RadixSort::sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n } { const void* args[] = { &gpSumBuffer }; - OrochiUtils::launch1D( m_gPrefixSum, nIteration * 256, args, 256, 0, stream ); + OrochiUtils::launch1D( m_gPrefixSum, nIteration * BIN_SIZE, args, BIN_SIZE, 0, stream ); } auto s = src; diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index c5e2c0a..33c5f78 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -55,4 +55,7 @@ constexpr int MAX_LOOK_BACK = 64; constexpr int TAIL_BITS = 4; constexpr int TAIL_COUNT = 1u << TAIL_BITS; +static_assert( REORDER_NUMBER_OF_THREADS_PER_BLOCK <= BIN_SIZE, "please check prefixSumExclusive on onesweep_reorder" ); +static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" ); + }; // namespace Oro \ No newline at end of file diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 28a578b..d251e72 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -904,9 +904,9 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { - for( int j = threadIdx.x; j < 256; j += GHISTOGRAM_THREADS_PER_BLOCK ) + for( int j = threadIdx.x; j < BIN_SIZE; j += GHISTOGRAM_THREADS_PER_BLOCK ) { - atomicAdd( &gpSumBuffer[256 * i + j], localCounters[i][j] ); + atomicAdd( &gpSumBuffer[BIN_SIZE * i + j], localCounters[i][j] ); } } } @@ -914,15 +914,15 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf extern "C" __global__ void gPrefixSum( u32* gpSumBuffer ) { - __shared__ u32 smem[256]; + __shared__ u32 smem[BIN_SIZE]; - smem[threadIdx.x] = gpSumBuffer[blockIdx.x * 256 + threadIdx.x]; + smem[threadIdx.x] = gpSumBuffer[blockIdx.x * BIN_SIZE + threadIdx.x]; __syncthreads(); - prefixSumExclusive<256>( 0, smem ); + prefixSumExclusive( 0, smem ); - gpSumBuffer[blockIdx.x * 256 + threadIdx.x] = smem[threadIdx.x]; + gpSumBuffer[blockIdx.x * BIN_SIZE + threadIdx.x] = smem[threadIdx.x]; } __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, u32 numberOfInputs, u32* gpSumBuffer, @@ -935,23 +935,23 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 bucket : 8; }; - __shared__ u32 pSum[256]; - __shared__ u32 localPrefixSum[256]; - __shared__ u32 counters[256]; + __shared__ u32 pSum[BIN_SIZE]; + __shared__ u32 localPrefixSum[BIN_SIZE]; + __shared__ u32 counters[BIN_SIZE]; __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; __shared__ u8 elementBuckets[RADIX_SORT_BLOCK_SIZE]; - __shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][256]; + __shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][BIN_SIZE]; u32 bitLocation = startBits + 8 * iteration; u32 blockIndex = blockIdx.x; u32 numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE ); - clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( localPrefixSum, 0 ); - clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( counters, 0 ); + clearShared( localPrefixSum, 0 ); + clearShared( counters, 0 ); for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ ) { - for( int i = 0; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { matchMasks[w][i + threadIdx.x] = 0; } @@ -1022,10 +1022,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } __syncthreads(); - for( int i = threadIdx.x; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { u32 s = localPrefixSum[i]; - int pIndex = 256 * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i; + int pIndex = BIN_SIZE * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i; { ParitionID pa; @@ -1035,13 +1035,13 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys lookBackBuffer[pIndex] = asU64( pa ); } - u32 gp = gpSumBuffer[iteration * 256 + i]; + u32 gp = gpSumBuffer[iteration * BIN_SIZE + i]; u32 p = 0; for( int iBlock = (int)blockIndex - 1; 0 <= iBlock; iBlock-- ) { - int lookbackIndex = 256 * ( iBlock % LOOKBACK_TABLE_SIZE ) + i; + int lookbackIndex = BIN_SIZE * ( iBlock % LOOKBACK_TABLE_SIZE ) + i; ParitionID pa; // when you reach to the maximum, flag must be 2. flagRequire = 0b10 @@ -1072,7 +1072,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } u32 prefix = 0; - for( int i = 0; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { prefix += prefixSumExclusive( prefix, &localPrefixSum[i] ); } From 82d4aad99044db927c352cd88403c3997ac177db Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 11:24:07 +0900 Subject: [PATCH 28/68] extract common process as extractDigit() --- ParallelPrimitives/RadixSortKernels.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index d251e72..f7b3912 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -788,6 +788,8 @@ __device__ void clearShared( T* sMem, T value ) __device__ inline u32 getKeyBits( u32 x ) { return x ^ ORDER_MASK_32; } __device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; } +__device__ inline u32 extractDigit( u32 x, u32 bitLocation ) { return ( x >> bitLocation ) & RADIX_MASK; } +__device__ inline u32 extractDigit( u64 x, u32 bitLocation ) { return (u32)( ( x >> bitLocation ) & RADIX_MASK ); } template __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO ) @@ -865,7 +867,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { u32 bitLocation = startBits + i * 8; - u32 bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + u32 bits = extractDigit( getKeyBits( item ), bitLocation ); atomicInc( &localCounters[i][bits], 0xFFFFFFFF ); } } @@ -882,7 +884,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { u32 bitLocation = startBits + i * 8; - u32 bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + u32 bits = extractDigit( getKeyBits( item ), bitLocation ); atomicInc( &localCounters[i][bits], 0xFFFFFFFF ); } } @@ -973,7 +975,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int k = 0; k < 4; k++ ) { auto item = key4.xs[k]; - u32 bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); elementBuckets[i + threadIdx.x * 4 + k] = bucketIndex; } @@ -987,7 +989,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys if( itemIndex < numberOfInputs ) { auto item = inputKeys[itemIndex]; - u32 bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF; + u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); elementBuckets[i + threadIdx.x] = bucketIndex; From 866b70c2e9f590b869faf30c7c4fe64576d2669f Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 11:41:42 +0900 Subject: [PATCH 29/68] keyPair as a template parameter --- ParallelPrimitives/RadixSortKernels.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index f7b3912..d4b7343 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -927,7 +927,8 @@ extern "C" __global__ void gPrefixSum( u32* gpSumBuffer ) gpSumBuffer[blockIdx.x * BIN_SIZE + threadIdx.x] = smem[threadIdx.x]; } -__device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, u32 numberOfInputs, u32* gpSumBuffer, +template +__device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) { struct ElementLocation @@ -1155,7 +1156,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys outputKeys[dstIndex] = inputKeys[srcIndex]; } } - if( keyPair ) + if constexpr ( keyPair ) { for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { @@ -1175,10 +1176,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) { - onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration ); + onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration ); } extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) { - onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration ); + onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration ); } \ No newline at end of file From a9c4e61fcf35ec1fdd9392a3f24d0874633ac2ec Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 13:27:02 +0900 Subject: [PATCH 30/68] remove unused codes --- ParallelPrimitives/RadixSort.cpp | 149 +------- ParallelPrimitives/RadixSort.h | 50 +-- ParallelPrimitives/RadixSortConfigs.h | 22 -- ParallelPrimitives/RadixSortKernels.h | 510 -------------------------- 4 files changed, 4 insertions(+), 727 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index c2601c4..3f31121 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -79,23 +79,6 @@ RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils, oroStream stream, configure( kernelPath, includeDir, stream ); } -//void RadixSort::exclusiveScanCpu( const Oro::GpuMemory& countsGpu, Oro::GpuMemory& offsetsGpu ) const noexcept -//{ -// const auto buffer_size = countsGpu.size(); -// -// std::vector counts = countsGpu.getData(); -// std::vector offsets( buffer_size ); -// -// int sum = 0; -// for( int i = 0; i < counts.size(); ++i ) -// { -// offsets[i] = sum; -// sum += counts[i]; -// } -// -// offsetsGpu.copyFromHost( offsets.data(), std::size( offsets ) ); -//} - void RadixSort::compileKernels( const std::string& kernelPath, const std::string& includeDir ) noexcept { static constexpr auto defaultKernelPath{ "../ParallelPrimitives/RadixSortKernels.h" }; @@ -127,53 +110,20 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string binaryPath = getCurrentDir(); binaryPath += isAmd ? "oro_compiled_kernels.hipfb" : "oro_compiled_kernels.fatbin"; log = "loading pre-compiled kernels at path : " + binaryPath; - - //m_num_threads_per_block_for_count = DEFAULT_COUNT_BLOCK_SIZE; - //m_num_threads_per_block_for_scan = DEFAULT_SCAN_BLOCK_SIZE; - //m_num_threads_per_block_for_sort = DEFAULT_SORT_BLOCK_SIZE; - - //const auto warp_size = DEFAULT_WARP_SIZE; - - //m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size; } else { log = "compiling kernels at path : " + currentKernelPath + " in : " + currentIncludeDir; - - //m_num_threads_per_block_for_count = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_COUNT_BLOCK_SIZE; - //m_num_threads_per_block_for_scan = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SCAN_BLOCK_SIZE; - //m_num_threads_per_block_for_sort = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SORT_BLOCK_SIZE; - - //const auto warp_size = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE; - - //m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size; - - //assert( m_num_threads_per_block_for_count % warp_size == 0 ); - //assert( m_num_threads_per_block_for_scan % warp_size == 0 ); - //assert( m_num_threads_per_block_for_sort % warp_size == 0 ); } - //m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / m_warp_size; - if( m_flags == Flag::LOG ) { std::cout << log << std::endl; } const auto includeArg{ "-I" + currentIncludeDir }; - //const auto overwrite_flag = "-DOVERWRITE"; - //const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count ); - //const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan ); - //const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort ); - //const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort ); - std::vector opts; opts.push_back( includeArg.c_str() ); - //opts.push_back( overwrite_flag ); - //opts.push_back( count_block_size_param.c_str() ); - //opts.push_back( scan_block_size_param.c_str() ); - //opts.push_back( sort_block_size_param.c_str() ); - //opts.push_back( sort_num_warps_param.c_str() ); struct Record { @@ -181,10 +131,6 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string Kernel kernelType; }; - //const std::vector records{ - // { "CountKernel", Kernel::COUNT }, { "ParallelExclusiveScanSingleWG", Kernel::SCAN_SINGLE_WG }, { "ParallelExclusiveScanAllWG", Kernel::SCAN_PARALLEL }, { "SortKernel", Kernel::SORT }, - // { "SortKVKernel", Kernel::SORT_KV }, { "SortSinglePassKernel", Kernel::SORT_SINGLE_PASS }, { "SortSinglePassKVKernel", Kernel::SORT_SINGLE_PASS_KV }, - //}; const std::vector records{ { "SortSinglePassKernel", Kernel::SORT_SINGLE_PASS }, { "SortSinglePassKVKernel", Kernel::SORT_SINGLE_PASS_KV }, }; @@ -222,81 +168,8 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string LOAD_FUNC( m_onesweep_reorderKey64, "onesweep_reorderKey64" ); LOAD_FUNC( m_onesweep_reorderKeyPair64, "onesweep_reorderKeyPair64" ); #undef LOAD_FUNC - // const auto includeArg{ "-I" + currentIncludeDir }; - // const auto overwrite_flag = "-DOVERWRITE"; - // const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count ); - // const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan ); - // const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort ); - // const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort ); - - // std::vector opts; - - // if( const std::string device_name = m_props.name; device_name.find( "NVIDIA" ) != std::string::npos ) - // { - // opts.push_back( "--use_fast_math" ); - // } - // else - // { - // opts.push_back( "-ffast-math" ); - // } - - // opts.push_back( includeArg.c_str() ); - // opts.push_back( overwrite_flag ); - // opts.push_back( count_block_size_param.c_str() ); - // opts.push_back( scan_block_size_param.c_str() ); - // opts.push_back( sort_block_size_param.c_str() ); - // opts.push_back( sort_num_warps_param.c_str() ); - - // for( const auto& record : records ) - // { - // if constexpr( useBakeKernel ) - // { - // oroFunctions[record.kernelType] = m_oroutils.getFunctionFromString( m_device, hip_RadixSortKernels, currentKernelPath.c_str(), record.kernelName.c_str(), &opts, 1, hip::RadixSortKernelsArgs, hip::RadixSortKernelsIncludes ); - // } - // else if constexpr( useBitCode ) - // { - // oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary( binaryPath.c_str(), record.kernelName.c_str() ); - // } - // else - // { - // oroFunctions[record.kernelType] = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), record.kernelName.c_str(), &opts ); - // } - - // if( m_flags == Flag::LOG ) - // { - // printKernelInfo( record.kernelName, oroFunctions[record.kernelType] ); - // } - // } -} -//int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept -//{ -// const int warpSize = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE; -// const int warpPerWG = blockSize / warpSize; -// const int warpPerWGP = m_props.maxThreadsPerMultiProcessor / warpSize; -// const int occupancyFromWarp = ( warpPerWGP > 0 ) ? ( warpPerWGP / warpPerWG ) : 1; -// -// const int occupancy = std::max( 1, occupancyFromWarp ); -// -// if( m_flags == Flag::LOG ) -// { -// std::cout << "Occupancy: " << occupancy << '\n'; -// } -// -// static constexpr auto min_num_blocks = 16; -// auto number_of_blocks = m_props.multiProcessorCount > 0 ? m_props.multiProcessorCount * occupancy : min_num_blocks; -// -// if( m_num_threads_per_block_for_scan > BIN_SIZE ) -// { -// // Note: both are divisible by 2 -// const auto base = m_num_threads_per_block_for_scan / BIN_SIZE; -// -// // Floor -// number_of_blocks = ( number_of_blocks / base ) * base; -// } -// -// return number_of_blocks; -//} +} void RadixSort::configure( const std::string& kernelPath, const std::string& includeDir, oroStream stream ) noexcept { @@ -311,26 +184,6 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc m_tailIterator.resizeAsync( 1, false /*copy*/, stream ); m_tailIterator.resetAsync( stream ); m_gpSumCounter.resizeAsync( 1, false /*copy*/, stream ); - //m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count ); - - ///// The tmp buffer size of the count kernel and the scan kernel. - - //const auto tmp_buffer_size = BIN_SIZE * m_num_blocks_for_count; - - ///// @c tmp_buffer_size must be divisible by @c m_num_threads_per_block_for_scan - ///// This is guaranteed since @c m_num_blocks_for_count will be adjusted accordingly - - //m_num_blocks_for_scan = tmp_buffer_size / m_num_threads_per_block_for_scan; - - //m_tmp_buffer.resizeAsync( tmp_buffer_size, false, stream ); - - //if( selectedScanAlgo == ScanAlgo::SCAN_GPU_PARALLEL ) - //{ - // // These are for the scan kernel - // m_partial_sum.resizeAsync( m_num_blocks_for_scan, false, stream ); - // m_is_ready.resizeAsync( m_num_blocks_for_scan, false, stream ); - // m_is_ready.resetAsync( stream ); - //} } void RadixSort::setFlag( Flag flag ) noexcept { m_flags = flag; } diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index f530d79..2a7c3be 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -48,76 +48,32 @@ class RadixSort final void sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept; private: - //template - //void sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept; - - ///// @brief Compile the kernels for radix sort. - ///// @param kernelPath The kernel path. - ///// @param includeDir The include directory. + // @brief Compile the kernels for radix sort. + // @param kernelPath The kernel path. + // @param includeDir The include directory. void compileKernels( const std::string& kernelPath, const std::string& includeDir ) noexcept; - //[[nodiscard]] int calculateWGsToExecute( const int blockSize ) const noexcept; - - ///// @brief Exclusive scan algorithm on CPU for testing. - ///// It copies the count result from the Device to Host before computation, and then copies the offsets back from Host to Device afterward. - ///// @param countsGpu The count result in GPU memory. Otuput: The offset. - ///// @param offsetsGpu The offsets. - //void exclusiveScanCpu( const Oro::GpuMemory& countsGpu, Oro::GpuMemory& offsetsGpu ) const noexcept; - /// @brief Configure the settings, compile the kernels and allocate the memory. /// @param kernelPath The kernel path. /// @param includeDir The include directory. void configure( const std::string& kernelPath, const std::string& includeDir, oroStream stream ) noexcept; private: - //// GPU blocks for the count kernel - //int m_num_blocks_for_count{}; - - //// GPU blocks for the scan kernel - //int m_num_blocks_for_scan{}; - Flag m_flags{ Flag::NO_LOG }; enum class Kernel { - //COUNT, - //SCAN_SINGLE_WG, - //SCAN_PARALLEL, - //SORT, - //SORT_KV, SORT_SINGLE_PASS, SORT_SINGLE_PASS_KV, }; std::unordered_map oroFunctions; - /// @brief The enum class which indicates the selected algorithm of prefix scan. - //enum class ScanAlgo - //{ - // SCAN_CPU, - // SCAN_GPU_SINGLE_WG, - // SCAN_GPU_PARALLEL, - //}; - - //constexpr static auto selectedScanAlgo{ ScanAlgo::SCAN_GPU_PARALLEL }; - - //GpuMemory m_partial_sum; - //GpuMemory m_is_ready; - oroDevice m_device{}; oroDeviceProp m_props{}; OrochiUtils& m_oroutils; - // This buffer holds the "bucket" table from all GPU blocks. - //GpuMemory m_tmp_buffer; - - //int m_num_threads_per_block_for_count{}; - //int m_num_threads_per_block_for_scan{}; - //int m_num_threads_per_block_for_sort{}; - - //int m_num_warps_per_block_for_sort{}; - oroFunction m_gHistogram; oroFunction m_gPrefixSum; oroFunction m_onesweep_reorderKey64; diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index 33c5f78..ccdd81a 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -7,29 +7,11 @@ constexpr auto N_RADIX{ 8 }; constexpr auto BIN_SIZE{ 1 << N_RADIX }; constexpr auto RADIX_MASK{ ( 1 << N_RADIX ) - 1 }; constexpr auto PACK_FACTOR{ sizeof( int ) / sizeof( char ) }; -constexpr auto N_PACKED{ BIN_SIZE / PACK_FACTOR }; -constexpr auto PACK_MAX{ 255 }; -constexpr auto N_PACKED_PER_WI{ N_PACKED / WG_SIZE }; -constexpr auto N_BINS_PER_WI{ BIN_SIZE / WG_SIZE }; constexpr auto N_BINS_4BIT{ 16 }; constexpr auto N_BINS_PACK_FACTOR{ sizeof( long long ) / sizeof( short ) }; constexpr auto N_BINS_PACKED_4BIT{ N_BINS_4BIT / N_BINS_PACK_FACTOR }; -constexpr auto N_BINS_8BIT{ 1 << 8 }; - -constexpr auto DEFAULT_WARP_SIZE{ 32 }; - -constexpr auto DEFAULT_NUM_WARPS_PER_BLOCK{ 8 }; - -// count config - -constexpr auto DEFAULT_COUNT_BLOCK_SIZE{ DEFAULT_WARP_SIZE * DEFAULT_NUM_WARPS_PER_BLOCK }; - -// scan configs -constexpr auto DEFAULT_SCAN_BLOCK_SIZE{ DEFAULT_WARP_SIZE * DEFAULT_NUM_WARPS_PER_BLOCK }; - // sort configs -constexpr auto DEFAULT_SORT_BLOCK_SIZE{ DEFAULT_WARP_SIZE * DEFAULT_NUM_WARPS_PER_BLOCK }; constexpr auto SORT_N_ITEMS_PER_WI{ 12 }; constexpr auto SINGLE_SORT_N_ITEMS_PER_WI{ 24 }; constexpr auto SINGLE_SORT_WG_SIZE{ 128 }; @@ -38,10 +20,6 @@ constexpr auto SINGLE_SORT_WG_SIZE{ 128 }; static_assert( BIN_SIZE % 2 == 0 ); -// Notice that, on some GPUs, the max size of a GPU block cannot be greater than 256 -static_assert( DEFAULT_COUNT_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 ); -static_assert( DEFAULT_SCAN_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 ); - constexpr int RADIX_SORT_BLOCK_SIZE = 2048; constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048; diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index d4b7343..3647a39 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -11,137 +11,6 @@ using u32 = unsigned int; using u64 = unsigned long long; } // namespace -// #define NV_WORKAROUND 1 - -// default values -//#if defined( OVERWRITE ) -// -//constexpr auto COUNT_WG_SIZE{ COUNT_WG_SIZE_VAL }; -//constexpr auto SCAN_WG_SIZE{ SCAN_WG_SIZE_VAL }; -//constexpr auto SORT_WG_SIZE{ SORT_WG_SIZE_VAL }; -//constexpr auto SORT_NUM_WARPS_PER_BLOCK{ SORT_NUM_WARPS_PER_BLOCK_VAL }; -// -//#else -// -//constexpr auto COUNT_WG_SIZE{ DEFAULT_COUNT_BLOCK_SIZE }; -//constexpr auto SCAN_WG_SIZE{ DEFAULT_SCAN_BLOCK_SIZE }; -//constexpr auto SORT_WG_SIZE{ DEFAULT_SORT_BLOCK_SIZE }; -//constexpr auto SORT_NUM_WARPS_PER_BLOCK{ DEFAULT_NUM_WARPS_PER_BLOCK }; -// -//#endif - -//__device__ constexpr u32 getMaskedBits( const u32 value, const u32 shift ) noexcept { return ( value >> shift ) & RADIX_MASK; } -// -//extern "C" __global__ void CountKernel( int* gSrc, int* gDst, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) -//{ -// __shared__ int table[BIN_SIZE]; -// -// for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE ) -// { -// table[i] = 0; -// } -// -// __syncthreads(); -// -// const int offset = blockIdx.x * gNItemsPerWG; -// const int upperBound = ( offset + gNItemsPerWG > gN ) ? gN - offset : gNItemsPerWG; -// -// for( int i = threadIdx.x; i < upperBound; i += COUNT_WG_SIZE ) -// { -// const int idx = offset + i; -// const int tableIdx = getMaskedBits( gSrc[idx], START_BIT ); -// atomicAdd( &table[tableIdx], 1 ); -// } -// -// __syncthreads(); -// -// for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE ) -// { -// gDst[i * N_WGS_EXECUTED + blockIdx.x] = table[i]; -// } -//} - -template -struct ScanImpl -{ - __device__ static T exec( T a ) - { - T b = __shfl( a, threadIdx.x - STRIDE ); - if( threadIdx.x >= STRIDE ) a += b; - return ScanImpl::exec( a ); - } -}; - -template -struct ScanImpl -{ - __device__ static T exec( T a ) { return a; } -}; - -template -__device__ void waveScanInclusive( T& a, int width ) -{ -#if 0 - a = ScanImpl::exec( a ); -#else - for( int i = 1; i < width; i *= 2 ) - { - T b = __shfl( a, threadIdx.x - i ); - if( threadIdx.x >= i ) a += b; - } -#endif -} - -template -__device__ T waveScanExclusive( T& a, int width ) -{ - waveScanInclusive( a, width ); - - T sum = __shfl( a, width - 1 ); - a = __shfl( a, threadIdx.x - 1 ); - if( threadIdx.x == 0 ) a = 0; - - return sum; -} - -template -__device__ void ldsScanInclusive( T* lds, int width ) -{ - // The width cannot exceed WG_SIZE - __shared__ T temp[2][WG_SIZE]; - - constexpr int MAX_INDEX = 1; - int outIndex = 0; - int inIndex = 1; - - temp[outIndex][threadIdx.x] = lds[threadIdx.x]; - __syncthreads(); - - for( int i = 1; i < width; i *= 2 ) - { - // Swap in and out index for the buffers - - outIndex = MAX_INDEX - outIndex; - inIndex = MAX_INDEX - outIndex; - - if( threadIdx.x >= i ) - { - temp[outIndex][threadIdx.x] = temp[inIndex][threadIdx.x] + temp[inIndex][threadIdx.x - i]; - } - else - { - temp[outIndex][threadIdx.x] = temp[inIndex][threadIdx.x]; - } - - __syncthreads(); - } - - lds[threadIdx.x] = temp[outIndex][threadIdx.x]; - - // Ensure the results are written in LDS and are observable in a block (workgroup) before return. - __threadfence_block(); -} - template __device__ T ldsScanExclusive( T* lds, int width ) { @@ -325,140 +194,6 @@ __device__ void localSort4bitMulti( int* keys, u32* ldsKeys, int* values, u32* l } } -//__device__ void localSort8bitMulti_shared_bin( int* keys, u32* ldsKeys, const int START_BIT ) -//{ -// __shared__ unsigned table[BIN_SIZE]; -// -// for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE ) -// { -// table[i] = 0U; -// } -// -// LDS_BARRIER; -// -// for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i ) -// { -// const int tableIdx = ( keys[i] >> START_BIT ) & RADIX_MASK; -// atomicAdd( &table[tableIdx], 1 ); -// } -// -// LDS_BARRIER; -// -// int globalSum = 0; -// for( int binId = 0; binId < BIN_SIZE; binId += SORT_WG_SIZE * 2 ) -// { -// unsigned* globalOffset = &table[binId]; -// const unsigned currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 ); -// globalOffset[threadIdx.x * 2] += globalSum; -// globalOffset[threadIdx.x * 2 + 1] += globalSum; -// globalSum += currentGlobalSum; -// } -// -// LDS_BARRIER; -// -// __shared__ u32 keyBuffer[SORT_WG_SIZE * SORT_N_ITEMS_PER_WI]; -// -// for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i ) -// { -// keyBuffer[threadIdx.x * SORT_N_ITEMS_PER_WI + i] = keys[i]; -// } -// -// LDS_BARRIER; -// -// if( threadIdx.x == 0 ) -// { -// for( int i = 0; i < SORT_WG_SIZE * SORT_N_ITEMS_PER_WI; ++i ) -// { -// const int tableIdx = ( keyBuffer[i] >> START_BIT ) & RADIX_MASK; -// const int writeIndex = table[tableIdx]; -// -// ldsKeys[writeIndex] = keyBuffer[i]; -// -// ++table[tableIdx]; -// } -// } -// -// LDS_BARRIER; -// -// for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i ) -// { -// keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i]; -// } -//} -// -//__device__ void localSort8bitMulti_group( int* keys, u32* ldsKeys, const int START_BIT ) -//{ -// constexpr auto N_GROUP_SIZE{ N_BINS_8BIT / ( sizeof( u64 ) / sizeof( u16 ) ) }; -// -// __shared__ union -// { -// u16 m_ungrouped[SORT_WG_SIZE + 1][N_BINS_8BIT]; -// u64 m_grouped[SORT_WG_SIZE + 1][N_GROUP_SIZE]; -// } lds; -// -// for( int i = 0; i < N_GROUP_SIZE; ++i ) -// { -// lds.m_grouped[threadIdx.x][i] = 0U; -// } -// -// for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ ) -// { -// const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK; -// ++lds.m_ungrouped[threadIdx.x][in8bit]; -// } -// -// LDS_BARRIER; -// -// for( int groupId = threadIdx.x; groupId < N_GROUP_SIZE; groupId += SORT_WG_SIZE ) -// { -// u64 sum = 0U; -// for( int i = 0; i < SORT_WG_SIZE; i++ ) -// { -// const auto current = lds.m_grouped[i][groupId]; -// lds.m_grouped[i][groupId] = sum; -// sum += current; -// } -// lds.m_grouped[SORT_WG_SIZE][groupId] = sum; -// } -// -// LDS_BARRIER; -// -// int globalSum = 0; -// for( int binId = 0; binId < N_BINS_8BIT; binId += SORT_WG_SIZE * 2 ) -// { -// auto* globalOffset = &lds.m_ungrouped[SORT_WG_SIZE][binId]; -// const int currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 ); -// globalOffset[threadIdx.x * 2] += globalSum; -// globalOffset[threadIdx.x * 2 + 1] += globalSum; -// globalSum += currentGlobalSum; -// } -// -// LDS_BARRIER; -// -// for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ ) -// { -// const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK; -// const auto offset = lds.m_ungrouped[SORT_WG_SIZE][in8bit]; -// const auto rank = lds.m_ungrouped[threadIdx.x][in8bit]++; -// -// ldsKeys[offset + rank] = keys[i]; -// } -// -// LDS_BARRIER; -// -// for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ ) -// { -// keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i]; -// } -//} - -//template -//__device__ void localSort8bitMulti( int* keys, u32* ldsKeys, int* values, u32* ldsValues, const int START_BIT ) -//{ -// localSort4bitMulti( keys, ldsKeys, values, ldsValues, START_BIT ); -// if( N_RADIX > 4 ) localSort4bitMulti( keys, ldsKeys, values, ldsValues, START_BIT + 4 ); -//} - template __device__ void SortSinglePass( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int gN, const int START_BIT, const int END_BIT ) { @@ -513,251 +248,6 @@ extern "C" __global__ void SortSinglePassKernel( int* gSrcKey, int* gDstKey, int extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int gN, const int START_BIT, const int END_BIT ) { SortSinglePass( gSrcKey, gSrcVal, gDstKey, gDstVal, gN, START_BIT, END_BIT ); } -//extern "C" __global__ void ParallelExclusiveScanSingleWG( int* gCount, int* gHistogram, const int N_WGS_EXECUTED ) -//{ -// // Use a single WG. -// if( blockIdx.x != 0 ) -// { -// return; -// } -// -// // LDS for the parallel scan of the global sum: -// // First we store the sum of the counters of each number to it, -// // then we compute the global offset using parallel exclusive scan. -// __shared__ int blockBuffer[BIN_SIZE]; -// -// // fill the LDS with the local sum -// -// for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE ) -// { -// // Do exclusive scan for each segment handled by each WI in a WG -// -// int localThreadSum = 0; -// for( int i = 0; i < N_WGS_EXECUTED; ++i ) -// { -// int current = gCount[binId * N_WGS_EXECUTED + i]; -// gCount[binId * N_WGS_EXECUTED + i] = localThreadSum; -// -// localThreadSum += current; -// } -// -// // Store the thread local sum to LDS. -// -// blockBuffer[binId] = localThreadSum; -// } -// -// LDS_BARRIER; -// -// // Do parallel exclusive scan on the LDS -// -// int globalSum = 0; -// for( int binId = 0; binId < BIN_SIZE; binId += WG_SIZE * 2 ) -// { -// int* globalOffset = &blockBuffer[binId]; -// int currentGlobalSum = ldsScanExclusive( globalOffset, WG_SIZE * 2 ); -// globalOffset[threadIdx.x * 2] += globalSum; -// globalOffset[threadIdx.x * 2 + 1] += globalSum; -// globalSum += currentGlobalSum; -// } -// -// LDS_BARRIER; -// -// // Add the global offset to the global histogram. -// -// for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE ) -// { -// for( int i = 0; i < N_WGS_EXECUTED; ++i ) -// { -// gHistogram[binId * N_WGS_EXECUTED + i] += blockBuffer[binId]; -// } -// } -//} -// -//extern "C" __device__ void WorkgroupSync( int threadId, int blockId, int currentSegmentSum, int* currentGlobalOffset, volatile int* gPartialSum, volatile bool* gIsReady ) -//{ -// if( threadId == 0 ) -// { -// int offset = 0; -// -// if( blockId != 0 ) -// { -// while( !gIsReady[blockId - 1] ) -// { -// } -// -// offset = gPartialSum[blockId - 1]; -// -// __threadfence(); -// -// // Reset the value -// gIsReady[blockId - 1] = false; -// } -// -// gPartialSum[blockId] = offset + currentSegmentSum; -// -// // Ensure that the gIsReady is only modified after the gPartialSum is written. -// __threadfence(); -// -// gIsReady[blockId] = true; -// -// *currentGlobalOffset = offset; -// } -// -// __syncthreads(); -//} -// -//extern "C" __global__ void ParallelExclusiveScanAllWG( int* gCount, int* gHistogram, volatile int* gPartialSum, volatile bool* gIsReady ) -//{ -// // Fill the LDS with the partial sum of each segment -// __shared__ int blockBuffer[SCAN_WG_SIZE]; -// -// blockBuffer[threadIdx.x] = gCount[blockIdx.x * blockDim.x + threadIdx.x]; -// -// __syncthreads(); -// -// // Do parallel exclusive scan on the LDS -// -// int currentSegmentSum = ldsScanExclusive( blockBuffer, SCAN_WG_SIZE ); -// -// __syncthreads(); -// -// // Sync all the Workgroups to calculate the global offset. -// -// __shared__ int currentGlobalOffset; -// WorkgroupSync( threadIdx.x, blockIdx.x, currentSegmentSum, ¤tGlobalOffset, gPartialSum, gIsReady ); -// -// // Write back the result. -// -// gHistogram[blockIdx.x * blockDim.x + threadIdx.x] = blockBuffer[threadIdx.x] + currentGlobalOffset; -//} -// -//template -//__device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int numberOfInputs, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) -//{ -// __shared__ u32 globalOffset[BIN_SIZE]; -// __shared__ u32 localPrefixSum[BIN_SIZE]; -// __shared__ u32 counters[BIN_SIZE]; -// -// __shared__ u32 matchMasks[SORT_NUM_WARPS_PER_BLOCK][BIN_SIZE]; -// -// for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE ) -// { -// // Note: The size of gHistogram is always BIN_SIZE * N_WGS_EXECUTED -// globalOffset[i] = gHistogram[i * N_WGS_EXECUTED + blockIdx.x]; -// -// counters[i] = 0; -// localPrefixSum[i] = 0; -// } -// -// for( int w = 0; w < SORT_NUM_WARPS_PER_BLOCK; ++w ) -// { -// for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE ) -// { -// matchMasks[w][i] = 0; -// } -// } -// -// __syncthreads(); -// -// for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE ) -// { -// const u32 itemIndex = blockIdx.x * gNItemsPerWG + i; -// if( itemIndex < numberOfInputs ) -// { -// const auto item = gSrcKey[itemIndex]; -// const u32 bucketIndex = getMaskedBits( item, START_BIT ); -// atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); -// } -// } -// -// __syncthreads(); -// -// // Compute Prefix Sum -// -// ldsScanExclusive( localPrefixSum, BIN_SIZE ); -// -// __syncthreads(); -// -// // Reorder -// -// for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE ) -// { -// const u32 itemIndex = blockIdx.x * gNItemsPerWG + i; -// -// const auto item = gSrcKey[itemIndex]; -// const u32 bucketIndex = getMaskedBits( item, START_BIT ); -// -// const int warp = threadIdx.x / 32; -// const int lane = threadIdx.x % 32; -// -// __syncthreads(); -// -// if( itemIndex < numberOfInputs ) -// { -// atomicOr( &matchMasks[warp][bucketIndex], 1u << lane ); -// } -// -// __syncthreads(); -// -// bool flushMask = false; -// -// u32 localOffset = 0; -// u32 localSrcIndex = 0; -// -// if( itemIndex < numberOfInputs ) -// { -// const u32 matchMask = matchMasks[warp][bucketIndex]; -// const u32 lowerMask = ( 1u << lane ) - 1; -// u32 offset = __popc( matchMask & lowerMask ); -// -// flushMask = ( offset == 0 ); -// -// for( int w = 0; w < warp; ++w ) -// { -// offset += __popc( matchMasks[w][bucketIndex] ); -// } -// -// localOffset = counters[bucketIndex] + offset; -// localSrcIndex = i; -// } -// -// __syncthreads(); -// -// if( itemIndex < numberOfInputs ) -// { -// atomicInc( &counters[bucketIndex], 0xFFFFFFFF ); -// } -// -// if( flushMask ) -// { -// matchMasks[warp][bucketIndex] = 0; -// } -// -// // Swap -// -// if( itemIndex < numberOfInputs ) -// { -// const u32 srcIndex = blockIdx.x * gNItemsPerWG + localSrcIndex; -// const u32 dstIndex = globalOffset[bucketIndex] + localOffset; -// gDstKey[dstIndex] = gSrcKey[srcIndex]; -// -// if constexpr( KEY_VALUE_PAIR ) -// { -// gDstVal[dstIndex] = gSrcVal[srcIndex]; -// } -// } -// } -//} -// -//extern "C" __global__ void SortKernel( int* gSrcKey, int* gDstKey, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) -//{ -// SortImpl( gSrcKey, nullptr, gDstKey, nullptr, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED ); -//} -// -//extern "C" __global__ void SortKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED ) -//{ -// SortImpl( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED ); -//} constexpr auto KEY_IS_16BYTE_ALIGNED = true; From 7ae1709f0c26b4c89ce2530ddfd8a203d032fed1 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 14:12:55 +0900 Subject: [PATCH 31/68] delete unused inl --- ParallelPrimitives/RadixSort.inl | 163 ------------------------------- 1 file changed, 163 deletions(-) delete mode 100644 ParallelPrimitives/RadixSort.inl diff --git a/ParallelPrimitives/RadixSort.inl b/ParallelPrimitives/RadixSort.inl deleted file mode 100644 index d001238..0000000 --- a/ParallelPrimitives/RadixSort.inl +++ /dev/null @@ -1,163 +0,0 @@ - - -//namespace -//{ -// -//struct Empty -//{ -//}; -// -///// @brief Call the callable and measure the elapsed time using the Stopwatch. -///// @tparam CallableType The type of the callable to be invoked in this function. -///// @tparam RecordType The type of the object that stores the recorded times. -///// @tparam enable_profile The elapsed time will be recorded if this is set to True. -///// @param callable The callable object to be called. -///// @param time_record The object that stores the recorded times. -///// @param index The index indicates where to store the elapsed time in @c time_record -///// @param stream The GPU stream -//template -//constexpr void execute( CallableType&& callable, RecordType& time_record, const int index, const oroStream stream ) noexcept -//{ -// using TimerType = std::conditional_t; -// -// TimerType stopwatch; -// -// if constexpr( enable_profile ) -// { -// stopwatch.start(); -// } -// -// std::invoke( std::forward( callable ) ); -// -// if constexpr( enable_profile ) -// { -// OrochiUtils::waitForCompletion( stream ); -// stopwatch.stop(); -// time_record[index] = stopwatch.getMs(); -// } -//} -// -//template -//void resize_record( T& t ) noexcept -//{ -// if constexpr( enable_profile ) -// { -// t.resize( 3 ); -// } -//} -// -//template -//void print_record( const T& t ) noexcept -//{ -// if constexpr( enable_profile ) -// { -// printf( "%3.2f, %3.2f, %3.2f\n", t[0], t[1], t[2] ); -// } -//} -// -//} // namespace - -//template -//void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept -//{ -// static constexpr auto enable_profile = false; -// -// const u32* srcKey{ nullptr }; -// const u32* dstKey{ nullptr }; -// -// const u32* srcVal{ nullptr }; -// const u32* dstVal{ nullptr }; -// -// static constexpr auto enable_key_value_pair_sorting{ std::is_same_v }; -// -// if constexpr( enable_key_value_pair_sorting ) -// { -// srcKey = src.key; -// dstKey = dst.key; -// -// srcVal = src.value; -// dstVal = dst.value; -// } -// else -// { -// static_assert( std::is_same_v || std::is_same_v ); -// srcKey = src; -// dstKey = dst; -// } -// -// const int nItemPerWG = ( n + m_num_blocks_for_count - 1 ) / m_num_blocks_for_count; -// -// // Timer records -// -// using RecordType = std::conditional_t, Empty>; -// RecordType t; -// -// resize_record( t ); -// -// const auto launch_count_kernel = [&]() noexcept -// { -// const auto num_total_thread_for_count = m_num_threads_per_block_for_count * m_num_blocks_for_count; -// -// const auto func{ oroFunctions[Kernel::COUNT] }; -// const void* args[] = { &srcKey, arg_cast( m_tmp_buffer.address() ), &n, &nItemPerWG, &startBit, &m_num_blocks_for_count }; -// OrochiUtils::launch1D( func, num_total_thread_for_count, args, m_num_threads_per_block_for_count, 0, stream ); -// }; -// -// execute( launch_count_kernel, t, 0, stream ); -// -// const auto launch_scan_kernel = [&]() noexcept -// { -// switch( selectedScanAlgo ) -// { -// case ScanAlgo::SCAN_CPU: -// { -// exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer ); -// } -// break; -// -// case ScanAlgo::SCAN_GPU_SINGLE_WG: -// { -// const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), &m_num_blocks_for_count }; -// OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_SINGLE_WG], WG_SIZE * m_num_blocks_for_count, args, WG_SIZE, 0, stream ); -// } -// break; -// -// case ScanAlgo::SCAN_GPU_PARALLEL: -// { -// const auto num_total_thread_for_scan = m_num_threads_per_block_for_scan * m_num_blocks_for_scan; -// -// const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), arg_cast( m_partial_sum.address() ), arg_cast( m_is_ready.address() ) }; -// OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_PARALLEL], num_total_thread_for_scan, args, m_num_threads_per_block_for_scan, 0, stream ); -// } -// break; -// -// default: -// exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer ); -// break; -// } -// }; -// -// execute( launch_scan_kernel, t, 1, stream ); -// -// const auto launch_sort_kernel = [&]() noexcept -// { -// const auto num_blocks_for_sort = m_num_blocks_for_count; -// const auto num_total_thread_for_sort = m_num_threads_per_block_for_sort * num_blocks_for_sort; -// const auto num_items_per_block = nItemPerWG; -// -// if constexpr( enable_key_value_pair_sorting ) -// { -// const void* args[] = { &srcKey, &srcVal, &dstKey, &dstVal, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort }; -// OrochiUtils::launch1D( oroFunctions[Kernel::SORT_KV], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream ); -// } -// else -// { -// const void* args[] = { &srcKey, &dstKey, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort }; -// OrochiUtils::launch1D( oroFunctions[Kernel::SORT], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream ); -// } -// }; -// -// execute( launch_sort_kernel, t, 2, stream ); -// -// print_record( t ); -//} From 6c9fc499358e5556504f6b50d5fcd2b4b23cb771 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 15:45:14 +0900 Subject: [PATCH 32/68] Add a special case handling, all elements have the same digit, to reduce the overhead of thread conflicts --- ParallelPrimitives/RadixSortKernels.h | 29 ++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 3647a39..d71f64d 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -561,7 +561,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys lookBackBuffer[pIndex] = asU64( pa ); // complete global output location - pSum[i] = gp + p; + u32 globalOutput = gp + p; + pSum[i] = globalOutput; + + // A special case handling: all elements have the same digit + if( s == RADIX_SORT_BLOCK_SIZE ) + { + matchMasks[0][0] = globalOutput + 1 /* +1 to avoid zero */; + } } u32 prefix = 0; @@ -578,6 +585,26 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ ); } + // A special case handling: all elements have the same digit + u32 globalOutput = matchMasks[0][0]; + if( globalOutput-- /* -1 for the actual offset */ ) + { + for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + { + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + if( itemIndex < numberOfInputs ) + { + u32 dstIndex = globalOutput + i + threadIdx.x; + outputKeys[dstIndex] = inputKeys[itemIndex]; + if constexpr( keyPair ) + { + outputValues[dstIndex] = inputValues[itemIndex]; + } + } + } + return; + } + // reorder for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { From 35b265443ad225e412f33ede1593729448cfb541 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 26 Dec 2023 16:40:36 +0900 Subject: [PATCH 33/68] Refactor indices --- ParallelPrimitives/RadixSortKernels.h | 46 +++++++++++++-------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index d71f64d..e2e6d86 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -365,17 +365,17 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf } else { - for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK ) + for( int i = threadIdx.x; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK ) { - u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x + i; + u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i; if( itemIndex < numberOfInputs ) { auto item = inputs[itemIndex]; - for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) + for( int j = 0; j < sizeof( RADIX_SORT_KEY_TYPE ); j++ ) { - u32 bitLocation = startBits + i * 8; + u32 bitLocation = startBits + j * 8; u32 bits = extractDigit( getKeyBits( item ), bitLocation ); - atomicInc( &localCounters[i][bits], 0xFFFFFFFF ); + atomicInc( &localCounters[j][bits], 0xFFFFFFFF ); } } } @@ -444,9 +444,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ ) { - for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - matchMasks[w][i + threadIdx.x] = 0; + matchMasks[w][i] = 0; } } @@ -474,16 +474,16 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } else { - for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; if( itemIndex < numberOfInputs ) { auto item = inputKeys[itemIndex]; u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); - elementBuckets[i + threadIdx.x] = bucketIndex; + elementBuckets[i] = bucketIndex; } } } @@ -589,12 +589,12 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 globalOutput = matchMasks[0][0]; if( globalOutput-- /* -1 for the actual offset */ ) { - for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; if( itemIndex < numberOfInputs ) { - u32 dstIndex = globalOutput + i + threadIdx.x; + u32 dstIndex = globalOutput + i; outputKeys[dstIndex] = inputKeys[itemIndex]; if constexpr( keyPair ) { @@ -606,10 +606,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } // reorder - for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; - u32 bucketIndex = elementBuckets[i + threadIdx.x]; + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; + u32 bucketIndex = elementBuckets[i]; __syncthreads(); @@ -642,7 +642,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 to = localOffset + localPrefixSum[bucketIndex]; ElementLocation el; - el.localSrcIndex = i + threadIdx.x; + el.localSrcIndex = i; el.localOffset = localOffset; el.bucket = bucketIndex; elementLocations[to] = el; @@ -660,12 +660,12 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } } - for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; if( itemIndex < numberOfInputs ) { - ElementLocation el = elementLocations[i + threadIdx.x]; + ElementLocation el = elementLocations[i]; u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; u8 bucketIndex = el.bucket; @@ -675,12 +675,12 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } if constexpr ( keyPair ) { - for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x; + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; if( itemIndex < numberOfInputs ) { - ElementLocation el = elementLocations[i + threadIdx.x]; + ElementLocation el = elementLocations[i]; u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; u8 bucketIndex = el.bucket; From 7daad5c500e15ac4b56ccdef8787931f910d5184 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 28 Dec 2023 16:43:14 +0900 Subject: [PATCH 34/68] implement counting part --- ParallelPrimitives/RadixSortConfigs.h | 2 + ParallelPrimitives/RadixSortKernels.h | 106 ++++++++++++++++++++------ 2 files changed, 85 insertions(+), 23 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index ccdd81a..82f0c10 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -27,6 +27,8 @@ constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256; constexpr int REORDER_NUMBER_OF_WARPS = 8; constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS; +constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = GHISTOGRAM_ITEM_PER_BLOCK / REORDER_NUMBER_OF_WARPS; +constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32; constexpr int LOOKBACK_TABLE_SIZE = 1024; constexpr int MAX_LOOK_BACK = 64; diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index e2e6d86..e9dc245 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -1,5 +1,10 @@ #include #define LDS_BARRIER __syncthreads() + +#if defined( CUDART_VERSION ) && CUDART_VERSION >= 9000 +#define ITS 1 +#endif + namespace { @@ -280,6 +285,7 @@ __device__ inline u32 getKeyBits( u32 x ) { return x ^ ORDER_MASK_32; } __device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; } __device__ inline u32 extractDigit( u32 x, u32 bitLocation ) { return ( x >> bitLocation ) & RADIX_MASK; } __device__ inline u32 extractDigit( u64 x, u32 bitLocation ) { return (u32)( ( x >> bitLocation ) & RADIX_MASK ); } +__device__ __forceinline__ u32 u32min( u32 x, u32 y ) { return ( y < x ) ? y : x; } template __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO ) @@ -428,6 +434,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 bucket : 8; }; + __shared__ u32 blockHistogram[BIN_SIZE]; + __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; + __shared__ u32 pSum[BIN_SIZE]; __shared__ u32 localPrefixSum[BIN_SIZE]; __shared__ u32 counters[BIN_SIZE]; @@ -450,6 +459,56 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } } + clearShared( blockHistogram, 0 ); + clearShared( lpSum, 0 ); + + __syncthreads(); + + int warp = threadIdx.x / 32; + int lane = threadIdx.x % 32; + for( int i = lane; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32 ) + { + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; + + u32 bucketIndex = 0; + if( itemIndex < numberOfInputs ) + { + auto item = inputKeys[itemIndex]; + bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); + } + + int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32 + u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems; + + for( int i = 0; i < 8; ++i ) + { + u32 bit = ( bucketIndex >> i ) & 0x1; + u32 difference = ( 0xFFFFFFFF * bit ) ^ +#if defined( ITS ) + __ballot_sync( 0xFFFFFFFF, bit != 0 ); +#else + __ballot( bit != 0 ); +#endif + broThreads &= ~difference; + } + int laneIndex = threadIdx.x % 32; + u32 lowerMask = ( 1u << laneIndex ) - 1; + bool leader = ( broThreads & lowerMask ) == 0; + if( itemIndex < numberOfInputs && leader ) + { + u32 n = __popc( broThreads ); + atomicAdd( &blockHistogram[bucketIndex], n ); + lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] = n; + } + } + + { + u32 prefix = 0; + for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + { + prefix += prefixSumExclusive( prefix, &lpSum[i] ); + } + } __syncthreads(); // count @@ -517,7 +576,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - u32 s = localPrefixSum[i]; + //u32 s = localPrefixSum[i]; + u32 s = blockHistogram[i]; int pIndex = BIN_SIZE * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i; { @@ -565,10 +625,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys pSum[i] = globalOutput; // A special case handling: all elements have the same digit - if( s == RADIX_SORT_BLOCK_SIZE ) - { - matchMasks[0][0] = globalOutput + 1 /* +1 to avoid zero */; - } + //if( s == RADIX_SORT_BLOCK_SIZE ) + //{ + // matchMasks[0][0] = globalOutput + 1 /* +1 to avoid zero */; + //} } u32 prefix = 0; @@ -586,24 +646,24 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } // A special case handling: all elements have the same digit - u32 globalOutput = matchMasks[0][0]; - if( globalOutput-- /* -1 for the actual offset */ ) - { - for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; - if( itemIndex < numberOfInputs ) - { - u32 dstIndex = globalOutput + i; - outputKeys[dstIndex] = inputKeys[itemIndex]; - if constexpr( keyPair ) - { - outputValues[dstIndex] = inputValues[itemIndex]; - } - } - } - return; - } + //u32 globalOutput = matchMasks[0][0]; + //if( globalOutput-- /* -1 for the actual offset */ ) + //{ + // for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + // { + // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; + // if( itemIndex < numberOfInputs ) + // { + // u32 dstIndex = globalOutput + i; + // outputKeys[dstIndex] = inputKeys[itemIndex]; + // if constexpr( keyPair ) + // { + // outputValues[dstIndex] = inputValues[itemIndex]; + // } + // } + // } + // return; + //} // reorder for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) From 5043a034b6a669aead7471d4a970c7a22080cd3c Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 28 Dec 2023 18:43:41 +0900 Subject: [PATCH 35/68] slow but works --- ParallelPrimitives/RadixSortConfigs.h | 7 +- ParallelPrimitives/RadixSortKernels.h | 304 +++++++++++++++----------- 2 files changed, 186 insertions(+), 125 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index 82f0c10..f305940 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -20,19 +20,20 @@ constexpr auto SINGLE_SORT_WG_SIZE{ 128 }; static_assert( BIN_SIZE % 2 == 0 ); -constexpr int RADIX_SORT_BLOCK_SIZE = 2048; +constexpr int RADIX_SORT_BLOCK_SIZE = 2048 * 2; +// constexpr int RADIX_SORT_BLOCK_SIZE = 512; constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048; constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256; constexpr int REORDER_NUMBER_OF_WARPS = 8; constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS; -constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = GHISTOGRAM_ITEM_PER_BLOCK / REORDER_NUMBER_OF_WARPS; +constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = RADIX_SORT_BLOCK_SIZE / REORDER_NUMBER_OF_WARPS; constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32; constexpr int LOOKBACK_TABLE_SIZE = 1024; constexpr int MAX_LOOK_BACK = 64; -constexpr int TAIL_BITS = 4; +constexpr int TAIL_BITS = 5; constexpr int TAIL_COUNT = 1u << TAIL_BITS; static_assert( REORDER_NUMBER_OF_THREADS_PER_BLOCK <= BIN_SIZE, "please check prefixSumExclusive on onesweep_reorder" ); diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index e9dc245..83105f9 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -436,37 +436,40 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __shared__ u32 blockHistogram[BIN_SIZE]; __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; - __shared__ u32 pSum[BIN_SIZE]; - __shared__ u32 localPrefixSum[BIN_SIZE]; - __shared__ u32 counters[BIN_SIZE]; __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; - __shared__ u8 elementBuckets[RADIX_SORT_BLOCK_SIZE]; - __shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][BIN_SIZE]; + + //__shared__ u32 localPrefixSum[BIN_SIZE]; + //__shared__ u32 counters[BIN_SIZE]; + //__shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; + //__shared__ u8 elementBuckets[RADIX_SORT_BLOCK_SIZE]; + //__shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][BIN_SIZE]; u32 bitLocation = startBits + 8 * iteration; u32 blockIndex = blockIdx.x; u32 numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE ); - clearShared( localPrefixSum, 0 ); - clearShared( counters, 0 ); + // clearShared( localPrefixSum, 0 ); + // clearShared( counters, 0 ); - for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ ) - { - for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - { - matchMasks[w][i] = 0; - } - } + //for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ ) + //{ + // for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + // { + // matchMasks[w][i] = 0; + // } + //} clearShared( blockHistogram, 0 ); clearShared( lpSum, 0 ); __syncthreads(); + u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD]; + int warp = threadIdx.x / 32; int lane = threadIdx.x % 32; - for( int i = lane; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32 ) + for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; @@ -476,13 +479,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys auto item = inputKeys[itemIndex]; bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); } + bucketIndices[k] = bucketIndex; int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32 u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems; - for( int i = 0; i < 8; ++i ) + for( int j = 0; j < 8; ++j ) { - u32 bit = ( bucketIndex >> i ) & 0x1; + u32 bit = ( bucketIndex >> j ) & 0x1; u32 difference = ( 0xFFFFFFFF * bit ) ^ #if defined( ITS ) __ballot_sync( 0xFFFFFFFF, bit != 0 ); @@ -498,55 +502,13 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { u32 n = __popc( broThreads ); atomicAdd( &blockHistogram[bucketIndex], n ); - lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] = n; + lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n; } + // warpOffsets[k] = __popc( broThreads & lowerMask ); } - { - u32 prefix = 0; - for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - { - prefix += prefixSumExclusive( prefix, &lpSum[i] ); - } - } __syncthreads(); - // count - if( KEY_IS_16BYTE_ALIGNED && ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs ) - { - for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK * 4 ) - { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x * 4; - struct alignas( 16 ) Key4 - { - RADIX_SORT_KEY_TYPE xs[4]; - }; - Key4 key4 = *(Key4*)&inputKeys[itemIndex]; - for( int k = 0; k < 4; k++ ) - { - auto item = key4.xs[k]; - u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); - atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); - elementBuckets[i + threadIdx.x * 4 + k] = bucketIndex; - } - } - } - else - { - for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; - if( itemIndex < numberOfInputs ) - { - auto item = inputKeys[itemIndex]; - u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); - atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF ); - - elementBuckets[i] = bucketIndex; - } - } - } - struct ParitionID { u64 value : 32; @@ -623,19 +585,36 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys // complete global output location u32 globalOutput = gp + p; pSum[i] = globalOutput; - - // A special case handling: all elements have the same digit - //if( s == RADIX_SORT_BLOCK_SIZE ) - //{ - // matchMasks[0][0] = globalOutput + 1 /* +1 to avoid zero */; - //} } + u32 prefix = 0; for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - prefix += prefixSumExclusive( prefix, &localPrefixSum[i] ); + prefix += prefixSumExclusive( prefix, &blockHistogram[i] ); } + { + int bucketIndex = threadIdx.x; + u32 s = blockHistogram[bucketIndex]; + for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ ) + { + int index = bucketIndex * REORDER_NUMBER_OF_WARPS + warp; + u32 n = lpSum[index]; + lpSum[index] = s; + s += n; + } + } + // printf( "[%d] %d\n", threadIdx.x, blockHistogram[threadIdx.x] ); + + //{ + // u32 prefix = 0; + // for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + // { + // prefix += prefixSumExclusive( prefix, &lpSum[i] ); + // } + //} + + __syncthreads(); if( threadIdx.x == 0 ) { @@ -645,81 +624,57 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ ); } - // A special case handling: all elements have the same digit - //u32 globalOutput = matchMasks[0][0]; - //if( globalOutput-- /* -1 for the actual offset */ ) //{ - // for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + // u32 prefix = 0; + // for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) // { - // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; - // if( itemIndex < numberOfInputs ) - // { - // u32 dstIndex = globalOutput + i; - // outputKeys[dstIndex] = inputKeys[itemIndex]; - // if constexpr( keyPair ) - // { - // outputValues[dstIndex] = inputValues[itemIndex]; - // } - // } + // prefix += prefixSumExclusive( prefix, &lpSum[i] ); // } - // return; //} + __syncthreads(); - // reorder - for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; - u32 bucketIndex = elementBuckets[i]; - - __syncthreads(); + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; + u32 bucketIndex = bucketIndices[k]; - int warp = threadIdx.x / 32; - int lane = threadIdx.x % 32; + int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32 + u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems; - if( itemIndex < numberOfInputs ) + for( int j = 0; j < 8; ++j ) { - atomicOr( &matchMasks[warp][bucketIndex], 1u << lane ); + u32 bit = ( bucketIndex >> j ) & 0x1; + u32 difference = ( 0xFFFFFFFF * bit ) ^ +#if defined( ITS ) + __ballot_sync( 0xFFFFFFFF, bit != 0 ); +#else + __ballot( bit != 0 ); +#endif + broThreads &= ~difference; } - - __syncthreads(); - - bool flushMask = false; + int laneIndex = threadIdx.x % 32; + u32 lowerMask = ( 1u << laneIndex ) - 1; + bool leader = ( broThreads & lowerMask ) == 0; if( itemIndex < numberOfInputs ) { - u32 matchMask = matchMasks[warp][bucketIndex]; - u32 lowerMask = ( 1u << lane ) - 1; - u32 offset = __popc( matchMask & lowerMask ); - - flushMask = offset == 0; - - for( int w = 0; w < warp; w++ ) - { - offset += __popc( matchMasks[w][bucketIndex] ); - } - - u32 localOffset = counters[bucketIndex] + offset; - u32 to = localOffset + localPrefixSum[bucketIndex]; + u32 localBase = lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; + u32 to = localBase + __popc( broThreads & lowerMask ); ElementLocation el; - el.localSrcIndex = i; - el.localOffset = localOffset; + el.localSrcIndex = itemIndex - blockIndex * RADIX_SORT_BLOCK_SIZE; + el.localOffset = to - blockHistogram[bucketIndex]; el.bucket = bucketIndex; elementLocations[to] = el; } - - __syncthreads(); - - if( itemIndex < numberOfInputs ) - { - atomicInc( &counters[bucketIndex], 0xFFFFFFFF ); - } - if( flushMask ) + if( itemIndex < numberOfInputs && leader ) { - matchMasks[warp][bucketIndex] = 0; + lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += __popc( broThreads ); } } + __syncthreads(); + for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; @@ -733,7 +688,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys outputKeys[dstIndex] = inputKeys[srcIndex]; } } - if constexpr ( keyPair ) + if constexpr( keyPair ) { for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { @@ -749,6 +704,111 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } } } + + // A special case handling: all elements have the same digit + //u32 globalOutput = matchMasks[0][0]; + //if( globalOutput-- /* -1 for the actual offset */ ) + //{ + // for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + // { + // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; + // if( itemIndex < numberOfInputs ) + // { + // u32 dstIndex = globalOutput + i; + // outputKeys[dstIndex] = inputKeys[itemIndex]; + // if constexpr( keyPair ) + // { + // outputValues[dstIndex] = inputValues[itemIndex]; + // } + // } + // } + // return; + //} + + // reorder + //for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + //{ + // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; + // u32 bucketIndex = elementBuckets[i]; + + // __syncthreads(); + + // int warp = threadIdx.x / 32; + // int lane = threadIdx.x % 32; + + // if( itemIndex < numberOfInputs ) + // { + // atomicOr( &matchMasks[warp][bucketIndex], 1u << lane ); + // } + + // __syncthreads(); + + // bool flushMask = false; + + // if( itemIndex < numberOfInputs ) + // { + // u32 matchMask = matchMasks[warp][bucketIndex]; + // u32 lowerMask = ( 1u << lane ) - 1; + // u32 offset = __popc( matchMask & lowerMask ); + + // flushMask = offset == 0; + + // for( int w = 0; w < warp; w++ ) + // { + // offset += __popc( matchMasks[w][bucketIndex] ); + // } + + // u32 localOffset = counters[bucketIndex] + offset; + // u32 to = localOffset + localPrefixSum[bucketIndex]; + + // ElementLocation el; + // el.localSrcIndex = i; + // el.localOffset = localOffset; + // el.bucket = bucketIndex; + // elementLocations[to] = el; + // } + + // __syncthreads(); + + // if( itemIndex < numberOfInputs ) + // { + // atomicInc( &counters[bucketIndex], 0xFFFFFFFF ); + // } + // if( flushMask ) + // { + // matchMasks[warp][bucketIndex] = 0; + // } + //} + + //for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + //{ + // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; + // if( itemIndex < numberOfInputs ) + // { + // ElementLocation el = elementLocations[i]; + // u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; + // u8 bucketIndex = el.bucket; + + // u32 dstIndex = pSum[bucketIndex] + el.localOffset; + // outputKeys[dstIndex] = inputKeys[srcIndex]; + // } + //} + //if constexpr ( keyPair ) + //{ + // for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + // { + // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; + // if( itemIndex < numberOfInputs ) + // { + // ElementLocation el = elementLocations[i]; + // u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; + // u8 bucketIndex = el.bucket; + + // u32 dstIndex = pSum[bucketIndex] + el.localOffset; + // outputValues[dstIndex] = inputValues[srcIndex]; + // } + // } + //} } extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) From 265578239a7fc297cfeb981ef97c89aa4c0586db Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 28 Dec 2023 19:54:03 +0900 Subject: [PATCH 36/68] Simplify --- ParallelPrimitives/RadixSortKernels.h | 51 +++++++-------------------- 1 file changed, 12 insertions(+), 39 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 83105f9..89bb8c9 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -466,6 +466,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD]; + u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD]; + // u32 bros[REORDER_NUMBER_OF_ITEM_PER_THREAD]; int warp = threadIdx.x / 32; int lane = threadIdx.x % 32; @@ -495,8 +497,15 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys #endif broThreads &= ~difference; } + // bros[k] = broThreads; int laneIndex = threadIdx.x % 32; u32 lowerMask = ( 1u << laneIndex ) - 1; + + if( itemIndex < numberOfInputs ) + { + warpOffsets[k] = lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask ); + } + bool leader = ( broThreads & lowerMask ) == 0; if( itemIndex < numberOfInputs && leader ) { @@ -504,7 +513,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys atomicAdd( &blockHistogram[bucketIndex], n ); lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n; } - // warpOffsets[k] = __popc( broThreads & lowerMask ); } __syncthreads(); @@ -606,14 +614,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } // printf( "[%d] %d\n", threadIdx.x, blockHistogram[threadIdx.x] ); - //{ - // u32 prefix = 0; - // for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - // { - // prefix += prefixSumExclusive( prefix, &lpSum[i] ); - // } - //} - __syncthreads(); if( threadIdx.x == 0 ) @@ -624,13 +624,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ ); } - //{ - // u32 prefix = 0; - // for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - // { - // prefix += prefixSumExclusive( prefix, &lpSum[i] ); - // } - //} __syncthreads(); for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) @@ -638,28 +631,11 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; u32 bucketIndex = bucketIndices[k]; - int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32 - u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems; - - for( int j = 0; j < 8; ++j ) - { - u32 bit = ( bucketIndex >> j ) & 0x1; - u32 difference = ( 0xFFFFFFFF * bit ) ^ -#if defined( ITS ) - __ballot_sync( 0xFFFFFFFF, bit != 0 ); -#else - __ballot( bit != 0 ); -#endif - broThreads &= ~difference; - } - int laneIndex = threadIdx.x % 32; - u32 lowerMask = ( 1u << laneIndex ) - 1; - bool leader = ( broThreads & lowerMask ) == 0; - if( itemIndex < numberOfInputs ) { u32 localBase = lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; - u32 to = localBase + __popc( broThreads & lowerMask ); + // u32 to = localBase + __popc( broThreads & lowerMask ); + u32 to = localBase + warpOffsets[k]; ElementLocation el; el.localSrcIndex = itemIndex - blockIndex * RADIX_SORT_BLOCK_SIZE; @@ -667,10 +643,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys el.bucket = bucketIndex; elementLocations[to] = el; } - if( itemIndex < numberOfInputs && leader ) - { - lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += __popc( broThreads ); - } } __syncthreads(); @@ -688,6 +660,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys outputKeys[dstIndex] = inputKeys[srcIndex]; } } + if constexpr( keyPair ) { for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) From d33d590f9077b1888949c15eb0313d852e01a011 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 28 Dec 2023 21:35:17 +0900 Subject: [PATCH 37/68] shared approach --- ParallelPrimitives/RadixSortKernels.h | 61 +++++++++++++++++++-------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 89bb8c9..adb8b04 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -435,9 +435,28 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys }; __shared__ u32 blockHistogram[BIN_SIZE]; - __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; __shared__ u32 pSum[BIN_SIZE]; - __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; + // __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; + // __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; + + struct SMem + { + struct Phase1 + { + u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; + }; + struct Phase2 + { + ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; + }; + + union + { + Phase1 phase1; + Phase2 phase2; + } u; + }; + __shared__ SMem smem; //__shared__ u32 localPrefixSum[BIN_SIZE]; //__shared__ u32 counters[BIN_SIZE]; @@ -461,7 +480,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys //} clearShared( blockHistogram, 0 ); - clearShared( lpSum, 0 ); + clearShared( smem.u.phase1.lpSum, 0 ); __syncthreads(); @@ -503,7 +522,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys if( itemIndex < numberOfInputs ) { - warpOffsets[k] = lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask ); + warpOffsets[k] = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask ); } bool leader = ( broThreads & lowerMask ) == 0; @@ -511,7 +530,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { u32 n = __popc( broThreads ); atomicAdd( &blockHistogram[bucketIndex], n ); - lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n; + smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n; } } @@ -607,12 +626,11 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ ) { int index = bucketIndex * REORDER_NUMBER_OF_WARPS + warp; - u32 n = lpSum[index]; - lpSum[index] = s; + u32 n = smem.u.phase1.lpSum[index]; + smem.u.phase1.lpSum[index] = s; s += n; } } - // printf( "[%d] %d\n", threadIdx.x, blockHistogram[threadIdx.x] ); __syncthreads(); @@ -626,6 +644,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); + for( int k = 0; k < REORDER_NUMBER_OF_ITEM_PER_THREAD; k++ ) + { + u32 bucketIndex = bucketIndices[k]; + warpOffsets[k] += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; + } + + __syncthreads(); + for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; @@ -633,15 +659,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys if( itemIndex < numberOfInputs ) { - u32 localBase = lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; - // u32 to = localBase + __popc( broThreads & lowerMask ); - u32 to = localBase + warpOffsets[k]; + u32 to = warpOffsets[k]; ElementLocation el; el.localSrcIndex = itemIndex - blockIndex * RADIX_SORT_BLOCK_SIZE; - el.localOffset = to - blockHistogram[bucketIndex]; + // el.localOffset = to - blockHistogram[bucketIndex]; + el.localOffset = 0; el.bucket = bucketIndex; - elementLocations[to] = el; + smem.u.phase2.elementLocations[to] = el; } } @@ -652,11 +677,12 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; if( itemIndex < numberOfInputs ) { - ElementLocation el = elementLocations[i]; + ElementLocation el = smem.u.phase2.elementLocations[i]; u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; u8 bucketIndex = el.bucket; - u32 dstIndex = pSum[bucketIndex] + el.localOffset; + // u32 dstIndex = pSum[bucketIndex] + el.localOffset; + u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex]; outputKeys[dstIndex] = inputKeys[srcIndex]; } } @@ -668,11 +694,12 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; if( itemIndex < numberOfInputs ) { - ElementLocation el = elementLocations[i]; + ElementLocation el = smem.u.phase2.elementLocations[i]; u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; u8 bucketIndex = el.bucket; - u32 dstIndex = pSum[bucketIndex] + el.localOffset; + // u32 dstIndex = pSum[bucketIndex] + el.localOffset; + u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex]; outputValues[dstIndex] = inputValues[srcIndex]; } } From 35a02f99d6c67f704d52423e80ace051f435c555 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 29 Dec 2023 17:45:56 +0900 Subject: [PATCH 38/68] add explicit sync --- ParallelPrimitives/RadixSortKernels.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index adb8b04..bc0dd8a 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -524,7 +524,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { warpOffsets[k] = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask ); } - +#if defined( ITS ) + __syncwarp( 0xFFFFFFFF ); +#endif bool leader = ( broThreads & lowerMask ) == 0; if( itemIndex < numberOfInputs && leader ) { @@ -532,6 +534,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys atomicAdd( &blockHistogram[bucketIndex], n ); smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n; } +#if defined( ITS ) + __syncwarp( 0xFFFFFFFF ); +#endif } __syncthreads(); From 2179be6ca83819807ec2017394c5b4068c1f7a57 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 29 Dec 2023 17:46:27 +0900 Subject: [PATCH 39/68] larger block --- ParallelPrimitives/RadixSortConfigs.h | 6 +-- ParallelPrimitives/RadixSortKernels.h | 57 ++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index f305940..16bb154 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -26,7 +26,7 @@ constexpr int RADIX_SORT_BLOCK_SIZE = 2048 * 2; constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048; constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256; -constexpr int REORDER_NUMBER_OF_WARPS = 8; +constexpr int REORDER_NUMBER_OF_WARPS = 16; constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS; constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = RADIX_SORT_BLOCK_SIZE / REORDER_NUMBER_OF_WARPS; constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32; @@ -36,7 +36,7 @@ constexpr int MAX_LOOK_BACK = 64; constexpr int TAIL_BITS = 5; constexpr int TAIL_COUNT = 1u << TAIL_BITS; -static_assert( REORDER_NUMBER_OF_THREADS_PER_BLOCK <= BIN_SIZE, "please check prefixSumExclusive on onesweep_reorder" ); -static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" ); +//static_assert( REORDER_NUMBER_OF_THREADS_PER_BLOCK <= BIN_SIZE, "please check prefixSumExclusive on onesweep_reorder" ); +//static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" ); }; // namespace Oro \ No newline at end of file diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index bc0dd8a..e859949 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -318,6 +318,49 @@ __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO ) return sum; } +__device__ inline u32 scanExclusive( u32 prefix, u32* sMemIO, int nElement ) +{ + // assert(nElement <= nThreads) + bool active = threadIdx.x < nElement; + u32 value = active ? sMemIO[threadIdx.x] : 0; + + for( u32 offset = 1; offset < nElement; offset <<= 1 ) + { + u32 x; + if( active ) + { + x = sMemIO[threadIdx.x]; + } + + if( active && offset <= threadIdx.x ) + { + x += sMemIO[threadIdx.x - offset]; + } + + __syncthreads(); + + if( active ) + { + sMemIO[threadIdx.x] = x; + } + + __syncthreads(); + } + + u32 sum = sMemIO[nElement - 1]; + + __syncthreads(); + + if( active ) + { + sMemIO[threadIdx.x] += prefix - value; + } + + __syncthreads(); + + return sum; +} + extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOfInputs, u32* gpSumBuffer, u32 startBits, u32* counter ) { __shared__ u32 localCounters[sizeof( RADIX_SORT_KEY_TYPE )][256]; @@ -620,11 +663,15 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } - u32 prefix = 0; - for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - { - prefix += prefixSumExclusive( prefix, &blockHistogram[i] ); - } + //u32 prefix = 0; + //for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + //{ + // prefix += prefixSumExclusive( prefix, &blockHistogram[i] ); + //} + + scanExclusive( 0, blockHistogram, BIN_SIZE ); + + if( threadIdx.x < BIN_SIZE ) { int bucketIndex = threadIdx.x; u32 s = blockHistogram[bucketIndex]; From 1269018a86c9f7913c629573bcb0bf3e33864213 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 29 Dec 2023 18:44:35 +0900 Subject: [PATCH 40/68] key cache --- ParallelPrimitives/RadixSortConfigs.h | 2 +- ParallelPrimitives/RadixSortKernels.h | 78 +++++++++++++++++---------- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index 16bb154..d13fb08 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -26,7 +26,7 @@ constexpr int RADIX_SORT_BLOCK_SIZE = 2048 * 2; constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048; constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256; -constexpr int REORDER_NUMBER_OF_WARPS = 16; +constexpr int REORDER_NUMBER_OF_WARPS = 8; constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS; constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = RADIX_SORT_BLOCK_SIZE / REORDER_NUMBER_OF_WARPS; constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32; diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index e859949..f911b49 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -490,7 +490,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys }; struct Phase2 { - ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; + // ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; + RADIX_SORT_KEY_TYPE elements[RADIX_SORT_BLOCK_SIZE]; }; union @@ -527,7 +528,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); - u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD]; + // u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD]; + u32 keys[REORDER_NUMBER_OF_ITEM_PER_THREAD]; u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD]; // u32 bros[REORDER_NUMBER_OF_ITEM_PER_THREAD]; @@ -542,8 +544,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { auto item = inputKeys[itemIndex]; bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); + keys[k] = item; } - bucketIndices[k] = bucketIndex; + // bucketIndices[k] = bucketIndex; int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32 u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems; @@ -698,16 +701,19 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int k = 0; k < REORDER_NUMBER_OF_ITEM_PER_THREAD; k++ ) { - u32 bucketIndex = bucketIndices[k]; + // u32 bucketIndex = bucketIndices[k]; + u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation ); warpOffsets[k] += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; } __syncthreads(); + for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; - u32 bucketIndex = bucketIndices[k]; + // u32 bucketIndex = bucketIndices[k]; + u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation ); if( itemIndex < numberOfInputs ) { @@ -718,10 +724,18 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys // el.localOffset = to - blockHistogram[bucketIndex]; el.localOffset = 0; el.bucket = bucketIndex; - smem.u.phase2.elementLocations[to] = el; + // smem.u.phase2.elementLocations[to] = el; + + // smem.u.phase2.elements[to] = inputKeys[itemIndex]; + smem.u.phase2.elements[to] = keys[k]; } } + if( threadIdx.x < BIN_SIZE ) + { + pSum[threadIdx.x] -= blockHistogram[threadIdx.x]; + } + __syncthreads(); for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) @@ -729,33 +743,39 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; if( itemIndex < numberOfInputs ) { - ElementLocation el = smem.u.phase2.elementLocations[i]; - u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; - u8 bucketIndex = el.bucket; - - // u32 dstIndex = pSum[bucketIndex] + el.localOffset; - u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex]; - outputKeys[dstIndex] = inputKeys[srcIndex]; + // ElementLocation el = smem.u.phase2.elementLocations[i]; + // u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; + // u8 bucketIndex = el.bucket; + + // // u32 dstIndex = pSum[bucketIndex] + el.localOffset; + // u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex]; + // outputKeys[dstIndex] = inputKeys[srcIndex]; + + auto item = smem.u.phase2.elements[i]; + u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); + // u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex]; + u32 dstIndex = pSum[bucketIndex] + i; + outputKeys[dstIndex] = item; } } - if constexpr( keyPair ) - { - for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; - if( itemIndex < numberOfInputs ) - { - ElementLocation el = smem.u.phase2.elementLocations[i]; - u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; - u8 bucketIndex = el.bucket; + //if constexpr( keyPair ) + //{ + // for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + // { + // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; + // if( itemIndex < numberOfInputs ) + // { + // ElementLocation el = smem.u.phase2.elementLocations[i]; + // u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; + // u8 bucketIndex = el.bucket; - // u32 dstIndex = pSum[bucketIndex] + el.localOffset; - u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex]; - outputValues[dstIndex] = inputValues[srcIndex]; - } - } - } + // // u32 dstIndex = pSum[bucketIndex] + el.localOffset; + // u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex]; + // outputValues[dstIndex] = inputValues[srcIndex]; + // } + // } + //} // A special case handling: all elements have the same digit //u32 globalOutput = matchMasks[0][0]; From bcb56c9d120738278536acd2633ecabac4bd835a Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 29 Dec 2023 19:40:10 +0900 Subject: [PATCH 41/68] 16bit lpsum --- ParallelPrimitives/RadixSortConfigs.h | 2 +- ParallelPrimitives/RadixSortKernels.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index d13fb08..bd0b70f 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -20,7 +20,7 @@ constexpr auto SINGLE_SORT_WG_SIZE{ 128 }; static_assert( BIN_SIZE % 2 == 0 ); -constexpr int RADIX_SORT_BLOCK_SIZE = 2048 * 2; +constexpr int RADIX_SORT_BLOCK_SIZE = 2048 + 1024 + 1024; // constexpr int RADIX_SORT_BLOCK_SIZE = 512; constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048; diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index f911b49..cac5368 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -486,7 +486,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { struct Phase1 { - u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; + u16 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; }; struct Phase2 { @@ -524,7 +524,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys //} clearShared( blockHistogram, 0 ); - clearShared( smem.u.phase1.lpSum, 0 ); + clearShared( smem.u.phase1.lpSum, 0 ); __syncthreads(); From 1207fdc082f56ef81998a1c1cac2ce720dbae4df Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 29 Dec 2023 20:38:35 +0900 Subject: [PATCH 42/68] 16bit blockHist --- ParallelPrimitives/RadixSortKernels.h | 31 ++++++++++++++++++++------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index cac5368..7f99d9c 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -318,15 +318,16 @@ __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO ) return sum; } -__device__ inline u32 scanExclusive( u32 prefix, u32* sMemIO, int nElement ) +template +__device__ inline T scanExclusive( T prefix, T* sMemIO, int nElement ) { // assert(nElement <= nThreads) bool active = threadIdx.x < nElement; - u32 value = active ? sMemIO[threadIdx.x] : 0; + T value = active ? sMemIO[threadIdx.x] : 0; for( u32 offset = 1; offset < nElement; offset <<= 1 ) { - u32 x; + T x; if( active ) { x = sMemIO[threadIdx.x]; @@ -347,7 +348,7 @@ __device__ inline u32 scanExclusive( u32 prefix, u32* sMemIO, int nElement ) __syncthreads(); } - u32 sum = sMemIO[nElement - 1]; + T sum = sMemIO[nElement - 1]; __syncthreads(); @@ -477,7 +478,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 bucket : 8; }; - __shared__ u32 blockHistogram[BIN_SIZE]; + __shared__ u16 blockHistogram[BIN_SIZE]; __shared__ u32 pSum[BIN_SIZE]; // __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; // __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; @@ -523,7 +524,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys // } //} - clearShared( blockHistogram, 0 ); + // clearShared( blockHistogram, 0 ); clearShared( smem.u.phase1.lpSum, 0 ); __syncthreads(); @@ -572,21 +573,35 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } #if defined( ITS ) __syncwarp( 0xFFFFFFFF ); +#else + __threadfence_block(); #endif bool leader = ( broThreads & lowerMask ) == 0; if( itemIndex < numberOfInputs && leader ) { u32 n = __popc( broThreads ); - atomicAdd( &blockHistogram[bucketIndex], n ); smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n; } #if defined( ITS ) __syncwarp( 0xFFFFFFFF ); +#else + __threadfence_block(); #endif } __syncthreads(); + if( threadIdx.x < BIN_SIZE ) + { + int bucketIndex = threadIdx.x; + u32 s = 0; + for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ ) + { + s += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; + } + blockHistogram[bucketIndex] = s; + } + struct ParitionID { u64 value : 32; @@ -672,7 +687,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys // prefix += prefixSumExclusive( prefix, &blockHistogram[i] ); //} - scanExclusive( 0, blockHistogram, BIN_SIZE ); + scanExclusive( 0, blockHistogram, BIN_SIZE ); if( threadIdx.x < BIN_SIZE ) { From b78f5dc4a7632384bbb52d6e78099cc150681e6b Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Sun, 31 Dec 2023 19:02:43 +0900 Subject: [PATCH 43/68] keyValue support --- ParallelPrimitives/RadixSortKernels.h | 92 ++++++++++++++++----------- 1 file changed, 55 insertions(+), 37 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 7f99d9c..65e91be 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -478,7 +478,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 bucket : 8; }; - __shared__ u16 blockHistogram[BIN_SIZE]; __shared__ u32 pSum[BIN_SIZE]; // __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; // __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; @@ -487,6 +486,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { struct Phase1 { + u16 blockHistogram[BIN_SIZE]; u16 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; }; struct Phase2 @@ -494,11 +494,18 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys // ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; RADIX_SORT_KEY_TYPE elements[RADIX_SORT_BLOCK_SIZE]; }; + struct Phase3 + { + // ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; + RADIX_SORT_VALUE_TYPE elements[RADIX_SORT_BLOCK_SIZE]; + u8 buckets[RADIX_SORT_BLOCK_SIZE]; + }; union { Phase1 phase1; Phase2 phase2; + Phase3 phase3; } u; }; __shared__ SMem smem; @@ -524,7 +531,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys // } //} - // clearShared( blockHistogram, 0 ); clearShared( smem.u.phase1.lpSum, 0 ); __syncthreads(); @@ -599,7 +605,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { s += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; } - blockHistogram[bucketIndex] = s; + smem.u.phase1.blockHistogram[bucketIndex] = s; } struct ParitionID @@ -632,7 +638,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { //u32 s = localPrefixSum[i]; - u32 s = blockHistogram[i]; + u32 s = smem.u.phase1.blockHistogram[i]; int pIndex = BIN_SIZE * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i; { @@ -684,15 +690,15 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys //u32 prefix = 0; //for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) //{ - // prefix += prefixSumExclusive( prefix, &blockHistogram[i] ); + // prefix += prefixSumExclusive( prefix, &smem.u.phase1.blockHistogram[i] ); //} - scanExclusive( 0, blockHistogram, BIN_SIZE ); + scanExclusive( 0, smem.u.phase1.blockHistogram, BIN_SIZE ); if( threadIdx.x < BIN_SIZE ) { int bucketIndex = threadIdx.x; - u32 s = blockHistogram[bucketIndex]; + u32 s = smem.u.phase1.blockHistogram[bucketIndex]; for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ ) { int index = bucketIndex * REORDER_NUMBER_OF_WARPS + warp; @@ -716,64 +722,76 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int k = 0; k < REORDER_NUMBER_OF_ITEM_PER_THREAD; k++ ) { - // u32 bucketIndex = bucketIndices[k]; u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation ); warpOffsets[k] += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; } + if( threadIdx.x < BIN_SIZE ) + { + pSum[threadIdx.x] -= smem.u.phase1.blockHistogram[threadIdx.x]; + } + __syncthreads(); for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; - // u32 bucketIndex = bucketIndices[k]; u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation ); - if( itemIndex < numberOfInputs ) { - u32 to = warpOffsets[k]; - - ElementLocation el; - el.localSrcIndex = itemIndex - blockIndex * RADIX_SORT_BLOCK_SIZE; - // el.localOffset = to - blockHistogram[bucketIndex]; - el.localOffset = 0; - el.bucket = bucketIndex; - // smem.u.phase2.elementLocations[to] = el; - - // smem.u.phase2.elements[to] = inputKeys[itemIndex]; - smem.u.phase2.elements[to] = keys[k]; + smem.u.phase2.elements[warpOffsets[k]] = keys[k]; } } - if( threadIdx.x < BIN_SIZE ) - { - pSum[threadIdx.x] -= blockHistogram[threadIdx.x]; - } - __syncthreads(); - for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + for( int i = threadIdx.x, k = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK, k++ ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; if( itemIndex < numberOfInputs ) { - // ElementLocation el = smem.u.phase2.elementLocations[i]; - // u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; - // u8 bucketIndex = el.bucket; - - // // u32 dstIndex = pSum[bucketIndex] + el.localOffset; - // u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex]; - // outputKeys[dstIndex] = inputKeys[srcIndex]; - auto item = smem.u.phase2.elements[i]; u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); - // u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex]; + + // u32 dstIndex = pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex]; u32 dstIndex = pSum[bucketIndex] + i; outputKeys[dstIndex] = item; } } + if constexpr( keyPair ) + { + __syncthreads(); + + for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) + { + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; + u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation ); + if( itemIndex < numberOfInputs ) + { + smem.u.phase3.elements[warpOffsets[k]] = inputValues[itemIndex]; + smem.u.phase3.buckets[warpOffsets[k]] = bucketIndex; + } + } + + __syncthreads(); + + for( int i = threadIdx.x, k = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK, k++ ) + { + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; + if( itemIndex < numberOfInputs ) + { + auto item = smem.u.phase3.elements[i]; + u32 bucketIndex = smem.u.phase3.buckets[i]; + + // u32 dstIndex = pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex]; + u32 dstIndex = pSum[bucketIndex] + i; + outputValues[dstIndex] = item; + } + } + } + //if constexpr( keyPair ) //{ // for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) @@ -786,7 +804,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys // u8 bucketIndex = el.bucket; // // u32 dstIndex = pSum[bucketIndex] + el.localOffset; - // u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex]; + // u32 dstIndex = pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex]; // outputValues[dstIndex] = inputValues[srcIndex]; // } // } From fd9357f02095ba61df8f1a272b7e2472969b5816 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Sun, 31 Dec 2023 20:52:40 +0900 Subject: [PATCH 44/68] smaller warpOffsets --- ParallelPrimitives/RadixSortKernels.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 65e91be..2b989fd 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -536,8 +536,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); // u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD]; - u32 keys[REORDER_NUMBER_OF_ITEM_PER_THREAD]; - u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD]; + RADIX_SORT_KEY_TYPE keys[REORDER_NUMBER_OF_ITEM_PER_THREAD]; + u16 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD]; // u32 bros[REORDER_NUMBER_OF_ITEM_PER_THREAD]; int warp = threadIdx.x / 32; From 1224e6d8c847b71e3770cb839dcb630a4870e55e Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Mon, 1 Jan 2024 07:33:15 +0900 Subject: [PATCH 45/68] n batch loading --- ParallelPrimitives/RadixSortKernels.h | 40 ++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 2b989fd..83a67fa 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -482,12 +482,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys // __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; // __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; + constexpr int N_BATCH_LOAD = 4; struct SMem { struct Phase1 { u16 blockHistogram[BIN_SIZE]; u16 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; + RADIX_SORT_KEY_TYPE batchKeys[REORDER_NUMBER_OF_WARPS][N_BATCH_LOAD][32]; }; struct Phase2 { @@ -537,19 +539,49 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys // u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD]; RADIX_SORT_KEY_TYPE keys[REORDER_NUMBER_OF_ITEM_PER_THREAD]; - u16 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD]; + u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD]; // u32 bros[REORDER_NUMBER_OF_ITEM_PER_THREAD]; + bool batchLoading = ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs; + int warp = threadIdx.x / 32; int lane = threadIdx.x % 32; - for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) + for( int i = 0, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) { - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; + if( batchLoading && ( k % N_BATCH_LOAD ) == 0 ) + { + struct alignas( 16 ) BatchKeys + { + RADIX_SORT_KEY_TYPE xs[N_BATCH_LOAD]; + }; + int srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane * N_BATCH_LOAD; + BatchKeys batchKeys = *(BatchKeys*)&inputKeys[srcIndex]; + for( int v = 0; v < N_BATCH_LOAD; v++ ) + { + int indexInWarp = lane * N_BATCH_LOAD + v; + int toK = indexInWarp / 32; + int toLane = indexInWarp % 32; + smem.u.phase1.batchKeys[warp][toK][toLane] = batchKeys.xs[v]; + } + + __syncthreads(); + } + + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane; u32 bucketIndex = 0; if( itemIndex < numberOfInputs ) { - auto item = inputKeys[itemIndex]; + RADIX_SORT_KEY_TYPE item; + if( batchLoading ) + { + item = smem.u.phase1.batchKeys[warp][k % N_BATCH_LOAD][lane]; + } + else + { + item = inputKeys[itemIndex]; + } + bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); keys[k] = item; } From 2c997072e98d4a4c9029782d3faf5a8d6431bed0 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Mon, 1 Jan 2024 08:50:14 +0900 Subject: [PATCH 46/68] warp level is fine --- ParallelPrimitives/RadixSortKernels.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 83a67fa..a78d8c3 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -564,7 +564,11 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys smem.u.phase1.batchKeys[warp][toK][toLane] = batchKeys.xs[v]; } - __syncthreads(); +#if defined( ITS ) + __syncwarp( 0xFFFFFFFF ); +#else + __threadfence_block(); +#endif } u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane; From 59afd88d56093b5511987932adca7a5c5421eb40 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 2 Jan 2024 19:07:02 +0900 Subject: [PATCH 47/68] clean up --- ParallelPrimitives/RadixSortKernels.h | 149 -------------------------- 1 file changed, 149 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index a78d8c3..0de4669 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -479,8 +479,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys }; __shared__ u32 pSum[BIN_SIZE]; - // __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; - // __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; constexpr int N_BATCH_LOAD = 4; struct SMem @@ -512,27 +510,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys }; __shared__ SMem smem; - //__shared__ u32 localPrefixSum[BIN_SIZE]; - //__shared__ u32 counters[BIN_SIZE]; - //__shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; - //__shared__ u8 elementBuckets[RADIX_SORT_BLOCK_SIZE]; - //__shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][BIN_SIZE]; - u32 bitLocation = startBits + 8 * iteration; u32 blockIndex = blockIdx.x; u32 numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE ); - // clearShared( localPrefixSum, 0 ); - // clearShared( counters, 0 ); - - //for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ ) - //{ - // for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - // { - // matchMasks[w][i] = 0; - // } - //} - clearShared( smem.u.phase1.lpSum, 0 ); __syncthreads(); @@ -722,13 +703,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys pSum[i] = globalOutput; } - - //u32 prefix = 0; - //for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - //{ - // prefix += prefixSumExclusive( prefix, &smem.u.phase1.blockHistogram[i] ); - //} - scanExclusive( 0, smem.u.phase1.blockHistogram, BIN_SIZE ); if( threadIdx.x < BIN_SIZE ) @@ -827,129 +801,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } } } - - //if constexpr( keyPair ) - //{ - // for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - // { - // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; - // if( itemIndex < numberOfInputs ) - // { - // ElementLocation el = smem.u.phase2.elementLocations[i]; - // u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; - // u8 bucketIndex = el.bucket; - - // // u32 dstIndex = pSum[bucketIndex] + el.localOffset; - // u32 dstIndex = pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex]; - // outputValues[dstIndex] = inputValues[srcIndex]; - // } - // } - //} - - // A special case handling: all elements have the same digit - //u32 globalOutput = matchMasks[0][0]; - //if( globalOutput-- /* -1 for the actual offset */ ) - //{ - // for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - // { - // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; - // if( itemIndex < numberOfInputs ) - // { - // u32 dstIndex = globalOutput + i; - // outputKeys[dstIndex] = inputKeys[itemIndex]; - // if constexpr( keyPair ) - // { - // outputValues[dstIndex] = inputValues[itemIndex]; - // } - // } - // } - // return; - //} - - // reorder - //for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - //{ - // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; - // u32 bucketIndex = elementBuckets[i]; - - // __syncthreads(); - - // int warp = threadIdx.x / 32; - // int lane = threadIdx.x % 32; - - // if( itemIndex < numberOfInputs ) - // { - // atomicOr( &matchMasks[warp][bucketIndex], 1u << lane ); - // } - - // __syncthreads(); - - // bool flushMask = false; - - // if( itemIndex < numberOfInputs ) - // { - // u32 matchMask = matchMasks[warp][bucketIndex]; - // u32 lowerMask = ( 1u << lane ) - 1; - // u32 offset = __popc( matchMask & lowerMask ); - - // flushMask = offset == 0; - - // for( int w = 0; w < warp; w++ ) - // { - // offset += __popc( matchMasks[w][bucketIndex] ); - // } - - // u32 localOffset = counters[bucketIndex] + offset; - // u32 to = localOffset + localPrefixSum[bucketIndex]; - - // ElementLocation el; - // el.localSrcIndex = i; - // el.localOffset = localOffset; - // el.bucket = bucketIndex; - // elementLocations[to] = el; - // } - - // __syncthreads(); - - // if( itemIndex < numberOfInputs ) - // { - // atomicInc( &counters[bucketIndex], 0xFFFFFFFF ); - // } - // if( flushMask ) - // { - // matchMasks[warp][bucketIndex] = 0; - // } - //} - - //for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - //{ - // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; - // if( itemIndex < numberOfInputs ) - // { - // ElementLocation el = elementLocations[i]; - // u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; - // u8 bucketIndex = el.bucket; - - // u32 dstIndex = pSum[bucketIndex] + el.localOffset; - // outputKeys[dstIndex] = inputKeys[srcIndex]; - // } - //} - //if constexpr ( keyPair ) - //{ - // for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - // { - // u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; - // if( itemIndex < numberOfInputs ) - // { - // ElementLocation el = elementLocations[i]; - // u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex; - // u8 bucketIndex = el.bucket; - - // u32 dstIndex = pSum[bucketIndex] + el.localOffset; - // outputValues[dstIndex] = inputValues[srcIndex]; - // } - // } - //} } extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) From fedd3c568fc652183543b69382021cf23f87969d Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 2 Jan 2024 19:10:53 +0900 Subject: [PATCH 48/68] psum in gHistogram --- ParallelPrimitives/RadixSort.cpp | 5 ----- ParallelPrimitives/RadixSort.h | 1 - ParallelPrimitives/RadixSortConfigs.h | 2 +- ParallelPrimitives/RadixSortKernels.h | 22 +++++----------------- 4 files changed, 6 insertions(+), 24 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 3f31121..4ab14c6 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -164,7 +164,6 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string // TODO: bit code support? #define LOAD_FUNC( var, kernel ) var = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), kernel, &opts ); LOAD_FUNC( m_gHistogram, "gHistogram" ); - LOAD_FUNC( m_gPrefixSum, "gPrefixSum" ); LOAD_FUNC( m_onesweep_reorderKey64, "onesweep_reorderKey64" ); LOAD_FUNC( m_onesweep_reorderKeyPair64, "onesweep_reorderKeyPair64" ); #undef LOAD_FUNC @@ -232,10 +231,6 @@ void RadixSort::sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &counter }; OrochiUtils::launch1D( m_gHistogram, nBlocks * GHISTOGRAM_THREADS_PER_BLOCK, args, GHISTOGRAM_THREADS_PER_BLOCK, 0, stream ); } - { - const void* args[] = { &gpSumBuffer }; - OrochiUtils::launch1D( m_gPrefixSum, nIteration * BIN_SIZE, args, BIN_SIZE, 0, stream ); - } auto s = src; auto d = dst; diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h index 2a7c3be..a20c9f0 100644 --- a/ParallelPrimitives/RadixSort.h +++ b/ParallelPrimitives/RadixSort.h @@ -75,7 +75,6 @@ class RadixSort final OrochiUtils& m_oroutils; oroFunction m_gHistogram; - oroFunction m_gPrefixSum; oroFunction m_onesweep_reorderKey64; oroFunction m_onesweep_reorderKeyPair64; diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index bd0b70f..9f3701f 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -36,7 +36,7 @@ constexpr int MAX_LOOK_BACK = 64; constexpr int TAIL_BITS = 5; constexpr int TAIL_COUNT = 1u << TAIL_BITS; -//static_assert( REORDER_NUMBER_OF_THREADS_PER_BLOCK <= BIN_SIZE, "please check prefixSumExclusive on onesweep_reorder" ); +static_assert( BIN_SIZE <= REORDER_NUMBER_OF_THREADS_PER_BLOCK, "please check scanExclusive" ); //static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" ); }; // namespace Oro \ No newline at end of file diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 0de4669..7e73717 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -442,7 +442,10 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf if( hasData ) { - __syncthreads(); + for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) + { + scanExclusive( 0, &localCounters[i][0], BIN_SIZE ); + } for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { @@ -454,19 +457,6 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf } } -extern "C" __global__ void gPrefixSum( u32* gpSumBuffer ) -{ - __shared__ u32 smem[BIN_SIZE]; - - smem[threadIdx.x] = gpSumBuffer[blockIdx.x * BIN_SIZE + threadIdx.x]; - - __syncthreads(); - - prefixSumExclusive( 0, smem ); - - gpSumBuffer[blockIdx.x * BIN_SIZE + threadIdx.x] = smem[threadIdx.x]; -} - template __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) @@ -518,12 +508,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); - // u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD]; RADIX_SORT_KEY_TYPE keys[REORDER_NUMBER_OF_ITEM_PER_THREAD]; u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD]; - // u32 bros[REORDER_NUMBER_OF_ITEM_PER_THREAD]; - bool batchLoading = ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs; + bool batchLoading = KEY_IS_16BYTE_ALIGNED && ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs; int warp = threadIdx.x / 32; int lane = threadIdx.x % 32; From 19dd9fa0165210b7c09407be0bc678479829bfec Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Tue, 2 Jan 2024 19:12:16 +0900 Subject: [PATCH 49/68] remove unused --- ParallelPrimitives/RadixSortKernels.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 7e73717..d71aefc 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -461,13 +461,6 @@ template __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) { - struct ElementLocation - { - u32 localSrcIndex : 12; - u32 localOffset : 12; - u32 bucket : 8; - }; - __shared__ u32 pSum[BIN_SIZE]; constexpr int N_BATCH_LOAD = 4; @@ -481,12 +474,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys }; struct Phase2 { - // ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; RADIX_SORT_KEY_TYPE elements[RADIX_SORT_BLOCK_SIZE]; }; struct Phase3 { - // ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE]; RADIX_SORT_VALUE_TYPE elements[RADIX_SORT_BLOCK_SIZE]; u8 buckets[RADIX_SORT_BLOCK_SIZE]; }; @@ -558,7 +549,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); keys[k] = item; } - // bucketIndices[k] = bucketIndex; int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32 u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems; @@ -574,7 +564,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys #endif broThreads &= ~difference; } - // bros[k] = broThreads; + int laneIndex = threadIdx.x % 32; u32 lowerMask = ( 1u << laneIndex ) - 1; From ac0605d9711028d00bfb7958e1e276cbae506896 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Wed, 3 Jan 2024 00:23:11 +0900 Subject: [PATCH 50/68] refactor --- ParallelPrimitives/RadixSortConfigs.h | 2 ++ ParallelPrimitives/RadixSortKernels.h | 34 +++++++++++++-------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index 9f3701f..178ffa4 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -20,6 +20,8 @@ constexpr auto SINGLE_SORT_WG_SIZE{ 128 }; static_assert( BIN_SIZE % 2 == 0 ); +constexpr int WARP_SIZE = 32; + constexpr int RADIX_SORT_BLOCK_SIZE = 2048 + 1024 + 1024; // constexpr int RADIX_SORT_BLOCK_SIZE = 512; diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index d71aefc..ee1e4af 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -364,11 +364,11 @@ __device__ inline T scanExclusive( T prefix, T* sMemIO, int nElement ) extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOfInputs, u32* gpSumBuffer, u32 startBits, u32* counter ) { - __shared__ u32 localCounters[sizeof( RADIX_SORT_KEY_TYPE )][256]; + __shared__ u32 localCounters[sizeof( RADIX_SORT_KEY_TYPE )][BIN_SIZE]; for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { - for( int j = threadIdx.x; j < 256; j += GHISTOGRAM_THREADS_PER_BLOCK ) + for( int j = threadIdx.x; j < BIN_SIZE; j += GHISTOGRAM_THREADS_PER_BLOCK ) { localCounters[i][j] = 0; } @@ -406,7 +406,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf auto item = key4.xs[k]; for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { - u32 bitLocation = startBits + i * 8; + u32 bitLocation = startBits + i * N_RADIX; u32 bits = extractDigit( getKeyBits( item ), bitLocation ); atomicInc( &localCounters[i][bits], 0xFFFFFFFF ); } @@ -423,7 +423,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf auto item = inputs[itemIndex]; for( int j = 0; j < sizeof( RADIX_SORT_KEY_TYPE ); j++ ) { - u32 bitLocation = startBits + j * 8; + u32 bitLocation = startBits + j * N_RADIX; u32 bits = extractDigit( getKeyBits( item ), bitLocation ); atomicInc( &localCounters[j][bits], 0xFFFFFFFF ); } @@ -470,7 +470,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { u16 blockHistogram[BIN_SIZE]; u16 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; - RADIX_SORT_KEY_TYPE batchKeys[REORDER_NUMBER_OF_WARPS][N_BATCH_LOAD][32]; + RADIX_SORT_KEY_TYPE batchKeys[REORDER_NUMBER_OF_WARPS][N_BATCH_LOAD][WARP_SIZE]; }; struct Phase2 { @@ -491,7 +491,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys }; __shared__ SMem smem; - u32 bitLocation = startBits + 8 * iteration; + u32 bitLocation = startBits + N_RADIX * iteration; u32 blockIndex = blockIdx.x; u32 numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE ); @@ -504,9 +504,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys bool batchLoading = KEY_IS_16BYTE_ALIGNED && ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs; - int warp = threadIdx.x / 32; - int lane = threadIdx.x % 32; - for( int i = 0, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) + int warp = threadIdx.x / WARP_SIZE; + int lane = threadIdx.x % WARP_SIZE; + for( int i = 0, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ ) { if( batchLoading && ( k % N_BATCH_LOAD ) == 0 ) { @@ -519,8 +519,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int v = 0; v < N_BATCH_LOAD; v++ ) { int indexInWarp = lane * N_BATCH_LOAD + v; - int toK = indexInWarp / 32; - int toLane = indexInWarp % 32; + int toK = indexInWarp / WARP_SIZE; + int toLane = indexInWarp % WARP_SIZE; smem.u.phase1.batchKeys[warp][toK][toLane] = batchKeys.xs[v]; } @@ -550,10 +550,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys keys[k] = item; } - int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32 + int nNoneActiveItems = WARP_SIZE - u32min( numberOfInputs - ( itemIndex - lane ), WARP_SIZE ); // 0 - 32 u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems; - for( int j = 0; j < 8; ++j ) + for( int j = 0; j < N_RADIX; ++j ) { u32 bit = ( bucketIndex >> j ) & 0x1; u32 difference = ( 0xFFFFFFFF * bit ) ^ @@ -565,8 +565,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys broThreads &= ~difference; } - int laneIndex = threadIdx.x % 32; - u32 lowerMask = ( 1u << laneIndex ) - 1; + u32 lowerMask = ( 1u << lane ) - 1; if( itemIndex < numberOfInputs ) { @@ -632,7 +631,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - //u32 s = localPrefixSum[i]; u32 s = smem.u.phase1.blockHistogram[i]; int pIndex = BIN_SIZE * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i; @@ -722,7 +720,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); - for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) + for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation ); @@ -752,7 +750,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { __syncthreads(); - for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ ) + for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation ); From 34ac3652390d19d8f555a54350d9ce5e36358645 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Wed, 3 Jan 2024 11:10:22 +0900 Subject: [PATCH 51/68] fix undefined behavior and simplify --- ParallelPrimitives/RadixSortKernels.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index ee1e4af..3ec8bd5 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -285,7 +285,6 @@ __device__ inline u32 getKeyBits( u32 x ) { return x ^ ORDER_MASK_32; } __device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; } __device__ inline u32 extractDigit( u32 x, u32 bitLocation ) { return ( x >> bitLocation ) & RADIX_MASK; } __device__ inline u32 extractDigit( u64 x, u32 bitLocation ) { return (u32)( ( x >> bitLocation ) & RADIX_MASK ); } -__device__ __forceinline__ u32 u32min( u32 x, u32 y ) { return ( y < x ) ? y : x; } template __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO ) @@ -550,8 +549,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys keys[k] = item; } - int nNoneActiveItems = WARP_SIZE - u32min( numberOfInputs - ( itemIndex - lane ), WARP_SIZE ); // 0 - 32 - u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems; + // check the attendees + u32 broThreads = +#if defined( ITS ) + __ballot_sync( 0xFFFFFFFF, +#else + __ballot( +#endif + itemIndex < numberOfInputs ); for( int j = 0; j < N_RADIX; ++j ) { From 3289c1b663bec727fa19b6153271111542819870 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Wed, 3 Jan 2024 11:40:09 +0900 Subject: [PATCH 52/68] simplify --- ParallelPrimitives/RadixSortKernels.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 3ec8bd5..207affb 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -571,18 +571,15 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } u32 lowerMask = ( 1u << lane ) - 1; - - if( itemIndex < numberOfInputs ) - { - warpOffsets[k] = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask ); - } + warpOffsets[k] = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask ); + #if defined( ITS ) __syncwarp( 0xFFFFFFFF ); #else __threadfence_block(); #endif - bool leader = ( broThreads & lowerMask ) == 0; - if( itemIndex < numberOfInputs && leader ) + u32 leaderIdx = __ffs( broThreads ) - 1; + if( lane == leaderIdx ) { u32 n = __popc( broThreads ); smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n; From 680a91026e4a386a9dd72726430846bc8cb8bf58 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Wed, 3 Jan 2024 11:46:49 +0900 Subject: [PATCH 53/68] refactor --- ParallelPrimitives/RadixSortKernels.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 207affb..2e03f27 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -571,7 +571,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } u32 lowerMask = ( 1u << lane ) - 1; - warpOffsets[k] = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask ); + auto digitCount = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; + warpOffsets[k] = digitCount + __popc( broThreads & lowerMask ); #if defined( ITS ) __syncwarp( 0xFFFFFFFF ); @@ -581,8 +582,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys u32 leaderIdx = __ffs( broThreads ) - 1; if( lane == leaderIdx ) { - u32 n = __popc( broThreads ); - smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n; + smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] = digitCount + __popc( broThreads ); } #if defined( ITS ) __syncwarp( 0xFFFFFFFF ); @@ -593,9 +593,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); - if( threadIdx.x < BIN_SIZE ) + for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - int bucketIndex = threadIdx.x; u32 s = 0; for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ ) { @@ -683,9 +682,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys scanExclusive( 0, smem.u.phase1.blockHistogram, BIN_SIZE ); - if( threadIdx.x < BIN_SIZE ) + for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - int bucketIndex = threadIdx.x; u32 s = smem.u.phase1.blockHistogram[bucketIndex]; for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ ) { @@ -714,9 +712,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys warpOffsets[k] += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; } - if( threadIdx.x < BIN_SIZE ) + for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { - pSum[threadIdx.x] -= smem.u.phase1.blockHistogram[threadIdx.x]; + pSum[bucketIndex] -= smem.u.phase1.blockHistogram[bucketIndex]; } __syncthreads(); From d7d0274c3e4e59dce69cc3afea9f43dabf14c254 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Wed, 3 Jan 2024 12:30:25 +0900 Subject: [PATCH 54/68] support non blockDim != 256 case --- ParallelPrimitives/RadixSortConfigs.h | 9 ++++----- ParallelPrimitives/RadixSortKernels.h | 8 +++++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index 178ffa4..1cb1639 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -22,23 +22,22 @@ static_assert( BIN_SIZE % 2 == 0 ); constexpr int WARP_SIZE = 32; -constexpr int RADIX_SORT_BLOCK_SIZE = 2048 + 1024 + 1024; -// constexpr int RADIX_SORT_BLOCK_SIZE = 512; +constexpr int RADIX_SORT_BLOCK_SIZE = 4096; constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048; constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256; constexpr int REORDER_NUMBER_OF_WARPS = 8; -constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS; +constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = WARP_SIZE * REORDER_NUMBER_OF_WARPS; constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = RADIX_SORT_BLOCK_SIZE / REORDER_NUMBER_OF_WARPS; -constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32; +constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / WARP_SIZE; constexpr int LOOKBACK_TABLE_SIZE = 1024; constexpr int MAX_LOOK_BACK = 64; constexpr int TAIL_BITS = 5; constexpr int TAIL_COUNT = 1u << TAIL_BITS; -static_assert( BIN_SIZE <= REORDER_NUMBER_OF_THREADS_PER_BLOCK, "please check scanExclusive" ); +//static_assert( BIN_SIZE <= REORDER_NUMBER_OF_THREADS_PER_BLOCK, "please check scanExclusive" ); //static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" ); }; // namespace Oro \ No newline at end of file diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 2e03f27..51a06f3 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -680,7 +680,13 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys pSum[i] = globalOutput; } - scanExclusive( 0, smem.u.phase1.blockHistogram, BIN_SIZE ); + __syncthreads(); + + u32 prefix = 0; + for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) + { + prefix += scanExclusive( prefix, smem.u.phase1.blockHistogram + i, min( REORDER_NUMBER_OF_THREADS_PER_BLOCK, BIN_SIZE ) ); + } for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { From dea141111282b61e9096a96404eec4d20a27cffd Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Wed, 3 Jan 2024 12:33:15 +0900 Subject: [PATCH 55/68] remove unused --- ParallelPrimitives/RadixSortKernels.h | 31 --------------------------- 1 file changed, 31 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 51a06f3..1bec3eb 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -286,37 +286,6 @@ __device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; } __device__ inline u32 extractDigit( u32 x, u32 bitLocation ) { return ( x >> bitLocation ) & RADIX_MASK; } __device__ inline u32 extractDigit( u64 x, u32 bitLocation ) { return (u32)( ( x >> bitLocation ) & RADIX_MASK ); } -template -__device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO ) -{ - u32 value = sMemIO[threadIdx.x]; - - for( u32 offset = 1; offset < NThreads; offset <<= 1 ) - { - u32 x = sMemIO[threadIdx.x]; - - if( offset <= threadIdx.x ) - { - x += sMemIO[threadIdx.x - offset]; - } - - __syncthreads(); - - sMemIO[threadIdx.x] = x; - - __syncthreads(); - } - u32 sum = sMemIO[NThreads - 1]; - - __syncthreads(); - - sMemIO[threadIdx.x] += prefix - value; - - __syncthreads(); - - return sum; -} - template __device__ inline T scanExclusive( T prefix, T* sMemIO, int nElement ) { From c6f871bfc6eb00d0fb0bc001f230eb4c14a74461 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Wed, 3 Jan 2024 12:50:02 +0900 Subject: [PATCH 56/68] reduce loops and ealier tail iterator is better --- ParallelPrimitives/RadixSortKernels.h | 29 ++++++++++++--------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 1bec3eb..185b271 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -651,6 +651,16 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); + if( threadIdx.x == 0 ) + { + while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT ) + ; + + atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ ); + } + + __syncthreads(); + u32 prefix = 0; for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { @@ -660,6 +670,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { u32 s = smem.u.phase1.blockHistogram[bucketIndex]; + + pSum[bucketIndex] -= s; // pre-substruct to avoid pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex] to calculate destinations + for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ ) { int index = bucketIndex * REORDER_NUMBER_OF_WARPS + warp; @@ -671,30 +684,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); - if( threadIdx.x == 0 ) - { - while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT ) - ; - - atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ ); - } - - __syncthreads(); - for( int k = 0; k < REORDER_NUMBER_OF_ITEM_PER_THREAD; k++ ) { u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation ); warpOffsets[k] += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp]; } - for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) - { - pSum[bucketIndex] -= smem.u.phase1.blockHistogram[bucketIndex]; - } - __syncthreads(); - for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i; From 402db80aef7902c3e036a26bd7d037bc98fcf69f Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 4 Jan 2024 00:06:58 +0900 Subject: [PATCH 57/68] remove redundant sync --- ParallelPrimitives/RadixSortKernels.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 185b271..854c0c1 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -342,8 +342,6 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf } } - __syncthreads(); - u32 numberOfBlocks = div_round_up( numberOfInputs, GHISTOGRAM_ITEM_PER_BLOCK ); __shared__ u32 iBlock; if( threadIdx.x == 0 ) From 16e046cf2d345e0ba9aa5354ab3f852eae9cf6b5 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 4 Jan 2024 00:07:14 +0900 Subject: [PATCH 58/68] use constant decl --- ParallelPrimitives/RadixSortConfigs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index 1cb1639..7d4c29a 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -30,7 +30,7 @@ constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256; constexpr int REORDER_NUMBER_OF_WARPS = 8; constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = WARP_SIZE * REORDER_NUMBER_OF_WARPS; constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = RADIX_SORT_BLOCK_SIZE / REORDER_NUMBER_OF_WARPS; -constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / WARP_SIZE; +constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32; constexpr int LOOKBACK_TABLE_SIZE = 1024; constexpr int MAX_LOOK_BACK = 64; From 69b09b17560f56f5e887b900528b0ee6d6963183 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Sun, 14 Jan 2024 13:05:18 +0900 Subject: [PATCH 59/68] shorten --- ParallelPrimitives/RadixSortKernels.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 854c0c1..a8ad5a3 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -498,20 +498,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane; - u32 bucketIndex = 0; if( itemIndex < numberOfInputs ) { - RADIX_SORT_KEY_TYPE item; - if( batchLoading ) - { - item = smem.u.phase1.batchKeys[warp][k % N_BATCH_LOAD][lane]; - } - else - { - item = inputKeys[itemIndex]; - } - + RADIX_SORT_KEY_TYPE item = batchLoading ? smem.u.phase1.batchKeys[warp][k % N_BATCH_LOAD][lane] : inputKeys[itemIndex]; bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); keys[k] = item; } From 2261c0280fc925a24e59396cf0d05c82c57201b3 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Sun, 14 Jan 2024 20:35:12 +0900 Subject: [PATCH 60/68] remove unused --- ParallelPrimitives/RadixSortKernels.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index a8ad5a3..8436890 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -692,7 +692,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); - for( int i = threadIdx.x, k = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK, k++ ) + for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; if( itemIndex < numberOfInputs ) @@ -723,7 +723,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys __syncthreads(); - for( int i = threadIdx.x, k = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK, k++ ) + for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK ) { u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i; if( itemIndex < numberOfInputs ) From 6723b8e88f6f4b1a84890b9332ab03a9b58ead09 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Thu, 18 Jan 2024 21:56:38 +0900 Subject: [PATCH 61/68] remove too much optimizations, fix potential sync issue etc --- ParallelPrimitives/RadixSortKernels.h | 60 ++++++++------------------- 1 file changed, 17 insertions(+), 43 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 8436890..7605016 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -292,15 +292,10 @@ __device__ inline T scanExclusive( T prefix, T* sMemIO, int nElement ) // assert(nElement <= nThreads) bool active = threadIdx.x < nElement; T value = active ? sMemIO[threadIdx.x] : 0; + T x = value; for( u32 offset = 1; offset < nElement; offset <<= 1 ) { - T x; - if( active ) - { - x = sMemIO[threadIdx.x]; - } - if( active && offset <= threadIdx.x ) { x += sMemIO[threadIdx.x - offset]; @@ -322,7 +317,7 @@ __device__ inline T scanExclusive( T prefix, T* sMemIO, int nElement ) if( active ) { - sMemIO[threadIdx.x] += prefix - value; + sMemIO[threadIdx.x] = x + prefix - value; } __syncthreads(); @@ -436,7 +431,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { u16 blockHistogram[BIN_SIZE]; u16 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS]; - RADIX_SORT_KEY_TYPE batchKeys[REORDER_NUMBER_OF_WARPS][N_BATCH_LOAD][WARP_SIZE]; }; struct Phase2 { @@ -468,43 +462,21 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys RADIX_SORT_KEY_TYPE keys[REORDER_NUMBER_OF_ITEM_PER_THREAD]; u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD]; - bool batchLoading = KEY_IS_16BYTE_ALIGNED && ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs; - int warp = threadIdx.x / WARP_SIZE; int lane = threadIdx.x % WARP_SIZE; + for( int i = 0, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ ) { - if( batchLoading && ( k % N_BATCH_LOAD ) == 0 ) - { - struct alignas( 16 ) BatchKeys - { - RADIX_SORT_KEY_TYPE xs[N_BATCH_LOAD]; - }; - int srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane * N_BATCH_LOAD; - BatchKeys batchKeys = *(BatchKeys*)&inputKeys[srcIndex]; - for( int v = 0; v < N_BATCH_LOAD; v++ ) - { - int indexInWarp = lane * N_BATCH_LOAD + v; - int toK = indexInWarp / WARP_SIZE; - int toLane = indexInWarp % WARP_SIZE; - smem.u.phase1.batchKeys[warp][toK][toLane] = batchKeys.xs[v]; - } - -#if defined( ITS ) - __syncwarp( 0xFFFFFFFF ); -#else - __threadfence_block(); -#endif - } - u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane; - u32 bucketIndex = 0; if( itemIndex < numberOfInputs ) { - RADIX_SORT_KEY_TYPE item = batchLoading ? smem.u.phase1.batchKeys[warp][k % N_BATCH_LOAD][lane] : inputKeys[itemIndex]; - bucketIndex = extractDigit( getKeyBits( item ), bitLocation ); - keys[k] = item; + keys[k] = inputKeys[itemIndex]; } + } + for( int i = 0, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ ) + { + u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane; + u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation ); // check the attendees u32 broThreads = @@ -534,7 +506,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys #if defined( ITS ) __syncwarp( 0xFFFFFFFF ); #else - __threadfence_block(); + __syncthreads(); #endif u32 leaderIdx = __ffs( broThreads ) - 1; if( lane == leaderIdx ) @@ -544,7 +516,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys #if defined( ITS ) __syncwarp( 0xFFFFFFFF ); #else - __threadfence_block(); + __syncthreads(); #endif } @@ -661,9 +633,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys pSum[bucketIndex] -= s; // pre-substruct to avoid pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex] to calculate destinations - for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ ) + for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ ) { - int index = bucketIndex * REORDER_NUMBER_OF_WARPS + warp; + int index = bucketIndex * REORDER_NUMBER_OF_WARPS + w; u32 n = smem.u.phase1.lpSum[index]; smem.u.phase1.lpSum[index] = s; s += n; @@ -738,12 +710,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys } } } -extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, +extern "C" __global__ void __launch_bounds__( REORDER_NUMBER_OF_THREADS_PER_BLOCK ) onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) { onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration ); } -extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer, +extern "C" __global__ void __launch_bounds__( REORDER_NUMBER_OF_THREADS_PER_BLOCK ) onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, + u32 numberOfInputs, + u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration ) { onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration ); From 1d399efa658ab185a5ace715ff042c1cccbdf080 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 23 Feb 2024 18:41:31 +0900 Subject: [PATCH 62/68] remove unused branching. Thanks to ChihChen --- ParallelPrimitives/RadixSortKernels.h | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 7605016..ec52952 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -346,12 +346,8 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf __syncthreads(); - bool hasData = false; - while( iBlock < numberOfBlocks ) { - hasData = true; - if( KEY_IS_16BYTE_ALIGNED && ( iBlock + 1 ) * GHISTOGRAM_ITEM_PER_BLOCK <= numberOfInputs ) { for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK * 4 ) @@ -401,19 +397,16 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf __syncthreads(); } - if( hasData ) + for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { - for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) - { - scanExclusive( 0, &localCounters[i][0], BIN_SIZE ); - } + scanExclusive( 0, &localCounters[i][0], BIN_SIZE ); + } - for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) + for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) + { + for( int j = threadIdx.x; j < BIN_SIZE; j += GHISTOGRAM_THREADS_PER_BLOCK ) { - for( int j = threadIdx.x; j < BIN_SIZE; j += GHISTOGRAM_THREADS_PER_BLOCK ) - { - atomicAdd( &gpSumBuffer[BIN_SIZE * i + j], localCounters[i][j] ); - } + atomicAdd( &gpSumBuffer[BIN_SIZE * i + j], localCounters[i][j] ); } } } From 20b8e5569718b59518136cba0b3df67a4d9ed295 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 23 Feb 2024 20:31:37 +0900 Subject: [PATCH 63/68] refactor the tail iterator conditions --- ParallelPrimitives/RadixSortConfigs.h | 3 ++- ParallelPrimitives/RadixSortKernels.h | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index 7d4c29a..87036e2 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -35,7 +35,8 @@ constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WAR constexpr int LOOKBACK_TABLE_SIZE = 1024; constexpr int MAX_LOOK_BACK = 64; constexpr int TAIL_BITS = 5; -constexpr int TAIL_COUNT = 1u << TAIL_BITS; +constexpr auto TAIL_MASK = 0xFFFFFFFFu << TAIL_BITS; +static_assert( MAX_LOOK_BACK < LOOKBACK_TABLE_SIZE, "" ); //static_assert( BIN_SIZE <= REORDER_NUMBER_OF_THREADS_PER_BLOCK, "please check scanExclusive" ); //static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" ); diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index ec52952..2a1785b 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -546,8 +546,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys if( threadIdx.x == 0 && LOOKBACK_TABLE_SIZE <= blockIndex ) { - u32 mustBeDone = blockIndex - LOOKBACK_TABLE_SIZE + MAX_LOOK_BACK; - while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) * TAIL_COUNT <= mustBeDone ) + // Wait until blockIndex < tail - MAX_LOOK_BACK + LOOKBACK_TABLE_SIZE + while( ( atomicAdd( tailIterator, 0 ) & TAIL_MASK ) - MAX_LOOK_BACK + LOOKBACK_TABLE_SIZE <= blockIndex ) ; } __syncthreads(); @@ -606,7 +606,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys if( threadIdx.x == 0 ) { - while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT ) + while( ( atomicAdd( tailIterator, 0 ) & TAIL_MASK ) != ( blockIndex & TAIL_MASK ) ) ; atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ ); From a1731faa0827acec0b1385406f462f0b223ba11f Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 23 Feb 2024 23:14:26 +0900 Subject: [PATCH 64/68] simple code is just fine at gHistogram. No more KEY_IS_16BYTE_ALIGNED --- ParallelPrimitives/RadixSortConfigs.h | 1 + ParallelPrimitives/RadixSortKernels.h | 45 ++++++--------------------- 2 files changed, 10 insertions(+), 36 deletions(-) diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h index 87036e2..40cd112 100644 --- a/ParallelPrimitives/RadixSortConfigs.h +++ b/ParallelPrimitives/RadixSortConfigs.h @@ -26,6 +26,7 @@ constexpr int RADIX_SORT_BLOCK_SIZE = 4096; constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048; constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256; +constexpr int GHISTOGRAM_ITEMS_PER_THREAD = GHISTOGRAM_ITEM_PER_BLOCK / GHISTOGRAM_THREADS_PER_BLOCK; constexpr int REORDER_NUMBER_OF_WARPS = 8; constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = WARP_SIZE * REORDER_NUMBER_OF_WARPS; diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 2a1785b..f2e229a 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -253,9 +253,6 @@ extern "C" __global__ void SortSinglePassKernel( int* gSrcKey, int* gDstKey, int extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int gN, const int START_BIT, const int END_BIT ) { SortSinglePass( gSrcKey, gSrcVal, gDstKey, gDstVal, gN, START_BIT, END_BIT ); } - -constexpr auto KEY_IS_16BYTE_ALIGNED = true; - using RADIX_SORT_KEY_TYPE = u32; using RADIX_SORT_VALUE_TYPE = u32; @@ -348,45 +345,21 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf while( iBlock < numberOfBlocks ) { - if( KEY_IS_16BYTE_ALIGNED && ( iBlock + 1 ) * GHISTOGRAM_ITEM_PER_BLOCK <= numberOfInputs ) + for( int j = 0; j < GHISTOGRAM_ITEMS_PER_THREAD; j++ ) { - for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK * 4 ) - { - u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i + threadIdx.x * 4; - struct alignas( 16 ) Key4 - { - RADIX_SORT_KEY_TYPE xs[4]; - }; - Key4 key4 = *(Key4*)&inputs[itemIndex]; - for( int k = 0; k < 4; k++ ) - { - auto item = key4.xs[k]; - for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) - { - u32 bitLocation = startBits + i * N_RADIX; - u32 bits = extractDigit( getKeyBits( item ), bitLocation ); - atomicInc( &localCounters[i][bits], 0xFFFFFFFF ); - } - } - } - } - else - { - for( int i = threadIdx.x; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK ) + u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x * GHISTOGRAM_ITEMS_PER_THREAD + j; + if( itemIndex < numberOfInputs ) { - u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i; - if( itemIndex < numberOfInputs ) + auto item = inputs[itemIndex]; + for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) { - auto item = inputs[itemIndex]; - for( int j = 0; j < sizeof( RADIX_SORT_KEY_TYPE ); j++ ) - { - u32 bitLocation = startBits + j * N_RADIX; - u32 bits = extractDigit( getKeyBits( item ), bitLocation ); - atomicInc( &localCounters[j][bits], 0xFFFFFFFF ); - } + u32 bitLocation = startBits + i * N_RADIX; + u32 bits = extractDigit( getKeyBits( item ), bitLocation ); + atomicInc( &localCounters[i][bits], 0xFFFFFFFF ); } } } + __syncthreads(); if( threadIdx.x == 0 ) From d5beef7e6ba7a93af662e74d0d948f0dd252ab0d Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Sat, 24 Feb 2024 14:41:35 +0900 Subject: [PATCH 65/68] remove unused --- ParallelPrimitives/RadixSortKernels.h | 1 - 1 file changed, 1 deletion(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index f2e229a..9b5849e 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -390,7 +390,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys { __shared__ u32 pSum[BIN_SIZE]; - constexpr int N_BATCH_LOAD = 4; struct SMem { struct Phase1 From 1ff67d955d7893e98e2b24d233f41be35222cf52 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Sat, 24 Feb 2024 15:09:34 +0900 Subject: [PATCH 66/68] unify atomicInc --- ParallelPrimitives/RadixSortKernels.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h index 9b5849e..75e9451 100644 --- a/ParallelPrimitives/RadixSortKernels.h +++ b/ParallelPrimitives/RadixSortKernels.h @@ -336,15 +336,18 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf u32 numberOfBlocks = div_round_up( numberOfInputs, GHISTOGRAM_ITEM_PER_BLOCK ); __shared__ u32 iBlock; - if( threadIdx.x == 0 ) + for(;;) { - iBlock = atomicInc( counter, 0xFFFFFFFF ); - } + if( threadIdx.x == 0 ) + { + iBlock = atomicInc( counter, 0xFFFFFFFF ); + } - __syncthreads(); + __syncthreads(); - while( iBlock < numberOfBlocks ) - { + if( numberOfBlocks <= iBlock ) + break; + for( int j = 0; j < GHISTOGRAM_ITEMS_PER_THREAD; j++ ) { u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x * GHISTOGRAM_ITEMS_PER_THREAD + j; @@ -361,13 +364,6 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf } __syncthreads(); - - if( threadIdx.x == 0 ) - { - iBlock = atomicInc( counter, 0xFFFFFFFF ); - } - - __syncthreads(); } for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ ) From b3af1e9dbd7d13b532f50a3230bf5de0a583eccc Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 1 Mar 2024 19:08:42 +0900 Subject: [PATCH 67/68] remove temporal splitmix64 --- Test/RadixSort/main.cpp | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/Test/RadixSort/main.cpp b/Test/RadixSort/main.cpp index 2f7578d..090da60 100644 --- a/Test/RadixSort/main.cpp +++ b/Test/RadixSort/main.cpp @@ -49,19 +49,6 @@ class Stopwatch }; #endif -struct splitmix64 -{ - uint64_t x = 0; /* The state can be seeded with any value. */ - - uint64_t next() - { - uint64_t z = ( x += 0x9e3779b97f4a7c15 ); - z = ( z ^ ( z >> 30 ) ) * 0xbf58476d1ce4e5b9; - z = ( z ^ ( z >> 27 ) ) * 0x94d049bb133111eb; - return z ^ ( z >> 31 ); - } -}; - using u64 = Oro::RadixSort::u64; using u32 = Oro::RadixSort::u32; @@ -82,13 +69,9 @@ class SortTest std::vector srcKey( testSize ); - splitmix64 rng; for( int i = 0; i < testSize; i++ ) { srcKey[i] = getRandom( 0u, (u32)( ( 1ull << (u64)testBits ) - 1 ) ); - - //u32 mask = (u32)( ( 1ull << (u64)testBits ) - 1 ); - //srcKey[i] = rng.next() & mask; } std::vector srcValue( testSize ); From 656e5782b90349cc6680e2f16aec3f93b5ce8584 Mon Sep 17 00:00:00 2001 From: "Atsushi.Yoshimura" Date: Fri, 1 Mar 2024 19:13:43 +0900 Subject: [PATCH 68/68] use arg_cast instead --- ParallelPrimitives/RadixSort.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 4ab14c6..62c7415 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -212,23 +212,17 @@ void RadixSort::sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n int nIteration = div_round_up64( endBit - startBit, 8 ); uint64_t numberOfBlocks = div_round_up64( n, RADIX_SORT_BLOCK_SIZE ); - // Buffers - void* gpSumBuffer = m_gpSumBuffer.ptr(); - void* lookBackBuffer = m_lookbackBuffer.ptr(); - void* tailIteratorBuffer = m_tailIterator.ptr(); - m_lookbackBuffer.resetAsync( stream ); m_gpSumCounter.resetAsync( stream ); m_gpSumBuffer.resetAsync( stream ); // counter for gHistogram. { - void* counter = m_gpSumCounter.ptr(); int maxBlocksPerMP = 0; oroError e = oroOccupancyMaxActiveBlocksPerMultiprocessor( &maxBlocksPerMP, m_gHistogram, GHISTOGRAM_THREADS_PER_BLOCK, 0 ); const int nBlocks = e == oroSuccess ? maxBlocksPerMP * m_props.multiProcessorCount : 2048; - const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &counter }; + const void* args[] = { &src.key, &n, arg_cast( m_gpSumBuffer.address() ), &startBit, arg_cast( m_gpSumCounter.address() ) }; OrochiUtils::launch1D( m_gHistogram, nBlocks * GHISTOGRAM_THREADS_PER_BLOCK, args, GHISTOGRAM_THREADS_PER_BLOCK, 0, stream ); } @@ -243,12 +237,12 @@ void RadixSort::sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n if( keyPair ) { - const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, &gpSumBuffer, &lookBackBuffer, &tailIteratorBuffer, & startBit, &i }; + const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, arg_cast( m_gpSumBuffer.address() ), arg_cast( m_lookbackBuffer.address() ), arg_cast( m_tailIterator.address() ), &startBit, &i }; OrochiUtils::launch1D( m_onesweep_reorderKeyPair64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream ); } else { - const void* args[] = { &s.key, &d.key, &n, &gpSumBuffer, &lookBackBuffer, &tailIteratorBuffer, &startBit, &i }; + const void* args[] = { &s.key, &d.key, &n, arg_cast( m_gpSumBuffer.address() ), arg_cast( m_lookbackBuffer.address() ), arg_cast( m_tailIterator.address() ), &startBit, &i }; OrochiUtils::launch1D( m_onesweep_reorderKey64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream ); } std::swap( s, d );