From 49150057170d2911ccdb02e0f31ef110df0c0e2c Mon Sep 17 00:00:00 2001
From: Chih-Chen Kao <ChihChen.Kao@amd.com>
Date: Thu, 7 Sep 2023 13:31:10 +0200
Subject: [PATCH 01/68] Add kernelPath and includeDir to the ctor

Signed-off-by: Chih-Chen Kao <ChihChen.Kao@amd.com>
---
 ParallelPrimitives/RadixSort.cpp | 8 ++++----
 ParallelPrimitives/RadixSort.h   | 6 +++---
 ParallelPrimitives/RadixSort.inl | 4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 2a80264..2edc763 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -56,7 +56,7 @@ void printKernelInfo( const std::string& name, oroFunction func )
 namespace Oro
 {
 
-RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils ) : m_device{ device }, m_oroutils{ oroutils }
+RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils, const std::string& kernelPath, const std::string& includeDir ) : m_device{ device }, m_oroutils{ oroutils }
 {
 	oroGetDeviceProperties( &m_props, device );
 
@@ -72,10 +72,10 @@ RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils ) : m_device{ devi
 	assert( m_num_threads_per_block_for_scan % warp_size == 0 );
 	assert( m_num_threads_per_block_for_sort % warp_size == 0 );
 
-	configure();
+	configure( kernelPath, includeDir );
 }
 
-void RadixSort::exclusiveScanCpu( const Oro::GpuMemory<int>& countsGpu, Oro::GpuMemory<int>& offsetsGpu, oroStream stream ) const noexcept
+void RadixSort::exclusiveScanCpu( const Oro::GpuMemory<int>& countsGpu, Oro::GpuMemory<int>& offsetsGpu ) const noexcept
 {
 	const auto buffer_size = countsGpu.size();
 
@@ -210,7 +210,7 @@ int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept
 	return number_of_blocks;
 }
 
-void RadixSort::configure( const std::string& kernelPath, const std::string& includeDir, oroStream stream ) noexcept
+void RadixSort::configure( const std::string& kernelPath, const std::string& includeDir ) noexcept
 {
 	compileKernels( kernelPath, includeDir );
 
diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index d529feb..6df72f5 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -31,7 +31,7 @@ class RadixSort final
 		LOG,
 	};
 
-	RadixSort( oroDevice device, OrochiUtils& oroutils );
+	RadixSort( oroDevice device, OrochiUtils& oroutils, const std::string& kernelPath = "", const std::string& includeDir = "" );
 
 	// Allow move but disallow copy.
 	RadixSort( RadixSort&& ) noexcept = default;
@@ -61,12 +61,12 @@ class RadixSort final
 	/// It copies the count result from the Device to Host before computation, and then copies the offsets back from Host to Device afterward.
 	/// @param countsGpu The count result in GPU memory. Otuput: The offset.
 	/// @param offsetsGpu The offsets.
-	void exclusiveScanCpu( const Oro::GpuMemory<int>& countsGpu, Oro::GpuMemory<int>& offsetsGpu, oroStream stream ) const noexcept;
+	void exclusiveScanCpu( const Oro::GpuMemory<int>& countsGpu, Oro::GpuMemory<int>& offsetsGpu ) const noexcept;
 
 	/// @brief Configure the settings, compile the kernels and allocate the memory.
 	/// @param kernelPath The kernel path.
 	/// @param includeDir The include directory.
-	void configure( const std::string& kernelPath = "", const std::string& includeDir = "", oroStream stream = 0 ) noexcept;
+	void configure( const std::string& kernelPath, const std::string& includeDir ) noexcept;
 
   private:
 	// GPU blocks for the count kernel
diff --git a/ParallelPrimitives/RadixSort.inl b/ParallelPrimitives/RadixSort.inl
index 0577132..77df463 100644
--- a/ParallelPrimitives/RadixSort.inl
+++ b/ParallelPrimitives/RadixSort.inl
@@ -95,7 +95,7 @@ void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int en
 		{
 		case ScanAlgo::SCAN_CPU:
 		{
-			exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer, stream );
+			exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer );
 		}
 		break;
 
@@ -116,7 +116,7 @@ void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int en
 		break;
 
 		default:
-			exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer, stream );
+			exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer );
 			break;
 		}
 	};

From b725f75045b852d6632fa2ec428ad9f4f67c9bba Mon Sep 17 00:00:00 2001
From: Chih-Chen Kao <ChihChen.Kao@amd.com>
Date: Fri, 8 Sep 2023 07:58:20 +0200
Subject: [PATCH 02/68] Add missing header

Signed-off-by: Chih-Chen Kao <ChihChen.Kao@amd.com>
---
 ParallelPrimitives/RadixSort.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index 6df72f5..d3e89bf 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -7,6 +7,7 @@
 #include <Test/Stopwatch.h>
 #include <cmath>
 #include <cstdint>
+#include <functional>
 #include <string>
 #include <unordered_map>
 

From 117a318d443ae9b8cc315c3738b97fdab531e46a Mon Sep 17 00:00:00 2001
From: Chih-Chen Kao <ChihChen.Kao@amd.com>
Date: Fri, 8 Sep 2023 12:23:44 +0200
Subject: [PATCH 03/68] fix template constexpr rule

Signed-off-by: Chih-Chen Kao <ChihChen.Kao@amd.com>
---
 ParallelPrimitives/RadixSort.inl | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.inl b/ParallelPrimitives/RadixSort.inl
index 77df463..ad9ecdb 100644
--- a/ParallelPrimitives/RadixSort.inl
+++ b/ParallelPrimitives/RadixSort.inl
@@ -36,6 +36,25 @@ constexpr void execute( CallableType&& callable, RecordType& time_record, const
 		time_record[index] = stopwatch.getMs();
 	}
 }
+
+template<bool enable_profile, typename T>
+void resize_record( T& t ) noexcept
+{
+	if constexpr( enable_profile )
+	{
+		t.resize( 3 );
+	}
+}
+
+template<bool enable_profile, typename T>
+void print_record( const T& t ) noexcept
+{
+	if constexpr( enable_profile )
+	{
+		printf( "%3.2f, %3.2f, %3.2f\n", t[0], t[1], t[2] );
+	}
+}
+
 } // namespace
 
 template<class T>
@@ -73,10 +92,7 @@ void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int en
 	using RecordType = std::conditional_t<enable_profile, std::vector<float>, Empty>;
 	RecordType t;
 
-	if constexpr( enable_profile )
-	{
-		t.resize( 3 );
-	}
+	resize_record<enable_profile>( t );
 
 	const auto launch_count_kernel = [&]() noexcept
 	{
@@ -143,8 +159,5 @@ void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int en
 
 	execute<enable_profile>( launch_sort_kernel, t, 2, stream );
 
-	if constexpr( enable_profile )
-	{
-		printf( "%3.2f, %3.2f, %3.2f\n", t[0], t[1], t[2] );
-	}
+	print_record<enable_profile>( t );
 }

From 571111b402b5502d8a34204b4b44e6b188ab6a93 Mon Sep 17 00:00:00 2001
From: Chih-Chen Kao <ChihChen.Kao@amd.com>
Date: Fri, 8 Sep 2023 16:40:08 +0200
Subject: [PATCH 04/68] Use default values for bitcode

Signed-off-by: Chih-Chen Kao <ChihChen.Kao@amd.com>
---
 ParallelPrimitives/RadixSort.cpp      | 43 ++++++++++++++++-----------
 ParallelPrimitives/RadixSortKernels.h | 17 +++++++++++
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 2edc763..fd93056 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -59,19 +59,6 @@ namespace Oro
 RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils, const std::string& kernelPath, const std::string& includeDir ) : m_device{ device }, m_oroutils{ oroutils }
 {
 	oroGetDeviceProperties( &m_props, device );
-
-	m_num_threads_per_block_for_count = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_COUNT_BLOCK_SIZE;
-	m_num_threads_per_block_for_scan = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SCAN_BLOCK_SIZE;
-	m_num_threads_per_block_for_sort = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SORT_BLOCK_SIZE;
-
-	const auto warp_size = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE;
-
-	m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size;
-
-	assert( m_num_threads_per_block_for_count % warp_size == 0 );
-	assert( m_num_threads_per_block_for_scan % warp_size == 0 );
-	assert( m_num_threads_per_block_for_sort % warp_size == 0 );
-
 	configure( kernelPath, includeDir );
 }
 
@@ -123,10 +110,30 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 		binaryPath = getCurrentDir();
 		binaryPath += isAmd ? "oro_compiled_kernels.hipfb" : "oro_compiled_kernels.fatbin";
 		log = "loading pre-compiled kernels at path : " + binaryPath;
+
+		m_num_threads_per_block_for_count = DEFAULT_COUNT_BLOCK_SIZE;
+		m_num_threads_per_block_for_scan = DEFAULT_SCAN_BLOCK_SIZE;
+		m_num_threads_per_block_for_sort = DEFAULT_SORT_BLOCK_SIZE;
+
+		const auto warp_size = DEFAULT_WARP_SIZE;
+
+		m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size;
 	}
 	else
 	{
 		log = "compiling kernels at path : " + currentKernelPath + " in : " + currentIncludeDir;
+
+		m_num_threads_per_block_for_count = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_COUNT_BLOCK_SIZE;
+		m_num_threads_per_block_for_scan = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SCAN_BLOCK_SIZE;
+		m_num_threads_per_block_for_sort = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SORT_BLOCK_SIZE;
+
+		const auto warp_size = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE;
+
+		m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size;
+
+		assert( m_num_threads_per_block_for_count % warp_size == 0 );
+		assert( m_num_threads_per_block_for_scan % warp_size == 0 );
+		assert( m_num_threads_per_block_for_sort % warp_size == 0 );
 	}
 
 	if( m_flags == Flag::LOG )
@@ -135,13 +142,15 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 	}
 
 	const auto includeArg{ "-I" + currentIncludeDir };
-	const auto count_block_size_param = "-DCOUNT_WG_SIZE=" + std::to_string( m_num_threads_per_block_for_count );
-	const auto scan_block_size_param = "-DSCAN_WG_SIZE=" + std::to_string( m_num_threads_per_block_for_scan );
-	const auto sort_block_size_param = "-DSORT_WG_SIZE=" + std::to_string( m_num_threads_per_block_for_sort );
-	const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK=" + std::to_string( m_num_warps_per_block_for_sort );
+	const auto overwrite_flag = "-DOVERWRITE";
+	const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count );
+	const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan );
+	const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort );
+	const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort );
 
 	std::vector<const char*> opts;
 	opts.push_back( includeArg.c_str() );
+	opts.push_back( overwrite_flag );
 	opts.push_back( count_block_size_param.c_str() );
 	opts.push_back( scan_block_size_param.c_str() );
 	opts.push_back( sort_block_size_param.c_str() );
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 435569f..a529452 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -14,6 +14,23 @@ using u64 = unsigned long long;
 
 // #define NV_WORKAROUND 1
 
+// default values
+#if defined( OVERWRITE )
+
+constexpr auto COUNT_WG_SIZE{ COUNT_WG_SIZE_VAL };
+constexpr auto SCAN_WG_SIZE{ SCAN_WG_SIZE_VAL };
+constexpr auto SORT_WG_SIZE{ SORT_WG_SIZE_VAL };
+constexpr auto SORT_NUM_WARPS_PER_BLOCK{ SORT_NUM_WARPS_PER_BLOCK_VAL };
+
+#else
+
+constexpr auto COUNT_WG_SIZE{ DEFAULT_COUNT_BLOCK_SIZE };
+constexpr auto SCAN_WG_SIZE{ DEFAULT_SCAN_BLOCK_SIZE };
+constexpr auto SORT_WG_SIZE{ DEFAULT_SORT_BLOCK_SIZE };
+constexpr auto SORT_NUM_WARPS_PER_BLOCK{ DEFAULT_NUM_WARPS_PER_BLOCK };
+
+#endif
+
 __device__ constexpr u32 getMaskedBits( const u32 value, const u32 shift ) noexcept { return ( value >> shift ) & RADIX_MASK; }
 
 extern "C" __global__ void CountKernel( int* gSrc, int* gDst, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )

From c9bb91b3242504651d249e0ca847d3a2705350c8 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 14 Sep 2023 14:04:53 +0900
Subject: [PATCH 05/68] [ORO-0] simple porting

---
 ParallelPrimitives/RadixSort.cpp      |  289 ++++---
 ParallelPrimitives/RadixSort.h        |   84 +-
 ParallelPrimitives/RadixSort.inl      |  208 ++---
 ParallelPrimitives/RadixSortConfigs.h |   13 +
 ParallelPrimitives/RadixSortKernels.h | 1057 +++++++++++++++++--------
 Test/RadixSort/main.cpp               |    7 +-
 6 files changed, 1088 insertions(+), 570 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index fd93056..a33ff47 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -18,6 +18,9 @@
 #include <dlfcn.h>
 #endif
 
+inline uint64_t div_round_up64( uint64_t val, uint64_t divisor ) { return ( val + divisor - 1 ) / divisor; }
+inline uint64_t next_multiple64( uint64_t val, uint64_t divisor ) { return div_round_up64( val, divisor ) * divisor; }
+
 namespace
 {
 #if defined( ORO_PRECOMPILED )
@@ -58,26 +61,26 @@ namespace Oro
 
 RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils, const std::string& kernelPath, const std::string& includeDir ) : m_device{ device }, m_oroutils{ oroutils }
 {
-	oroGetDeviceProperties( &m_props, device );
+	//oroGetDeviceProperties( &m_props, device );
 	configure( kernelPath, includeDir );
 }
 
-void RadixSort::exclusiveScanCpu( const Oro::GpuMemory<int>& countsGpu, Oro::GpuMemory<int>& offsetsGpu ) const noexcept
-{
-	const auto buffer_size = countsGpu.size();
-
-	std::vector<int> counts = countsGpu.getData();
-	std::vector<int> offsets( buffer_size );
-
-	int sum = 0;
-	for( int i = 0; i < counts.size(); ++i )
-	{
-		offsets[i] = sum;
-		sum += counts[i];
-	}
-
-	offsetsGpu.copyFromHost( offsets.data(), std::size( offsets ) );
-}
+//void RadixSort::exclusiveScanCpu( const Oro::GpuMemory<int>& countsGpu, Oro::GpuMemory<int>& offsetsGpu ) const noexcept
+//{
+//	const auto buffer_size = countsGpu.size();
+//
+//	std::vector<int> counts = countsGpu.getData();
+//	std::vector<int> offsets( buffer_size );
+//
+//	int sum = 0;
+//	for( int i = 0; i < counts.size(); ++i )
+//	{
+//		offsets[i] = sum;
+//		sum += counts[i];
+//	}
+//
+//	offsetsGpu.copyFromHost( offsets.data(), std::size( offsets ) );
+//}
 
 void RadixSort::compileKernels( const std::string& kernelPath, const std::string& includeDir ) noexcept
 {
@@ -111,29 +114,29 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 		binaryPath += isAmd ? "oro_compiled_kernels.hipfb" : "oro_compiled_kernels.fatbin";
 		log = "loading pre-compiled kernels at path : " + binaryPath;
 
-		m_num_threads_per_block_for_count = DEFAULT_COUNT_BLOCK_SIZE;
-		m_num_threads_per_block_for_scan = DEFAULT_SCAN_BLOCK_SIZE;
-		m_num_threads_per_block_for_sort = DEFAULT_SORT_BLOCK_SIZE;
+		//m_num_threads_per_block_for_count = DEFAULT_COUNT_BLOCK_SIZE;
+		//m_num_threads_per_block_for_scan = DEFAULT_SCAN_BLOCK_SIZE;
+		//m_num_threads_per_block_for_sort = DEFAULT_SORT_BLOCK_SIZE;
 
-		const auto warp_size = DEFAULT_WARP_SIZE;
+		//const auto warp_size = DEFAULT_WARP_SIZE;
 
-		m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size;
+		//m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size;
 	}
 	else
 	{
 		log = "compiling kernels at path : " + currentKernelPath + " in : " + currentIncludeDir;
 
-		m_num_threads_per_block_for_count = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_COUNT_BLOCK_SIZE;
-		m_num_threads_per_block_for_scan = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SCAN_BLOCK_SIZE;
-		m_num_threads_per_block_for_sort = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SORT_BLOCK_SIZE;
+		//m_num_threads_per_block_for_count = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_COUNT_BLOCK_SIZE;
+		//m_num_threads_per_block_for_scan = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SCAN_BLOCK_SIZE;
+		//m_num_threads_per_block_for_sort = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SORT_BLOCK_SIZE;
 
-		const auto warp_size = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE;
+		//const auto warp_size = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE;
 
-		m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size;
+		//m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size;
 
-		assert( m_num_threads_per_block_for_count % warp_size == 0 );
-		assert( m_num_threads_per_block_for_scan % warp_size == 0 );
-		assert( m_num_threads_per_block_for_sort % warp_size == 0 );
+		//assert( m_num_threads_per_block_for_count % warp_size == 0 );
+		//assert( m_num_threads_per_block_for_scan % warp_size == 0 );
+		//assert( m_num_threads_per_block_for_sort % warp_size == 0 );
 	}
 
 	if( m_flags == Flag::LOG )
@@ -142,19 +145,19 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 	}
 
 	const auto includeArg{ "-I" + currentIncludeDir };
-	const auto overwrite_flag = "-DOVERWRITE";
-	const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count );
-	const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan );
-	const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort );
-	const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort );
+	//const auto overwrite_flag = "-DOVERWRITE";
+	//const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count );
+	//const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan );
+	//const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort );
+	//const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort );
 
 	std::vector<const char*> opts;
 	opts.push_back( includeArg.c_str() );
-	opts.push_back( overwrite_flag );
-	opts.push_back( count_block_size_param.c_str() );
-	opts.push_back( scan_block_size_param.c_str() );
-	opts.push_back( sort_block_size_param.c_str() );
-	opts.push_back( sort_num_warps_param.c_str() );
+	//opts.push_back( overwrite_flag );
+	//opts.push_back( count_block_size_param.c_str() );
+	//opts.push_back( scan_block_size_param.c_str() );
+	//opts.push_back( sort_block_size_param.c_str() );
+	//opts.push_back( sort_num_warps_param.c_str() );
 
 	struct Record
 	{
@@ -162,11 +165,15 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 		Kernel kernelType;
 	};
 
+	//const std::vector<Record> records{
+	//	{ "CountKernel", Kernel::COUNT },	 { "ParallelExclusiveScanSingleWG", Kernel::SCAN_SINGLE_WG }, { "ParallelExclusiveScanAllWG", Kernel::SCAN_PARALLEL },	 { "SortKernel", Kernel::SORT },
+	//	{ "SortKVKernel", Kernel::SORT_KV }, { "SortSinglePassKernel", Kernel::SORT_SINGLE_PASS },		  { "SortSinglePassKVKernel", Kernel::SORT_SINGLE_PASS_KV },
+	//};
 	const std::vector<Record> records{
-		{ "CountKernel", Kernel::COUNT },	 { "ParallelExclusiveScanSingleWG", Kernel::SCAN_SINGLE_WG }, { "ParallelExclusiveScanAllWG", Kernel::SCAN_PARALLEL },	 { "SortKernel", Kernel::SORT },
-		{ "SortKVKernel", Kernel::SORT_KV }, { "SortSinglePassKernel", Kernel::SORT_SINGLE_PASS },		  { "SortSinglePassKVKernel", Kernel::SORT_SINGLE_PASS_KV },
+		{ "SortSinglePassKernel", Kernel::SORT_SINGLE_PASS }, { "SortSinglePassKVKernel", Kernel::SORT_SINGLE_PASS_KV },
 	};
 
+
 	for( const auto& record : records )
 	{
 #if defined( ORO_PP_LOAD_FROM_STRING )
@@ -188,118 +195,170 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 			printKernelInfo( record.kernelName, oroFunctions[record.kernelType] );
 		}
 	}
-}
-
-int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept
-{
-	const int warpSize = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE;
-	const int warpPerWG = blockSize / warpSize;
-	const int warpPerWGP = m_props.maxThreadsPerMultiProcessor / warpSize;
-	const int occupancyFromWarp = ( warpPerWGP > 0 ) ? ( warpPerWGP / warpPerWG ) : 1;
 
-	const int occupancy = std::max( 1, occupancyFromWarp );
-
-	if( m_flags == Flag::LOG )
-	{
-		std::cout << "Occupancy: " << occupancy << '\n';
-	}
-
-	static constexpr auto min_num_blocks = 16;
-	auto number_of_blocks = m_props.multiProcessorCount > 0 ? m_props.multiProcessorCount * occupancy : min_num_blocks;
-
-	if( m_num_threads_per_block_for_scan > BIN_SIZE )
-	{
-		// Note: both are divisible by 2
-		const auto base = m_num_threads_per_block_for_scan / BIN_SIZE;
-
-		// Floor
-		number_of_blocks = ( number_of_blocks / base ) * base;
-	}
-
-	return number_of_blocks;
+	// TODO: bit code support?
+#define LOAD_FUNC( var, kernel ) var = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), kernel, &opts );
+	LOAD_FUNC( m_gHistogram, "gHistogram" );
+	LOAD_FUNC( m_gPrefixSum, "gPrefixSum" );
+	LOAD_FUNC( m_onesweep_reorderKey, "onesweep_reorderKey" );
+	LOAD_FUNC( m_onesweep_reorderKeyPair, "onesweep_reorderKeyPair" );
+	LOAD_FUNC( m_onesweep_reorderKey64, "onesweep_reorderKey64" );
+	LOAD_FUNC( m_onesweep_reorderKeyPair64, "onesweep_reorderKeyPair64" );
+#undef LOAD_FUNC
 }
 
+//int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept
+//{
+//	const int warpSize = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE;
+//	const int warpPerWG = blockSize / warpSize;
+//	const int warpPerWGP = m_props.maxThreadsPerMultiProcessor / warpSize;
+//	const int occupancyFromWarp = ( warpPerWGP > 0 ) ? ( warpPerWGP / warpPerWG ) : 1;
+//
+//	const int occupancy = std::max( 1, occupancyFromWarp );
+//
+//	if( m_flags == Flag::LOG )
+//	{
+//		std::cout << "Occupancy: " << occupancy << '\n';
+//	}
+//
+//	static constexpr auto min_num_blocks = 16;
+//	auto number_of_blocks = m_props.multiProcessorCount > 0 ? m_props.multiProcessorCount * occupancy : min_num_blocks;
+//
+//	if( m_num_threads_per_block_for_scan > BIN_SIZE )
+//	{
+//		// Note: both are divisible by 2
+//		const auto base = m_num_threads_per_block_for_scan / BIN_SIZE;
+//
+//		// Floor
+//		number_of_blocks = ( number_of_blocks / base ) * base;
+//	}
+//
+//	return number_of_blocks;
+//}
+
 void RadixSort::configure( const std::string& kernelPath, const std::string& includeDir ) noexcept
 {
 	compileKernels( kernelPath, includeDir );
 
-	m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count );
+	//m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count );
 
-	/// The tmp buffer size of the count kernel and the scan kernel.
+	///// The tmp buffer size of the count kernel and the scan kernel.
 
-	const auto tmp_buffer_size = BIN_SIZE * m_num_blocks_for_count;
+	//const auto tmp_buffer_size = BIN_SIZE * m_num_blocks_for_count;
 
-	/// @c tmp_buffer_size must be divisible by @c m_num_threads_per_block_for_scan
-	/// This is guaranteed since @c m_num_blocks_for_count will be adjusted accordingly
+	///// @c tmp_buffer_size must be divisible by @c m_num_threads_per_block_for_scan
+	///// This is guaranteed since @c m_num_blocks_for_count will be adjusted accordingly
 
-	m_num_blocks_for_scan = tmp_buffer_size / m_num_threads_per_block_for_scan;
+	//m_num_blocks_for_scan = tmp_buffer_size / m_num_threads_per_block_for_scan;
 
-	m_tmp_buffer.resize( tmp_buffer_size );
+	//m_tmp_buffer.resize( tmp_buffer_size );
 
-	if( selectedScanAlgo == ScanAlgo::SCAN_GPU_PARALLEL )
-	{
-		// These are for the scan kernel
-		m_partial_sum.resize( m_num_blocks_for_scan );
-		m_is_ready.resize( m_num_blocks_for_scan );
-	}
+	//if( selectedScanAlgo == ScanAlgo::SCAN_GPU_PARALLEL )
+	//{
+	//	// These are for the scan kernel
+	//	m_partial_sum.resize( m_num_blocks_for_scan );
+	//	m_is_ready.resize( m_num_blocks_for_scan );
+	//}
 }
 void RadixSort::setFlag( Flag flag ) noexcept { m_flags = flag; }
 
-void RadixSort::sort( const KeyValueSoA src, const KeyValueSoA dst, int n, int startBit, int endBit, oroStream stream ) noexcept
+void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream ) noexcept
 {
+	bool keyPair = src.value != nullptr;
+
 	// todo. better to compute SINGLE_SORT_N_ITEMS_PER_WI which we use in the kernel dynamically rather than hard coding it to distribute the work evenly
 	// right now, setting this as large as possible is faster than multi pass sorting
 	if( n < SINGLE_SORT_WG_SIZE * SINGLE_SORT_N_ITEMS_PER_WI )
 	{
-		const auto func = oroFunctions[Kernel::SORT_SINGLE_PASS_KV];
-		const void* args[] = { &src.key, &src.value, &dst.key, &dst.value, &n, &startBit, &endBit };
-		OrochiUtils::launch1D( func, SINGLE_SORT_WG_SIZE, args, SINGLE_SORT_WG_SIZE, 0, stream );
+		if( keyPair )
+		{
+			const auto func = oroFunctions[Kernel::SORT_SINGLE_PASS_KV];
+			const void* args[] = { &src.key, &src.value, &dst.key, &dst.value, &n, &startBit, &endBit };
+			OrochiUtils::launch1D( func, SINGLE_SORT_WG_SIZE, args, SINGLE_SORT_WG_SIZE, 0, stream );
+		}
+		else
+		{
+			const auto func = oroFunctions[Kernel::SORT_SINGLE_PASS];
+			const void* args[] = { &src, &dst, &n, &startBit, &endBit };
+			OrochiUtils::launch1D( func, SINGLE_SORT_WG_SIZE, args, SINGLE_SORT_WG_SIZE, 0, stream );
+		}
 		return;
 	}
 
-	auto* s{ &src };
-	auto* d{ &dst };
+	int nIteration = div_round_up64( endBit - startBit, 8 );
+	bool use64bitCounter =
+#if defined( ENFORCE_64BIT_COUNTER )
+		true;
+#else
+		MAX_ELEMENTS_WITH_32BIT_COUNTER < n;
+#endif
+	uint64_t numberOfBlocks = div_round_up64( n, RADIX_SORT_BLOCK_SIZE );
+
+	// Buffers
+	void* gpSumBuffer = tempStorage;
+	void* lookBackBuffer = (void*)( (char*)tempStorage + sizeof( uint32_t ) * 256 * sizeof( u32 /* key */ ) );
 
-	for( int i = startBit; i < endBit; i += N_RADIX )
 	{
-		sort1pass( *s, *d, n, i, i + std::min( N_RADIX, endBit - i ), stream );
+		oroMemsetD32Async( (oroDeviceptr)gpSumBuffer, 0, 256 * sizeof( u32 /* key */ ), stream );
+		oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, 1, stream );
 
-		std::swap( s, d );
-	}
+		const int nBlocks = 2048;
 
-	if( s == &src )
-	{
-		OrochiUtils::copyDtoDAsync( dst.key, src.key, n, stream );
-		OrochiUtils::copyDtoDAsync( dst.value, src.value, n, stream );
+		const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &lookBackBuffer };
+		OrochiUtils::launch1D( m_gHistogram, nBlocks * GHISTOGRAM_THREADS_PER_BLOCK, args, GHISTOGRAM_THREADS_PER_BLOCK, 0, stream );
 	}
-}
-
-void RadixSort::sort( const u32* src, const u32* dst, int n, int startBit, int endBit, oroStream stream ) noexcept
-{
-	// todo. better to compute SINGLE_SORT_N_ITEMS_PER_WI which we use in the kernel dynamically rather than hard coding it to distribute the work evenly
-	// right now, setting this as large as possible is faster than multi pass sorting
-	if( n < SINGLE_SORT_WG_SIZE * SINGLE_SORT_N_ITEMS_PER_WI )
 	{
-		const auto func = oroFunctions[Kernel::SORT_SINGLE_PASS];
-		const void* args[] = { &src, &dst, &n, &startBit, &endBit };
-		OrochiUtils::launch1D( func, SINGLE_SORT_WG_SIZE, args, SINGLE_SORT_WG_SIZE, 0, stream );
-		return;
+		const void* args[] = { &gpSumBuffer };
+		OrochiUtils::launch1D( m_gPrefixSum, nIteration * 256, args, 256, 0, stream );
 	}
 
-	auto* s{ &src };
-	auto* d{ &dst };
-
-	for( int i = startBit; i < endBit; i += N_RADIX )
+	auto s = src;
+	auto d = dst;
+	for( int i = 0; i < nIteration; i++ )
 	{
-		sort1pass( *s, *d, n, i, i + std::min( N_RADIX, endBit - i ), stream );
+		oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, 256 * numberOfBlocks * ( use64bitCounter ? 2 : 1 ), stream );
 
+		if( keyPair )
+		{
+			const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i };
+			OrochiUtils::launch1D( use64bitCounter ? m_onesweep_reorderKeyPair64 : m_onesweep_reorderKeyPair, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream );
+		}
+		else
+		{
+			const void* args[] = { &s.key, &d.key, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i };
+			OrochiUtils::launch1D( use64bitCounter ? m_onesweep_reorderKey64 : m_onesweep_reorderKey, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream );
+		}
 		std::swap( s, d );
 	}
 
-	if( s == &src )
+	if( s.key == src.key )
 	{
-		OrochiUtils::copyDtoDAsync( dst, src, n, stream );
+		oroMemcpyDtoDAsync( (oroDeviceptr)dst.key, (oroDeviceptr)src.key, sizeof( uint32_t ) * n, stream );
+
+		if( keyPair )
+		{
+			oroMemcpyDtoDAsync( (oroDeviceptr)dst.value, (oroDeviceptr)src.value, sizeof( uint32_t ) * n, stream );
+		}
 	}
 }
 
+void RadixSort::sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream ) noexcept
+{
+	sort( KeyValueSoA{ src, nullptr }, KeyValueSoA{ dst, nullptr }, n, startBit, endBit, tempStorage, stream );
+}
+
+uint64_t RadixSort::getRequiredTemporalStorageBytes( u32 numberOfMaxInputs ) const
+{
+	static_assert( BIN_SIZE == 256, "check alignment of the buffers" );
+	uint64_t numberOfBlocks = div_round_up64( numberOfMaxInputs, RADIX_SORT_BLOCK_SIZE );
+	uint64_t gpSumBuffer = sizeof( uint32_t ) * 256 * sizeof( u32 /* key */ );
+	uint64_t lookBackBuffer = sizeof( uint32_t ) * 256 * numberOfBlocks;
+#if !defined( ENFORCE_64BIT_COUNTER )
+	if( MAX_ELEMENTS_WITH_32BIT_COUNTER < numberOfMaxInputs )
+#endif
+	{
+		lookBackBuffer *= 2; // to 64bit counter
+	}
+	return gpSumBuffer + lookBackBuffer;
+}
 }; // namespace Oro
diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index d3e89bf..d30ee32 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -43,26 +43,27 @@ class RadixSort final
 
 	void setFlag( Flag flag ) noexcept;
 
-	void sort( const KeyValueSoA src, const KeyValueSoA dst, int n, int startBit, int endBit, oroStream stream = 0 ) noexcept;
+	void sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream = 0 ) noexcept;
 
-	void sort( const u32* src, const u32* dst, int n, int startBit, int endBit, oroStream stream = 0 ) noexcept;
+	void sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream = 0 ) noexcept;
 
+	uint64_t getRequiredTemporalStorageBytes( u32 numberOfMaxInputs ) const;
   private:
-	template<class T>
-	void sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept;
+	//template<class T>
+	//void sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept;
 
-	/// @brief Compile the kernels for radix sort.
-	/// @param kernelPath The kernel path.
-	/// @param includeDir The include directory.
+	///// @brief Compile the kernels for radix sort.
+	///// @param kernelPath The kernel path.
+	///// @param includeDir The include directory.
 	void compileKernels( const std::string& kernelPath, const std::string& includeDir ) noexcept;
 
-	[[nodiscard]] int calculateWGsToExecute( const int blockSize ) const noexcept;
+	//[[nodiscard]] int calculateWGsToExecute( const int blockSize ) const noexcept;
 
-	/// @brief Exclusive scan algorithm on CPU for testing.
-	/// It copies the count result from the Device to Host before computation, and then copies the offsets back from Host to Device afterward.
-	/// @param countsGpu The count result in GPU memory. Otuput: The offset.
-	/// @param offsetsGpu The offsets.
-	void exclusiveScanCpu( const Oro::GpuMemory<int>& countsGpu, Oro::GpuMemory<int>& offsetsGpu ) const noexcept;
+	///// @brief Exclusive scan algorithm on CPU for testing.
+	///// It copies the count result from the Device to Host before computation, and then copies the offsets back from Host to Device afterward.
+	///// @param countsGpu The count result in GPU memory. Otuput: The offset.
+	///// @param offsetsGpu The offsets.
+	//void exclusiveScanCpu( const Oro::GpuMemory<int>& countsGpu, Oro::GpuMemory<int>& offsetsGpu ) const noexcept;
 
 	/// @brief Configure the settings, compile the kernels and allocate the memory.
 	/// @param kernelPath The kernel path.
@@ -70,21 +71,21 @@ class RadixSort final
 	void configure( const std::string& kernelPath, const std::string& includeDir ) noexcept;
 
   private:
-	// GPU blocks for the count kernel
-	int m_num_blocks_for_count{};
+	//// GPU blocks for the count kernel
+	//int m_num_blocks_for_count{};
 
-	// GPU blocks for the scan kernel
-	int m_num_blocks_for_scan{};
+	//// GPU blocks for the scan kernel
+	//int m_num_blocks_for_scan{};
 
 	Flag m_flags{ Flag::NO_LOG };
 
 	enum class Kernel
 	{
-		COUNT,
-		SCAN_SINGLE_WG,
-		SCAN_PARALLEL,
-		SORT,
-		SORT_KV,
+		//COUNT,
+		//SCAN_SINGLE_WG,
+		//SCAN_PARALLEL,
+		//SORT,
+		//SORT_KV,
 		SORT_SINGLE_PASS,
 		SORT_SINGLE_PASS_KV,
 	};
@@ -92,33 +93,40 @@ class RadixSort final
 	std::unordered_map<Kernel, oroFunction> oroFunctions;
 
 	/// @brief  The enum class which indicates the selected algorithm of prefix scan.
-	enum class ScanAlgo
-	{
-		SCAN_CPU,
-		SCAN_GPU_SINGLE_WG,
-		SCAN_GPU_PARALLEL,
-	};
+	//enum class ScanAlgo
+	//{
+	//	SCAN_CPU,
+	//	SCAN_GPU_SINGLE_WG,
+	//	SCAN_GPU_PARALLEL,
+	//};
 
-	constexpr static auto selectedScanAlgo{ ScanAlgo::SCAN_GPU_PARALLEL };
+	//constexpr static auto selectedScanAlgo{ ScanAlgo::SCAN_GPU_PARALLEL };
 
-	GpuMemory<int> m_partial_sum;
-	GpuMemory<bool> m_is_ready;
+	//GpuMemory<int> m_partial_sum;
+	//GpuMemory<bool> m_is_ready;
 
 	oroDevice m_device{};
-	oroDeviceProp m_props{};
+	//oroDeviceProp m_props{};
 
 	OrochiUtils& m_oroutils;
 
 	// This buffer holds the "bucket" table from all GPU blocks.
-	GpuMemory<int> m_tmp_buffer;
+	//GpuMemory<int> m_tmp_buffer;
+
+	//int m_num_threads_per_block_for_count{};
+	//int m_num_threads_per_block_for_scan{};
+	//int m_num_threads_per_block_for_sort{};
 
-	int m_num_threads_per_block_for_count{};
-	int m_num_threads_per_block_for_scan{};
-	int m_num_threads_per_block_for_sort{};
+	//int m_num_warps_per_block_for_sort{};
 
-	int m_num_warps_per_block_for_sort{};
+	oroFunction m_gHistogram;
+	oroFunction m_gPrefixSum;
+	oroFunction m_onesweep_reorderKey;
+	oroFunction m_onesweep_reorderKeyPair;
+	oroFunction m_onesweep_reorderKey64;
+	oroFunction m_onesweep_reorderKeyPair64;
 };
 
-#include <ParallelPrimitives/RadixSort.inl>
+//#include <ParallelPrimitives/RadixSort.inl>
 
 }; // namespace Oro
diff --git a/ParallelPrimitives/RadixSort.inl b/ParallelPrimitives/RadixSort.inl
index ad9ecdb..fd42633 100644
--- a/ParallelPrimitives/RadixSort.inl
+++ b/ParallelPrimitives/RadixSort.inl
@@ -57,107 +57,107 @@ void print_record( const T& t ) noexcept
 
 } // namespace
 
-template<class T>
-void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept
-{
-	static constexpr auto enable_profile = false;
-
-	const u32* srcKey{ nullptr };
-	const u32* dstKey{ nullptr };
-
-	const u32* srcVal{ nullptr };
-	const u32* dstVal{ nullptr };
-
-	static constexpr auto enable_key_value_pair_sorting{ std::is_same_v<T, KeyValueSoA> };
-
-	if constexpr( enable_key_value_pair_sorting )
-	{
-		srcKey = src.key;
-		dstKey = dst.key;
-
-		srcVal = src.value;
-		dstVal = dst.value;
-	}
-	else
-	{
-		static_assert( std::is_same_v<T, u32*> || std::is_same_v<T, const u32*> );
-		srcKey = src;
-		dstKey = dst;
-	}
-
-	const int nItemPerWG = ( n + m_num_blocks_for_count - 1 ) / m_num_blocks_for_count;
-
-	// Timer records
-
-	using RecordType = std::conditional_t<enable_profile, std::vector<float>, Empty>;
-	RecordType t;
-
-	resize_record<enable_profile>( t );
-
-	const auto launch_count_kernel = [&]() noexcept
-	{
-		const auto num_total_thread_for_count = m_num_threads_per_block_for_count * m_num_blocks_for_count;
-
-		const auto func{ oroFunctions[Kernel::COUNT] };
-		const void* args[] = { &srcKey, arg_cast( m_tmp_buffer.address() ), &n, &nItemPerWG, &startBit, &m_num_blocks_for_count };
-		OrochiUtils::launch1D( func, num_total_thread_for_count, args, m_num_threads_per_block_for_count, 0, stream );
-	};
-
-	execute<enable_profile>( launch_count_kernel, t, 0, stream );
-
-	const auto launch_scan_kernel = [&]() noexcept
-	{
-		switch( selectedScanAlgo )
-		{
-		case ScanAlgo::SCAN_CPU:
-		{
-			exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer );
-		}
-		break;
-
-		case ScanAlgo::SCAN_GPU_SINGLE_WG:
-		{
-			const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), &m_num_blocks_for_count };
-			OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_SINGLE_WG], WG_SIZE * m_num_blocks_for_count, args, WG_SIZE, 0, stream );
-		}
-		break;
-
-		case ScanAlgo::SCAN_GPU_PARALLEL:
-		{
-			const auto num_total_thread_for_scan = m_num_threads_per_block_for_scan * m_num_blocks_for_scan;
-
-			const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), arg_cast( m_partial_sum.address() ), arg_cast( m_is_ready.address() ) };
-			OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_PARALLEL], num_total_thread_for_scan, args, m_num_threads_per_block_for_scan, 0, stream );
-		}
-		break;
-
-		default:
-			exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer );
-			break;
-		}
-	};
-
-	execute<enable_profile>( launch_scan_kernel, t, 1, stream );
-
-	const auto launch_sort_kernel = [&]() noexcept
-	{
-		const auto num_blocks_for_sort = m_num_blocks_for_count;
-		const auto num_total_thread_for_sort = m_num_threads_per_block_for_sort * num_blocks_for_sort;
-		const auto num_items_per_block = nItemPerWG;
-
-		if constexpr( enable_key_value_pair_sorting )
-		{
-			const void* args[] = { &srcKey, &srcVal, &dstKey, &dstVal, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort };
-			OrochiUtils::launch1D( oroFunctions[Kernel::SORT_KV], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream );
-		}
-		else
-		{
-			const void* args[] = { &srcKey, &dstKey, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort };
-			OrochiUtils::launch1D( oroFunctions[Kernel::SORT], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream );
-		}
-	};
-
-	execute<enable_profile>( launch_sort_kernel, t, 2, stream );
-
-	print_record<enable_profile>( t );
-}
+//template<class T>
+//void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept
+//{
+//	static constexpr auto enable_profile = false;
+//
+//	const u32* srcKey{ nullptr };
+//	const u32* dstKey{ nullptr };
+//
+//	const u32* srcVal{ nullptr };
+//	const u32* dstVal{ nullptr };
+//
+//	static constexpr auto enable_key_value_pair_sorting{ std::is_same_v<T, KeyValueSoA> };
+//
+//	if constexpr( enable_key_value_pair_sorting )
+//	{
+//		srcKey = src.key;
+//		dstKey = dst.key;
+//
+//		srcVal = src.value;
+//		dstVal = dst.value;
+//	}
+//	else
+//	{
+//		static_assert( std::is_same_v<T, u32*> || std::is_same_v<T, const u32*> );
+//		srcKey = src;
+//		dstKey = dst;
+//	}
+//
+//	const int nItemPerWG = ( n + m_num_blocks_for_count - 1 ) / m_num_blocks_for_count;
+//
+//	// Timer records
+//
+//	using RecordType = std::conditional_t<enable_profile, std::vector<float>, Empty>;
+//	RecordType t;
+//
+//	resize_record<enable_profile>( t );
+//
+//	const auto launch_count_kernel = [&]() noexcept
+//	{
+//		const auto num_total_thread_for_count = m_num_threads_per_block_for_count * m_num_blocks_for_count;
+//
+//		const auto func{ oroFunctions[Kernel::COUNT] };
+//		const void* args[] = { &srcKey, arg_cast( m_tmp_buffer.address() ), &n, &nItemPerWG, &startBit, &m_num_blocks_for_count };
+//		OrochiUtils::launch1D( func, num_total_thread_for_count, args, m_num_threads_per_block_for_count, 0, stream );
+//	};
+//
+//	execute<enable_profile>( launch_count_kernel, t, 0, stream );
+//
+//	const auto launch_scan_kernel = [&]() noexcept
+//	{
+//		switch( selectedScanAlgo )
+//		{
+//		case ScanAlgo::SCAN_CPU:
+//		{
+//			exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer );
+//		}
+//		break;
+//
+//		case ScanAlgo::SCAN_GPU_SINGLE_WG:
+//		{
+//			const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), &m_num_blocks_for_count };
+//			OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_SINGLE_WG], WG_SIZE * m_num_blocks_for_count, args, WG_SIZE, 0, stream );
+//		}
+//		break;
+//
+//		case ScanAlgo::SCAN_GPU_PARALLEL:
+//		{
+//			const auto num_total_thread_for_scan = m_num_threads_per_block_for_scan * m_num_blocks_for_scan;
+//
+//			const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), arg_cast( m_partial_sum.address() ), arg_cast( m_is_ready.address() ) };
+//			OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_PARALLEL], num_total_thread_for_scan, args, m_num_threads_per_block_for_scan, 0, stream );
+//		}
+//		break;
+//
+//		default:
+//			exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer );
+//			break;
+//		}
+//	};
+//
+//	execute<enable_profile>( launch_scan_kernel, t, 1, stream );
+//
+//	const auto launch_sort_kernel = [&]() noexcept
+//	{
+//		const auto num_blocks_for_sort = m_num_blocks_for_count;
+//		const auto num_total_thread_for_sort = m_num_threads_per_block_for_sort * num_blocks_for_sort;
+//		const auto num_items_per_block = nItemPerWG;
+//
+//		if constexpr( enable_key_value_pair_sorting )
+//		{
+//			const void* args[] = { &srcKey, &srcVal, &dstKey, &dstVal, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort };
+//			OrochiUtils::launch1D( oroFunctions[Kernel::SORT_KV], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream );
+//		}
+//		else
+//		{
+//			const void* args[] = { &srcKey, &dstKey, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort };
+//			OrochiUtils::launch1D( oroFunctions[Kernel::SORT], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream );
+//		}
+//	};
+//
+//	execute<enable_profile>( launch_sort_kernel, t, 2, stream );
+//
+//	print_record<enable_profile>( t );
+//}
diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index 1cc6f2b..c597238 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -42,4 +42,17 @@ static_assert( BIN_SIZE % 2 == 0 );
 static_assert( DEFAULT_COUNT_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 );
 static_assert( DEFAULT_SCAN_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 );
 
+#define RADIX_SORT_BLOCK_SIZE 2048
+
+#define GHISTOGRAM_ITEM_PER_BLOCK 2048
+#define GHISTOGRAM_THREADS_PER_BLOCK 256
+
+#define REORDER_NUMBER_OF_WARPS 8
+#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS )
+
+#define MAX_ELEMENTS_WITH_32BIT_COUNTER 0x3FFFFFFF
+
+// Please uncomment this enforce 64bit counter for lookback counter to measure performance impact.
+// #define ENFORCE_64BIT_COUNTER 1
+
 }; // namespace Oro
\ No newline at end of file
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index a529452..56bca91 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -15,52 +15,52 @@ using u64 = unsigned long long;
 // #define NV_WORKAROUND 1
 
 // default values
-#if defined( OVERWRITE )
-
-constexpr auto COUNT_WG_SIZE{ COUNT_WG_SIZE_VAL };
-constexpr auto SCAN_WG_SIZE{ SCAN_WG_SIZE_VAL };
-constexpr auto SORT_WG_SIZE{ SORT_WG_SIZE_VAL };
-constexpr auto SORT_NUM_WARPS_PER_BLOCK{ SORT_NUM_WARPS_PER_BLOCK_VAL };
-
-#else
-
-constexpr auto COUNT_WG_SIZE{ DEFAULT_COUNT_BLOCK_SIZE };
-constexpr auto SCAN_WG_SIZE{ DEFAULT_SCAN_BLOCK_SIZE };
-constexpr auto SORT_WG_SIZE{ DEFAULT_SORT_BLOCK_SIZE };
-constexpr auto SORT_NUM_WARPS_PER_BLOCK{ DEFAULT_NUM_WARPS_PER_BLOCK };
-
-#endif
-
-__device__ constexpr u32 getMaskedBits( const u32 value, const u32 shift ) noexcept { return ( value >> shift ) & RADIX_MASK; }
-
-extern "C" __global__ void CountKernel( int* gSrc, int* gDst, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
-{
-	__shared__ int table[BIN_SIZE];
-
-	for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE )
-	{
-		table[i] = 0;
-	}
-
-	__syncthreads();
-
-	const int offset = blockIdx.x * gNItemsPerWG;
-	const int upperBound = ( offset + gNItemsPerWG > gN ) ? gN - offset : gNItemsPerWG;
-
-	for( int i = threadIdx.x; i < upperBound; i += COUNT_WG_SIZE )
-	{
-		const int idx = offset + i;
-		const int tableIdx = getMaskedBits( gSrc[idx], START_BIT );
-		atomicAdd( &table[tableIdx], 1 );
-	}
-
-	__syncthreads();
-
-	for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE )
-	{
-		gDst[i * N_WGS_EXECUTED + blockIdx.x] = table[i];
-	}
-}
+//#if defined( OVERWRITE )
+//
+//constexpr auto COUNT_WG_SIZE{ COUNT_WG_SIZE_VAL };
+//constexpr auto SCAN_WG_SIZE{ SCAN_WG_SIZE_VAL };
+//constexpr auto SORT_WG_SIZE{ SORT_WG_SIZE_VAL };
+//constexpr auto SORT_NUM_WARPS_PER_BLOCK{ SORT_NUM_WARPS_PER_BLOCK_VAL };
+//
+//#else
+//
+//constexpr auto COUNT_WG_SIZE{ DEFAULT_COUNT_BLOCK_SIZE };
+//constexpr auto SCAN_WG_SIZE{ DEFAULT_SCAN_BLOCK_SIZE };
+//constexpr auto SORT_WG_SIZE{ DEFAULT_SORT_BLOCK_SIZE };
+//constexpr auto SORT_NUM_WARPS_PER_BLOCK{ DEFAULT_NUM_WARPS_PER_BLOCK };
+//
+//#endif
+
+//__device__ constexpr u32 getMaskedBits( const u32 value, const u32 shift ) noexcept { return ( value >> shift ) & RADIX_MASK; }
+//
+//extern "C" __global__ void CountKernel( int* gSrc, int* gDst, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
+//{
+//	__shared__ int table[BIN_SIZE];
+//
+//	for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE )
+//	{
+//		table[i] = 0;
+//	}
+//
+//	__syncthreads();
+//
+//	const int offset = blockIdx.x * gNItemsPerWG;
+//	const int upperBound = ( offset + gNItemsPerWG > gN ) ? gN - offset : gNItemsPerWG;
+//
+//	for( int i = threadIdx.x; i < upperBound; i += COUNT_WG_SIZE )
+//	{
+//		const int idx = offset + i;
+//		const int tableIdx = getMaskedBits( gSrc[idx], START_BIT );
+//		atomicAdd( &table[tableIdx], 1 );
+//	}
+//
+//	__syncthreads();
+//
+//	for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE )
+//	{
+//		gDst[i * N_WGS_EXECUTED + blockIdx.x] = table[i];
+//	}
+//}
 
 template<typename T, int STRIDE>
 struct ScanImpl
@@ -326,139 +326,139 @@ __device__ void localSort4bitMulti( int* keys, u32* ldsKeys, int* values, u32* l
 	}
 }
 
-__device__ void localSort8bitMulti_shared_bin( int* keys, u32* ldsKeys, const int START_BIT )
-{
-	__shared__ unsigned table[BIN_SIZE];
-
-	for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE )
-	{
-		table[i] = 0U;
-	}
-
-	LDS_BARRIER;
-
-	for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i )
-	{
-		const int tableIdx = ( keys[i] >> START_BIT ) & RADIX_MASK;
-		atomicAdd( &table[tableIdx], 1 );
-	}
-
-	LDS_BARRIER;
-
-	int globalSum = 0;
-	for( int binId = 0; binId < BIN_SIZE; binId += SORT_WG_SIZE * 2 )
-	{
-		unsigned* globalOffset = &table[binId];
-		const unsigned currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 );
-		globalOffset[threadIdx.x * 2] += globalSum;
-		globalOffset[threadIdx.x * 2 + 1] += globalSum;
-		globalSum += currentGlobalSum;
-	}
-
-	LDS_BARRIER;
-
-	__shared__ u32 keyBuffer[SORT_WG_SIZE * SORT_N_ITEMS_PER_WI];
-
-	for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i )
-	{
-		keyBuffer[threadIdx.x * SORT_N_ITEMS_PER_WI + i] = keys[i];
-	}
-
-	LDS_BARRIER;
-
-	if( threadIdx.x == 0 )
-	{
-		for( int i = 0; i < SORT_WG_SIZE * SORT_N_ITEMS_PER_WI; ++i )
-		{
-			const int tableIdx = ( keyBuffer[i] >> START_BIT ) & RADIX_MASK;
-			const int writeIndex = table[tableIdx];
-
-			ldsKeys[writeIndex] = keyBuffer[i];
-
-			++table[tableIdx];
-		}
-	}
-
-	LDS_BARRIER;
-
-	for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i )
-	{
-		keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i];
-	}
-}
-
-__device__ void localSort8bitMulti_group( int* keys, u32* ldsKeys, const int START_BIT )
-{
-	constexpr auto N_GROUP_SIZE{ N_BINS_8BIT / ( sizeof( u64 ) / sizeof( u16 ) ) };
-
-	__shared__ union
-	{
-		u16 m_ungrouped[SORT_WG_SIZE + 1][N_BINS_8BIT];
-		u64 m_grouped[SORT_WG_SIZE + 1][N_GROUP_SIZE];
-	} lds;
-
-	for( int i = 0; i < N_GROUP_SIZE; ++i )
-	{
-		lds.m_grouped[threadIdx.x][i] = 0U;
-	}
-
-	for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ )
-	{
-		const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK;
-		++lds.m_ungrouped[threadIdx.x][in8bit];
-	}
-
-	LDS_BARRIER;
-
-	for( int groupId = threadIdx.x; groupId < N_GROUP_SIZE; groupId += SORT_WG_SIZE )
-	{
-		u64 sum = 0U;
-		for( int i = 0; i < SORT_WG_SIZE; i++ )
-		{
-			const auto current = lds.m_grouped[i][groupId];
-			lds.m_grouped[i][groupId] = sum;
-			sum += current;
-		}
-		lds.m_grouped[SORT_WG_SIZE][groupId] = sum;
-	}
-
-	LDS_BARRIER;
-
-	int globalSum = 0;
-	for( int binId = 0; binId < N_BINS_8BIT; binId += SORT_WG_SIZE * 2 )
-	{
-		auto* globalOffset = &lds.m_ungrouped[SORT_WG_SIZE][binId];
-		const int currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 );
-		globalOffset[threadIdx.x * 2] += globalSum;
-		globalOffset[threadIdx.x * 2 + 1] += globalSum;
-		globalSum += currentGlobalSum;
-	}
-
-	LDS_BARRIER;
-
-	for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ )
-	{
-		const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK;
-		const auto offset = lds.m_ungrouped[SORT_WG_SIZE][in8bit];
-		const auto rank = lds.m_ungrouped[threadIdx.x][in8bit]++;
-
-		ldsKeys[offset + rank] = keys[i];
-	}
-
-	LDS_BARRIER;
-
-	for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ )
-	{
-		keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i];
-	}
-}
-
-template<bool KEY_VALUE_PAIR>
-__device__ void localSort8bitMulti( int* keys, u32* ldsKeys, int* values, u32* ldsValues, const int START_BIT )
-{
-	localSort4bitMulti<SORT_N_ITEMS_PER_WI, SORT_WG_SIZE, KEY_VALUE_PAIR>( keys, ldsKeys, values, ldsValues, START_BIT );
-	if( N_RADIX > 4 ) localSort4bitMulti<SORT_N_ITEMS_PER_WI, SORT_WG_SIZE, KEY_VALUE_PAIR>( keys, ldsKeys, values, ldsValues, START_BIT + 4 );
-}
+//__device__ void localSort8bitMulti_shared_bin( int* keys, u32* ldsKeys, const int START_BIT )
+//{
+//	__shared__ unsigned table[BIN_SIZE];
+//
+//	for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE )
+//	{
+//		table[i] = 0U;
+//	}
+//
+//	LDS_BARRIER;
+//
+//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i )
+//	{
+//		const int tableIdx = ( keys[i] >> START_BIT ) & RADIX_MASK;
+//		atomicAdd( &table[tableIdx], 1 );
+//	}
+//
+//	LDS_BARRIER;
+//
+//	int globalSum = 0;
+//	for( int binId = 0; binId < BIN_SIZE; binId += SORT_WG_SIZE * 2 )
+//	{
+//		unsigned* globalOffset = &table[binId];
+//		const unsigned currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 );
+//		globalOffset[threadIdx.x * 2] += globalSum;
+//		globalOffset[threadIdx.x * 2 + 1] += globalSum;
+//		globalSum += currentGlobalSum;
+//	}
+//
+//	LDS_BARRIER;
+//
+//	__shared__ u32 keyBuffer[SORT_WG_SIZE * SORT_N_ITEMS_PER_WI];
+//
+//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i )
+//	{
+//		keyBuffer[threadIdx.x * SORT_N_ITEMS_PER_WI + i] = keys[i];
+//	}
+//
+//	LDS_BARRIER;
+//
+//	if( threadIdx.x == 0 )
+//	{
+//		for( int i = 0; i < SORT_WG_SIZE * SORT_N_ITEMS_PER_WI; ++i )
+//		{
+//			const int tableIdx = ( keyBuffer[i] >> START_BIT ) & RADIX_MASK;
+//			const int writeIndex = table[tableIdx];
+//
+//			ldsKeys[writeIndex] = keyBuffer[i];
+//
+//			++table[tableIdx];
+//		}
+//	}
+//
+//	LDS_BARRIER;
+//
+//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i )
+//	{
+//		keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i];
+//	}
+//}
+//
+//__device__ void localSort8bitMulti_group( int* keys, u32* ldsKeys, const int START_BIT )
+//{
+//	constexpr auto N_GROUP_SIZE{ N_BINS_8BIT / ( sizeof( u64 ) / sizeof( u16 ) ) };
+//
+//	__shared__ union
+//	{
+//		u16 m_ungrouped[SORT_WG_SIZE + 1][N_BINS_8BIT];
+//		u64 m_grouped[SORT_WG_SIZE + 1][N_GROUP_SIZE];
+//	} lds;
+//
+//	for( int i = 0; i < N_GROUP_SIZE; ++i )
+//	{
+//		lds.m_grouped[threadIdx.x][i] = 0U;
+//	}
+//
+//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ )
+//	{
+//		const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK;
+//		++lds.m_ungrouped[threadIdx.x][in8bit];
+//	}
+//
+//	LDS_BARRIER;
+//
+//	for( int groupId = threadIdx.x; groupId < N_GROUP_SIZE; groupId += SORT_WG_SIZE )
+//	{
+//		u64 sum = 0U;
+//		for( int i = 0; i < SORT_WG_SIZE; i++ )
+//		{
+//			const auto current = lds.m_grouped[i][groupId];
+//			lds.m_grouped[i][groupId] = sum;
+//			sum += current;
+//		}
+//		lds.m_grouped[SORT_WG_SIZE][groupId] = sum;
+//	}
+//
+//	LDS_BARRIER;
+//
+//	int globalSum = 0;
+//	for( int binId = 0; binId < N_BINS_8BIT; binId += SORT_WG_SIZE * 2 )
+//	{
+//		auto* globalOffset = &lds.m_ungrouped[SORT_WG_SIZE][binId];
+//		const int currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 );
+//		globalOffset[threadIdx.x * 2] += globalSum;
+//		globalOffset[threadIdx.x * 2 + 1] += globalSum;
+//		globalSum += currentGlobalSum;
+//	}
+//
+//	LDS_BARRIER;
+//
+//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ )
+//	{
+//		const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK;
+//		const auto offset = lds.m_ungrouped[SORT_WG_SIZE][in8bit];
+//		const auto rank = lds.m_ungrouped[threadIdx.x][in8bit]++;
+//
+//		ldsKeys[offset + rank] = keys[i];
+//	}
+//
+//	LDS_BARRIER;
+//
+//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ )
+//	{
+//		keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i];
+//	}
+//}
+
+//template<bool KEY_VALUE_PAIR>
+//__device__ void localSort8bitMulti( int* keys, u32* ldsKeys, int* values, u32* ldsValues, const int START_BIT )
+//{
+//	localSort4bitMulti<SORT_N_ITEMS_PER_WI, SORT_WG_SIZE, KEY_VALUE_PAIR>( keys, ldsKeys, values, ldsValues, START_BIT );
+//	if( N_RADIX > 4 ) localSort4bitMulti<SORT_N_ITEMS_PER_WI, SORT_WG_SIZE, KEY_VALUE_PAIR>( keys, ldsKeys, values, ldsValues, START_BIT + 4 );
+//}
 
 template<bool KEY_VALUE_PAIR>
 __device__ void SortSinglePass( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int gN, const int START_BIT, const int END_BIT )
@@ -514,185 +514,594 @@ extern "C" __global__ void SortSinglePassKernel( int* gSrcKey, int* gDstKey, int
 
 extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int gN, const int START_BIT, const int END_BIT ) { SortSinglePass<true>( gSrcKey, gSrcVal, gDstKey, gDstVal, gN, START_BIT, END_BIT ); }
 
-extern "C" __global__ void ParallelExclusiveScanSingleWG( int* gCount, int* gHistogram, const int N_WGS_EXECUTED )
+//extern "C" __global__ void ParallelExclusiveScanSingleWG( int* gCount, int* gHistogram, const int N_WGS_EXECUTED )
+//{
+//	// Use a single WG.
+//	if( blockIdx.x != 0 )
+//	{
+//		return;
+//	}
+//
+//	// LDS for the parallel scan of the global sum:
+//	// First we store the sum of the counters of each number to it,
+//	// then we compute the global offset using parallel exclusive scan.
+//	__shared__ int blockBuffer[BIN_SIZE];
+//
+//	// fill the LDS with the local sum
+//
+//	for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE )
+//	{
+//		// Do exclusive scan for each segment handled by each WI in a WG
+//
+//		int localThreadSum = 0;
+//		for( int i = 0; i < N_WGS_EXECUTED; ++i )
+//		{
+//			int current = gCount[binId * N_WGS_EXECUTED + i];
+//			gCount[binId * N_WGS_EXECUTED + i] = localThreadSum;
+//
+//			localThreadSum += current;
+//		}
+//
+//		// Store the thread local sum to LDS.
+//
+//		blockBuffer[binId] = localThreadSum;
+//	}
+//
+//	LDS_BARRIER;
+//
+//	// Do parallel exclusive scan on the LDS
+//
+//	int globalSum = 0;
+//	for( int binId = 0; binId < BIN_SIZE; binId += WG_SIZE * 2 )
+//	{
+//		int* globalOffset = &blockBuffer[binId];
+//		int currentGlobalSum = ldsScanExclusive( globalOffset, WG_SIZE * 2 );
+//		globalOffset[threadIdx.x * 2] += globalSum;
+//		globalOffset[threadIdx.x * 2 + 1] += globalSum;
+//		globalSum += currentGlobalSum;
+//	}
+//
+//	LDS_BARRIER;
+//
+//	// Add the global offset to the global histogram.
+//
+//	for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE )
+//	{
+//		for( int i = 0; i < N_WGS_EXECUTED; ++i )
+//		{
+//			gHistogram[binId * N_WGS_EXECUTED + i] += blockBuffer[binId];
+//		}
+//	}
+//}
+//
+//extern "C" __device__ void WorkgroupSync( int threadId, int blockId, int currentSegmentSum, int* currentGlobalOffset, volatile int* gPartialSum, volatile bool* gIsReady )
+//{
+//	if( threadId == 0 )
+//	{
+//		int offset = 0;
+//
+//		if( blockId != 0 )
+//		{
+//			while( !gIsReady[blockId - 1] )
+//			{
+//			}
+//
+//			offset = gPartialSum[blockId - 1];
+//
+//			__threadfence();
+//
+//			// Reset the value
+//			gIsReady[blockId - 1] = false;
+//		}
+//
+//		gPartialSum[blockId] = offset + currentSegmentSum;
+//
+//		// Ensure that the gIsReady is only modified after the gPartialSum is written.
+//		__threadfence();
+//
+//		gIsReady[blockId] = true;
+//
+//		*currentGlobalOffset = offset;
+//	}
+//
+//	__syncthreads();
+//}
+//
+//extern "C" __global__ void ParallelExclusiveScanAllWG( int* gCount, int* gHistogram, volatile int* gPartialSum, volatile bool* gIsReady )
+//{
+//	// Fill the LDS with the partial sum of each segment
+//	__shared__ int blockBuffer[SCAN_WG_SIZE];
+//
+//	blockBuffer[threadIdx.x] = gCount[blockIdx.x * blockDim.x + threadIdx.x];
+//
+//	__syncthreads();
+//
+//	// Do parallel exclusive scan on the LDS
+//
+//	int currentSegmentSum = ldsScanExclusive( blockBuffer, SCAN_WG_SIZE );
+//
+//	__syncthreads();
+//
+//	// Sync all the Workgroups to calculate the global offset.
+//
+//	__shared__ int currentGlobalOffset;
+//	WorkgroupSync( threadIdx.x, blockIdx.x, currentSegmentSum, &currentGlobalOffset, gPartialSum, gIsReady );
+//
+//	// Write back the result.
+//
+//	gHistogram[blockIdx.x * blockDim.x + threadIdx.x] = blockBuffer[threadIdx.x] + currentGlobalOffset;
+//}
+//
+//template<bool KEY_VALUE_PAIR>
+//__device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int numberOfInputs, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
+//{
+//	__shared__ u32 globalOffset[BIN_SIZE];
+//	__shared__ u32 localPrefixSum[BIN_SIZE];
+//	__shared__ u32 counters[BIN_SIZE];
+//
+//	__shared__ u32 matchMasks[SORT_NUM_WARPS_PER_BLOCK][BIN_SIZE];
+//
+//	for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE )
+//	{
+//		// Note: The size of gHistogram is always BIN_SIZE * N_WGS_EXECUTED
+//		globalOffset[i] = gHistogram[i * N_WGS_EXECUTED + blockIdx.x];
+//
+//		counters[i] = 0;
+//		localPrefixSum[i] = 0;
+//	}
+//
+//	for( int w = 0; w < SORT_NUM_WARPS_PER_BLOCK; ++w )
+//	{
+//		for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE )
+//		{
+//			matchMasks[w][i] = 0;
+//		}
+//	}
+//
+//	__syncthreads();
+//
+//	for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE )
+//	{
+//		const u32 itemIndex = blockIdx.x * gNItemsPerWG + i;
+//		if( itemIndex < numberOfInputs )
+//		{
+//			const auto item = gSrcKey[itemIndex];
+//			const u32 bucketIndex = getMaskedBits( item, START_BIT );
+//			atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
+//		}
+//	}
+//
+//	__syncthreads();
+//
+//	// Compute Prefix Sum
+//
+//	ldsScanExclusive( localPrefixSum, BIN_SIZE );
+//
+//	__syncthreads();
+//
+//	// Reorder
+//
+//	for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE )
+//	{
+//		const u32 itemIndex = blockIdx.x * gNItemsPerWG + i;
+//
+//		const auto item = gSrcKey[itemIndex];
+//		const u32 bucketIndex = getMaskedBits( item, START_BIT );
+//
+//		const int warp = threadIdx.x / 32;
+//		const int lane = threadIdx.x % 32;
+//
+//		__syncthreads();
+//
+//		if( itemIndex < numberOfInputs )
+//		{
+//			atomicOr( &matchMasks[warp][bucketIndex], 1u << lane );
+//		}
+//
+//		__syncthreads();
+//
+//		bool flushMask = false;
+//
+//		u32 localOffset = 0;
+//		u32 localSrcIndex = 0;
+//
+//		if( itemIndex < numberOfInputs )
+//		{
+//			const u32 matchMask = matchMasks[warp][bucketIndex];
+//			const u32 lowerMask = ( 1u << lane ) - 1;
+//			u32 offset = __popc( matchMask & lowerMask );
+//
+//			flushMask = ( offset == 0 );
+//
+//			for( int w = 0; w < warp; ++w )
+//			{
+//				offset += __popc( matchMasks[w][bucketIndex] );
+//			}
+//
+//			localOffset = counters[bucketIndex] + offset;
+//			localSrcIndex = i;
+//		}
+//
+//		__syncthreads();
+//
+//		if( itemIndex < numberOfInputs )
+//		{
+//			atomicInc( &counters[bucketIndex], 0xFFFFFFFF );
+//		}
+//
+//		if( flushMask )
+//		{
+//			matchMasks[warp][bucketIndex] = 0;
+//		}
+//
+//		// Swap
+//
+//		if( itemIndex < numberOfInputs )
+//		{
+//			const u32 srcIndex = blockIdx.x * gNItemsPerWG + localSrcIndex;
+//			const u32 dstIndex = globalOffset[bucketIndex] + localOffset;
+//			gDstKey[dstIndex] = gSrcKey[srcIndex];
+//
+//			if constexpr( KEY_VALUE_PAIR )
+//			{
+//				gDstVal[dstIndex] = gSrcVal[srcIndex];
+//			}
+//		}
+//	}
+//}
+//
+//extern "C" __global__ void SortKernel( int* gSrcKey, int* gDstKey, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
+//{
+//	SortImpl<false>( gSrcKey, nullptr, gDstKey, nullptr, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED );
+//}
+//
+//extern "C" __global__ void SortKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
+//{
+//	SortImpl<true>( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED );
+//}
+
+#define RADIX_SORT_KEY_TYPE u32
+#define RADIX_SORT_VALUE_TYPE u32
+#define KEY_IS_16BYTE_ALIGNED 1
+
+
+typedef unsigned long long uint64_t;
+typedef unsigned int uint32_t;
+typedef unsigned short uint16_t;
+typedef unsigned char uint8_t;
+
+//#define RADIX_SORT_BLOCK_SIZE 2048
+//
+//#define GHISTOGRAM_ITEM_PER_BLOCK 2048
+//#define GHISTOGRAM_THREADS_PER_BLOCK 256
+//
+//#define REORDER_NUMBER_OF_WARPS 8
+//#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS )
+
+#define PARTITIOIN_BIT_A 0x80000000
+#define PARTITIOIN_BIT_P 0x40000000
+#define PARTITIOIN_FLAG_MASK ( PARTITIOIN_BIT_A | PARTITIOIN_BIT_P )
+#define PARTITIOIN_VALUE_MASK 0x3FFFFFFF
+
+#define PARTITIOIN_BIT_A_64 0x8000000000000000llu
+#define PARTITIOIN_BIT_P_64 0x4000000000000000llu
+#define PARTITIOIN_FLAG_MASK_64 ( PARTITIOIN_BIT_A_64 | PARTITIOIN_BIT_P_64 )
+#define PARTITIOIN_VALUE_MASK_64 0x3FFFFFFFFFFFFFFFllu
+
+__device__ inline void partitionStoreA( volatile uint32_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_A | x; }
+__device__ inline void partitionStoreA( volatile uint64_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_A_64 | x; }
+__device__ inline void partitionStoreP( volatile uint32_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_P | x; }
+__device__ inline void partitionStoreP( volatile uint64_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_P_64 | x; }
+__device__ inline bool partitionIsX( uint32_t x ) { return ( x & PARTITIOIN_FLAG_MASK ) == 0; }
+__device__ inline bool partitionIsX( uint64_t x ) { return ( x & PARTITIOIN_FLAG_MASK_64 ) == 0; }
+__device__ inline bool partitionIsP( uint32_t x ) { return ( x & PARTITIOIN_BIT_P ) != 0; }
+__device__ inline bool partitionIsP( uint64_t x ) { return ( x & PARTITIOIN_BIT_P_64 ) != 0; }
+
+__device__ inline uint32_t partitionGetValue( uint32_t x ) { return x & PARTITIOIN_VALUE_MASK; }
+__device__ inline uint32_t partitionGetValue( uint64_t x ) { return static_cast<uint32_t>( x & PARTITIOIN_VALUE_MASK_64 ); }
+
+#if defined( DESCENDING_ORDER )
+#define ORDER_MASK_32 0xFFFFFFFF
+#define ORDER_MASK_64 0xFFFFFFFFFFFFFFFFllu
+#else
+#define ORDER_MASK_32 0
+#define ORDER_MASK_64 0llu
+#endif
+
+#if defined( CUDART_VERSION ) && CUDART_VERSION >= 9000
+#define ITS 1
+#endif
+
+__device__ inline uint32_t div_round_up( uint32_t val, uint32_t divisor ) { return ( val + divisor - 1 ) / divisor; }
+template<int NElement, int NThread, class T>
+__device__ void clearShared( T* sMem, T value )
 {
-	// Use a single WG.
-	if( blockIdx.x != 0 )
+	for( int i = 0; i < NElement; i += NThread )
 	{
-		return;
+		if( i < NElement )
+		{
+			sMem[i + threadIdx.x] = value;
+		}
 	}
+}
+
+__device__ inline uint32_t getKeyBits( uint32_t x ) { return x ^ ORDER_MASK_32; }
+__device__ inline uint64_t getKeyBits( uint64_t x ) { return x ^ ORDER_MASK_64; }
+__device__ inline uint32_t getKeyBits( float x )
+{
+	if( x == 0.0f ) x = 0.0f;
+
+	uint32_t flip = uint32_t( __float_as_int( x ) >> 31 ) | 0x80000000;
+	return __float_as_uint( x ) ^ flip ^ ORDER_MASK_32;
+}
+__device__ inline uint64_t getKeyBits( double x )
+{
+	if( x == 0.0 ) x = 0.0;
 
-	// LDS for the parallel scan of the global sum:
-	// First we store the sum of the counters of each number to it,
-	// then we compute the global offset using parallel exclusive scan.
-	__shared__ int blockBuffer[BIN_SIZE];
+	uint64_t flip = uint64_t( __double_as_longlong( x ) >> 63 ) | 0x8000000000000000llu;
+	return (uint64_t)__double_as_longlong( x ) ^ flip ^ ORDER_MASK_64;
+}
 
-	// fill the LDS with the local sum
+template<int NThreads>
+__device__ inline uint32_t prefixSumExclusive( uint32_t prefix, uint32_t* sMemIO )
+{
+	uint32_t value = sMemIO[threadIdx.x];
 
-	for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE )
+	for( uint32_t offset = 1; offset < NThreads; offset <<= 1 )
 	{
-		// Do exclusive scan for each segment handled by each WI in a WG
+		uint32_t x = sMemIO[threadIdx.x];
 
-		int localThreadSum = 0;
-		for( int i = 0; i < N_WGS_EXECUTED; ++i )
+		if( offset <= threadIdx.x )
 		{
-			int current = gCount[binId * N_WGS_EXECUTED + i];
-			gCount[binId * N_WGS_EXECUTED + i] = localThreadSum;
-
-			localThreadSum += current;
+			x += sMemIO[threadIdx.x - offset];
 		}
 
-		// Store the thread local sum to LDS.
+		__syncthreads();
+
+		sMemIO[threadIdx.x] = x;
 
-		blockBuffer[binId] = localThreadSum;
+		__syncthreads();
 	}
+	uint32_t sum = sMemIO[NThreads - 1];
 
-	LDS_BARRIER;
+	__syncthreads();
 
-	// Do parallel exclusive scan on the LDS
+	sMemIO[threadIdx.x] += prefix - value;
 
-	int globalSum = 0;
-	for( int binId = 0; binId < BIN_SIZE; binId += WG_SIZE * 2 )
-	{
-		int* globalOffset = &blockBuffer[binId];
-		int currentGlobalSum = ldsScanExclusive( globalOffset, WG_SIZE * 2 );
-		globalOffset[threadIdx.x * 2] += globalSum;
-		globalOffset[threadIdx.x * 2 + 1] += globalSum;
-		globalSum += currentGlobalSum;
-	}
+	__syncthreads();
 
-	LDS_BARRIER;
+	return sum;
+}
 
-	// Add the global offset to the global histogram.
+extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t numberOfInputs, uint32_t* gpSumBuffer, uint32_t startBits, uint32_t* counter )
+{
+	__shared__ uint32_t localCounters[sizeof( RADIX_SORT_KEY_TYPE )][256];
 
-	for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE )
+	for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 	{
-		for( int i = 0; i < N_WGS_EXECUTED; ++i )
+		for( int j = threadIdx.x; j < 256; j += GHISTOGRAM_THREADS_PER_BLOCK )
 		{
-			gHistogram[binId * N_WGS_EXECUTED + i] += blockBuffer[binId];
+			localCounters[i][j] = 0;
 		}
 	}
-}
 
-extern "C" __device__ void WorkgroupSync( int threadId, int blockId, int currentSegmentSum, int* currentGlobalOffset, volatile int* gPartialSum, volatile bool* gIsReady )
-{
-	if( threadId == 0 )
+	__syncthreads();
+
+	uint32_t numberOfBlocks = div_round_up( numberOfInputs, GHISTOGRAM_ITEM_PER_BLOCK );
+	__shared__ uint32_t iBlock;
+	if( threadIdx.x == 0 )
+	{
+		iBlock = atomicInc( counter, 0xFFFFFFFF );
+	}
+
+	__syncthreads();
+
+	bool hasData = false;
+
+	while( iBlock < numberOfBlocks )
 	{
-		int offset = 0;
+		hasData = true;
 
-		if( blockId != 0 )
+#if defined( KEY_IS_16BYTE_ALIGNED )
+		if( ( iBlock + 1 ) * GHISTOGRAM_ITEM_PER_BLOCK <= numberOfInputs )
 		{
-			while( !gIsReady[blockId - 1] )
+			for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK * 4 )
 			{
+				uint32_t itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i + threadIdx.x * 4;
+				struct alignas( 16 ) Key4
+				{
+					RADIX_SORT_KEY_TYPE xs[4];
+				};
+				Key4 key4 = *(Key4*)&inputs[itemIndex];
+				for( int k = 0; k < 4; k++ )
+				{
+					auto item = key4.xs[k];
+					for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
+					{
+						uint32_t bitLocation = startBits + i * 8;
+						uint32_t bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+						atomicInc( &localCounters[i][bits], 0xFFFFFFFF );
+					}
+				}
+			}
+		}
+		else
+#endif
+			for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK )
+			{
+				uint32_t itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x + i;
+				if( itemIndex < numberOfInputs )
+				{
+					auto item = inputs[itemIndex];
+					for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
+					{
+						uint32_t bitLocation = startBits + i * 8;
+						uint32_t bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+						atomicInc( &localCounters[i][bits], 0xFFFFFFFF );
+					}
+				}
 			}
 
-			offset = gPartialSum[blockId - 1];
-
-			__threadfence();
+		__syncthreads();
 
-			// Reset the value
-			gIsReady[blockId - 1] = false;
+		if( threadIdx.x == 0 )
+		{
+			iBlock = atomicInc( counter, 0xFFFFFFFF );
 		}
 
-		gPartialSum[blockId] = offset + currentSegmentSum;
-
-		// Ensure that the gIsReady is only modified after the gPartialSum is written.
-		__threadfence();
+		__syncthreads();
+	}
 
-		gIsReady[blockId] = true;
+	if( hasData )
+	{
+		__syncthreads();
 
-		*currentGlobalOffset = offset;
+		for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
+		{
+			for( int j = threadIdx.x; j < 256; j += GHISTOGRAM_THREADS_PER_BLOCK )
+			{
+				atomicAdd( &gpSumBuffer[256 * i + j], localCounters[i][j] );
+			}
+		}
 	}
-
-	__syncthreads();
 }
 
-extern "C" __global__ void ParallelExclusiveScanAllWG( int* gCount, int* gHistogram, volatile int* gPartialSum, volatile bool* gIsReady )
+extern "C" __global__ void gPrefixSum( uint32_t* gpSumBuffer )
 {
-	// Fill the LDS with the partial sum of each segment
-	__shared__ int blockBuffer[SCAN_WG_SIZE];
-
-	blockBuffer[threadIdx.x] = gCount[blockIdx.x * blockDim.x + threadIdx.x];
+	__shared__ uint32_t smem[256];
 
-	__syncthreads();
-
-	// Do parallel exclusive scan on the LDS
-
-	int currentSegmentSum = ldsScanExclusive( blockBuffer, SCAN_WG_SIZE );
+	smem[threadIdx.x] = gpSumBuffer[blockIdx.x * 256 + threadIdx.x];
 
 	__syncthreads();
 
-	// Sync all the Workgroups to calculate the global offset.
-
-	__shared__ int currentGlobalOffset;
-	WorkgroupSync( threadIdx.x, blockIdx.x, currentSegmentSum, &currentGlobalOffset, gPartialSum, gIsReady );
+	prefixSumExclusive<256>( 0, smem );
 
-	// Write back the result.
-
-	gHistogram[blockIdx.x * blockDim.x + threadIdx.x] = blockBuffer[threadIdx.x] + currentGlobalOffset;
+	gpSumBuffer[blockIdx.x * 256 + threadIdx.x] = smem[threadIdx.x];
 }
 
-template<bool KEY_VALUE_PAIR>
-__device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int numberOfInputs, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
+template<class TLookBack>
+__device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, uint32_t numberOfInputs, uint32_t* gpSumBuffer,
+												  volatile TLookBack* lookBackBuffer, uint32_t startBits, uint32_t iteration )
 {
-	__shared__ u32 globalOffset[BIN_SIZE];
-	__shared__ u32 localPrefixSum[BIN_SIZE];
-	__shared__ u32 counters[BIN_SIZE];
-
-	__shared__ u32 matchMasks[SORT_NUM_WARPS_PER_BLOCK][BIN_SIZE];
-
-	for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE )
+	struct ElementLocation
 	{
-		// Note: The size of gHistogram is always BIN_SIZE * N_WGS_EXECUTED
-		globalOffset[i] = gHistogram[i * N_WGS_EXECUTED + blockIdx.x];
-
-		counters[i] = 0;
-		localPrefixSum[i] = 0;
-	}
-
-	for( int w = 0; w < SORT_NUM_WARPS_PER_BLOCK; ++w )
+		uint32_t localSrcIndex : 12;
+		uint32_t localOffset : 12;
+		uint32_t bucket : 8;
+	};
+
+	__shared__ uint32_t pSum[256];
+	__shared__ uint32_t localPrefixSum[256];
+	__shared__ uint32_t counters[256];
+	__shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
+	__shared__ uint8_t elementBuckets[RADIX_SORT_BLOCK_SIZE];
+	__shared__ uint32_t matchMasks[REORDER_NUMBER_OF_WARPS][256];
+
+	uint32_t bitLocation = startBits + 8 * iteration;
+	uint32_t blockIndex = blockIdx.x;
+	uint32_t numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE );
+
+	clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, uint32_t>( localPrefixSum, 0 );
+	clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, uint32_t>( counters, 0 );
+
+	for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ )
 	{
-		for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE )
+		for( int i = 0; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
-			matchMasks[w][i] = 0;
+			matchMasks[w][i + threadIdx.x] = 0;
 		}
 	}
 
 	__syncthreads();
 
-	for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE )
+	// count
+#if defined( KEY_IS_16BYTE_ALIGNED )
+	if( ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs )
 	{
-		const u32 itemIndex = blockIdx.x * gNItemsPerWG + i;
-		if( itemIndex < numberOfInputs )
+		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK * 4 )
+		{
+			uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x * 4;
+			struct alignas( 16 ) Key4
+			{
+				RADIX_SORT_KEY_TYPE xs[4];
+			};
+			Key4 key4 = *(Key4*)&inputKeys[itemIndex];
+			for( int k = 0; k < 4; k++ )
+			{
+				auto item = key4.xs[k];
+				uint32_t bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+				atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
+				elementBuckets[i + threadIdx.x * 4 + k] = bucketIndex;
+			}
+		}
+	}
+	else
+#endif
+	{
+		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
-			const auto item = gSrcKey[itemIndex];
-			const u32 bucketIndex = getMaskedBits( item, START_BIT );
-			atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
+			uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+			if( itemIndex < numberOfInputs )
+			{
+				auto item = inputKeys[itemIndex];
+				uint32_t bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+				atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
+
+				elementBuckets[i + threadIdx.x] = bucketIndex;
+			}
 		}
 	}
 
 	__syncthreads();
 
-	// Compute Prefix Sum
+	// Look back
+	for( int i = threadIdx.x; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	{
+		uint32_t s = localPrefixSum[i];
+		partitionStoreA( &lookBackBuffer[256 * blockIdx.x + i], s );
+		uint32_t gp = gpSumBuffer[iteration * 256 + i];
 
-	ldsScanExclusive( localPrefixSum, BIN_SIZE );
+		uint32_t p = 0;
 
-	__syncthreads();
+		for( int iBlock = (int)blockIdx.x - 1; 0 <= iBlock; iBlock-- )
+		{
+			TLookBack counter = lookBackBuffer[256 * iBlock + i];
+			while( partitionIsX( counter ) )
+			{
+				counter = lookBackBuffer[256 * iBlock + i];
+			}
 
-	// Reorder
+			uint32_t value = partitionGetValue( counter );
+			p += value;
+			if( partitionIsP( counter ) )
+			{
+				break;
+			}
+		}
 
-	for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE )
-	{
-		const u32 itemIndex = blockIdx.x * gNItemsPerWG + i;
+		partitionStoreP( &lookBackBuffer[256 * blockIdx.x + i], p + s );
 
-		const auto item = gSrcKey[itemIndex];
-		const u32 bucketIndex = getMaskedBits( item, START_BIT );
+		// complete global output location
+		pSum[i] = gp + p;
+	}
+
+	uint32_t prefix = 0;
+	for( int i = 0; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	{
+		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &localPrefixSum[i] );
+	}
 
-		const int warp = threadIdx.x / 32;
-		const int lane = threadIdx.x % 32;
+	// reorder
+	for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	{
+		uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+		uint32_t bucketIndex = elementBuckets[i + threadIdx.x];
 
 		__syncthreads();
 
+		int warp = threadIdx.x / 32;
+		int lane = threadIdx.x % 32;
+
 		if( itemIndex < numberOfInputs )
 		{
 			atomicOr( &matchMasks[warp][bucketIndex], 1u << lane );
@@ -702,24 +1111,27 @@ __device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal
 
 		bool flushMask = false;
 
-		u32 localOffset = 0;
-		u32 localSrcIndex = 0;
-
 		if( itemIndex < numberOfInputs )
 		{
-			const u32 matchMask = matchMasks[warp][bucketIndex];
-			const u32 lowerMask = ( 1u << lane ) - 1;
-			u32 offset = __popc( matchMask & lowerMask );
+			uint32_t matchMask = matchMasks[warp][bucketIndex];
+			uint32_t lowerMask = ( 1u << lane ) - 1;
+			uint32_t offset = __popc( matchMask & lowerMask );
 
-			flushMask = ( offset == 0 );
+			flushMask = offset == 0;
 
-			for( int w = 0; w < warp; ++w )
+			for( int w = 0; w < warp; w++ )
 			{
 				offset += __popc( matchMasks[w][bucketIndex] );
 			}
 
-			localOffset = counters[bucketIndex] + offset;
-			localSrcIndex = i;
+			uint32_t localOffset = counters[bucketIndex] + offset;
+			uint32_t to = localOffset + localPrefixSum[bucketIndex];
+
+			ElementLocation el;
+			el.localSrcIndex = i + threadIdx.x;
+			el.localOffset = localOffset;
+			el.bucket = bucketIndex;
+			elementLocations[to] = el;
 		}
 
 		__syncthreads();
@@ -728,34 +1140,57 @@ __device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal
 		{
 			atomicInc( &counters[bucketIndex], 0xFFFFFFFF );
 		}
-
 		if( flushMask )
 		{
 			matchMasks[warp][bucketIndex] = 0;
 		}
+	}
 
-		// Swap
-
+	for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	{
+		uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
 		if( itemIndex < numberOfInputs )
 		{
-			const u32 srcIndex = blockIdx.x * gNItemsPerWG + localSrcIndex;
-			const u32 dstIndex = globalOffset[bucketIndex] + localOffset;
-			gDstKey[dstIndex] = gSrcKey[srcIndex];
+			ElementLocation el = elementLocations[i + threadIdx.x];
+			uint32_t srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
+			uint8_t bucketIndex = el.bucket;
 
-			if constexpr( KEY_VALUE_PAIR )
+			uint32_t dstIndex = pSum[bucketIndex] + el.localOffset;
+			outputKeys[dstIndex] = inputKeys[srcIndex];
+		}
+	}
+	if( keyPair )
+	{
+		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+		{
+			uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+			if( itemIndex < numberOfInputs )
 			{
-				gDstVal[dstIndex] = gSrcVal[srcIndex];
+				ElementLocation el = elementLocations[i + threadIdx.x];
+				uint32_t srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
+				uint8_t bucketIndex = el.bucket;
+
+				uint32_t dstIndex = pSum[bucketIndex] + el.localOffset;
+				outputValues[dstIndex] = inputValues[srcIndex];
 			}
 		}
 	}
 }
-
-extern "C" __global__ void SortKernel( int* gSrcKey, int* gDstKey, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
+extern "C" __global__ void onesweep_reorderKey( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint32_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
 {
-	SortImpl<false>( gSrcKey, nullptr, gDstKey, nullptr, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED );
+	onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration );
 }
-
-extern "C" __global__ void SortKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
+extern "C" __global__ void onesweep_reorderKeyPair( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, uint32_t numberOfInputs, uint32_t* gpSumBuffer,
+													volatile uint32_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
 {
-	SortImpl<true>( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED );
+	onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration );
 }
+extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
+{
+	onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration );
+}
+extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, uint32_t numberOfInputs, uint32_t* gpSumBuffer,
+													  volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
+{
+	onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration );
+}
\ No newline at end of file
diff --git a/Test/RadixSort/main.cpp b/Test/RadixSort/main.cpp
index fae9f55..57eb4e4 100644
--- a/Test/RadixSort/main.cpp
+++ b/Test/RadixSort/main.cpp
@@ -67,6 +67,8 @@ class SortTest
 
 		OrochiUtils::malloc( srcGpu.key, testSize );
 		OrochiUtils::malloc( dstGpu.key, testSize );
+		void* temp;
+		oroMalloc( (oroDeviceptr*)&temp, m_sort.getRequiredTemporalStorageBytes( testSize ) );
 
 		std::vector<u32> srcKey( testSize );
 		for( int i = 0; i < testSize; i++ )
@@ -102,11 +104,11 @@ class SortTest
 
 			if constexpr( KEY_VALUE_PAIR )
 			{
-				m_sort.sort( srcGpu, dstGpu, testSize, 0, testBits);
+				m_sort.sort( srcGpu, dstGpu, testSize, 0, testBits, temp );
 			}
 			else
 			{
-				m_sort.sort( srcGpu.key, dstGpu.key, testSize, 0, testBits );
+				m_sort.sort( srcGpu.key, dstGpu.key, testSize, 0, testBits, temp );
 			}
 
 			OrochiUtils::waitForCompletion();
@@ -178,6 +180,7 @@ class SortTest
 
 		OrochiUtils::free( srcGpu.key );
 		OrochiUtils::free( dstGpu.key );
+		oroFree( (oroDeviceptr)temp );
 
 		printf( "passed: %3.2fK keys\n", testSize / 1000.f );
 	}

From 12fd48c6624785f927f34f21879df60a9bd406e3 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 21 Sep 2023 16:54:07 +0900
Subject: [PATCH 06/68] [ORO-0] fixed storage ver

---
 ParallelPrimitives/RadixSort.cpp      |  43 +++-------
 ParallelPrimitives/RadixSort.h        |   9 +--
 ParallelPrimitives/RadixSortConfigs.h |   8 +-
 ParallelPrimitives/RadixSortKernels.h | 110 ++++++++++++++++----------
 Test/RadixSort/main.cpp               |   7 +-
 5 files changed, 89 insertions(+), 88 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index a33ff47..864d756 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -200,8 +200,6 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 #define LOAD_FUNC( var, kernel ) var = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), kernel, &opts );
 	LOAD_FUNC( m_gHistogram, "gHistogram" );
 	LOAD_FUNC( m_gPrefixSum, "gPrefixSum" );
-	LOAD_FUNC( m_onesweep_reorderKey, "onesweep_reorderKey" );
-	LOAD_FUNC( m_onesweep_reorderKeyPair, "onesweep_reorderKeyPair" );
 	LOAD_FUNC( m_onesweep_reorderKey64, "onesweep_reorderKey64" );
 	LOAD_FUNC( m_onesweep_reorderKeyPair64, "onesweep_reorderKeyPair64" );
 #undef LOAD_FUNC
@@ -240,6 +238,10 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc
 {
 	compileKernels( kernelPath, includeDir );
 
+	u64 gpSumBuffer = sizeof( u32 ) * 256 * sizeof( u32 /* key type */ );
+	u64 lookBackBuffer = next_multiple64( sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE + 1 ), 16 );
+	m_tmpBuffer.resize( gpSumBuffer + lookBackBuffer );
+
 	//m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count );
 
 	///// The tmp buffer size of the count kernel and the scan kernel.
@@ -262,7 +264,7 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc
 }
 void RadixSort::setFlag( Flag flag ) noexcept { m_flags = flag; }
 
-void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream ) noexcept
+void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, oroStream stream ) noexcept
 {
 	bool keyPair = src.value != nullptr;
 
@@ -286,17 +288,11 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 	}
 
 	int nIteration = div_round_up64( endBit - startBit, 8 );
-	bool use64bitCounter =
-#if defined( ENFORCE_64BIT_COUNTER )
-		true;
-#else
-		MAX_ELEMENTS_WITH_32BIT_COUNTER < n;
-#endif
 	uint64_t numberOfBlocks = div_round_up64( n, RADIX_SORT_BLOCK_SIZE );
 
 	// Buffers
-	void* gpSumBuffer = tempStorage;
-	void* lookBackBuffer = (void*)( (char*)tempStorage + sizeof( uint32_t ) * 256 * sizeof( u32 /* key */ ) );
+	void* gpSumBuffer = m_tmpBuffer.ptr();
+	void* lookBackBuffer = (void*)( m_tmpBuffer.ptr() + sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ) );
 
 	{
 		oroMemsetD32Async( (oroDeviceptr)gpSumBuffer, 0, 256 * sizeof( u32 /* key */ ), stream );
@@ -316,17 +312,17 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 	auto d = dst;
 	for( int i = 0; i < nIteration; i++ )
 	{
-		oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, 256 * numberOfBlocks * ( use64bitCounter ? 2 : 1 ), stream );
+		oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, ( 256 * LOOKBACK_TABLE_SIZE + 1 ) * sizeof( uint64_t ) / 4, stream );
 
 		if( keyPair )
 		{
 			const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i };
-			OrochiUtils::launch1D( use64bitCounter ? m_onesweep_reorderKeyPair64 : m_onesweep_reorderKeyPair, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream );
+			OrochiUtils::launch1D( m_onesweep_reorderKeyPair64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream );
 		}
 		else
 		{
 			const void* args[] = { &s.key, &d.key, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i };
-			OrochiUtils::launch1D( use64bitCounter ? m_onesweep_reorderKey64 : m_onesweep_reorderKey, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream );
+			OrochiUtils::launch1D( m_onesweep_reorderKey64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream );
 		}
 		std::swap( s, d );
 	}
@@ -342,23 +338,8 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 	}
 }
 
-void RadixSort::sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream ) noexcept
+void RadixSort::sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, oroStream stream ) noexcept
 {
-	sort( KeyValueSoA{ src, nullptr }, KeyValueSoA{ dst, nullptr }, n, startBit, endBit, tempStorage, stream );
-}
-
-uint64_t RadixSort::getRequiredTemporalStorageBytes( u32 numberOfMaxInputs ) const
-{
-	static_assert( BIN_SIZE == 256, "check alignment of the buffers" );
-	uint64_t numberOfBlocks = div_round_up64( numberOfMaxInputs, RADIX_SORT_BLOCK_SIZE );
-	uint64_t gpSumBuffer = sizeof( uint32_t ) * 256 * sizeof( u32 /* key */ );
-	uint64_t lookBackBuffer = sizeof( uint32_t ) * 256 * numberOfBlocks;
-#if !defined( ENFORCE_64BIT_COUNTER )
-	if( MAX_ELEMENTS_WITH_32BIT_COUNTER < numberOfMaxInputs )
-#endif
-	{
-		lookBackBuffer *= 2; // to 64bit counter
-	}
-	return gpSumBuffer + lookBackBuffer;
+	sort( KeyValueSoA{ src, nullptr }, KeyValueSoA{ dst, nullptr }, n, startBit, endBit, stream );
 }
 }; // namespace Oro
diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index d30ee32..e78a978 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -43,11 +43,10 @@ class RadixSort final
 
 	void setFlag( Flag flag ) noexcept;
 
-	void sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream = 0 ) noexcept;
+	void sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept;
 
-	void sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, void* tempStorage, oroStream stream = 0 ) noexcept;
+	void sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept;
 
-	uint64_t getRequiredTemporalStorageBytes( u32 numberOfMaxInputs ) const;
   private:
 	//template<class T>
 	//void sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept;
@@ -121,10 +120,10 @@ class RadixSort final
 
 	oroFunction m_gHistogram;
 	oroFunction m_gPrefixSum;
-	oroFunction m_onesweep_reorderKey;
-	oroFunction m_onesweep_reorderKeyPair;
 	oroFunction m_onesweep_reorderKey64;
 	oroFunction m_onesweep_reorderKeyPair64;
+
+	GpuMemory<uint8_t> m_tmpBuffer;
 };
 
 //#include <ParallelPrimitives/RadixSort.inl>
diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index c597238..ed64110 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -50,9 +50,9 @@ static_assert( DEFAULT_SCAN_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 );
 #define REORDER_NUMBER_OF_WARPS 8
 #define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS )
 
-#define MAX_ELEMENTS_WITH_32BIT_COUNTER 0x3FFFFFFF
-
-// Please uncomment this enforce 64bit counter for lookback counter to measure performance impact.
-// #define ENFORCE_64BIT_COUNTER 1
+#define LOOKBACK_TABLE_SIZE ( 1024 )
+#define MAX_LOOK_BACK 32
+#define TAIL_BITS 4
+#define TAIL_COUNT ( 1u << TAIL_BITS )
 
 }; // namespace Oro
\ No newline at end of file
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 56bca91..c04a6f1 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -777,28 +777,11 @@ typedef unsigned char uint8_t;
 //
 //#define REORDER_NUMBER_OF_WARPS 8
 //#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS )
-
-#define PARTITIOIN_BIT_A 0x80000000
-#define PARTITIOIN_BIT_P 0x40000000
-#define PARTITIOIN_FLAG_MASK ( PARTITIOIN_BIT_A | PARTITIOIN_BIT_P )
-#define PARTITIOIN_VALUE_MASK 0x3FFFFFFF
-
-#define PARTITIOIN_BIT_A_64 0x8000000000000000llu
-#define PARTITIOIN_BIT_P_64 0x4000000000000000llu
-#define PARTITIOIN_FLAG_MASK_64 ( PARTITIOIN_BIT_A_64 | PARTITIOIN_BIT_P_64 )
-#define PARTITIOIN_VALUE_MASK_64 0x3FFFFFFFFFFFFFFFllu
-
-__device__ inline void partitionStoreA( volatile uint32_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_A | x; }
-__device__ inline void partitionStoreA( volatile uint64_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_A_64 | x; }
-__device__ inline void partitionStoreP( volatile uint32_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_P | x; }
-__device__ inline void partitionStoreP( volatile uint64_t* to, uint32_t x ) { *to = PARTITIOIN_BIT_P_64 | x; }
-__device__ inline bool partitionIsX( uint32_t x ) { return ( x & PARTITIOIN_FLAG_MASK ) == 0; }
-__device__ inline bool partitionIsX( uint64_t x ) { return ( x & PARTITIOIN_FLAG_MASK_64 ) == 0; }
-__device__ inline bool partitionIsP( uint32_t x ) { return ( x & PARTITIOIN_BIT_P ) != 0; }
-__device__ inline bool partitionIsP( uint64_t x ) { return ( x & PARTITIOIN_BIT_P_64 ) != 0; }
-
-__device__ inline uint32_t partitionGetValue( uint32_t x ) { return x & PARTITIOIN_VALUE_MASK; }
-__device__ inline uint32_t partitionGetValue( uint64_t x ) { return static_cast<uint32_t>( x & PARTITIOIN_VALUE_MASK_64 ); }
+//
+//#define LOOKBACK_TABLE_SIZE ( 1024 )
+//#define MAX_LOOK_BACK 32
+//#define TAIL_BITS 4
+//#define TAIL_COUNT ( 1u << TAIL_BITS )
 
 #if defined( DESCENDING_ORDER )
 #define ORDER_MASK_32 0xFFFFFFFF
@@ -979,9 +962,8 @@ extern "C" __global__ void gPrefixSum( uint32_t* gpSumBuffer )
 	gpSumBuffer[blockIdx.x * 256 + threadIdx.x] = smem[threadIdx.x];
 }
 
-template<class TLookBack>
 __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, uint32_t numberOfInputs, uint32_t* gpSumBuffer,
-												  volatile TLookBack* lookBackBuffer, uint32_t startBits, uint32_t iteration )
+												  volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
 {
 	struct ElementLocation
 	{
@@ -1052,34 +1034,77 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 	}
 
+	struct ParitionID
+	{
+		uint64_t value : 32;
+		uint64_t block : 30;
+		uint64_t flag : 2;
+	};
+	auto asPartition = []( uint64_t x )
+	{
+		ParitionID pa;
+		memcpy( &pa, &x, sizeof( ParitionID ) );
+		return pa;
+	};
+	auto asU64 = []( ParitionID pa )
+	{
+		uint64_t x;
+		memcpy( &x, &pa, sizeof( uint64_t ) );
+		return x;
+	};
+
+	uint32_t* gTailIterator = (uint32_t*)( lookBackBuffer + LOOKBACK_TABLE_SIZE * 256 );
+
+	if( threadIdx.x == 0 && LOOKBACK_TABLE_SIZE <= blockIndex )
+	{
+		uint32_t mustBeDone = blockIndex - LOOKBACK_TABLE_SIZE + MAX_LOOK_BACK;
+		while( ( atomicAdd( gTailIterator, 0 ) >> TAIL_BITS ) * TAIL_COUNT <= mustBeDone )
+			;
+	}
 	__syncthreads();
 
-	// Look back
 	for( int i = threadIdx.x; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
 		uint32_t s = localPrefixSum[i];
-		partitionStoreA( &lookBackBuffer[256 * blockIdx.x + i], s );
+		int pIndex = 256 * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i;
+
+		{
+			ParitionID pa;
+			pa.value = s;
+			pa.block = blockIndex;
+			pa.flag = 1;
+			lookBackBuffer[pIndex] = asU64( pa );
+		}
+
 		uint32_t gp = gpSumBuffer[iteration * 256 + i];
 
 		uint32_t p = 0;
 
-		for( int iBlock = (int)blockIdx.x - 1; 0 <= iBlock; iBlock-- )
+		for( int iBlock = (int)blockIndex - 1; 0 <= iBlock; iBlock-- )
 		{
-			TLookBack counter = lookBackBuffer[256 * iBlock + i];
-			while( partitionIsX( counter ) )
+			int lookbackIndex = 256 * ( iBlock % LOOKBACK_TABLE_SIZE ) + i;
+			ParitionID pa;
+			do
 			{
-				counter = lookBackBuffer[256 * iBlock + i];
-			}
+				pa = asPartition( lookBackBuffer[lookbackIndex] );
 
-			uint32_t value = partitionGetValue( counter );
+				// when you reach to the maximum, flag must be 2
+				if( MAX_LOOK_BACK == blockIndex - iBlock && pa.flag != 2 ) continue;
+			} while( pa.flag == 0 || pa.block != iBlock );
+
+			uint32_t value = pa.value;
 			p += value;
-			if( partitionIsP( counter ) )
+			if( pa.flag == 2 )
 			{
 				break;
 			}
 		}
 
-		partitionStoreP( &lookBackBuffer[256 * blockIdx.x + i], p + s );
+		ParitionID pa;
+		pa.value = p + s;
+		pa.block = blockIndex;
+		pa.flag = 2;
+		lookBackBuffer[pIndex] = asU64( pa );
 
 		// complete global output location
 		pSum[i] = gp + p;
@@ -1091,6 +1116,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &localPrefixSum[i] );
 	}
 
+	if( threadIdx.x == 0 )
+	{
+		while( ( atomicAdd( gTailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT )
+			;
+
+		atomicInc( gTailIterator, 0xFFFFFFFF );
+	}
+
 	// reorder
 	for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
@@ -1176,15 +1209,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 	}
 }
-extern "C" __global__ void onesweep_reorderKey( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint32_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
-{
-	onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration );
-}
-extern "C" __global__ void onesweep_reorderKeyPair( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, uint32_t numberOfInputs, uint32_t* gpSumBuffer,
-													volatile uint32_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
-{
-	onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration );
-}
 extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
 {
 	onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration );
diff --git a/Test/RadixSort/main.cpp b/Test/RadixSort/main.cpp
index 57eb4e4..e36dda4 100644
--- a/Test/RadixSort/main.cpp
+++ b/Test/RadixSort/main.cpp
@@ -67,8 +67,6 @@ class SortTest
 
 		OrochiUtils::malloc( srcGpu.key, testSize );
 		OrochiUtils::malloc( dstGpu.key, testSize );
-		void* temp;
-		oroMalloc( (oroDeviceptr*)&temp, m_sort.getRequiredTemporalStorageBytes( testSize ) );
 
 		std::vector<u32> srcKey( testSize );
 		for( int i = 0; i < testSize; i++ )
@@ -104,11 +102,11 @@ class SortTest
 
 			if constexpr( KEY_VALUE_PAIR )
 			{
-				m_sort.sort( srcGpu, dstGpu, testSize, 0, testBits, temp );
+				m_sort.sort( srcGpu, dstGpu, testSize, 0, testBits );
 			}
 			else
 			{
-				m_sort.sort( srcGpu.key, dstGpu.key, testSize, 0, testBits, temp );
+				m_sort.sort( srcGpu.key, dstGpu.key, testSize, 0, testBits );
 			}
 
 			OrochiUtils::waitForCompletion();
@@ -180,7 +178,6 @@ class SortTest
 
 		OrochiUtils::free( srcGpu.key );
 		OrochiUtils::free( dstGpu.key );
-		oroFree( (oroDeviceptr)temp );
 
 		printf( "passed: %3.2fK keys\n", testSize / 1000.f );
 	}

From 284b083a4d7545350fd36fbb2047929596a66788 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 21 Sep 2023 18:26:02 +0900
Subject: [PATCH 07/68] [ORO-0] Memset can be skipped in most situations

---
 ParallelPrimitives/RadixSort.cpp      | 13 +++++++++----
 ParallelPrimitives/RadixSortKernels.h |  3 +--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 864d756..3d77b53 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -294,13 +294,15 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 	void* gpSumBuffer = m_tmpBuffer.ptr();
 	void* lookBackBuffer = (void*)( m_tmpBuffer.ptr() + sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ) );
 
+	// counter for gHistogram. 
+	void* counter = (uint8_t*)lookBackBuffer + ( 256 * LOOKBACK_TABLE_SIZE ) + sizeof( uint32_t );
+
 	{
-		oroMemsetD32Async( (oroDeviceptr)gpSumBuffer, 0, 256 * sizeof( u32 /* key */ ), stream );
-		oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, 1, stream );
+		oroMemsetD32Async( (oroDeviceptr)m_tmpBuffer.ptr(), 0, m_tmpBuffer.size() / 4, stream );
 
 		const int nBlocks = 2048;
 
-		const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &lookBackBuffer };
+		const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &counter };
 		OrochiUtils::launch1D( m_gHistogram, nBlocks * GHISTOGRAM_THREADS_PER_BLOCK, args, GHISTOGRAM_THREADS_PER_BLOCK, 0, stream );
 	}
 	{
@@ -312,7 +314,10 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 	auto d = dst;
 	for( int i = 0; i < nIteration; i++ )
 	{
-		oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, ( 256 * LOOKBACK_TABLE_SIZE + 1 ) * sizeof( uint64_t ) / 4, stream );
+		if( numberOfBlocks < LOOKBACK_TABLE_SIZE * 2 )
+		{
+			oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, ( 256 * LOOKBACK_TABLE_SIZE ) * sizeof( uint64_t ) / 4, stream );
+		} // other wise, we can skip zero clear look back buffer
 
 		if( keyPair )
 		{
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index c04a6f1..3090e31 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -764,7 +764,6 @@ extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, i
 #define RADIX_SORT_VALUE_TYPE u32
 #define KEY_IS_16BYTE_ALIGNED 1
 
-
 typedef unsigned long long uint64_t;
 typedef unsigned int uint32_t;
 typedef unsigned short uint16_t;
@@ -1121,7 +1120,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		while( ( atomicAdd( gTailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT )
 			;
 
-		atomicInc( gTailIterator, 0xFFFFFFFF );
+		atomicInc( gTailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ );
 	}
 
 	// reorder

From d12fdea3016d4351157160bcf931519ef1f8458e Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 21 Sep 2023 22:54:20 +0900
Subject: [PATCH 08/68] [ORO-0] fix the wrong condition for MAX_LOOK_BACK

---
 ParallelPrimitives/RadixSortConfigs.h |  2 +-
 ParallelPrimitives/RadixSortKernels.h | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index ed64110..6eb68d1 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -51,7 +51,7 @@ static_assert( DEFAULT_SCAN_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 );
 #define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS )
 
 #define LOOKBACK_TABLE_SIZE ( 1024 )
-#define MAX_LOOK_BACK 32
+#define MAX_LOOK_BACK 64
 #define TAIL_BITS 4
 #define TAIL_COUNT ( 1u << TAIL_BITS )
 
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 3090e31..1530b0a 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -778,7 +778,7 @@ typedef unsigned char uint8_t;
 //#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS )
 //
 //#define LOOKBACK_TABLE_SIZE ( 1024 )
-//#define MAX_LOOK_BACK 32
+//#define MAX_LOOK_BACK 64
 //#define TAIL_BITS 4
 //#define TAIL_COUNT ( 1u << TAIL_BITS )
 
@@ -1083,13 +1083,15 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		{
 			int lookbackIndex = 256 * ( iBlock % LOOKBACK_TABLE_SIZE ) + i;
 			ParitionID pa;
+
+			// when you reach to the maximum, flag must be 2. flagRequire = 0b10
+			// Otherwise, flag can be 1 or 2 flagRequire = 0b11
+			int flagRequire = MAX_LOOK_BACK == blockIndex - iBlock ? 2 : 3;
+
 			do
 			{
 				pa = asPartition( lookBackBuffer[lookbackIndex] );
-
-				// when you reach to the maximum, flag must be 2
-				if( MAX_LOOK_BACK == blockIndex - iBlock && pa.flag != 2 ) continue;
-			} while( pa.flag == 0 || pa.block != iBlock );
+			} while( ( pa.flag & flagRequire ) == 0 || pa.block != iBlock );
 
 			uint32_t value = pa.value;
 			p += value;

From 688be9a0eae3aa57d258d78a3615a06bf415296a Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 22 Sep 2023 17:52:26 +0900
Subject: [PATCH 09/68] [ORO-0]fix wrong address for counter - it was luckly
 working

---
 ParallelPrimitives/RadixSort.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 3d77b53..691dae8 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -295,7 +295,7 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 	void* lookBackBuffer = (void*)( m_tmpBuffer.ptr() + sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ) );
 
 	// counter for gHistogram. 
-	void* counter = (uint8_t*)lookBackBuffer + ( 256 * LOOKBACK_TABLE_SIZE ) + sizeof( uint32_t );
+	void* counter = (uint8_t*)lookBackBuffer + ( 256 * LOOKBACK_TABLE_SIZE ) * sizeof( uint64_t ) + sizeof( uint32_t );
 
 	{
 		oroMemsetD32Async( (oroDeviceptr)m_tmpBuffer.ptr(), 0, m_tmpBuffer.size() / 4, stream );

From a3601869426d249fc233a049f6ebeb3de51bc9b0 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Mon, 25 Dec 2023 13:45:32 +0900
Subject: [PATCH 10/68] use resizeAsync

---
 ParallelPrimitives/RadixSort.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 937db5e..f4c2d83 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -317,7 +317,7 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc
 
 	u64 gpSumBuffer = sizeof( u32 ) * 256 * sizeof( u32 /* key type */ );
 	u64 lookBackBuffer = next_multiple64( sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE + 1 ), 16 );
-	m_tmpBuffer.resize( gpSumBuffer + lookBackBuffer );
+	m_tmpBuffer.resizeAsync( gpSumBuffer + lookBackBuffer, false /*copy*/, stream );
 
 	//m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count );
 

From 1bb319ae8b4599c44f99f90005c1f48080802626 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Mon, 25 Dec 2023 13:47:48 +0900
Subject: [PATCH 11/68] remove inl dependency

---
 ParallelPrimitives/RadixSort.h   |   3 -
 ParallelPrimitives/RadixSort.inl | 112 +++++++++++++++----------------
 2 files changed, 56 insertions(+), 59 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index 8b4f815..9e5b7e5 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -125,7 +125,4 @@ class RadixSort final
 
 	GpuMemory<uint8_t> m_tmpBuffer;
 };
-
-//#include <ParallelPrimitives/RadixSort.inl>
-
 }; // namespace Oro
diff --git a/ParallelPrimitives/RadixSort.inl b/ParallelPrimitives/RadixSort.inl
index fd42633..d001238 100644
--- a/ParallelPrimitives/RadixSort.inl
+++ b/ParallelPrimitives/RadixSort.inl
@@ -1,61 +1,61 @@
 
 
-namespace
-{
-
-struct Empty
-{
-};
-
-/// @brief Call the callable and measure the elapsed time using the Stopwatch.
-/// @tparam CallableType The type of the callable to be invoked in this function.
-/// @tparam RecordType The type of the object that stores the recorded times.
-/// @tparam enable_profile The elapsed time will be recorded if this is set to True.
-/// @param callable The callable object to be called.
-/// @param time_record The object that stores the recorded times.
-/// @param index The index indicates where to store the elapsed time in @c time_record
-/// @param stream The GPU stream
-template<bool enable_profile, typename CallableType, typename RecordType>
-constexpr void execute( CallableType&& callable, RecordType& time_record, const int index, const oroStream stream ) noexcept
-{
-	using TimerType = std::conditional_t<enable_profile, Stopwatch, Empty>;
-
-	TimerType stopwatch;
-
-	if constexpr( enable_profile )
-	{
-		stopwatch.start();
-	}
-
-	std::invoke( std::forward<CallableType>( callable ) );
-
-	if constexpr( enable_profile )
-	{
-		OrochiUtils::waitForCompletion( stream );
-		stopwatch.stop();
-		time_record[index] = stopwatch.getMs();
-	}
-}
-
-template<bool enable_profile, typename T>
-void resize_record( T& t ) noexcept
-{
-	if constexpr( enable_profile )
-	{
-		t.resize( 3 );
-	}
-}
-
-template<bool enable_profile, typename T>
-void print_record( const T& t ) noexcept
-{
-	if constexpr( enable_profile )
-	{
-		printf( "%3.2f, %3.2f, %3.2f\n", t[0], t[1], t[2] );
-	}
-}
-
-} // namespace
+//namespace
+//{
+//
+//struct Empty
+//{
+//};
+//
+///// @brief Call the callable and measure the elapsed time using the Stopwatch.
+///// @tparam CallableType The type of the callable to be invoked in this function.
+///// @tparam RecordType The type of the object that stores the recorded times.
+///// @tparam enable_profile The elapsed time will be recorded if this is set to True.
+///// @param callable The callable object to be called.
+///// @param time_record The object that stores the recorded times.
+///// @param index The index indicates where to store the elapsed time in @c time_record
+///// @param stream The GPU stream
+//template<bool enable_profile, typename CallableType, typename RecordType>
+//constexpr void execute( CallableType&& callable, RecordType& time_record, const int index, const oroStream stream ) noexcept
+//{
+//	using TimerType = std::conditional_t<enable_profile, Stopwatch, Empty>;
+//
+//	TimerType stopwatch;
+//
+//	if constexpr( enable_profile )
+//	{
+//		stopwatch.start();
+//	}
+//
+//	std::invoke( std::forward<CallableType>( callable ) );
+//
+//	if constexpr( enable_profile )
+//	{
+//		OrochiUtils::waitForCompletion( stream );
+//		stopwatch.stop();
+//		time_record[index] = stopwatch.getMs();
+//	}
+//}
+//
+//template<bool enable_profile, typename T>
+//void resize_record( T& t ) noexcept
+//{
+//	if constexpr( enable_profile )
+//	{
+//		t.resize( 3 );
+//	}
+//}
+//
+//template<bool enable_profile, typename T>
+//void print_record( const T& t ) noexcept
+//{
+//	if constexpr( enable_profile )
+//	{
+//		printf( "%3.2f, %3.2f, %3.2f\n", t[0], t[1], t[2] );
+//	}
+//}
+//
+//} // namespace
 
 //template<class T>
 //void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept

From cc8ad0eedc69566d030ec0653865dcae67c7c935 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Mon, 25 Dec 2023 14:02:15 +0900
Subject: [PATCH 12/68] constexpr noexcept for helper funcs

---
 ParallelPrimitives/RadixSort.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index f4c2d83..9748901 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -18,8 +18,8 @@
 #include <dlfcn.h>
 #endif
 
-inline uint64_t div_round_up64( uint64_t val, uint64_t divisor ) { return ( val + divisor - 1 ) / divisor; }
-inline uint64_t next_multiple64( uint64_t val, uint64_t divisor ) { return div_round_up64( val, divisor ) * divisor; }
+constexpr uint64_t div_round_up64( uint64_t val, uint64_t divisor ) noexcept { return ( val + divisor - 1 ) / divisor; }
+constexpr uint64_t next_multiple64( uint64_t val, uint64_t divisor ) noexcept { return div_round_up64( val, divisor ) * divisor; }
 
 namespace
 {

From e9b33f8647b5eef4df3e90dc89cd1a682ffeafb0 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Mon, 25 Dec 2023 14:22:39 +0900
Subject: [PATCH 13/68] Use GPU timer

---
 Test/RadixSort/main.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Test/RadixSort/main.cpp b/Test/RadixSort/main.cpp
index 645480f..9ee57a6 100644
--- a/Test/RadixSort/main.cpp
+++ b/Test/RadixSort/main.cpp
@@ -85,7 +85,6 @@ class SortTest
 			}
 		}
 
-		Stopwatch sw;
 		for( int i = 0; i < nRuns; i++ )
 		{
 			OrochiUtils::copyHtoD( srcGpu.key, srcKey.data(), testSize );
@@ -97,7 +96,8 @@ class SortTest
 				OrochiUtils::waitForCompletion();
 			}
 
-			sw.start();
+			OroStopwatch oroStream( nullptr );
+			oroStream.start();
 
 			if constexpr( KEY_VALUE_PAIR )
 			{
@@ -108,9 +108,10 @@ class SortTest
 				m_sort.sort( srcGpu.key, dstGpu.key, testSize, 0, testBits );
 			}
 
+			oroStream.stop();
+
 			OrochiUtils::waitForCompletion();
-			sw.stop();
-			float ms = sw.getMs();
+			float ms = oroStream.getMs();
 			float gKeys_s = static_cast<float>( testSize ) / 1000.f / 1000.f / ms;
 			printf( "%5.2fms (%3.2fGKeys/s) sorting %3.1fMkeys [%s]\n", ms, gKeys_s, testSize / 1000.f / 1000.f, KEY_VALUE_PAIR ? "keyValue" : "key" );
 		}

From 8681ee0b324a91b613adcf8f1d3ca4b6a3a18a57 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Mon, 25 Dec 2023 17:08:15 +0900
Subject: [PATCH 14/68] other test variants

---
 Test/RadixSort/main.cpp | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/Test/RadixSort/main.cpp b/Test/RadixSort/main.cpp
index 9ee57a6..2f7578d 100644
--- a/Test/RadixSort/main.cpp
+++ b/Test/RadixSort/main.cpp
@@ -49,6 +49,19 @@ class Stopwatch
 };
 #endif
 
+struct splitmix64
+{
+	uint64_t x = 0; /* The state can be seeded with any value. */
+
+	uint64_t next()
+	{
+		uint64_t z = ( x += 0x9e3779b97f4a7c15 );
+		z = ( z ^ ( z >> 30 ) ) * 0xbf58476d1ce4e5b9;
+		z = ( z ^ ( z >> 27 ) ) * 0x94d049bb133111eb;
+		return z ^ ( z >> 31 );
+	}
+};
+
 using u64 = Oro::RadixSort::u64;
 using u32 = Oro::RadixSort::u32;
 
@@ -68,9 +81,14 @@ class SortTest
 		OrochiUtils::malloc( dstGpu.key, testSize );
 
 		std::vector<u32> srcKey( testSize );
+
+		splitmix64 rng;
 		for( int i = 0; i < testSize; i++ )
 		{
 			srcKey[i] = getRandom( 0u, (u32)( ( 1ull << (u64)testBits ) - 1 ) );
+
+			//u32 mask = (u32)( ( 1ull << (u64)testBits ) - 1 );
+			//srcKey[i] = rng.next() & mask;
 		}
 
 		std::vector<u32> srcValue( testSize );
@@ -291,6 +309,7 @@ enum TestType
 	TEST_SIMPLE,
 	TEST_PERF,
 	TEST_BITS,
+	TEST_CAPTURE,
 	TEST_MISC,
 };
 
@@ -371,7 +390,11 @@ int main( int argc, char** argv )
 		sort.test( testSize, 32, nRuns );
 	}
 	break;
-
+	case TEST_CAPTURE:
+	{
+		sort.test<false>( 1u << 27 /*2^29*/, 32, 9999999 );
+	}
+	break;
 	case TEST_MISC:
 	{
 		static constexpr auto file = "input.txt";

From f6de36d2a4d040a7c54dfe94c69752d1de90e9f1 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Mon, 25 Dec 2023 18:04:52 +0900
Subject: [PATCH 15/68] Split iterators

---
 ParallelPrimitives/RadixSort.cpp      | 15 ++++++++++-----
 ParallelPrimitives/RadixSort.h        |  2 ++
 ParallelPrimitives/RadixSortKernels.h | 19 +++++++++----------
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 9748901..07c8afd 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -316,9 +316,12 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc
 	compileKernels( kernelPath, includeDir );
 
 	u64 gpSumBuffer = sizeof( u32 ) * 256 * sizeof( u32 /* key type */ );
-	u64 lookBackBuffer = next_multiple64( sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE + 1 ), 16 );
+	u64 lookBackBuffer = next_multiple64( sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE ), 16 );
 	m_tmpBuffer.resizeAsync( gpSumBuffer + lookBackBuffer, false /*copy*/, stream );
 
+	m_tailIterator.resizeAsync( 1, false /*copy*/, stream );
+	m_tailIterator.resetAsync( stream );
+	m_gpSumCounter.resizeAsync( 1, false /*copy*/, stream );
 	//m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count );
 
 	///// The tmp buffer size of the count kernel and the scan kernel.
@@ -371,11 +374,13 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 	// Buffers
 	void* gpSumBuffer = m_tmpBuffer.ptr();
 	void* lookBackBuffer = (void*)( m_tmpBuffer.ptr() + sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ) );
+	void* tailIteratorBuffer = m_tailIterator.ptr();
 
 	// counter for gHistogram. 
-	void* counter = (uint8_t*)lookBackBuffer + ( 256 * LOOKBACK_TABLE_SIZE ) * sizeof( uint64_t ) + sizeof( uint32_t );
-
 	{
+		m_gpSumCounter.resetAsync( stream );
+		void* counter = m_gpSumCounter.ptr();
+
 		oroMemsetD32Async( (oroDeviceptr)m_tmpBuffer.ptr(), 0, m_tmpBuffer.size() / 4, stream );
 
 		const int nBlocks = 2048;
@@ -399,12 +404,12 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 
 		if( keyPair )
 		{
-			const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i };
+			const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, &gpSumBuffer, &lookBackBuffer, &tailIteratorBuffer, & startBit, &i };
 			OrochiUtils::launch1D( m_onesweep_reorderKeyPair64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream );
 		}
 		else
 		{
-			const void* args[] = { &s.key, &d.key, &n, &gpSumBuffer, &lookBackBuffer, &startBit, &i };
+			const void* args[] = { &s.key, &d.key, &n, &gpSumBuffer, &lookBackBuffer, &tailIteratorBuffer, &startBit, &i };
 			OrochiUtils::launch1D( m_onesweep_reorderKey64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream );
 		}
 		std::swap( s, d );
diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index 9e5b7e5..bb51673 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -124,5 +124,7 @@ class RadixSort final
 	oroFunction m_onesweep_reorderKeyPair64;
 
 	GpuMemory<uint8_t> m_tmpBuffer;
+	GpuMemory<u32> m_gpSumCounter;
+	GpuMemory<u32> m_tailIterator;
 };
 }; // namespace Oro
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 1530b0a..8919e99 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -962,7 +962,7 @@ extern "C" __global__ void gPrefixSum( uint32_t* gpSumBuffer )
 }
 
 __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, uint32_t numberOfInputs, uint32_t* gpSumBuffer,
-												  volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
+												  volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits, uint32_t iteration )
 {
 	struct ElementLocation
 	{
@@ -1052,12 +1052,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		return x;
 	};
 
-	uint32_t* gTailIterator = (uint32_t*)( lookBackBuffer + LOOKBACK_TABLE_SIZE * 256 );
-
 	if( threadIdx.x == 0 && LOOKBACK_TABLE_SIZE <= blockIndex )
 	{
 		uint32_t mustBeDone = blockIndex - LOOKBACK_TABLE_SIZE + MAX_LOOK_BACK;
-		while( ( atomicAdd( gTailIterator, 0 ) >> TAIL_BITS ) * TAIL_COUNT <= mustBeDone )
+		while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) * TAIL_COUNT <= mustBeDone )
 			;
 	}
 	__syncthreads();
@@ -1119,10 +1117,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	if( threadIdx.x == 0 )
 	{
-		while( ( atomicAdd( gTailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT )
+		while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT )
 			;
 
-		atomicInc( gTailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ );
+		atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ );
 	}
 
 	// reorder
@@ -1210,12 +1208,13 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 	}
 }
-extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
+extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits,
+												  uint32_t iteration )
 {
-	onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration );
+	onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration );
 }
 extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, uint32_t numberOfInputs, uint32_t* gpSumBuffer,
-													  volatile uint64_t* lookBackBuffer, uint32_t startBits, uint32_t iteration )
+													  volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits, uint32_t iteration )
 {
-	onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, startBits, iteration );
+	onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration );
 }
\ No newline at end of file

From ed8fb9d3e9302fb89aa4f837f4917516b8c43551 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Mon, 25 Dec 2023 18:26:14 +0900
Subject: [PATCH 16/68] Split temp buffer for simplicity

---
 ParallelPrimitives/RadixSort.cpp | 20 +++++++++++---------
 ParallelPrimitives/RadixSort.h   |  3 ++-
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 07c8afd..7c58cda 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -316,8 +316,10 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc
 	compileKernels( kernelPath, includeDir );
 
 	u64 gpSumBuffer = sizeof( u32 ) * 256 * sizeof( u32 /* key type */ );
-	u64 lookBackBuffer = next_multiple64( sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE ), 16 );
-	m_tmpBuffer.resizeAsync( gpSumBuffer + lookBackBuffer, false /*copy*/, stream );
+	m_gpSumBuffer.resizeAsync( gpSumBuffer, false /*copy*/, stream );
+
+	u64 lookBackBuffer = sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE );
+	m_lookbackBuffer.resizeAsync( lookBackBuffer, false /*copy*/, stream );
 
 	m_tailIterator.resizeAsync( 1, false /*copy*/, stream );
 	m_tailIterator.resetAsync( stream );
@@ -372,17 +374,17 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 	uint64_t numberOfBlocks = div_round_up64( n, RADIX_SORT_BLOCK_SIZE );
 
 	// Buffers
-	void* gpSumBuffer = m_tmpBuffer.ptr();
-	void* lookBackBuffer = (void*)( m_tmpBuffer.ptr() + sizeof( u32 ) * 256 * sizeof( u32 /* key type */ ) );
+	void* gpSumBuffer = m_gpSumBuffer.ptr();
+	void* lookBackBuffer = m_lookbackBuffer.ptr();
 	void* tailIteratorBuffer = m_tailIterator.ptr();
 
+	m_lookbackBuffer.resetAsync( stream );
+	m_gpSumCounter.resetAsync( stream );
+	m_gpSumBuffer.resetAsync( stream );
+
 	// counter for gHistogram. 
 	{
-		m_gpSumCounter.resetAsync( stream );
 		void* counter = m_gpSumCounter.ptr();
-
-		oroMemsetD32Async( (oroDeviceptr)m_tmpBuffer.ptr(), 0, m_tmpBuffer.size() / 4, stream );
-
 		const int nBlocks = 2048;
 
 		const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &counter };
@@ -399,7 +401,7 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 	{
 		if( numberOfBlocks < LOOKBACK_TABLE_SIZE * 2 )
 		{
-			oroMemsetD32Async( (oroDeviceptr)lookBackBuffer, 0, ( 256 * LOOKBACK_TABLE_SIZE ) * sizeof( uint64_t ) / 4, stream );
+			m_lookbackBuffer.resetAsync( stream );
 		} // other wise, we can skip zero clear look back buffer
 
 		if( keyPair )
diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index bb51673..e74cc56 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -123,7 +123,8 @@ class RadixSort final
 	oroFunction m_onesweep_reorderKey64;
 	oroFunction m_onesweep_reorderKeyPair64;
 
-	GpuMemory<uint8_t> m_tmpBuffer;
+	GpuMemory<uint8_t> m_lookbackBuffer;
+	GpuMemory<uint8_t> m_gpSumBuffer;
 	GpuMemory<u32> m_gpSumCounter;
 	GpuMemory<u32> m_tailIterator;
 };

From d6c37b676a23146f6ff5f40f33de1e3a2a381a61 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 09:00:08 +0900
Subject: [PATCH 17/68] to constexprs

---
 ParallelPrimitives/RadixSortConfigs.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index 6eb68d1..c5e2c0a 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -42,17 +42,17 @@ static_assert( BIN_SIZE % 2 == 0 );
 static_assert( DEFAULT_COUNT_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 );
 static_assert( DEFAULT_SCAN_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 );
 
-#define RADIX_SORT_BLOCK_SIZE 2048
+constexpr int RADIX_SORT_BLOCK_SIZE = 2048;
 
-#define GHISTOGRAM_ITEM_PER_BLOCK 2048
-#define GHISTOGRAM_THREADS_PER_BLOCK 256
+constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048;
+constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256;
 
-#define REORDER_NUMBER_OF_WARPS 8
-#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS )
+constexpr int REORDER_NUMBER_OF_WARPS = 8;
+constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS;
 
-#define LOOKBACK_TABLE_SIZE ( 1024 )
-#define MAX_LOOK_BACK 64
-#define TAIL_BITS 4
-#define TAIL_COUNT ( 1u << TAIL_BITS )
+constexpr int LOOKBACK_TABLE_SIZE = 1024;
+constexpr int MAX_LOOK_BACK = 64;
+constexpr int TAIL_BITS = 4;
+constexpr int TAIL_COUNT = 1u << TAIL_BITS;
 
 }; // namespace Oro
\ No newline at end of file

From 56fa76d32ac0fddeae78ad86a8395801d466e789 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 09:12:54 +0900
Subject: [PATCH 18/68] Fix smaller n execution.

---
 ParallelPrimitives/RadixSort.cpp | 61 +++++++++++++-------------------
 1 file changed, 24 insertions(+), 37 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 7c58cda..0e2d06d 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -190,43 +190,30 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 	};
 
 
-//	for( const auto& record : records )
-//	{
-//#if defined( ORO_PP_LOAD_FROM_STRING )
-//		oroFunctions[record.kernelType] = oroutils.getFunctionFromString( device, hip_RadixSortKernels, currentKernelPath.c_str(), record.kernelName.c_str(), &opts, 1, hip::RadixSortKernelsArgs, hip::RadixSortKernelsIncludes );
-//#else
-//
-//		if constexpr( useBitCode )
-//		{
-//			oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary( binaryPath.c_str(), record.kernelName.c_str() );
-//		}
-//		else
-//		{
-//			const auto includeArg{ "-I" + currentIncludeDir };
-//			const auto overwrite_flag = "-DOVERWRITE";
-//			const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count );
-//			const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan );
-//			const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort );
-//			const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort );
-//
-//			std::vector<const char*> opts;
-//			opts.push_back( "-ffast-math" );
-//			opts.push_back( includeArg.c_str() );
-//			opts.push_back( overwrite_flag );
-//			opts.push_back( count_block_size_param.c_str() );
-//			opts.push_back( scan_block_size_param.c_str() );
-//			opts.push_back( sort_block_size_param.c_str() );
-//			opts.push_back( sort_num_warps_param.c_str() );
-//
-//			oroFunctions[record.kernelType] = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), record.kernelName.c_str(), &opts );
-//		}
-//
-//#endif
-//		if( m_flags == Flag::LOG )
-//		{
-//			printKernelInfo( record.kernelName, oroFunctions[record.kernelType] );
-//		}
-//	}
+	for( const auto& record : records )
+	{
+#if defined( ORO_PP_LOAD_FROM_STRING )
+		oroFunctions[record.kernelType] = oroutils.getFunctionFromString( device, hip_RadixSortKernels, currentKernelPath.c_str(), record.kernelName.c_str(), &opts, 1, hip::RadixSortKernelsArgs, hip::RadixSortKernelsIncludes );
+#else
+
+		if constexpr( useBitCode )
+		{
+			oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary( binaryPath.c_str(), record.kernelName.c_str() );
+		}
+		else
+		{
+			const auto includeArg{ "-I" + currentIncludeDir };
+			std::vector<const char*> opts;
+			opts.push_back( includeArg.c_str() );
+			oroFunctions[record.kernelType] = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), record.kernelName.c_str(), &opts );
+		}
+
+#endif
+		if( m_flags == Flag::LOG )
+		{
+			printKernelInfo( record.kernelName, oroFunctions[record.kernelType] );
+		}
+	}
 
 	// TODO: bit code support?
 #define LOAD_FUNC( var, kernel ) var = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), kernel, &opts );

From 0ba36d579a723fc0e824198b47af9911497cd3d1 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 09:39:39 +0900
Subject: [PATCH 19/68] adaptive blocksize for counting

---
 ParallelPrimitives/RadixSort.cpp | 6 ++++--
 ParallelPrimitives/RadixSort.h   | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 0e2d06d..876fe0f 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -75,7 +75,7 @@ namespace Oro
 
 RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils, oroStream stream, const std::string& kernelPath, const std::string& includeDir ) : m_device{ device }, m_oroutils{ oroutils }
 {
-	// oroGetDeviceProperties( &m_props, device );
+	oroGetDeviceProperties( &m_props, device );
 	configure( kernelPath, includeDir, stream );
 }
 
@@ -372,7 +372,9 @@ void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit
 	// counter for gHistogram. 
 	{
 		void* counter = m_gpSumCounter.ptr();
-		const int nBlocks = 2048;
+		int maxBlocksPerMP = 0;
+		oroError e = oroOccupancyMaxActiveBlocksPerMultiprocessor( &maxBlocksPerMP, m_gHistogram, GHISTOGRAM_THREADS_PER_BLOCK, 0 );
+		const int nBlocks = e == oroSuccess ? maxBlocksPerMP * m_props.multiProcessorCount : 2048;
 
 		const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &counter };
 		OrochiUtils::launch1D( m_gHistogram, nBlocks * GHISTOGRAM_THREADS_PER_BLOCK, args, GHISTOGRAM_THREADS_PER_BLOCK, 0, stream );
diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index e74cc56..c04854c 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -105,7 +105,7 @@ class RadixSort final
 	//GpuMemory<bool> m_is_ready;
 
 	oroDevice m_device{};
-	//oroDeviceProp m_props{};
+	oroDeviceProp m_props{};
 
 	OrochiUtils& m_oroutils;
 

From d704f749c8b243d5aca554c2be9569fbadf8f052 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 09:51:48 +0900
Subject: [PATCH 20/68] use const ref

---
 ParallelPrimitives/RadixSort.cpp | 2 +-
 ParallelPrimitives/RadixSort.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 876fe0f..8829c0b 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -334,7 +334,7 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc
 }
 void RadixSort::setFlag( Flag flag ) noexcept { m_flags = flag; }
 
-void RadixSort::sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, oroStream stream ) noexcept
+void RadixSort::sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n, int startBit, int endBit, oroStream stream ) noexcept
 {
 	bool keyPair = src.value != nullptr;
 
diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index c04854c..f530d79 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -43,7 +43,7 @@ class RadixSort final
 
 	void setFlag( Flag flag ) noexcept;
 
-	void sort( KeyValueSoA src, KeyValueSoA dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept;
+	void sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept;
 
 	void sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept;
 

From 11671a4ddad15089bbe614cab3d4873b8217ed53 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 09:57:53 +0900
Subject: [PATCH 21/68] remove define

---
 ParallelPrimitives/RadixSortKernels.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 8919e99..bf1c050 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -760,8 +760,8 @@ extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, i
 //	SortImpl<true>( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED );
 //}
 
-#define RADIX_SORT_KEY_TYPE u32
-#define RADIX_SORT_VALUE_TYPE u32
+using RADIX_SORT_KEY_TYPE = uint32_t;
+using RADIX_SORT_VALUE_TYPE = uint32_t;
 #define KEY_IS_16BYTE_ALIGNED 1
 
 typedef unsigned long long uint64_t;

From 99cc300b45bf392c8b9af2bd7ef0109a20de3360 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 10:02:45 +0900
Subject: [PATCH 22/68] fix compile error and remove unused comments

---
 ParallelPrimitives/RadixSortKernels.h | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index bf1c050..4341a81 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -760,8 +760,6 @@ extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, i
 //	SortImpl<true>( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED );
 //}
 
-using RADIX_SORT_KEY_TYPE = uint32_t;
-using RADIX_SORT_VALUE_TYPE = uint32_t;
 #define KEY_IS_16BYTE_ALIGNED 1
 
 typedef unsigned long long uint64_t;
@@ -769,18 +767,8 @@ typedef unsigned int uint32_t;
 typedef unsigned short uint16_t;
 typedef unsigned char uint8_t;
 
-//#define RADIX_SORT_BLOCK_SIZE 2048
-//
-//#define GHISTOGRAM_ITEM_PER_BLOCK 2048
-//#define GHISTOGRAM_THREADS_PER_BLOCK 256
-//
-//#define REORDER_NUMBER_OF_WARPS 8
-//#define REORDER_NUMBER_OF_THREADS_PER_BLOCK ( 32 * REORDER_NUMBER_OF_WARPS )
-//
-//#define LOOKBACK_TABLE_SIZE ( 1024 )
-//#define MAX_LOOK_BACK 64
-//#define TAIL_BITS 4
-//#define TAIL_COUNT ( 1u << TAIL_BITS )
+using RADIX_SORT_KEY_TYPE = uint32_t;
+using RADIX_SORT_VALUE_TYPE = uint32_t;
 
 #if defined( DESCENDING_ORDER )
 #define ORDER_MASK_32 0xFFFFFFFF

From 1fd425cd0e32eab796d5c3fc289e97293df3fdc9 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 10:13:02 +0900
Subject: [PATCH 23/68] remove macro

---
 ParallelPrimitives/RadixSortKernels.h | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 4341a81..82a8d55 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -760,7 +760,7 @@ extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, i
 //	SortImpl<true>( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED );
 //}
 
-#define KEY_IS_16BYTE_ALIGNED 1
+constexpr auto KEY_IS_16BYTE_ALIGNED = true;
 
 typedef unsigned long long uint64_t;
 typedef unsigned int uint32_t;
@@ -872,8 +872,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num
 	{
 		hasData = true;
 
-#if defined( KEY_IS_16BYTE_ALIGNED )
-		if( ( iBlock + 1 ) * GHISTOGRAM_ITEM_PER_BLOCK <= numberOfInputs )
+		if( KEY_IS_16BYTE_ALIGNED && ( iBlock + 1 ) * GHISTOGRAM_ITEM_PER_BLOCK <= numberOfInputs )
 		{
 			for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK * 4 )
 			{
@@ -896,7 +895,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num
 			}
 		}
 		else
-#endif
+		{
 			for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK )
 			{
 				uint32_t itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x + i;
@@ -911,7 +910,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num
 					}
 				}
 			}
-
+		}
 		__syncthreads();
 
 		if( threadIdx.x == 0 )
@@ -984,8 +983,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	__syncthreads();
 
 	// count
-#if defined( KEY_IS_16BYTE_ALIGNED )
-	if( ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs )
+	if( KEY_IS_16BYTE_ALIGNED && ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs )
 	{
 		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK * 4 )
 		{
@@ -1005,7 +1003,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 	}
 	else
-#endif
 	{
 		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{

From c0ee24b39ee2653b4b90cb0132d302ad15062cf6 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 10:17:56 +0900
Subject: [PATCH 24/68] unified types

---
 ParallelPrimitives/RadixSortKernels.h | 159 +++++++++++++-------------
 1 file changed, 77 insertions(+), 82 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 82a8d55..4088335 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -762,13 +762,8 @@ extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, i
 
 constexpr auto KEY_IS_16BYTE_ALIGNED = true;
 
-typedef unsigned long long uint64_t;
-typedef unsigned int uint32_t;
-typedef unsigned short uint16_t;
-typedef unsigned char uint8_t;
-
-using RADIX_SORT_KEY_TYPE = uint32_t;
-using RADIX_SORT_VALUE_TYPE = uint32_t;
+using RADIX_SORT_KEY_TYPE = u32;
+using RADIX_SORT_VALUE_TYPE = u32;
 
 #if defined( DESCENDING_ORDER )
 #define ORDER_MASK_32 0xFFFFFFFF
@@ -782,7 +777,7 @@ using RADIX_SORT_VALUE_TYPE = uint32_t;
 #define ITS 1
 #endif
 
-__device__ inline uint32_t div_round_up( uint32_t val, uint32_t divisor ) { return ( val + divisor - 1 ) / divisor; }
+__device__ inline u32 div_round_up( u32 val, u32 divisor ) { return ( val + divisor - 1 ) / divisor; }
 template<int NElement, int NThread, class T>
 __device__ void clearShared( T* sMem, T value )
 {
@@ -795,31 +790,31 @@ __device__ void clearShared( T* sMem, T value )
 	}
 }
 
-__device__ inline uint32_t getKeyBits( uint32_t x ) { return x ^ ORDER_MASK_32; }
-__device__ inline uint64_t getKeyBits( uint64_t x ) { return x ^ ORDER_MASK_64; }
-__device__ inline uint32_t getKeyBits( float x )
+__device__ inline u32 getKeyBits( u32 x ) { return x ^ ORDER_MASK_32; }
+__device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; }
+__device__ inline u32 getKeyBits( float x )
 {
 	if( x == 0.0f ) x = 0.0f;
 
-	uint32_t flip = uint32_t( __float_as_int( x ) >> 31 ) | 0x80000000;
+	u32 flip = u32( __float_as_int( x ) >> 31 ) | 0x80000000;
 	return __float_as_uint( x ) ^ flip ^ ORDER_MASK_32;
 }
-__device__ inline uint64_t getKeyBits( double x )
+__device__ inline u64 getKeyBits( double x )
 {
 	if( x == 0.0 ) x = 0.0;
 
-	uint64_t flip = uint64_t( __double_as_longlong( x ) >> 63 ) | 0x8000000000000000llu;
-	return (uint64_t)__double_as_longlong( x ) ^ flip ^ ORDER_MASK_64;
+	u64 flip = u64( __double_as_longlong( x ) >> 63 ) | 0x8000000000000000llu;
+	return (u64)__double_as_longlong( x ) ^ flip ^ ORDER_MASK_64;
 }
 
 template<int NThreads>
-__device__ inline uint32_t prefixSumExclusive( uint32_t prefix, uint32_t* sMemIO )
+__device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO )
 {
-	uint32_t value = sMemIO[threadIdx.x];
+	u32 value = sMemIO[threadIdx.x];
 
-	for( uint32_t offset = 1; offset < NThreads; offset <<= 1 )
+	for( u32 offset = 1; offset < NThreads; offset <<= 1 )
 	{
-		uint32_t x = sMemIO[threadIdx.x];
+		u32 x = sMemIO[threadIdx.x];
 
 		if( offset <= threadIdx.x )
 		{
@@ -832,7 +827,7 @@ __device__ inline uint32_t prefixSumExclusive( uint32_t prefix, uint32_t* sMemIO
 
 		__syncthreads();
 	}
-	uint32_t sum = sMemIO[NThreads - 1];
+	u32 sum = sMemIO[NThreads - 1];
 
 	__syncthreads();
 
@@ -843,9 +838,9 @@ __device__ inline uint32_t prefixSumExclusive( uint32_t prefix, uint32_t* sMemIO
 	return sum;
 }
 
-extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t numberOfInputs, uint32_t* gpSumBuffer, uint32_t startBits, uint32_t* counter )
+extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOfInputs, u32* gpSumBuffer, u32 startBits, u32* counter )
 {
-	__shared__ uint32_t localCounters[sizeof( RADIX_SORT_KEY_TYPE )][256];
+	__shared__ u32 localCounters[sizeof( RADIX_SORT_KEY_TYPE )][256];
 
 	for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 	{
@@ -857,8 +852,8 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num
 
 	__syncthreads();
 
-	uint32_t numberOfBlocks = div_round_up( numberOfInputs, GHISTOGRAM_ITEM_PER_BLOCK );
-	__shared__ uint32_t iBlock;
+	u32 numberOfBlocks = div_round_up( numberOfInputs, GHISTOGRAM_ITEM_PER_BLOCK );
+	__shared__ u32 iBlock;
 	if( threadIdx.x == 0 )
 	{
 		iBlock = atomicInc( counter, 0xFFFFFFFF );
@@ -876,7 +871,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num
 		{
 			for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK * 4 )
 			{
-				uint32_t itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i + threadIdx.x * 4;
+				u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i + threadIdx.x * 4;
 				struct alignas( 16 ) Key4
 				{
 					RADIX_SORT_KEY_TYPE xs[4];
@@ -887,8 +882,8 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num
 					auto item = key4.xs[k];
 					for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 					{
-						uint32_t bitLocation = startBits + i * 8;
-						uint32_t bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+						u32 bitLocation = startBits + i * 8;
+						u32 bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
 						atomicInc( &localCounters[i][bits], 0xFFFFFFFF );
 					}
 				}
@@ -898,14 +893,14 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num
 		{
 			for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK )
 			{
-				uint32_t itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x + i;
+				u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x + i;
 				if( itemIndex < numberOfInputs )
 				{
 					auto item = inputs[itemIndex];
 					for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 					{
-						uint32_t bitLocation = startBits + i * 8;
-						uint32_t bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+						u32 bitLocation = startBits + i * 8;
+						u32 bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
 						atomicInc( &localCounters[i][bits], 0xFFFFFFFF );
 					}
 				}
@@ -935,9 +930,9 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, uint32_t num
 	}
 }
 
-extern "C" __global__ void gPrefixSum( uint32_t* gpSumBuffer )
+extern "C" __global__ void gPrefixSum( u32* gpSumBuffer )
 {
-	__shared__ uint32_t smem[256];
+	__shared__ u32 smem[256];
 
 	smem[threadIdx.x] = gpSumBuffer[blockIdx.x * 256 + threadIdx.x];
 
@@ -948,29 +943,29 @@ extern "C" __global__ void gPrefixSum( uint32_t* gpSumBuffer )
 	gpSumBuffer[blockIdx.x * 256 + threadIdx.x] = smem[threadIdx.x];
 }
 
-__device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, uint32_t numberOfInputs, uint32_t* gpSumBuffer,
-												  volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits, uint32_t iteration )
+__device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, u32 numberOfInputs, u32* gpSumBuffer,
+												  volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration )
 {
 	struct ElementLocation
 	{
-		uint32_t localSrcIndex : 12;
-		uint32_t localOffset : 12;
-		uint32_t bucket : 8;
+		u32 localSrcIndex : 12;
+		u32 localOffset : 12;
+		u32 bucket : 8;
 	};
 
-	__shared__ uint32_t pSum[256];
-	__shared__ uint32_t localPrefixSum[256];
-	__shared__ uint32_t counters[256];
+	__shared__ u32 pSum[256];
+	__shared__ u32 localPrefixSum[256];
+	__shared__ u32 counters[256];
 	__shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
-	__shared__ uint8_t elementBuckets[RADIX_SORT_BLOCK_SIZE];
-	__shared__ uint32_t matchMasks[REORDER_NUMBER_OF_WARPS][256];
+	__shared__ u8 elementBuckets[RADIX_SORT_BLOCK_SIZE];
+	__shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][256];
 
-	uint32_t bitLocation = startBits + 8 * iteration;
-	uint32_t blockIndex = blockIdx.x;
-	uint32_t numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE );
+	u32 bitLocation = startBits + 8 * iteration;
+	u32 blockIndex = blockIdx.x;
+	u32 numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE );
 
-	clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, uint32_t>( localPrefixSum, 0 );
-	clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, uint32_t>( counters, 0 );
+	clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( localPrefixSum, 0 );
+	clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( counters, 0 );
 
 	for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ )
 	{
@@ -987,7 +982,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	{
 		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK * 4 )
 		{
-			uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x * 4;
+			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x * 4;
 			struct alignas( 16 ) Key4
 			{
 				RADIX_SORT_KEY_TYPE xs[4];
@@ -996,7 +991,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			for( int k = 0; k < 4; k++ )
 			{
 				auto item = key4.xs[k];
-				uint32_t bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+				u32 bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
 				atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
 				elementBuckets[i + threadIdx.x * 4 + k] = bucketIndex;
 			}
@@ -1006,11 +1001,11 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	{
 		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
-			uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
 			if( itemIndex < numberOfInputs )
 			{
 				auto item = inputKeys[itemIndex];
-				uint32_t bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+				u32 bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
 				atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
 
 				elementBuckets[i + threadIdx.x] = bucketIndex;
@@ -1020,11 +1015,11 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	struct ParitionID
 	{
-		uint64_t value : 32;
-		uint64_t block : 30;
-		uint64_t flag : 2;
+		u64 value : 32;
+		u64 block : 30;
+		u64 flag : 2;
 	};
-	auto asPartition = []( uint64_t x )
+	auto asPartition = []( u64 x )
 	{
 		ParitionID pa;
 		memcpy( &pa, &x, sizeof( ParitionID ) );
@@ -1032,14 +1027,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	};
 	auto asU64 = []( ParitionID pa )
 	{
-		uint64_t x;
-		memcpy( &x, &pa, sizeof( uint64_t ) );
+		u64 x;
+		memcpy( &x, &pa, sizeof( u64 ) );
 		return x;
 	};
 
 	if( threadIdx.x == 0 && LOOKBACK_TABLE_SIZE <= blockIndex )
 	{
-		uint32_t mustBeDone = blockIndex - LOOKBACK_TABLE_SIZE + MAX_LOOK_BACK;
+		u32 mustBeDone = blockIndex - LOOKBACK_TABLE_SIZE + MAX_LOOK_BACK;
 		while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) * TAIL_COUNT <= mustBeDone )
 			;
 	}
@@ -1047,7 +1042,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	for( int i = threadIdx.x; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		uint32_t s = localPrefixSum[i];
+		u32 s = localPrefixSum[i];
 		int pIndex = 256 * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i;
 
 		{
@@ -1058,9 +1053,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			lookBackBuffer[pIndex] = asU64( pa );
 		}
 
-		uint32_t gp = gpSumBuffer[iteration * 256 + i];
+		u32 gp = gpSumBuffer[iteration * 256 + i];
 
-		uint32_t p = 0;
+		u32 p = 0;
 
 		for( int iBlock = (int)blockIndex - 1; 0 <= iBlock; iBlock-- )
 		{
@@ -1076,7 +1071,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 				pa = asPartition( lookBackBuffer[lookbackIndex] );
 			} while( ( pa.flag & flagRequire ) == 0 || pa.block != iBlock );
 
-			uint32_t value = pa.value;
+			u32 value = pa.value;
 			p += value;
 			if( pa.flag == 2 )
 			{
@@ -1094,7 +1089,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		pSum[i] = gp + p;
 	}
 
-	uint32_t prefix = 0;
+	u32 prefix = 0;
 	for( int i = 0; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
 		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &localPrefixSum[i] );
@@ -1111,8 +1106,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	// reorder
 	for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
-		uint32_t bucketIndex = elementBuckets[i + threadIdx.x];
+		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+		u32 bucketIndex = elementBuckets[i + threadIdx.x];
 
 		__syncthreads();
 
@@ -1130,9 +1125,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 		if( itemIndex < numberOfInputs )
 		{
-			uint32_t matchMask = matchMasks[warp][bucketIndex];
-			uint32_t lowerMask = ( 1u << lane ) - 1;
-			uint32_t offset = __popc( matchMask & lowerMask );
+			u32 matchMask = matchMasks[warp][bucketIndex];
+			u32 lowerMask = ( 1u << lane ) - 1;
+			u32 offset = __popc( matchMask & lowerMask );
 
 			flushMask = offset == 0;
 
@@ -1141,8 +1136,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 				offset += __popc( matchMasks[w][bucketIndex] );
 			}
 
-			uint32_t localOffset = counters[bucketIndex] + offset;
-			uint32_t to = localOffset + localPrefixSum[bucketIndex];
+			u32 localOffset = counters[bucketIndex] + offset;
+			u32 to = localOffset + localPrefixSum[bucketIndex];
 
 			ElementLocation el;
 			el.localSrcIndex = i + threadIdx.x;
@@ -1165,14 +1160,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
 		if( itemIndex < numberOfInputs )
 		{
 			ElementLocation el = elementLocations[i + threadIdx.x];
-			uint32_t srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
-			uint8_t bucketIndex = el.bucket;
+			u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
+			u8 bucketIndex = el.bucket;
 
-			uint32_t dstIndex = pSum[bucketIndex] + el.localOffset;
+			u32 dstIndex = pSum[bucketIndex] + el.localOffset;
 			outputKeys[dstIndex] = inputKeys[srcIndex];
 		}
 	}
@@ -1180,26 +1175,26 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	{
 		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
-			uint32_t itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
 			if( itemIndex < numberOfInputs )
 			{
 				ElementLocation el = elementLocations[i + threadIdx.x];
-				uint32_t srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
-				uint8_t bucketIndex = el.bucket;
+				u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
+				u8 bucketIndex = el.bucket;
 
-				uint32_t dstIndex = pSum[bucketIndex] + el.localOffset;
+				u32 dstIndex = pSum[bucketIndex] + el.localOffset;
 				outputValues[dstIndex] = inputValues[srcIndex];
 			}
 		}
 	}
 }
-extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, uint32_t numberOfInputs, uint32_t* gpSumBuffer, volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits,
-												  uint32_t iteration )
+extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits,
+												  u32 iteration )
 {
 	onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration );
 }
-extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, uint32_t numberOfInputs, uint32_t* gpSumBuffer,
-													  volatile uint64_t* lookBackBuffer, uint32_t* tailIterator, uint32_t startBits, uint32_t iteration )
+extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer,
+													  volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration )
 {
 	onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration );
 }
\ No newline at end of file

From 8dbf83a1c9bd24522771b3803d8cb20bd63cc9c4 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 10:19:41 +0900
Subject: [PATCH 25/68] to constexpr noexcept

---
 ParallelPrimitives/RadixSortKernels.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 4088335..e843d5d 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -777,7 +777,8 @@ using RADIX_SORT_VALUE_TYPE = u32;
 #define ITS 1
 #endif
 
-__device__ inline u32 div_round_up( u32 val, u32 divisor ) { return ( val + divisor - 1 ) / divisor; }
+__device__ constexpr u32 div_round_up( u32 val, u32 divisor ) noexcept { return ( val + divisor - 1 ) / divisor; }
+
 template<int NElement, int NThread, class T>
 __device__ void clearShared( T* sMem, T value )
 {

From be5f26e64b0f991fd37a7ac7f7dc3df26dbd41ce Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 10:39:37 +0900
Subject: [PATCH 26/68] use constexpr and remove unused functions

---
 ParallelPrimitives/RadixSortKernels.h | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index e843d5d..28a578b 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -1,6 +1,5 @@
 #include <ParallelPrimitives/RadixSortConfigs.h>
 #define LDS_BARRIER __syncthreads()
-
 namespace
 {
 
@@ -766,15 +765,11 @@ using RADIX_SORT_KEY_TYPE = u32;
 using RADIX_SORT_VALUE_TYPE = u32;
 
 #if defined( DESCENDING_ORDER )
-#define ORDER_MASK_32 0xFFFFFFFF
-#define ORDER_MASK_64 0xFFFFFFFFFFFFFFFFllu
+constexpr u32 ORDER_MASK_32 = 0xFFFFFFFF;
+constexpr u64 ORDER_MASK_64 = 0xFFFFFFFFFFFFFFFFllu;
 #else
-#define ORDER_MASK_32 0
-#define ORDER_MASK_64 0llu
-#endif
-
-#if defined( CUDART_VERSION ) && CUDART_VERSION >= 9000
-#define ITS 1
+constexpr u32 ORDER_MASK_32 = 0;
+constexpr u64 ORDER_MASK_64 = 0llu;
 #endif
 
 __device__ constexpr u32 div_round_up( u32 val, u32 divisor ) noexcept { return ( val + divisor - 1 ) / divisor; }
@@ -793,20 +788,6 @@ __device__ void clearShared( T* sMem, T value )
 
 __device__ inline u32 getKeyBits( u32 x ) { return x ^ ORDER_MASK_32; }
 __device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; }
-__device__ inline u32 getKeyBits( float x )
-{
-	if( x == 0.0f ) x = 0.0f;
-
-	u32 flip = u32( __float_as_int( x ) >> 31 ) | 0x80000000;
-	return __float_as_uint( x ) ^ flip ^ ORDER_MASK_32;
-}
-__device__ inline u64 getKeyBits( double x )
-{
-	if( x == 0.0 ) x = 0.0;
-
-	u64 flip = u64( __double_as_longlong( x ) >> 63 ) | 0x8000000000000000llu;
-	return (u64)__double_as_longlong( x ) ^ flip ^ ORDER_MASK_64;
-}
 
 template<int NThreads>
 __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO )

From a443768d42fd5f24532c70f012c6bece2ecf21ef Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 10:57:57 +0900
Subject: [PATCH 27/68] use BIN_SIZE constant

---
 ParallelPrimitives/RadixSort.cpp      |  6 ++---
 ParallelPrimitives/RadixSortConfigs.h |  3 +++
 ParallelPrimitives/RadixSortKernels.h | 36 +++++++++++++--------------
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 8829c0b..c2601c4 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -302,10 +302,10 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc
 {
 	compileKernels( kernelPath, includeDir );
 
-	u64 gpSumBuffer = sizeof( u32 ) * 256 * sizeof( u32 /* key type */ );
+	u64 gpSumBuffer = sizeof( u32 ) * BIN_SIZE * sizeof( u32 /* key type */ );
 	m_gpSumBuffer.resizeAsync( gpSumBuffer, false /*copy*/, stream );
 
-	u64 lookBackBuffer = sizeof( u64 ) * ( 256 * LOOKBACK_TABLE_SIZE );
+	u64 lookBackBuffer = sizeof( u64 ) * ( BIN_SIZE * LOOKBACK_TABLE_SIZE );
 	m_lookbackBuffer.resizeAsync( lookBackBuffer, false /*copy*/, stream );
 
 	m_tailIterator.resizeAsync( 1, false /*copy*/, stream );
@@ -381,7 +381,7 @@ void RadixSort::sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n
 	}
 	{
 		const void* args[] = { &gpSumBuffer };
-		OrochiUtils::launch1D( m_gPrefixSum, nIteration * 256, args, 256, 0, stream );
+		OrochiUtils::launch1D( m_gPrefixSum, nIteration * BIN_SIZE, args, BIN_SIZE, 0, stream );
 	}
 
 	auto s = src;
diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index c5e2c0a..33c5f78 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -55,4 +55,7 @@ constexpr int MAX_LOOK_BACK = 64;
 constexpr int TAIL_BITS = 4;
 constexpr int TAIL_COUNT = 1u << TAIL_BITS;
 
+static_assert( REORDER_NUMBER_OF_THREADS_PER_BLOCK <= BIN_SIZE, "please check prefixSumExclusive on onesweep_reorder" );
+static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" );
+
 }; // namespace Oro
\ No newline at end of file
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 28a578b..d251e72 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -904,9 +904,9 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 
 		for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 		{
-			for( int j = threadIdx.x; j < 256; j += GHISTOGRAM_THREADS_PER_BLOCK )
+			for( int j = threadIdx.x; j < BIN_SIZE; j += GHISTOGRAM_THREADS_PER_BLOCK )
 			{
-				atomicAdd( &gpSumBuffer[256 * i + j], localCounters[i][j] );
+				atomicAdd( &gpSumBuffer[BIN_SIZE * i + j], localCounters[i][j] );
 			}
 		}
 	}
@@ -914,15 +914,15 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 
 extern "C" __global__ void gPrefixSum( u32* gpSumBuffer )
 {
-	__shared__ u32 smem[256];
+	__shared__ u32 smem[BIN_SIZE];
 
-	smem[threadIdx.x] = gpSumBuffer[blockIdx.x * 256 + threadIdx.x];
+	smem[threadIdx.x] = gpSumBuffer[blockIdx.x * BIN_SIZE + threadIdx.x];
 
 	__syncthreads();
 
-	prefixSumExclusive<256>( 0, smem );
+	prefixSumExclusive<BIN_SIZE>( 0, smem );
 
-	gpSumBuffer[blockIdx.x * 256 + threadIdx.x] = smem[threadIdx.x];
+	gpSumBuffer[blockIdx.x * BIN_SIZE + threadIdx.x] = smem[threadIdx.x];
 }
 
 __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, u32 numberOfInputs, u32* gpSumBuffer,
@@ -935,23 +935,23 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		u32 bucket : 8;
 	};
 
-	__shared__ u32 pSum[256];
-	__shared__ u32 localPrefixSum[256];
-	__shared__ u32 counters[256];
+	__shared__ u32 pSum[BIN_SIZE];
+	__shared__ u32 localPrefixSum[BIN_SIZE];
+	__shared__ u32 counters[BIN_SIZE];
 	__shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
 	__shared__ u8 elementBuckets[RADIX_SORT_BLOCK_SIZE];
-	__shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][256];
+	__shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][BIN_SIZE];
 
 	u32 bitLocation = startBits + 8 * iteration;
 	u32 blockIndex = blockIdx.x;
 	u32 numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE );
 
-	clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( localPrefixSum, 0 );
-	clearShared<256, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( counters, 0 );
+	clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( localPrefixSum, 0 );
+	clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( counters, 0 );
 
 	for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ )
 	{
-		for( int i = 0; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+		for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
 			matchMasks[w][i + threadIdx.x] = 0;
 		}
@@ -1022,10 +1022,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	}
 	__syncthreads();
 
-	for( int i = threadIdx.x; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
 		u32 s = localPrefixSum[i];
-		int pIndex = 256 * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i;
+		int pIndex = BIN_SIZE * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i;
 
 		{
 			ParitionID pa;
@@ -1035,13 +1035,13 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			lookBackBuffer[pIndex] = asU64( pa );
 		}
 
-		u32 gp = gpSumBuffer[iteration * 256 + i];
+		u32 gp = gpSumBuffer[iteration * BIN_SIZE + i];
 
 		u32 p = 0;
 
 		for( int iBlock = (int)blockIndex - 1; 0 <= iBlock; iBlock-- )
 		{
-			int lookbackIndex = 256 * ( iBlock % LOOKBACK_TABLE_SIZE ) + i;
+			int lookbackIndex = BIN_SIZE * ( iBlock % LOOKBACK_TABLE_SIZE ) + i;
 			ParitionID pa;
 
 			// when you reach to the maximum, flag must be 2. flagRequire = 0b10
@@ -1072,7 +1072,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	}
 
 	u32 prefix = 0;
-	for( int i = 0; i < 256; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
 		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &localPrefixSum[i] );
 	}

From 82d4aad99044db927c352cd88403c3997ac177db Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 11:24:07 +0900
Subject: [PATCH 28/68] extract common process as extractDigit()

---
 ParallelPrimitives/RadixSortKernels.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index d251e72..f7b3912 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -788,6 +788,8 @@ __device__ void clearShared( T* sMem, T value )
 
 __device__ inline u32 getKeyBits( u32 x ) { return x ^ ORDER_MASK_32; }
 __device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; }
+__device__ inline u32 extractDigit( u32 x, u32 bitLocation ) { return ( x >> bitLocation ) & RADIX_MASK; }
+__device__ inline u32 extractDigit( u64 x, u32 bitLocation ) { return (u32)( ( x >> bitLocation ) & RADIX_MASK ); }
 
 template<int NThreads>
 __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO )
@@ -865,7 +867,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 					for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 					{
 						u32 bitLocation = startBits + i * 8;
-						u32 bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+						u32 bits = extractDigit( getKeyBits( item ), bitLocation );
 						atomicInc( &localCounters[i][bits], 0xFFFFFFFF );
 					}
 				}
@@ -882,7 +884,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 					for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 					{
 						u32 bitLocation = startBits + i * 8;
-						u32 bits = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+						u32 bits = extractDigit( getKeyBits( item ), bitLocation );
 						atomicInc( &localCounters[i][bits], 0xFFFFFFFF );
 					}
 				}
@@ -973,7 +975,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			for( int k = 0; k < 4; k++ )
 			{
 				auto item = key4.xs[k];
-				u32 bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+				u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
 				atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
 				elementBuckets[i + threadIdx.x * 4 + k] = bucketIndex;
 			}
@@ -987,7 +989,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			if( itemIndex < numberOfInputs )
 			{
 				auto item = inputKeys[itemIndex];
-				u32 bucketIndex = ( getKeyBits( item ) >> bitLocation ) & 0xFF;
+				u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
 				atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
 
 				elementBuckets[i + threadIdx.x] = bucketIndex;

From 866b70c2e9f590b869faf30c7c4fe64576d2669f Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 11:41:42 +0900
Subject: [PATCH 29/68] keyPair as a template parameter

---
 ParallelPrimitives/RadixSortKernels.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index f7b3912..d4b7343 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -927,7 +927,8 @@ extern "C" __global__ void gPrefixSum( u32* gpSumBuffer )
 	gpSumBuffer[blockIdx.x * BIN_SIZE + threadIdx.x] = smem[threadIdx.x];
 }
 
-__device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, bool keyPair, u32 numberOfInputs, u32* gpSumBuffer,
+template <bool keyPair>
+__device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer,
 												  volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration )
 {
 	struct ElementLocation
@@ -1155,7 +1156,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			outputKeys[dstIndex] = inputKeys[srcIndex];
 		}
 	}
-	if( keyPair )
+	if constexpr ( keyPair )
 	{
 		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
@@ -1175,10 +1176,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits,
 												  u32 iteration )
 {
-	onesweep_reorder( inputKeys, outputKeys, nullptr, nullptr, false, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration );
+	onesweep_reorder<false /*keyPair*/>( inputKeys, outputKeys, nullptr, nullptr, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration );
 }
 extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer,
 													  volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration )
 {
-	onesweep_reorder( inputKeys, outputKeys, inputValues, outputValues, true, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration );
+	onesweep_reorder<true /*keyPair*/>( inputKeys, outputKeys, inputValues, outputValues, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration );
 }
\ No newline at end of file

From a9c4e61fcf35ec1fdd9392a3f24d0874633ac2ec Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 13:27:02 +0900
Subject: [PATCH 30/68] remove unused codes

---
 ParallelPrimitives/RadixSort.cpp      | 149 +-------
 ParallelPrimitives/RadixSort.h        |  50 +--
 ParallelPrimitives/RadixSortConfigs.h |  22 --
 ParallelPrimitives/RadixSortKernels.h | 510 --------------------------
 4 files changed, 4 insertions(+), 727 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index c2601c4..3f31121 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -79,23 +79,6 @@ RadixSort::RadixSort( oroDevice device, OrochiUtils& oroutils, oroStream stream,
 	configure( kernelPath, includeDir, stream );
 }
 
-//void RadixSort::exclusiveScanCpu( const Oro::GpuMemory<int>& countsGpu, Oro::GpuMemory<int>& offsetsGpu ) const noexcept
-//{
-//	const auto buffer_size = countsGpu.size();
-//
-//	std::vector<int> counts = countsGpu.getData();
-//	std::vector<int> offsets( buffer_size );
-//
-//	int sum = 0;
-//	for( int i = 0; i < counts.size(); ++i )
-//	{
-//		offsets[i] = sum;
-//		sum += counts[i];
-//	}
-//
-//	offsetsGpu.copyFromHost( offsets.data(), std::size( offsets ) );
-//}
-
 void RadixSort::compileKernels( const std::string& kernelPath, const std::string& includeDir ) noexcept
 {
 	static constexpr auto defaultKernelPath{ "../ParallelPrimitives/RadixSortKernels.h" };
@@ -127,53 +110,20 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 		binaryPath = getCurrentDir();
 		binaryPath += isAmd ? "oro_compiled_kernels.hipfb" : "oro_compiled_kernels.fatbin";
 		log = "loading pre-compiled kernels at path : " + binaryPath;
-
-		//m_num_threads_per_block_for_count = DEFAULT_COUNT_BLOCK_SIZE;
-		//m_num_threads_per_block_for_scan = DEFAULT_SCAN_BLOCK_SIZE;
-		//m_num_threads_per_block_for_sort = DEFAULT_SORT_BLOCK_SIZE;
-
-		//const auto warp_size = DEFAULT_WARP_SIZE;
-
-		//m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size;
 	}
 	else
 	{
 		log = "compiling kernels at path : " + currentKernelPath + " in : " + currentIncludeDir;
-
-		//m_num_threads_per_block_for_count = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_COUNT_BLOCK_SIZE;
-		//m_num_threads_per_block_for_scan = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SCAN_BLOCK_SIZE;
-		//m_num_threads_per_block_for_sort = m_props.maxThreadsPerBlock > 0 ? m_props.maxThreadsPerBlock : DEFAULT_SORT_BLOCK_SIZE;
-
-		//const auto warp_size = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE;
-
-		//m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / warp_size;
-
-		//assert( m_num_threads_per_block_for_count % warp_size == 0 );
-		//assert( m_num_threads_per_block_for_scan % warp_size == 0 );
-		//assert( m_num_threads_per_block_for_sort % warp_size == 0 );
 	}
 
-	//m_num_warps_per_block_for_sort = m_num_threads_per_block_for_sort / m_warp_size;
-
 	if( m_flags == Flag::LOG )
 	{
 		std::cout << log << std::endl;
 	}
 
 	const auto includeArg{ "-I" + currentIncludeDir };
-	//const auto overwrite_flag = "-DOVERWRITE";
-	//const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count );
-	//const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan );
-	//const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort );
-	//const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort );
-
 	std::vector<const char*> opts;
 	opts.push_back( includeArg.c_str() );
-	//opts.push_back( overwrite_flag );
-	//opts.push_back( count_block_size_param.c_str() );
-	//opts.push_back( scan_block_size_param.c_str() );
-	//opts.push_back( sort_block_size_param.c_str() );
-	//opts.push_back( sort_num_warps_param.c_str() );
 
 	struct Record
 	{
@@ -181,10 +131,6 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 		Kernel kernelType;
 	};
 
-	//const std::vector<Record> records{
-	//	{ "CountKernel", Kernel::COUNT },	 { "ParallelExclusiveScanSingleWG", Kernel::SCAN_SINGLE_WG }, { "ParallelExclusiveScanAllWG", Kernel::SCAN_PARALLEL },	 { "SortKernel", Kernel::SORT },
-	//	{ "SortKVKernel", Kernel::SORT_KV }, { "SortSinglePassKernel", Kernel::SORT_SINGLE_PASS },		  { "SortSinglePassKVKernel", Kernel::SORT_SINGLE_PASS_KV },
-	//};
 	const std::vector<Record> records{
 		{ "SortSinglePassKernel", Kernel::SORT_SINGLE_PASS }, { "SortSinglePassKVKernel", Kernel::SORT_SINGLE_PASS_KV },
 	};
@@ -222,81 +168,8 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 	LOAD_FUNC( m_onesweep_reorderKey64, "onesweep_reorderKey64" );
 	LOAD_FUNC( m_onesweep_reorderKeyPair64, "onesweep_reorderKeyPair64" );
 #undef LOAD_FUNC
-	// const auto includeArg{ "-I" + currentIncludeDir };
-	// const auto overwrite_flag = "-DOVERWRITE";
-	// const auto count_block_size_param = "-DCOUNT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_count );
-	// const auto scan_block_size_param = "-DSCAN_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_scan );
-	// const auto sort_block_size_param = "-DSORT_WG_SIZE_VAL=" + std::to_string( m_num_threads_per_block_for_sort );
-	// const auto sort_num_warps_param = "-DSORT_NUM_WARPS_PER_BLOCK_VAL=" + std::to_string( m_num_warps_per_block_for_sort );
-
-	// std::vector<const char*> opts;
-
-	// if( const std::string device_name = m_props.name; device_name.find( "NVIDIA" ) != std::string::npos )
-	// {
-	// 	opts.push_back( "--use_fast_math" );
-	// }
-	// else
-	// {
-	// 	opts.push_back( "-ffast-math" );
-	// }
-
-	// opts.push_back( includeArg.c_str() );
-	// opts.push_back( overwrite_flag );
-	// opts.push_back( count_block_size_param.c_str() );
-	// opts.push_back( scan_block_size_param.c_str() );
-	// opts.push_back( sort_block_size_param.c_str() );
-	// opts.push_back( sort_num_warps_param.c_str() );
-
-	// for( const auto& record : records )
-	// {
-	// 	if constexpr( useBakeKernel )
-	// 	{
-	// 		oroFunctions[record.kernelType] = m_oroutils.getFunctionFromString( m_device, hip_RadixSortKernels, currentKernelPath.c_str(), record.kernelName.c_str(), &opts, 1, hip::RadixSortKernelsArgs, hip::RadixSortKernelsIncludes );
-	// 	}
-	// 	else if constexpr( useBitCode )
-	// 	{
-	// 		oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary( binaryPath.c_str(), record.kernelName.c_str() );
-	// 	}
-	// 	else
-	// 	{
-	// 		oroFunctions[record.kernelType] = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), record.kernelName.c_str(), &opts );
-	// 	}
-
-	// 	if( m_flags == Flag::LOG )
-	// 	{
-	// 		printKernelInfo( record.kernelName, oroFunctions[record.kernelType] );
-	// 	}
-	// }
-}
 
-//int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept
-//{
-//	const int warpSize = ( m_props.warpSize != 0 ) ? m_props.warpSize : DEFAULT_WARP_SIZE;
-//	const int warpPerWG = blockSize / warpSize;
-//	const int warpPerWGP = m_props.maxThreadsPerMultiProcessor / warpSize;
-//	const int occupancyFromWarp = ( warpPerWGP > 0 ) ? ( warpPerWGP / warpPerWG ) : 1;
-//
-//	const int occupancy = std::max( 1, occupancyFromWarp );
-//
-//	if( m_flags == Flag::LOG )
-//	{
-//		std::cout << "Occupancy: " << occupancy << '\n';
-//	}
-//
-//	static constexpr auto min_num_blocks = 16;
-//	auto number_of_blocks = m_props.multiProcessorCount > 0 ? m_props.multiProcessorCount * occupancy : min_num_blocks;
-//
-//	if( m_num_threads_per_block_for_scan > BIN_SIZE )
-//	{
-//		// Note: both are divisible by 2
-//		const auto base = m_num_threads_per_block_for_scan / BIN_SIZE;
-//
-//		// Floor
-//		number_of_blocks = ( number_of_blocks / base ) * base;
-//	}
-//
-//	return number_of_blocks;
-//}
+}
 
 void RadixSort::configure( const std::string& kernelPath, const std::string& includeDir, oroStream stream ) noexcept
 {
@@ -311,26 +184,6 @@ void RadixSort::configure( const std::string& kernelPath, const std::string& inc
 	m_tailIterator.resizeAsync( 1, false /*copy*/, stream );
 	m_tailIterator.resetAsync( stream );
 	m_gpSumCounter.resizeAsync( 1, false /*copy*/, stream );
-	//m_num_blocks_for_count = calculateWGsToExecute( m_num_threads_per_block_for_count );
-
-	///// The tmp buffer size of the count kernel and the scan kernel.
-
-	//const auto tmp_buffer_size = BIN_SIZE * m_num_blocks_for_count;
-
-	///// @c tmp_buffer_size must be divisible by @c m_num_threads_per_block_for_scan
-	///// This is guaranteed since @c m_num_blocks_for_count will be adjusted accordingly
-
-	//m_num_blocks_for_scan = tmp_buffer_size / m_num_threads_per_block_for_scan;
-
-	//m_tmp_buffer.resizeAsync( tmp_buffer_size, false, stream );
-
-	//if( selectedScanAlgo == ScanAlgo::SCAN_GPU_PARALLEL )
-	//{
-	//	// These are for the scan kernel
-	//	m_partial_sum.resizeAsync( m_num_blocks_for_scan, false, stream );
-	//	m_is_ready.resizeAsync( m_num_blocks_for_scan, false, stream );
-	//  m_is_ready.resetAsync( stream );
-	//}
 }
 void RadixSort::setFlag( Flag flag ) noexcept { m_flags = flag; }
 
diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index f530d79..2a7c3be 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -48,76 +48,32 @@ class RadixSort final
 	void sort( u32* src, u32* dst, uint32_t n, int startBit, int endBit, oroStream stream = 0 ) noexcept;
 
   private:
-	//template<class T>
-	//void sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept;
-
-	///// @brief Compile the kernels for radix sort.
-	///// @param kernelPath The kernel path.
-	///// @param includeDir The include directory.
+	// @brief Compile the kernels for radix sort.
+	// @param kernelPath The kernel path.
+	// @param includeDir The include directory.
 	void compileKernels( const std::string& kernelPath, const std::string& includeDir ) noexcept;
 
-	//[[nodiscard]] int calculateWGsToExecute( const int blockSize ) const noexcept;
-
-	///// @brief Exclusive scan algorithm on CPU for testing.
-	///// It copies the count result from the Device to Host before computation, and then copies the offsets back from Host to Device afterward.
-	///// @param countsGpu The count result in GPU memory. Otuput: The offset.
-	///// @param offsetsGpu The offsets.
-	//void exclusiveScanCpu( const Oro::GpuMemory<int>& countsGpu, Oro::GpuMemory<int>& offsetsGpu ) const noexcept;
-
 	/// @brief Configure the settings, compile the kernels and allocate the memory.
 	/// @param kernelPath The kernel path.
 	/// @param includeDir The include directory.
 	void configure( const std::string& kernelPath, const std::string& includeDir, oroStream stream ) noexcept;
 
   private:
-	//// GPU blocks for the count kernel
-	//int m_num_blocks_for_count{};
-
-	//// GPU blocks for the scan kernel
-	//int m_num_blocks_for_scan{};
-
 	Flag m_flags{ Flag::NO_LOG };
 
 	enum class Kernel
 	{
-		//COUNT,
-		//SCAN_SINGLE_WG,
-		//SCAN_PARALLEL,
-		//SORT,
-		//SORT_KV,
 		SORT_SINGLE_PASS,
 		SORT_SINGLE_PASS_KV,
 	};
 
 	std::unordered_map<Kernel, oroFunction> oroFunctions;
 
-	/// @brief  The enum class which indicates the selected algorithm of prefix scan.
-	//enum class ScanAlgo
-	//{
-	//	SCAN_CPU,
-	//	SCAN_GPU_SINGLE_WG,
-	//	SCAN_GPU_PARALLEL,
-	//};
-
-	//constexpr static auto selectedScanAlgo{ ScanAlgo::SCAN_GPU_PARALLEL };
-
-	//GpuMemory<int> m_partial_sum;
-	//GpuMemory<bool> m_is_ready;
-
 	oroDevice m_device{};
 	oroDeviceProp m_props{};
 
 	OrochiUtils& m_oroutils;
 
-	// This buffer holds the "bucket" table from all GPU blocks.
-	//GpuMemory<int> m_tmp_buffer;
-
-	//int m_num_threads_per_block_for_count{};
-	//int m_num_threads_per_block_for_scan{};
-	//int m_num_threads_per_block_for_sort{};
-
-	//int m_num_warps_per_block_for_sort{};
-
 	oroFunction m_gHistogram;
 	oroFunction m_gPrefixSum;
 	oroFunction m_onesweep_reorderKey64;
diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index 33c5f78..ccdd81a 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -7,29 +7,11 @@ constexpr auto N_RADIX{ 8 };
 constexpr auto BIN_SIZE{ 1 << N_RADIX };
 constexpr auto RADIX_MASK{ ( 1 << N_RADIX ) - 1 };
 constexpr auto PACK_FACTOR{ sizeof( int ) / sizeof( char ) };
-constexpr auto N_PACKED{ BIN_SIZE / PACK_FACTOR };
-constexpr auto PACK_MAX{ 255 };
-constexpr auto N_PACKED_PER_WI{ N_PACKED / WG_SIZE };
-constexpr auto N_BINS_PER_WI{ BIN_SIZE / WG_SIZE };
 constexpr auto N_BINS_4BIT{ 16 };
 constexpr auto N_BINS_PACK_FACTOR{ sizeof( long long ) / sizeof( short ) };
 constexpr auto N_BINS_PACKED_4BIT{ N_BINS_4BIT / N_BINS_PACK_FACTOR };
 
-constexpr auto N_BINS_8BIT{ 1 << 8 };
-
-constexpr auto DEFAULT_WARP_SIZE{ 32 };
-
-constexpr auto DEFAULT_NUM_WARPS_PER_BLOCK{ 8 };
-
-// count config
-
-constexpr auto DEFAULT_COUNT_BLOCK_SIZE{ DEFAULT_WARP_SIZE * DEFAULT_NUM_WARPS_PER_BLOCK };
-
-// scan configs
-constexpr auto DEFAULT_SCAN_BLOCK_SIZE{ DEFAULT_WARP_SIZE * DEFAULT_NUM_WARPS_PER_BLOCK };
-
 // sort configs
-constexpr auto DEFAULT_SORT_BLOCK_SIZE{ DEFAULT_WARP_SIZE * DEFAULT_NUM_WARPS_PER_BLOCK };
 constexpr auto SORT_N_ITEMS_PER_WI{ 12 };
 constexpr auto SINGLE_SORT_N_ITEMS_PER_WI{ 24 };
 constexpr auto SINGLE_SORT_WG_SIZE{ 128 };
@@ -38,10 +20,6 @@ constexpr auto SINGLE_SORT_WG_SIZE{ 128 };
 
 static_assert( BIN_SIZE % 2 == 0 );
 
-// Notice that, on some GPUs, the max size of a GPU block cannot be greater than 256
-static_assert( DEFAULT_COUNT_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 );
-static_assert( DEFAULT_SCAN_BLOCK_SIZE % DEFAULT_WARP_SIZE == 0 );
-
 constexpr int RADIX_SORT_BLOCK_SIZE = 2048;
 
 constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048;
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index d4b7343..3647a39 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -11,137 +11,6 @@ using u32 = unsigned int;
 using u64 = unsigned long long;
 } // namespace
 
-// #define NV_WORKAROUND 1
-
-// default values
-//#if defined( OVERWRITE )
-//
-//constexpr auto COUNT_WG_SIZE{ COUNT_WG_SIZE_VAL };
-//constexpr auto SCAN_WG_SIZE{ SCAN_WG_SIZE_VAL };
-//constexpr auto SORT_WG_SIZE{ SORT_WG_SIZE_VAL };
-//constexpr auto SORT_NUM_WARPS_PER_BLOCK{ SORT_NUM_WARPS_PER_BLOCK_VAL };
-//
-//#else
-//
-//constexpr auto COUNT_WG_SIZE{ DEFAULT_COUNT_BLOCK_SIZE };
-//constexpr auto SCAN_WG_SIZE{ DEFAULT_SCAN_BLOCK_SIZE };
-//constexpr auto SORT_WG_SIZE{ DEFAULT_SORT_BLOCK_SIZE };
-//constexpr auto SORT_NUM_WARPS_PER_BLOCK{ DEFAULT_NUM_WARPS_PER_BLOCK };
-//
-//#endif
-
-//__device__ constexpr u32 getMaskedBits( const u32 value, const u32 shift ) noexcept { return ( value >> shift ) & RADIX_MASK; }
-//
-//extern "C" __global__ void CountKernel( int* gSrc, int* gDst, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
-//{
-//	__shared__ int table[BIN_SIZE];
-//
-//	for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE )
-//	{
-//		table[i] = 0;
-//	}
-//
-//	__syncthreads();
-//
-//	const int offset = blockIdx.x * gNItemsPerWG;
-//	const int upperBound = ( offset + gNItemsPerWG > gN ) ? gN - offset : gNItemsPerWG;
-//
-//	for( int i = threadIdx.x; i < upperBound; i += COUNT_WG_SIZE )
-//	{
-//		const int idx = offset + i;
-//		const int tableIdx = getMaskedBits( gSrc[idx], START_BIT );
-//		atomicAdd( &table[tableIdx], 1 );
-//	}
-//
-//	__syncthreads();
-//
-//	for( int i = threadIdx.x; i < BIN_SIZE; i += COUNT_WG_SIZE )
-//	{
-//		gDst[i * N_WGS_EXECUTED + blockIdx.x] = table[i];
-//	}
-//}
-
-template<typename T, int STRIDE>
-struct ScanImpl
-{
-	__device__ static T exec( T a )
-	{
-		T b = __shfl( a, threadIdx.x - STRIDE );
-		if( threadIdx.x >= STRIDE ) a += b;
-		return ScanImpl<T, STRIDE * 2>::exec( a );
-	}
-};
-
-template<typename T>
-struct ScanImpl<T, WG_SIZE>
-{
-	__device__ static T exec( T a ) { return a; }
-};
-
-template<typename T>
-__device__ void waveScanInclusive( T& a, int width )
-{
-#if 0
-	a = ScanImpl<T, 1>::exec( a );
-#else
-	for( int i = 1; i < width; i *= 2 )
-	{
-		T b = __shfl( a, threadIdx.x - i );
-		if( threadIdx.x >= i ) a += b;
-	}
-#endif
-}
-
-template<typename T>
-__device__ T waveScanExclusive( T& a, int width )
-{
-	waveScanInclusive( a, width );
-
-	T sum = __shfl( a, width - 1 );
-	a = __shfl( a, threadIdx.x - 1 );
-	if( threadIdx.x == 0 ) a = 0;
-
-	return sum;
-}
-
-template<typename T>
-__device__ void ldsScanInclusive( T* lds, int width )
-{
-	// The width cannot exceed WG_SIZE
-	__shared__ T temp[2][WG_SIZE];
-
-	constexpr int MAX_INDEX = 1;
-	int outIndex = 0;
-	int inIndex = 1;
-
-	temp[outIndex][threadIdx.x] = lds[threadIdx.x];
-	__syncthreads();
-
-	for( int i = 1; i < width; i *= 2 )
-	{
-		// Swap in and out index for the buffers
-
-		outIndex = MAX_INDEX - outIndex;
-		inIndex = MAX_INDEX - outIndex;
-
-		if( threadIdx.x >= i )
-		{
-			temp[outIndex][threadIdx.x] = temp[inIndex][threadIdx.x] + temp[inIndex][threadIdx.x - i];
-		}
-		else
-		{
-			temp[outIndex][threadIdx.x] = temp[inIndex][threadIdx.x];
-		}
-
-		__syncthreads();
-	}
-
-	lds[threadIdx.x] = temp[outIndex][threadIdx.x];
-
-	// Ensure the results are written in LDS and are observable in a block (workgroup) before return.
-	__threadfence_block();
-}
-
 template<typename T>
 __device__ T ldsScanExclusive( T* lds, int width )
 {
@@ -325,140 +194,6 @@ __device__ void localSort4bitMulti( int* keys, u32* ldsKeys, int* values, u32* l
 	}
 }
 
-//__device__ void localSort8bitMulti_shared_bin( int* keys, u32* ldsKeys, const int START_BIT )
-//{
-//	__shared__ unsigned table[BIN_SIZE];
-//
-//	for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE )
-//	{
-//		table[i] = 0U;
-//	}
-//
-//	LDS_BARRIER;
-//
-//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i )
-//	{
-//		const int tableIdx = ( keys[i] >> START_BIT ) & RADIX_MASK;
-//		atomicAdd( &table[tableIdx], 1 );
-//	}
-//
-//	LDS_BARRIER;
-//
-//	int globalSum = 0;
-//	for( int binId = 0; binId < BIN_SIZE; binId += SORT_WG_SIZE * 2 )
-//	{
-//		unsigned* globalOffset = &table[binId];
-//		const unsigned currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 );
-//		globalOffset[threadIdx.x * 2] += globalSum;
-//		globalOffset[threadIdx.x * 2 + 1] += globalSum;
-//		globalSum += currentGlobalSum;
-//	}
-//
-//	LDS_BARRIER;
-//
-//	__shared__ u32 keyBuffer[SORT_WG_SIZE * SORT_N_ITEMS_PER_WI];
-//
-//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i )
-//	{
-//		keyBuffer[threadIdx.x * SORT_N_ITEMS_PER_WI + i] = keys[i];
-//	}
-//
-//	LDS_BARRIER;
-//
-//	if( threadIdx.x == 0 )
-//	{
-//		for( int i = 0; i < SORT_WG_SIZE * SORT_N_ITEMS_PER_WI; ++i )
-//		{
-//			const int tableIdx = ( keyBuffer[i] >> START_BIT ) & RADIX_MASK;
-//			const int writeIndex = table[tableIdx];
-//
-//			ldsKeys[writeIndex] = keyBuffer[i];
-//
-//			++table[tableIdx];
-//		}
-//	}
-//
-//	LDS_BARRIER;
-//
-//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; ++i )
-//	{
-//		keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i];
-//	}
-//}
-//
-//__device__ void localSort8bitMulti_group( int* keys, u32* ldsKeys, const int START_BIT )
-//{
-//	constexpr auto N_GROUP_SIZE{ N_BINS_8BIT / ( sizeof( u64 ) / sizeof( u16 ) ) };
-//
-//	__shared__ union
-//	{
-//		u16 m_ungrouped[SORT_WG_SIZE + 1][N_BINS_8BIT];
-//		u64 m_grouped[SORT_WG_SIZE + 1][N_GROUP_SIZE];
-//	} lds;
-//
-//	for( int i = 0; i < N_GROUP_SIZE; ++i )
-//	{
-//		lds.m_grouped[threadIdx.x][i] = 0U;
-//	}
-//
-//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ )
-//	{
-//		const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK;
-//		++lds.m_ungrouped[threadIdx.x][in8bit];
-//	}
-//
-//	LDS_BARRIER;
-//
-//	for( int groupId = threadIdx.x; groupId < N_GROUP_SIZE; groupId += SORT_WG_SIZE )
-//	{
-//		u64 sum = 0U;
-//		for( int i = 0; i < SORT_WG_SIZE; i++ )
-//		{
-//			const auto current = lds.m_grouped[i][groupId];
-//			lds.m_grouped[i][groupId] = sum;
-//			sum += current;
-//		}
-//		lds.m_grouped[SORT_WG_SIZE][groupId] = sum;
-//	}
-//
-//	LDS_BARRIER;
-//
-//	int globalSum = 0;
-//	for( int binId = 0; binId < N_BINS_8BIT; binId += SORT_WG_SIZE * 2 )
-//	{
-//		auto* globalOffset = &lds.m_ungrouped[SORT_WG_SIZE][binId];
-//		const int currentGlobalSum = ldsScanExclusive( globalOffset, SORT_WG_SIZE * 2 );
-//		globalOffset[threadIdx.x * 2] += globalSum;
-//		globalOffset[threadIdx.x * 2 + 1] += globalSum;
-//		globalSum += currentGlobalSum;
-//	}
-//
-//	LDS_BARRIER;
-//
-//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ )
-//	{
-//		const auto in8bit = ( keys[i] >> START_BIT ) & RADIX_MASK;
-//		const auto offset = lds.m_ungrouped[SORT_WG_SIZE][in8bit];
-//		const auto rank = lds.m_ungrouped[threadIdx.x][in8bit]++;
-//
-//		ldsKeys[offset + rank] = keys[i];
-//	}
-//
-//	LDS_BARRIER;
-//
-//	for( int i = 0; i < SORT_N_ITEMS_PER_WI; i++ )
-//	{
-//		keys[i] = ldsKeys[threadIdx.x * SORT_N_ITEMS_PER_WI + i];
-//	}
-//}
-
-//template<bool KEY_VALUE_PAIR>
-//__device__ void localSort8bitMulti( int* keys, u32* ldsKeys, int* values, u32* ldsValues, const int START_BIT )
-//{
-//	localSort4bitMulti<SORT_N_ITEMS_PER_WI, SORT_WG_SIZE, KEY_VALUE_PAIR>( keys, ldsKeys, values, ldsValues, START_BIT );
-//	if( N_RADIX > 4 ) localSort4bitMulti<SORT_N_ITEMS_PER_WI, SORT_WG_SIZE, KEY_VALUE_PAIR>( keys, ldsKeys, values, ldsValues, START_BIT + 4 );
-//}
-
 template<bool KEY_VALUE_PAIR>
 __device__ void SortSinglePass( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int gN, const int START_BIT, const int END_BIT )
 {
@@ -513,251 +248,6 @@ extern "C" __global__ void SortSinglePassKernel( int* gSrcKey, int* gDstKey, int
 
 extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int gN, const int START_BIT, const int END_BIT ) { SortSinglePass<true>( gSrcKey, gSrcVal, gDstKey, gDstVal, gN, START_BIT, END_BIT ); }
 
-//extern "C" __global__ void ParallelExclusiveScanSingleWG( int* gCount, int* gHistogram, const int N_WGS_EXECUTED )
-//{
-//	// Use a single WG.
-//	if( blockIdx.x != 0 )
-//	{
-//		return;
-//	}
-//
-//	// LDS for the parallel scan of the global sum:
-//	// First we store the sum of the counters of each number to it,
-//	// then we compute the global offset using parallel exclusive scan.
-//	__shared__ int blockBuffer[BIN_SIZE];
-//
-//	// fill the LDS with the local sum
-//
-//	for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE )
-//	{
-//		// Do exclusive scan for each segment handled by each WI in a WG
-//
-//		int localThreadSum = 0;
-//		for( int i = 0; i < N_WGS_EXECUTED; ++i )
-//		{
-//			int current = gCount[binId * N_WGS_EXECUTED + i];
-//			gCount[binId * N_WGS_EXECUTED + i] = localThreadSum;
-//
-//			localThreadSum += current;
-//		}
-//
-//		// Store the thread local sum to LDS.
-//
-//		blockBuffer[binId] = localThreadSum;
-//	}
-//
-//	LDS_BARRIER;
-//
-//	// Do parallel exclusive scan on the LDS
-//
-//	int globalSum = 0;
-//	for( int binId = 0; binId < BIN_SIZE; binId += WG_SIZE * 2 )
-//	{
-//		int* globalOffset = &blockBuffer[binId];
-//		int currentGlobalSum = ldsScanExclusive( globalOffset, WG_SIZE * 2 );
-//		globalOffset[threadIdx.x * 2] += globalSum;
-//		globalOffset[threadIdx.x * 2 + 1] += globalSum;
-//		globalSum += currentGlobalSum;
-//	}
-//
-//	LDS_BARRIER;
-//
-//	// Add the global offset to the global histogram.
-//
-//	for( int binId = threadIdx.x; binId < BIN_SIZE; binId += WG_SIZE )
-//	{
-//		for( int i = 0; i < N_WGS_EXECUTED; ++i )
-//		{
-//			gHistogram[binId * N_WGS_EXECUTED + i] += blockBuffer[binId];
-//		}
-//	}
-//}
-//
-//extern "C" __device__ void WorkgroupSync( int threadId, int blockId, int currentSegmentSum, int* currentGlobalOffset, volatile int* gPartialSum, volatile bool* gIsReady )
-//{
-//	if( threadId == 0 )
-//	{
-//		int offset = 0;
-//
-//		if( blockId != 0 )
-//		{
-//			while( !gIsReady[blockId - 1] )
-//			{
-//			}
-//
-//			offset = gPartialSum[blockId - 1];
-//
-//			__threadfence();
-//
-//			// Reset the value
-//			gIsReady[blockId - 1] = false;
-//		}
-//
-//		gPartialSum[blockId] = offset + currentSegmentSum;
-//
-//		// Ensure that the gIsReady is only modified after the gPartialSum is written.
-//		__threadfence();
-//
-//		gIsReady[blockId] = true;
-//
-//		*currentGlobalOffset = offset;
-//	}
-//
-//	__syncthreads();
-//}
-//
-//extern "C" __global__ void ParallelExclusiveScanAllWG( int* gCount, int* gHistogram, volatile int* gPartialSum, volatile bool* gIsReady )
-//{
-//	// Fill the LDS with the partial sum of each segment
-//	__shared__ int blockBuffer[SCAN_WG_SIZE];
-//
-//	blockBuffer[threadIdx.x] = gCount[blockIdx.x * blockDim.x + threadIdx.x];
-//
-//	__syncthreads();
-//
-//	// Do parallel exclusive scan on the LDS
-//
-//	int currentSegmentSum = ldsScanExclusive( blockBuffer, SCAN_WG_SIZE );
-//
-//	__syncthreads();
-//
-//	// Sync all the Workgroups to calculate the global offset.
-//
-//	__shared__ int currentGlobalOffset;
-//	WorkgroupSync( threadIdx.x, blockIdx.x, currentSegmentSum, &currentGlobalOffset, gPartialSum, gIsReady );
-//
-//	// Write back the result.
-//
-//	gHistogram[blockIdx.x * blockDim.x + threadIdx.x] = blockBuffer[threadIdx.x] + currentGlobalOffset;
-//}
-//
-//template<bool KEY_VALUE_PAIR>
-//__device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int numberOfInputs, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
-//{
-//	__shared__ u32 globalOffset[BIN_SIZE];
-//	__shared__ u32 localPrefixSum[BIN_SIZE];
-//	__shared__ u32 counters[BIN_SIZE];
-//
-//	__shared__ u32 matchMasks[SORT_NUM_WARPS_PER_BLOCK][BIN_SIZE];
-//
-//	for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE )
-//	{
-//		// Note: The size of gHistogram is always BIN_SIZE * N_WGS_EXECUTED
-//		globalOffset[i] = gHistogram[i * N_WGS_EXECUTED + blockIdx.x];
-//
-//		counters[i] = 0;
-//		localPrefixSum[i] = 0;
-//	}
-//
-//	for( int w = 0; w < SORT_NUM_WARPS_PER_BLOCK; ++w )
-//	{
-//		for( int i = threadIdx.x; i < BIN_SIZE; i += SORT_WG_SIZE )
-//		{
-//			matchMasks[w][i] = 0;
-//		}
-//	}
-//
-//	__syncthreads();
-//
-//	for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE )
-//	{
-//		const u32 itemIndex = blockIdx.x * gNItemsPerWG + i;
-//		if( itemIndex < numberOfInputs )
-//		{
-//			const auto item = gSrcKey[itemIndex];
-//			const u32 bucketIndex = getMaskedBits( item, START_BIT );
-//			atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
-//		}
-//	}
-//
-//	__syncthreads();
-//
-//	// Compute Prefix Sum
-//
-//	ldsScanExclusive( localPrefixSum, BIN_SIZE );
-//
-//	__syncthreads();
-//
-//	// Reorder
-//
-//	for( int i = threadIdx.x; i < gNItemsPerWG; i += SORT_WG_SIZE )
-//	{
-//		const u32 itemIndex = blockIdx.x * gNItemsPerWG + i;
-//
-//		const auto item = gSrcKey[itemIndex];
-//		const u32 bucketIndex = getMaskedBits( item, START_BIT );
-//
-//		const int warp = threadIdx.x / 32;
-//		const int lane = threadIdx.x % 32;
-//
-//		__syncthreads();
-//
-//		if( itemIndex < numberOfInputs )
-//		{
-//			atomicOr( &matchMasks[warp][bucketIndex], 1u << lane );
-//		}
-//
-//		__syncthreads();
-//
-//		bool flushMask = false;
-//
-//		u32 localOffset = 0;
-//		u32 localSrcIndex = 0;
-//
-//		if( itemIndex < numberOfInputs )
-//		{
-//			const u32 matchMask = matchMasks[warp][bucketIndex];
-//			const u32 lowerMask = ( 1u << lane ) - 1;
-//			u32 offset = __popc( matchMask & lowerMask );
-//
-//			flushMask = ( offset == 0 );
-//
-//			for( int w = 0; w < warp; ++w )
-//			{
-//				offset += __popc( matchMasks[w][bucketIndex] );
-//			}
-//
-//			localOffset = counters[bucketIndex] + offset;
-//			localSrcIndex = i;
-//		}
-//
-//		__syncthreads();
-//
-//		if( itemIndex < numberOfInputs )
-//		{
-//			atomicInc( &counters[bucketIndex], 0xFFFFFFFF );
-//		}
-//
-//		if( flushMask )
-//		{
-//			matchMasks[warp][bucketIndex] = 0;
-//		}
-//
-//		// Swap
-//
-//		if( itemIndex < numberOfInputs )
-//		{
-//			const u32 srcIndex = blockIdx.x * gNItemsPerWG + localSrcIndex;
-//			const u32 dstIndex = globalOffset[bucketIndex] + localOffset;
-//			gDstKey[dstIndex] = gSrcKey[srcIndex];
-//
-//			if constexpr( KEY_VALUE_PAIR )
-//			{
-//				gDstVal[dstIndex] = gSrcVal[srcIndex];
-//			}
-//		}
-//	}
-//}
-//
-//extern "C" __global__ void SortKernel( int* gSrcKey, int* gDstKey, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
-//{
-//	SortImpl<false>( gSrcKey, nullptr, gDstKey, nullptr, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED );
-//}
-//
-//extern "C" __global__ void SortKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int* gHistogram, int gN, int gNItemsPerWG, const int START_BIT, const int N_WGS_EXECUTED )
-//{
-//	SortImpl<true>( gSrcKey, gSrcVal, gDstKey, gDstVal, gHistogram, gN, gNItemsPerWG, START_BIT, N_WGS_EXECUTED );
-//}
 
 constexpr auto KEY_IS_16BYTE_ALIGNED = true;
 

From 7ae1709f0c26b4c89ce2530ddfd8a203d032fed1 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 14:12:55 +0900
Subject: [PATCH 31/68] delete unused inl

---
 ParallelPrimitives/RadixSort.inl | 163 -------------------------------
 1 file changed, 163 deletions(-)
 delete mode 100644 ParallelPrimitives/RadixSort.inl

diff --git a/ParallelPrimitives/RadixSort.inl b/ParallelPrimitives/RadixSort.inl
deleted file mode 100644
index d001238..0000000
--- a/ParallelPrimitives/RadixSort.inl
+++ /dev/null
@@ -1,163 +0,0 @@
-
-
-//namespace
-//{
-//
-//struct Empty
-//{
-//};
-//
-///// @brief Call the callable and measure the elapsed time using the Stopwatch.
-///// @tparam CallableType The type of the callable to be invoked in this function.
-///// @tparam RecordType The type of the object that stores the recorded times.
-///// @tparam enable_profile The elapsed time will be recorded if this is set to True.
-///// @param callable The callable object to be called.
-///// @param time_record The object that stores the recorded times.
-///// @param index The index indicates where to store the elapsed time in @c time_record
-///// @param stream The GPU stream
-//template<bool enable_profile, typename CallableType, typename RecordType>
-//constexpr void execute( CallableType&& callable, RecordType& time_record, const int index, const oroStream stream ) noexcept
-//{
-//	using TimerType = std::conditional_t<enable_profile, Stopwatch, Empty>;
-//
-//	TimerType stopwatch;
-//
-//	if constexpr( enable_profile )
-//	{
-//		stopwatch.start();
-//	}
-//
-//	std::invoke( std::forward<CallableType>( callable ) );
-//
-//	if constexpr( enable_profile )
-//	{
-//		OrochiUtils::waitForCompletion( stream );
-//		stopwatch.stop();
-//		time_record[index] = stopwatch.getMs();
-//	}
-//}
-//
-//template<bool enable_profile, typename T>
-//void resize_record( T& t ) noexcept
-//{
-//	if constexpr( enable_profile )
-//	{
-//		t.resize( 3 );
-//	}
-//}
-//
-//template<bool enable_profile, typename T>
-//void print_record( const T& t ) noexcept
-//{
-//	if constexpr( enable_profile )
-//	{
-//		printf( "%3.2f, %3.2f, %3.2f\n", t[0], t[1], t[2] );
-//	}
-//}
-//
-//} // namespace
-
-//template<class T>
-//void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int endBit, oroStream stream ) noexcept
-//{
-//	static constexpr auto enable_profile = false;
-//
-//	const u32* srcKey{ nullptr };
-//	const u32* dstKey{ nullptr };
-//
-//	const u32* srcVal{ nullptr };
-//	const u32* dstVal{ nullptr };
-//
-//	static constexpr auto enable_key_value_pair_sorting{ std::is_same_v<T, KeyValueSoA> };
-//
-//	if constexpr( enable_key_value_pair_sorting )
-//	{
-//		srcKey = src.key;
-//		dstKey = dst.key;
-//
-//		srcVal = src.value;
-//		dstVal = dst.value;
-//	}
-//	else
-//	{
-//		static_assert( std::is_same_v<T, u32*> || std::is_same_v<T, const u32*> );
-//		srcKey = src;
-//		dstKey = dst;
-//	}
-//
-//	const int nItemPerWG = ( n + m_num_blocks_for_count - 1 ) / m_num_blocks_for_count;
-//
-//	// Timer records
-//
-//	using RecordType = std::conditional_t<enable_profile, std::vector<float>, Empty>;
-//	RecordType t;
-//
-//	resize_record<enable_profile>( t );
-//
-//	const auto launch_count_kernel = [&]() noexcept
-//	{
-//		const auto num_total_thread_for_count = m_num_threads_per_block_for_count * m_num_blocks_for_count;
-//
-//		const auto func{ oroFunctions[Kernel::COUNT] };
-//		const void* args[] = { &srcKey, arg_cast( m_tmp_buffer.address() ), &n, &nItemPerWG, &startBit, &m_num_blocks_for_count };
-//		OrochiUtils::launch1D( func, num_total_thread_for_count, args, m_num_threads_per_block_for_count, 0, stream );
-//	};
-//
-//	execute<enable_profile>( launch_count_kernel, t, 0, stream );
-//
-//	const auto launch_scan_kernel = [&]() noexcept
-//	{
-//		switch( selectedScanAlgo )
-//		{
-//		case ScanAlgo::SCAN_CPU:
-//		{
-//			exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer );
-//		}
-//		break;
-//
-//		case ScanAlgo::SCAN_GPU_SINGLE_WG:
-//		{
-//			const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), &m_num_blocks_for_count };
-//			OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_SINGLE_WG], WG_SIZE * m_num_blocks_for_count, args, WG_SIZE, 0, stream );
-//		}
-//		break;
-//
-//		case ScanAlgo::SCAN_GPU_PARALLEL:
-//		{
-//			const auto num_total_thread_for_scan = m_num_threads_per_block_for_scan * m_num_blocks_for_scan;
-//
-//			const void* args[] = { arg_cast( m_tmp_buffer.address() ), arg_cast( m_tmp_buffer.address() ), arg_cast( m_partial_sum.address() ), arg_cast( m_is_ready.address() ) };
-//			OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_PARALLEL], num_total_thread_for_scan, args, m_num_threads_per_block_for_scan, 0, stream );
-//		}
-//		break;
-//
-//		default:
-//			exclusiveScanCpu( m_tmp_buffer, m_tmp_buffer );
-//			break;
-//		}
-//	};
-//
-//	execute<enable_profile>( launch_scan_kernel, t, 1, stream );
-//
-//	const auto launch_sort_kernel = [&]() noexcept
-//	{
-//		const auto num_blocks_for_sort = m_num_blocks_for_count;
-//		const auto num_total_thread_for_sort = m_num_threads_per_block_for_sort * num_blocks_for_sort;
-//		const auto num_items_per_block = nItemPerWG;
-//
-//		if constexpr( enable_key_value_pair_sorting )
-//		{
-//			const void* args[] = { &srcKey, &srcVal, &dstKey, &dstVal, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort };
-//			OrochiUtils::launch1D( oroFunctions[Kernel::SORT_KV], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream );
-//		}
-//		else
-//		{
-//			const void* args[] = { &srcKey, &dstKey, arg_cast( m_tmp_buffer.address() ), &n, &num_items_per_block, &startBit, &num_blocks_for_sort };
-//			OrochiUtils::launch1D( oroFunctions[Kernel::SORT], num_total_thread_for_sort, args, m_num_threads_per_block_for_sort, 0, stream );
-//		}
-//	};
-//
-//	execute<enable_profile>( launch_sort_kernel, t, 2, stream );
-//
-//	print_record<enable_profile>( t );
-//}

From 6c9fc499358e5556504f6b50d5fcd2b4b23cb771 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 15:45:14 +0900
Subject: [PATCH 32/68] Add a special case handling, all elements have the same
 digit, to reduce the overhead of thread conflicts

---
 ParallelPrimitives/RadixSortKernels.h | 29 ++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 3647a39..d71f64d 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -561,7 +561,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		lookBackBuffer[pIndex] = asU64( pa );
 
 		// complete global output location
-		pSum[i] = gp + p;
+		u32 globalOutput = gp + p;
+		pSum[i] = globalOutput;
+
+		// A special case handling: all elements have the same digit
+		if( s == RADIX_SORT_BLOCK_SIZE )
+		{
+			matchMasks[0][0] = globalOutput + 1 /* +1 to avoid zero */;
+		}
 	}
 
 	u32 prefix = 0;
@@ -578,6 +585,26 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ );
 	}
 
+	// A special case handling: all elements have the same digit
+	u32 globalOutput = matchMasks[0][0];
+	if( globalOutput-- /* -1 for the actual offset */ )
+	{
+		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+		{
+			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+			if( itemIndex < numberOfInputs )
+			{
+				u32 dstIndex = globalOutput + i + threadIdx.x;
+				outputKeys[dstIndex] = inputKeys[itemIndex];
+				if constexpr( keyPair )
+				{
+					outputValues[dstIndex] = inputValues[itemIndex];
+				}
+			}
+		}
+		return;
+	}
+
 	// reorder
 	for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{

From 35b265443ad225e412f33ede1593729448cfb541 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 26 Dec 2023 16:40:36 +0900
Subject: [PATCH 33/68] Refactor indices

---
 ParallelPrimitives/RadixSortKernels.h | 46 +++++++++++++--------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index d71f64d..e2e6d86 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -365,17 +365,17 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 		}
 		else
 		{
-			for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK )
+			for( int i = threadIdx.x; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK )
 			{
-				u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x + i;
+				u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i;
 				if( itemIndex < numberOfInputs )
 				{
 					auto item = inputs[itemIndex];
-					for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
+					for( int j = 0; j < sizeof( RADIX_SORT_KEY_TYPE ); j++ )
 					{
-						u32 bitLocation = startBits + i * 8;
+						u32 bitLocation = startBits + j * 8;
 						u32 bits = extractDigit( getKeyBits( item ), bitLocation );
-						atomicInc( &localCounters[i][bits], 0xFFFFFFFF );
+						atomicInc( &localCounters[j][bits], 0xFFFFFFFF );
 					}
 				}
 			}
@@ -444,9 +444,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ )
 	{
-		for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+		for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
-			matchMasks[w][i + threadIdx.x] = 0;
+			matchMasks[w][i] = 0;
 		}
 	}
 
@@ -474,16 +474,16 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	}
 	else
 	{
-		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+		for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
-			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
 			if( itemIndex < numberOfInputs )
 			{
 				auto item = inputKeys[itemIndex];
 				u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
 				atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
 
-				elementBuckets[i + threadIdx.x] = bucketIndex;
+				elementBuckets[i] = bucketIndex;
 			}
 		}
 	}
@@ -589,12 +589,12 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	u32 globalOutput = matchMasks[0][0];
 	if( globalOutput-- /* -1 for the actual offset */ )
 	{
-		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+		for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
-			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
 			if( itemIndex < numberOfInputs )
 			{
-				u32 dstIndex = globalOutput + i + threadIdx.x;
+				u32 dstIndex = globalOutput + i;
 				outputKeys[dstIndex] = inputKeys[itemIndex];
 				if constexpr( keyPair )
 				{
@@ -606,10 +606,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	}
 
 	// reorder
-	for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
-		u32 bucketIndex = elementBuckets[i + threadIdx.x];
+		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
+		u32 bucketIndex = elementBuckets[i];
 
 		__syncthreads();
 
@@ -642,7 +642,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			u32 to = localOffset + localPrefixSum[bucketIndex];
 
 			ElementLocation el;
-			el.localSrcIndex = i + threadIdx.x;
+			el.localSrcIndex = i;
 			el.localOffset = localOffset;
 			el.bucket = bucketIndex;
 			elementLocations[to] = el;
@@ -660,12 +660,12 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 	}
 
-	for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
 		if( itemIndex < numberOfInputs )
 		{
-			ElementLocation el = elementLocations[i + threadIdx.x];
+			ElementLocation el = elementLocations[i];
 			u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
 			u8 bucketIndex = el.bucket;
 
@@ -675,12 +675,12 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	}
 	if constexpr ( keyPair )
 	{
-		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+		for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
-			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x;
+			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
 			if( itemIndex < numberOfInputs )
 			{
-				ElementLocation el = elementLocations[i + threadIdx.x];
+				ElementLocation el = elementLocations[i];
 				u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
 				u8 bucketIndex = el.bucket;
 

From 7daad5c500e15ac4b56ccdef8787931f910d5184 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 28 Dec 2023 16:43:14 +0900
Subject: [PATCH 34/68] implement counting part

---
 ParallelPrimitives/RadixSortConfigs.h |   2 +
 ParallelPrimitives/RadixSortKernels.h | 106 ++++++++++++++++++++------
 2 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index ccdd81a..82f0c10 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -27,6 +27,8 @@ constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256;
 
 constexpr int REORDER_NUMBER_OF_WARPS = 8;
 constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS;
+constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = GHISTOGRAM_ITEM_PER_BLOCK / REORDER_NUMBER_OF_WARPS;
+constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32;
 
 constexpr int LOOKBACK_TABLE_SIZE = 1024;
 constexpr int MAX_LOOK_BACK = 64;
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index e2e6d86..e9dc245 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -1,5 +1,10 @@
 #include <ParallelPrimitives/RadixSortConfigs.h>
 #define LDS_BARRIER __syncthreads()
+
+#if defined( CUDART_VERSION ) && CUDART_VERSION >= 9000
+#define ITS 1
+#endif
+
 namespace
 {
 
@@ -280,6 +285,7 @@ __device__ inline u32 getKeyBits( u32 x ) { return x ^ ORDER_MASK_32; }
 __device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; }
 __device__ inline u32 extractDigit( u32 x, u32 bitLocation ) { return ( x >> bitLocation ) & RADIX_MASK; }
 __device__ inline u32 extractDigit( u64 x, u32 bitLocation ) { return (u32)( ( x >> bitLocation ) & RADIX_MASK ); }
+__device__ __forceinline__ u32 u32min( u32 x, u32 y ) { return ( y < x ) ? y : x; }
 
 template<int NThreads>
 __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO )
@@ -428,6 +434,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		u32 bucket : 8;
 	};
 
+	__shared__ u32 blockHistogram[BIN_SIZE];
+	__shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
+
 	__shared__ u32 pSum[BIN_SIZE];
 	__shared__ u32 localPrefixSum[BIN_SIZE];
 	__shared__ u32 counters[BIN_SIZE];
@@ -450,6 +459,56 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 	}
 
+	clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( blockHistogram, 0 );
+	clearShared<BIN_SIZE * REORDER_NUMBER_OF_WARPS, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( lpSum, 0 );
+
+	__syncthreads();
+
+	int warp = threadIdx.x / 32;
+	int lane = threadIdx.x % 32;
+	for( int i = lane; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32 )
+	{
+		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
+
+		u32 bucketIndex = 0;
+		if( itemIndex < numberOfInputs )
+		{
+			auto item = inputKeys[itemIndex];
+			bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
+		}
+
+		int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32
+		u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems;
+
+		for( int i = 0; i < 8; ++i )
+		{
+			u32 bit = ( bucketIndex >> i ) & 0x1;
+			u32 difference = ( 0xFFFFFFFF * bit ) ^
+#if defined( ITS )
+								__ballot_sync( 0xFFFFFFFF, bit != 0 );
+#else
+								__ballot( bit != 0 );
+#endif
+			broThreads &= ~difference;
+		}
+		int laneIndex = threadIdx.x % 32;
+		u32 lowerMask = ( 1u << laneIndex ) - 1;
+		bool leader = ( broThreads & lowerMask ) == 0;
+		if( itemIndex < numberOfInputs && leader )
+		{
+			u32 n = __popc( broThreads );
+			atomicAdd( &blockHistogram[bucketIndex], n );
+			lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] = n;
+		}
+	}
+
+	{
+		u32 prefix = 0;
+		for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+		{
+			prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &lpSum[i] );
+		}
+	}
 	__syncthreads();
 
 	// count
@@ -517,7 +576,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		u32 s = localPrefixSum[i];
+		//u32 s = localPrefixSum[i];
+		u32 s = blockHistogram[i];
 		int pIndex = BIN_SIZE * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i;
 
 		{
@@ -565,10 +625,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		pSum[i] = globalOutput;
 
 		// A special case handling: all elements have the same digit
-		if( s == RADIX_SORT_BLOCK_SIZE )
-		{
-			matchMasks[0][0] = globalOutput + 1 /* +1 to avoid zero */;
-		}
+		//if( s == RADIX_SORT_BLOCK_SIZE )
+		//{
+		//	matchMasks[0][0] = globalOutput + 1 /* +1 to avoid zero */;
+		//}
 	}
 
 	u32 prefix = 0;
@@ -586,24 +646,24 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	}
 
 	// A special case handling: all elements have the same digit
-	u32 globalOutput = matchMasks[0][0];
-	if( globalOutput-- /* -1 for the actual offset */ )
-	{
-		for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-		{
-			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
-			if( itemIndex < numberOfInputs )
-			{
-				u32 dstIndex = globalOutput + i;
-				outputKeys[dstIndex] = inputKeys[itemIndex];
-				if constexpr( keyPair )
-				{
-					outputValues[dstIndex] = inputValues[itemIndex];
-				}
-			}
-		}
-		return;
-	}
+	//u32 globalOutput = matchMasks[0][0];
+	//if( globalOutput-- /* -1 for the actual offset */ )
+	//{
+	//	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	//	{
+	//		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
+	//		if( itemIndex < numberOfInputs )
+	//		{
+	//			u32 dstIndex = globalOutput + i;
+	//			outputKeys[dstIndex] = inputKeys[itemIndex];
+	//			if constexpr( keyPair )
+	//			{
+	//				outputValues[dstIndex] = inputValues[itemIndex];
+	//			}
+	//		}
+	//	}
+	//	return;
+	//}
 
 	// reorder
 	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )

From 5043a034b6a669aead7471d4a970c7a22080cd3c Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 28 Dec 2023 18:43:41 +0900
Subject: [PATCH 35/68] slow but works

---
 ParallelPrimitives/RadixSortConfigs.h |   7 +-
 ParallelPrimitives/RadixSortKernels.h | 304 +++++++++++++++-----------
 2 files changed, 186 insertions(+), 125 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index 82f0c10..f305940 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -20,19 +20,20 @@ constexpr auto SINGLE_SORT_WG_SIZE{ 128 };
 
 static_assert( BIN_SIZE % 2 == 0 );
 
-constexpr int RADIX_SORT_BLOCK_SIZE = 2048;
+constexpr int RADIX_SORT_BLOCK_SIZE = 2048 * 2;
+// constexpr int RADIX_SORT_BLOCK_SIZE = 512;
 
 constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048;
 constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256;
 
 constexpr int REORDER_NUMBER_OF_WARPS = 8;
 constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS;
-constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = GHISTOGRAM_ITEM_PER_BLOCK / REORDER_NUMBER_OF_WARPS;
+constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = RADIX_SORT_BLOCK_SIZE / REORDER_NUMBER_OF_WARPS;
 constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32;
 
 constexpr int LOOKBACK_TABLE_SIZE = 1024;
 constexpr int MAX_LOOK_BACK = 64;
-constexpr int TAIL_BITS = 4;
+constexpr int TAIL_BITS = 5;
 constexpr int TAIL_COUNT = 1u << TAIL_BITS;
 
 static_assert( REORDER_NUMBER_OF_THREADS_PER_BLOCK <= BIN_SIZE, "please check prefixSumExclusive on onesweep_reorder" );
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index e9dc245..83105f9 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -436,37 +436,40 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	__shared__ u32 blockHistogram[BIN_SIZE];
 	__shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
-
 	__shared__ u32 pSum[BIN_SIZE];
-	__shared__ u32 localPrefixSum[BIN_SIZE];
-	__shared__ u32 counters[BIN_SIZE];
 	__shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
-	__shared__ u8 elementBuckets[RADIX_SORT_BLOCK_SIZE];
-	__shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][BIN_SIZE];
+
+	//__shared__ u32 localPrefixSum[BIN_SIZE];
+	//__shared__ u32 counters[BIN_SIZE];
+	//__shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
+	//__shared__ u8 elementBuckets[RADIX_SORT_BLOCK_SIZE];
+	//__shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][BIN_SIZE];
 
 	u32 bitLocation = startBits + 8 * iteration;
 	u32 blockIndex = blockIdx.x;
 	u32 numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE );
 
-	clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( localPrefixSum, 0 );
-	clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( counters, 0 );
+	// clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( localPrefixSum, 0 );
+	// clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( counters, 0 );
 
-	for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ )
-	{
-		for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-		{
-			matchMasks[w][i] = 0;
-		}
-	}
+	//for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ )
+	//{
+	//	for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	//	{
+	//		matchMasks[w][i] = 0;
+	//	}
+	//}
 
 	clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( blockHistogram, 0 );
 	clearShared<BIN_SIZE * REORDER_NUMBER_OF_WARPS, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( lpSum, 0 );
 
 	__syncthreads();
 
+	u8  bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD];
+
 	int warp = threadIdx.x / 32;
 	int lane = threadIdx.x % 32;
-	for( int i = lane; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32 )
+	for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
 	{
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
 
@@ -476,13 +479,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			auto item = inputKeys[itemIndex];
 			bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
 		}
+		bucketIndices[k] = bucketIndex;
 
 		int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32
 		u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems;
 
-		for( int i = 0; i < 8; ++i )
+		for( int j = 0; j < 8; ++j )
 		{
-			u32 bit = ( bucketIndex >> i ) & 0x1;
+			u32 bit = ( bucketIndex >> j ) & 0x1;
 			u32 difference = ( 0xFFFFFFFF * bit ) ^
 #if defined( ITS )
 								__ballot_sync( 0xFFFFFFFF, bit != 0 );
@@ -498,55 +502,13 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		{
 			u32 n = __popc( broThreads );
 			atomicAdd( &blockHistogram[bucketIndex], n );
-			lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] = n;
+			lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n;
 		}
+		// warpOffsets[k] = __popc( broThreads & lowerMask );
 	}
 
-	{
-		u32 prefix = 0;
-		for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-		{
-			prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &lpSum[i] );
-		}
-	}
 	__syncthreads();
 
-	// count
-	if( KEY_IS_16BYTE_ALIGNED && ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs )
-	{
-		for( int i = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK * 4 )
-		{
-			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i + threadIdx.x * 4;
-			struct alignas( 16 ) Key4
-			{
-				RADIX_SORT_KEY_TYPE xs[4];
-			};
-			Key4 key4 = *(Key4*)&inputKeys[itemIndex];
-			for( int k = 0; k < 4; k++ )
-			{
-				auto item = key4.xs[k];
-				u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
-				atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
-				elementBuckets[i + threadIdx.x * 4 + k] = bucketIndex;
-			}
-		}
-	}
-	else
-	{
-		for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-		{
-			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
-			if( itemIndex < numberOfInputs )
-			{
-				auto item = inputKeys[itemIndex];
-				u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
-				atomicInc( &localPrefixSum[bucketIndex], 0xFFFFFFFF );
-
-				elementBuckets[i] = bucketIndex;
-			}
-		}
-	}
-
 	struct ParitionID
 	{
 		u64 value : 32;
@@ -623,19 +585,36 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		// complete global output location
 		u32 globalOutput = gp + p;
 		pSum[i] = globalOutput;
-
-		// A special case handling: all elements have the same digit
-		//if( s == RADIX_SORT_BLOCK_SIZE )
-		//{
-		//	matchMasks[0][0] = globalOutput + 1 /* +1 to avoid zero */;
-		//}
 	}
 
+
 	u32 prefix = 0;
 	for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &localPrefixSum[i] );
+		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &blockHistogram[i] );
 	}
+	{
+		int bucketIndex = threadIdx.x;
+		u32 s = blockHistogram[bucketIndex];
+		for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ )
+		{
+			int index = bucketIndex * REORDER_NUMBER_OF_WARPS + warp;
+			u32 n = lpSum[index];
+			lpSum[index] = s;
+			s += n;
+		}
+	}
+	// printf( "[%d] %d\n", threadIdx.x, blockHistogram[threadIdx.x] );
+
+	//{
+	//	u32 prefix = 0;
+	//	for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	//	{
+	//		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &lpSum[i] );
+	//	}
+	//}
+
+	__syncthreads();
 
 	if( threadIdx.x == 0 )
 	{
@@ -645,81 +624,57 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ );
 	}
 
-	// A special case handling: all elements have the same digit
-	//u32 globalOutput = matchMasks[0][0];
-	//if( globalOutput-- /* -1 for the actual offset */ )
 	//{
-	//	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	//	u32 prefix = 0;
+	//	for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	//	{
-	//		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
-	//		if( itemIndex < numberOfInputs )
-	//		{
-	//			u32 dstIndex = globalOutput + i;
-	//			outputKeys[dstIndex] = inputKeys[itemIndex];
-	//			if constexpr( keyPair )
-	//			{
-	//				outputValues[dstIndex] = inputValues[itemIndex];
-	//			}
-	//		}
+	//		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &lpSum[i] );
 	//	}
-	//	return;
 	//}
+	__syncthreads();
 
-	// reorder
-	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
 	{
-		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
-		u32 bucketIndex = elementBuckets[i];
-
-		__syncthreads();
+		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
+		u32 bucketIndex = bucketIndices[k];
 
-		int warp = threadIdx.x / 32;
-		int lane = threadIdx.x % 32;
+		int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32
+		u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems;
 
-		if( itemIndex < numberOfInputs )
+		for( int j = 0; j < 8; ++j )
 		{
-			atomicOr( &matchMasks[warp][bucketIndex], 1u << lane );
+			u32 bit = ( bucketIndex >> j ) & 0x1;
+			u32 difference = ( 0xFFFFFFFF * bit ) ^
+#if defined( ITS )
+							 __ballot_sync( 0xFFFFFFFF, bit != 0 );
+#else
+							 __ballot( bit != 0 );
+#endif
+			broThreads &= ~difference;
 		}
-
-		__syncthreads();
-
-		bool flushMask = false;
+		int laneIndex = threadIdx.x % 32;
+		u32 lowerMask = ( 1u << laneIndex ) - 1;
+		bool leader = ( broThreads & lowerMask ) == 0;
 
 		if( itemIndex < numberOfInputs )
 		{
-			u32 matchMask = matchMasks[warp][bucketIndex];
-			u32 lowerMask = ( 1u << lane ) - 1;
-			u32 offset = __popc( matchMask & lowerMask );
-
-			flushMask = offset == 0;
-
-			for( int w = 0; w < warp; w++ )
-			{
-				offset += __popc( matchMasks[w][bucketIndex] );
-			}
-
-			u32 localOffset = counters[bucketIndex] + offset;
-			u32 to = localOffset + localPrefixSum[bucketIndex];
+			u32 localBase = lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
+			u32 to = localBase + __popc( broThreads & lowerMask );
 
 			ElementLocation el;
-			el.localSrcIndex = i;
-			el.localOffset = localOffset;
+			el.localSrcIndex = itemIndex - blockIndex * RADIX_SORT_BLOCK_SIZE;
+			el.localOffset = to - blockHistogram[bucketIndex];
 			el.bucket = bucketIndex;
 			elementLocations[to] = el;
 		}
-
-		__syncthreads();
-
-		if( itemIndex < numberOfInputs )
-		{
-			atomicInc( &counters[bucketIndex], 0xFFFFFFFF );
-		}
-		if( flushMask )
+		if( itemIndex < numberOfInputs && leader )
 		{
-			matchMasks[warp][bucketIndex] = 0;
+			lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += __popc( broThreads );
 		}
 	}
 
+	__syncthreads();
+
 	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
@@ -733,7 +688,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			outputKeys[dstIndex] = inputKeys[srcIndex];
 		}
 	}
-	if constexpr ( keyPair )
+	if constexpr( keyPair )
 	{
 		for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
@@ -749,6 +704,111 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			}
 		}
 	}
+
+	// A special case handling: all elements have the same digit
+	//u32 globalOutput = matchMasks[0][0];
+	//if( globalOutput-- /* -1 for the actual offset */ )
+	//{
+	//	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	//	{
+	//		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
+	//		if( itemIndex < numberOfInputs )
+	//		{
+	//			u32 dstIndex = globalOutput + i;
+	//			outputKeys[dstIndex] = inputKeys[itemIndex];
+	//			if constexpr( keyPair )
+	//			{
+	//				outputValues[dstIndex] = inputValues[itemIndex];
+	//			}
+	//		}
+	//	}
+	//	return;
+	//}
+
+	// reorder
+	//for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	//{
+	//	u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
+	//	u32 bucketIndex = elementBuckets[i];
+
+	//	__syncthreads();
+
+	//	int warp = threadIdx.x / 32;
+	//	int lane = threadIdx.x % 32;
+
+	//	if( itemIndex < numberOfInputs )
+	//	{
+	//		atomicOr( &matchMasks[warp][bucketIndex], 1u << lane );
+	//	}
+
+	//	__syncthreads();
+
+	//	bool flushMask = false;
+
+	//	if( itemIndex < numberOfInputs )
+	//	{
+	//		u32 matchMask = matchMasks[warp][bucketIndex];
+	//		u32 lowerMask = ( 1u << lane ) - 1;
+	//		u32 offset = __popc( matchMask & lowerMask );
+
+	//		flushMask = offset == 0;
+
+	//		for( int w = 0; w < warp; w++ )
+	//		{
+	//			offset += __popc( matchMasks[w][bucketIndex] );
+	//		}
+
+	//		u32 localOffset = counters[bucketIndex] + offset;
+	//		u32 to = localOffset + localPrefixSum[bucketIndex];
+
+	//		ElementLocation el;
+	//		el.localSrcIndex = i;
+	//		el.localOffset = localOffset;
+	//		el.bucket = bucketIndex;
+	//		elementLocations[to] = el;
+	//	}
+
+	//	__syncthreads();
+
+	//	if( itemIndex < numberOfInputs )
+	//	{
+	//		atomicInc( &counters[bucketIndex], 0xFFFFFFFF );
+	//	}
+	//	if( flushMask )
+	//	{
+	//		matchMasks[warp][bucketIndex] = 0;
+	//	}
+	//}
+
+	//for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	//{
+	//	u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
+	//	if( itemIndex < numberOfInputs )
+	//	{
+	//		ElementLocation el = elementLocations[i];
+	//		u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
+	//		u8 bucketIndex = el.bucket;
+
+	//		u32 dstIndex = pSum[bucketIndex] + el.localOffset;
+	//		outputKeys[dstIndex] = inputKeys[srcIndex];
+	//	}
+	//}
+	//if constexpr ( keyPair )
+	//{
+	//	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	//	{
+	//		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
+	//		if( itemIndex < numberOfInputs )
+	//		{
+	//			ElementLocation el = elementLocations[i];
+	//			u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
+	//			u8 bucketIndex = el.bucket;
+
+	//			u32 dstIndex = pSum[bucketIndex] + el.localOffset;
+	//			outputValues[dstIndex] = inputValues[srcIndex];
+	//		}
+	//	}
+	//}
 }
 extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits,
 												  u32 iteration )

From 265578239a7fc297cfeb981ef97c89aa4c0586db Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 28 Dec 2023 19:54:03 +0900
Subject: [PATCH 36/68] Simplify

---
 ParallelPrimitives/RadixSortKernels.h | 51 +++++++--------------------
 1 file changed, 12 insertions(+), 39 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 83105f9..89bb8c9 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -466,6 +466,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	__syncthreads();
 
 	u8  bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD];
+	u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD];
+	// u32 bros[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 
 	int warp = threadIdx.x / 32;
 	int lane = threadIdx.x % 32;
@@ -495,8 +497,15 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 #endif
 			broThreads &= ~difference;
 		}
+		// bros[k] = broThreads;
 		int laneIndex = threadIdx.x % 32;
 		u32 lowerMask = ( 1u << laneIndex ) - 1;
+
+		if( itemIndex < numberOfInputs )
+		{
+			warpOffsets[k] = lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask );
+		}
+
 		bool leader = ( broThreads & lowerMask ) == 0;
 		if( itemIndex < numberOfInputs && leader )
 		{
@@ -504,7 +513,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			atomicAdd( &blockHistogram[bucketIndex], n );
 			lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n;
 		}
-		// warpOffsets[k] = __popc( broThreads & lowerMask );
 	}
 
 	__syncthreads();
@@ -606,14 +614,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	}
 	// printf( "[%d] %d\n", threadIdx.x, blockHistogram[threadIdx.x] );
 
-	//{
-	//	u32 prefix = 0;
-	//	for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	//	{
-	//		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &lpSum[i] );
-	//	}
-	//}
-
 	__syncthreads();
 
 	if( threadIdx.x == 0 )
@@ -624,13 +624,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ );
 	}
 
-	//{
-	//	u32 prefix = 0;
-	//	for( int i = 0; i < 256 * REORDER_NUMBER_OF_WARPS; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	//	{
-	//		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &lpSum[i] );
-	//	}
-	//}
 	__syncthreads();
 
 	for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
@@ -638,28 +631,11 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
 		u32 bucketIndex = bucketIndices[k];
 
-		int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32
-		u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems;
-
-		for( int j = 0; j < 8; ++j )
-		{
-			u32 bit = ( bucketIndex >> j ) & 0x1;
-			u32 difference = ( 0xFFFFFFFF * bit ) ^
-#if defined( ITS )
-							 __ballot_sync( 0xFFFFFFFF, bit != 0 );
-#else
-							 __ballot( bit != 0 );
-#endif
-			broThreads &= ~difference;
-		}
-		int laneIndex = threadIdx.x % 32;
-		u32 lowerMask = ( 1u << laneIndex ) - 1;
-		bool leader = ( broThreads & lowerMask ) == 0;
-
 		if( itemIndex < numberOfInputs )
 		{
 			u32 localBase = lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
-			u32 to = localBase + __popc( broThreads & lowerMask );
+			// u32 to = localBase + __popc( broThreads & lowerMask );
+			u32 to = localBase + warpOffsets[k];
 
 			ElementLocation el;
 			el.localSrcIndex = itemIndex - blockIndex * RADIX_SORT_BLOCK_SIZE;
@@ -667,10 +643,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			el.bucket = bucketIndex;
 			elementLocations[to] = el;
 		}
-		if( itemIndex < numberOfInputs && leader )
-		{
-			lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += __popc( broThreads );
-		}
 	}
 
 	__syncthreads();
@@ -688,6 +660,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			outputKeys[dstIndex] = inputKeys[srcIndex];
 		}
 	}
+
 	if constexpr( keyPair )
 	{
 		for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )

From d33d590f9077b1888949c15eb0313d852e01a011 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 28 Dec 2023 21:35:17 +0900
Subject: [PATCH 37/68] shared approach

---
 ParallelPrimitives/RadixSortKernels.h | 61 +++++++++++++++++++--------
 1 file changed, 44 insertions(+), 17 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 89bb8c9..adb8b04 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -435,9 +435,28 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	};
 
 	__shared__ u32 blockHistogram[BIN_SIZE];
-	__shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
 	__shared__ u32 pSum[BIN_SIZE];
-	__shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
+	// __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
+	// __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
+
+	struct SMem
+	{
+		struct Phase1
+		{
+			u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
+		};
+		struct Phase2
+		{
+			ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
+		};
+
+		union
+		{
+			Phase1 phase1;
+			Phase2 phase2;
+		} u;
+	};
+	__shared__ SMem smem;
 
 	//__shared__ u32 localPrefixSum[BIN_SIZE];
 	//__shared__ u32 counters[BIN_SIZE];
@@ -461,7 +480,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	//}
 
 	clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( blockHistogram, 0 );
-	clearShared<BIN_SIZE * REORDER_NUMBER_OF_WARPS, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( lpSum, 0 );
+	clearShared<BIN_SIZE * REORDER_NUMBER_OF_WARPS, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( smem.u.phase1.lpSum, 0 );
 
 	__syncthreads();
 
@@ -503,7 +522,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 		if( itemIndex < numberOfInputs )
 		{
-			warpOffsets[k] = lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask );
+			warpOffsets[k] = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask );
 		}
 
 		bool leader = ( broThreads & lowerMask ) == 0;
@@ -511,7 +530,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		{
 			u32 n = __popc( broThreads );
 			atomicAdd( &blockHistogram[bucketIndex], n );
-			lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n;
+			smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n;
 		}
 	}
 
@@ -607,12 +626,11 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ )
 		{
 			int index = bucketIndex * REORDER_NUMBER_OF_WARPS + warp;
-			u32 n = lpSum[index];
-			lpSum[index] = s;
+			u32 n = smem.u.phase1.lpSum[index];
+			smem.u.phase1.lpSum[index] = s;
 			s += n;
 		}
 	}
-	// printf( "[%d] %d\n", threadIdx.x, blockHistogram[threadIdx.x] );
 
 	__syncthreads();
 
@@ -626,6 +644,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	__syncthreads();
 
+	for( int k = 0; k < REORDER_NUMBER_OF_ITEM_PER_THREAD; k++ )
+	{
+		u32 bucketIndex = bucketIndices[k];
+		warpOffsets[k] += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
+	}
+
+	__syncthreads();
+
 	for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
 	{
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
@@ -633,15 +659,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 		if( itemIndex < numberOfInputs )
 		{
-			u32 localBase = lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
-			// u32 to = localBase + __popc( broThreads & lowerMask );
-			u32 to = localBase + warpOffsets[k];
+			u32 to = warpOffsets[k];
 
 			ElementLocation el;
 			el.localSrcIndex = itemIndex - blockIndex * RADIX_SORT_BLOCK_SIZE;
-			el.localOffset = to - blockHistogram[bucketIndex];
+			// el.localOffset = to - blockHistogram[bucketIndex];
+			el.localOffset = 0;
 			el.bucket = bucketIndex;
-			elementLocations[to] = el;
+			smem.u.phase2.elementLocations[to] = el;
 		}
 	}
 
@@ -652,11 +677,12 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
 		if( itemIndex < numberOfInputs )
 		{
-			ElementLocation el = elementLocations[i];
+			ElementLocation el = smem.u.phase2.elementLocations[i];
 			u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
 			u8 bucketIndex = el.bucket;
 
-			u32 dstIndex = pSum[bucketIndex] + el.localOffset;
+			// u32 dstIndex = pSum[bucketIndex] + el.localOffset;
+			u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex];
 			outputKeys[dstIndex] = inputKeys[srcIndex];
 		}
 	}
@@ -668,11 +694,12 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
 			if( itemIndex < numberOfInputs )
 			{
-				ElementLocation el = elementLocations[i];
+				ElementLocation el = smem.u.phase2.elementLocations[i];
 				u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
 				u8 bucketIndex = el.bucket;
 
-				u32 dstIndex = pSum[bucketIndex] + el.localOffset;
+				// u32 dstIndex = pSum[bucketIndex] + el.localOffset;
+				u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex];
 				outputValues[dstIndex] = inputValues[srcIndex];
 			}
 		}

From 35a02f99d6c67f704d52423e80ace051f435c555 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 29 Dec 2023 17:45:56 +0900
Subject: [PATCH 38/68] add explicit sync

---
 ParallelPrimitives/RadixSortKernels.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index adb8b04..bc0dd8a 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -524,7 +524,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		{
 			warpOffsets[k] = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask );
 		}
-
+#if defined( ITS )
+		__syncwarp( 0xFFFFFFFF );
+#endif
 		bool leader = ( broThreads & lowerMask ) == 0;
 		if( itemIndex < numberOfInputs && leader )
 		{
@@ -532,6 +534,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			atomicAdd( &blockHistogram[bucketIndex], n );
 			smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n;
 		}
+#if defined( ITS )
+		__syncwarp( 0xFFFFFFFF );
+#endif
 	}
 
 	__syncthreads();

From 2179be6ca83819807ec2017394c5b4068c1f7a57 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 29 Dec 2023 17:46:27 +0900
Subject: [PATCH 39/68] larger block

---
 ParallelPrimitives/RadixSortConfigs.h |  6 +--
 ParallelPrimitives/RadixSortKernels.h | 57 ++++++++++++++++++++++++---
 2 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index f305940..16bb154 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -26,7 +26,7 @@ constexpr int RADIX_SORT_BLOCK_SIZE = 2048 * 2;
 constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048;
 constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256;
 
-constexpr int REORDER_NUMBER_OF_WARPS = 8;
+constexpr int REORDER_NUMBER_OF_WARPS = 16;
 constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS;
 constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = RADIX_SORT_BLOCK_SIZE / REORDER_NUMBER_OF_WARPS;
 constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32;
@@ -36,7 +36,7 @@ constexpr int MAX_LOOK_BACK = 64;
 constexpr int TAIL_BITS = 5;
 constexpr int TAIL_COUNT = 1u << TAIL_BITS;
 
-static_assert( REORDER_NUMBER_OF_THREADS_PER_BLOCK <= BIN_SIZE, "please check prefixSumExclusive on onesweep_reorder" );
-static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" );
+//static_assert( REORDER_NUMBER_OF_THREADS_PER_BLOCK <= BIN_SIZE, "please check prefixSumExclusive on onesweep_reorder" );
+//static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" );
 
 }; // namespace Oro
\ No newline at end of file
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index bc0dd8a..e859949 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -318,6 +318,49 @@ __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO )
 	return sum;
 }
 
+__device__ inline u32 scanExclusive( u32 prefix, u32* sMemIO, int nElement )
+{
+	// assert(nElement <= nThreads)
+	bool active = threadIdx.x < nElement;
+	u32 value = active ? sMemIO[threadIdx.x] : 0;
+
+	for( u32 offset = 1; offset < nElement; offset <<= 1 )
+	{
+		u32 x;
+		if( active )
+		{
+			x = sMemIO[threadIdx.x];
+		}
+
+		if( active && offset <= threadIdx.x )
+		{
+			x += sMemIO[threadIdx.x - offset];
+		}
+
+		__syncthreads();
+
+		if( active )
+		{
+			sMemIO[threadIdx.x] = x;
+		}
+
+		__syncthreads();
+	}
+
+	u32 sum = sMemIO[nElement - 1];
+
+	__syncthreads();
+
+	if( active )
+	{
+		sMemIO[threadIdx.x] += prefix - value;
+	}
+
+	__syncthreads();
+
+	return sum;
+}
+
 extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOfInputs, u32* gpSumBuffer, u32 startBits, u32* counter )
 {
 	__shared__ u32 localCounters[sizeof( RADIX_SORT_KEY_TYPE )][256];
@@ -620,11 +663,15 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	}
 
 
-	u32 prefix = 0;
-	for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	{
-		prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &blockHistogram[i] );
-	}
+	//u32 prefix = 0;
+	//for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	//{
+	//	prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &blockHistogram[i] );
+	//}
+
+	scanExclusive( 0, blockHistogram, BIN_SIZE );
+
+	if( threadIdx.x < BIN_SIZE )
 	{
 		int bucketIndex = threadIdx.x;
 		u32 s = blockHistogram[bucketIndex];

From 1269018a86c9f7913c629573bcb0bf3e33864213 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 29 Dec 2023 18:44:35 +0900
Subject: [PATCH 40/68] key cache

---
 ParallelPrimitives/RadixSortConfigs.h |  2 +-
 ParallelPrimitives/RadixSortKernels.h | 78 +++++++++++++++++----------
 2 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index 16bb154..d13fb08 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -26,7 +26,7 @@ constexpr int RADIX_SORT_BLOCK_SIZE = 2048 * 2;
 constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048;
 constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256;
 
-constexpr int REORDER_NUMBER_OF_WARPS = 16;
+constexpr int REORDER_NUMBER_OF_WARPS = 8;
 constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS;
 constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = RADIX_SORT_BLOCK_SIZE / REORDER_NUMBER_OF_WARPS;
 constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32;
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index e859949..f911b49 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -490,7 +490,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		};
 		struct Phase2
 		{
-			ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
+			// ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
+			RADIX_SORT_KEY_TYPE elements[RADIX_SORT_BLOCK_SIZE];
 		};
 
 		union
@@ -527,7 +528,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	__syncthreads();
 
-	u8  bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD];
+	// u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD];
+	u32 keys[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 	u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 	// u32 bros[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 
@@ -542,8 +544,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		{
 			auto item = inputKeys[itemIndex];
 			bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
+			keys[k] = item;
 		}
-		bucketIndices[k] = bucketIndex;
+		// bucketIndices[k] = bucketIndex;
 
 		int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32
 		u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems;
@@ -698,16 +701,19 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	for( int k = 0; k < REORDER_NUMBER_OF_ITEM_PER_THREAD; k++ )
 	{
-		u32 bucketIndex = bucketIndices[k];
+		// u32 bucketIndex = bucketIndices[k];
+		u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation );
 		warpOffsets[k] += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
 	}
 
 	__syncthreads();
 
+
 	for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
 	{
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
-		u32 bucketIndex = bucketIndices[k];
+		// u32 bucketIndex = bucketIndices[k];
+		u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation );
 
 		if( itemIndex < numberOfInputs )
 		{
@@ -718,10 +724,18 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			// el.localOffset = to - blockHistogram[bucketIndex];
 			el.localOffset = 0;
 			el.bucket = bucketIndex;
-			smem.u.phase2.elementLocations[to] = el;
+			// smem.u.phase2.elementLocations[to] = el;
+			
+			// smem.u.phase2.elements[to] = inputKeys[itemIndex];
+			smem.u.phase2.elements[to] = keys[k];
 		}
 	}
 
+	if( threadIdx.x < BIN_SIZE )
+	{
+		pSum[threadIdx.x] -= blockHistogram[threadIdx.x];
+	}
+
 	__syncthreads();
 
 	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
@@ -729,33 +743,39 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
 		if( itemIndex < numberOfInputs )
 		{
-			ElementLocation el = smem.u.phase2.elementLocations[i];
-			u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
-			u8 bucketIndex = el.bucket;
-
-			// u32 dstIndex = pSum[bucketIndex] + el.localOffset;
-			u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex];
-			outputKeys[dstIndex] = inputKeys[srcIndex];
+		//	ElementLocation el = smem.u.phase2.elementLocations[i];
+		//	u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
+		//	u8 bucketIndex = el.bucket;
+
+		//	// u32 dstIndex = pSum[bucketIndex] + el.localOffset;
+		//	u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex];
+		//	outputKeys[dstIndex] = inputKeys[srcIndex];
+
+			auto item = smem.u.phase2.elements[i];
+			u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
+			// u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex];
+			u32 dstIndex = pSum[bucketIndex] + i;
+			outputKeys[dstIndex] = item;
 		}
 	}
 
-	if constexpr( keyPair )
-	{
-		for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-		{
-			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
-			if( itemIndex < numberOfInputs )
-			{
-				ElementLocation el = smem.u.phase2.elementLocations[i];
-				u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
-				u8 bucketIndex = el.bucket;
+	//if constexpr( keyPair )
+	//{
+	//	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	//	{
+	//		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
+	//		if( itemIndex < numberOfInputs )
+	//		{
+	//			ElementLocation el = smem.u.phase2.elementLocations[i];
+	//			u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
+	//			u8 bucketIndex = el.bucket;
 
-				// u32 dstIndex = pSum[bucketIndex] + el.localOffset;
-				u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex];
-				outputValues[dstIndex] = inputValues[srcIndex];
-			}
-		}
-	}
+	//			// u32 dstIndex = pSum[bucketIndex] + el.localOffset;
+	//			u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex];
+	//			outputValues[dstIndex] = inputValues[srcIndex];
+	//		}
+	//	}
+	//}
 
 	// A special case handling: all elements have the same digit
 	//u32 globalOutput = matchMasks[0][0];

From bcb56c9d120738278536acd2633ecabac4bd835a Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 29 Dec 2023 19:40:10 +0900
Subject: [PATCH 41/68] 16bit lpsum

---
 ParallelPrimitives/RadixSortConfigs.h | 2 +-
 ParallelPrimitives/RadixSortKernels.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index d13fb08..bd0b70f 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -20,7 +20,7 @@ constexpr auto SINGLE_SORT_WG_SIZE{ 128 };
 
 static_assert( BIN_SIZE % 2 == 0 );
 
-constexpr int RADIX_SORT_BLOCK_SIZE = 2048 * 2;
+constexpr int RADIX_SORT_BLOCK_SIZE = 2048 + 1024 + 1024;
 // constexpr int RADIX_SORT_BLOCK_SIZE = 512;
 
 constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048;
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index f911b49..cac5368 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -486,7 +486,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	{
 		struct Phase1
 		{
-			u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
+			u16 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
 		};
 		struct Phase2
 		{
@@ -524,7 +524,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	//}
 
 	clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( blockHistogram, 0 );
-	clearShared<BIN_SIZE * REORDER_NUMBER_OF_WARPS, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( smem.u.phase1.lpSum, 0 );
+	clearShared<BIN_SIZE * REORDER_NUMBER_OF_WARPS, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u16>( smem.u.phase1.lpSum, 0 );
 
 	__syncthreads();
 

From 1207fdc082f56ef81998a1c1cac2ce720dbae4df Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 29 Dec 2023 20:38:35 +0900
Subject: [PATCH 42/68] 16bit blockHist

---
 ParallelPrimitives/RadixSortKernels.h | 31 ++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index cac5368..7f99d9c 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -318,15 +318,16 @@ __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO )
 	return sum;
 }
 
-__device__ inline u32 scanExclusive( u32 prefix, u32* sMemIO, int nElement )
+template <class T>
+__device__ inline T scanExclusive( T prefix, T* sMemIO, int nElement )
 {
 	// assert(nElement <= nThreads)
 	bool active = threadIdx.x < nElement;
-	u32 value = active ? sMemIO[threadIdx.x] : 0;
+	T value = active ? sMemIO[threadIdx.x] : 0;
 
 	for( u32 offset = 1; offset < nElement; offset <<= 1 )
 	{
-		u32 x;
+		T x;
 		if( active )
 		{
 			x = sMemIO[threadIdx.x];
@@ -347,7 +348,7 @@ __device__ inline u32 scanExclusive( u32 prefix, u32* sMemIO, int nElement )
 		__syncthreads();
 	}
 
-	u32 sum = sMemIO[nElement - 1];
+	T sum = sMemIO[nElement - 1];
 
 	__syncthreads();
 
@@ -477,7 +478,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		u32 bucket : 8;
 	};
 
-	__shared__ u32 blockHistogram[BIN_SIZE];
+	__shared__ u16 blockHistogram[BIN_SIZE];
 	__shared__ u32 pSum[BIN_SIZE];
 	// __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
 	// __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
@@ -523,7 +524,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	//	}
 	//}
 
-	clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( blockHistogram, 0 );
+	// clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( blockHistogram, 0 );
 	clearShared<BIN_SIZE * REORDER_NUMBER_OF_WARPS, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u16>( smem.u.phase1.lpSum, 0 );
 
 	__syncthreads();
@@ -572,21 +573,35 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 #if defined( ITS )
 		__syncwarp( 0xFFFFFFFF );
+#else
+		__threadfence_block();
 #endif
 		bool leader = ( broThreads & lowerMask ) == 0;
 		if( itemIndex < numberOfInputs && leader )
 		{
 			u32 n = __popc( broThreads );
-			atomicAdd( &blockHistogram[bucketIndex], n );
 			smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n;
 		}
 #if defined( ITS )
 		__syncwarp( 0xFFFFFFFF );
+#else
+		__threadfence_block();
 #endif
 	}
 
 	__syncthreads();
 
+	if( threadIdx.x < BIN_SIZE )
+	{
+		int bucketIndex = threadIdx.x;
+		u32 s = 0;
+		for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ )
+		{
+			s += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
+		}
+		blockHistogram[bucketIndex] = s;
+	}
+
 	struct ParitionID
 	{
 		u64 value : 32;
@@ -672,7 +687,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	//	prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &blockHistogram[i] );
 	//}
 
-	scanExclusive( 0, blockHistogram, BIN_SIZE );
+	scanExclusive<u16>( 0, blockHistogram, BIN_SIZE );
 
 	if( threadIdx.x < BIN_SIZE )
 	{

From b78f5dc4a7632384bbb52d6e78099cc150681e6b Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Sun, 31 Dec 2023 19:02:43 +0900
Subject: [PATCH 43/68] keyValue support

---
 ParallelPrimitives/RadixSortKernels.h | 92 ++++++++++++++++-----------
 1 file changed, 55 insertions(+), 37 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 7f99d9c..65e91be 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -478,7 +478,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		u32 bucket : 8;
 	};
 
-	__shared__ u16 blockHistogram[BIN_SIZE];
 	__shared__ u32 pSum[BIN_SIZE];
 	// __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
 	// __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
@@ -487,6 +486,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	{
 		struct Phase1
 		{
+			u16 blockHistogram[BIN_SIZE];
 			u16 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
 		};
 		struct Phase2
@@ -494,11 +494,18 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			// ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
 			RADIX_SORT_KEY_TYPE elements[RADIX_SORT_BLOCK_SIZE];
 		};
+		struct Phase3
+		{
+			// ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
+			RADIX_SORT_VALUE_TYPE elements[RADIX_SORT_BLOCK_SIZE];
+			u8 buckets[RADIX_SORT_BLOCK_SIZE];
+		};
 
 		union
 		{
 			Phase1 phase1;
 			Phase2 phase2;
+			Phase3 phase3;
 		} u;
 	};
 	__shared__ SMem smem;
@@ -524,7 +531,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	//	}
 	//}
 
-	// clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( blockHistogram, 0 );
 	clearShared<BIN_SIZE * REORDER_NUMBER_OF_WARPS, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u16>( smem.u.phase1.lpSum, 0 );
 
 	__syncthreads();
@@ -599,7 +605,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		{
 			s += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
 		}
-		blockHistogram[bucketIndex] = s;
+		smem.u.phase1.blockHistogram[bucketIndex] = s;
 	}
 
 	struct ParitionID
@@ -632,7 +638,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
 		//u32 s = localPrefixSum[i];
-		u32 s = blockHistogram[i];
+		u32 s = smem.u.phase1.blockHistogram[i];
 		int pIndex = BIN_SIZE * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i;
 
 		{
@@ -684,15 +690,15 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	//u32 prefix = 0;
 	//for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	//{
-	//	prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &blockHistogram[i] );
+	//	prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &smem.u.phase1.blockHistogram[i] );
 	//}
 
-	scanExclusive<u16>( 0, blockHistogram, BIN_SIZE );
+	scanExclusive<u16>( 0, smem.u.phase1.blockHistogram, BIN_SIZE );
 
 	if( threadIdx.x < BIN_SIZE )
 	{
 		int bucketIndex = threadIdx.x;
-		u32 s = blockHistogram[bucketIndex];
+		u32 s = smem.u.phase1.blockHistogram[bucketIndex];
 		for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ )
 		{
 			int index = bucketIndex * REORDER_NUMBER_OF_WARPS + warp;
@@ -716,64 +722,76 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	for( int k = 0; k < REORDER_NUMBER_OF_ITEM_PER_THREAD; k++ )
 	{
-		// u32 bucketIndex = bucketIndices[k];
 		u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation );
 		warpOffsets[k] += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
 	}
 
+	if( threadIdx.x < BIN_SIZE )
+	{
+		pSum[threadIdx.x] -= smem.u.phase1.blockHistogram[threadIdx.x];
+	}
+
 	__syncthreads();
 
 
 	for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
 	{
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
-		// u32 bucketIndex = bucketIndices[k];
 		u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation );
-
 		if( itemIndex < numberOfInputs )
 		{
-			u32 to = warpOffsets[k];
-
-			ElementLocation el;
-			el.localSrcIndex = itemIndex - blockIndex * RADIX_SORT_BLOCK_SIZE;
-			// el.localOffset = to - blockHistogram[bucketIndex];
-			el.localOffset = 0;
-			el.bucket = bucketIndex;
-			// smem.u.phase2.elementLocations[to] = el;
-			
-			// smem.u.phase2.elements[to] = inputKeys[itemIndex];
-			smem.u.phase2.elements[to] = keys[k];
+			smem.u.phase2.elements[warpOffsets[k]] = keys[k];
 		}
 	}
 
-	if( threadIdx.x < BIN_SIZE )
-	{
-		pSum[threadIdx.x] -= blockHistogram[threadIdx.x];
-	}
-
 	__syncthreads();
 
-	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	for( int i = threadIdx.x, k = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK, k++ )
 	{
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
 		if( itemIndex < numberOfInputs )
 		{
-		//	ElementLocation el = smem.u.phase2.elementLocations[i];
-		//	u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
-		//	u8 bucketIndex = el.bucket;
-
-		//	// u32 dstIndex = pSum[bucketIndex] + el.localOffset;
-		//	u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex];
-		//	outputKeys[dstIndex] = inputKeys[srcIndex];
-
 			auto item = smem.u.phase2.elements[i];
 			u32 bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
-			// u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex];
+
+			// u32 dstIndex = pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex];
 			u32 dstIndex = pSum[bucketIndex] + i;
 			outputKeys[dstIndex] = item;
 		}
 	}
 
+	if constexpr( keyPair )
+	{
+		__syncthreads();
+
+		for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
+		{
+			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
+			u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation );
+			if( itemIndex < numberOfInputs )
+			{
+				smem.u.phase3.elements[warpOffsets[k]] = inputValues[itemIndex];
+				smem.u.phase3.buckets[warpOffsets[k]] = bucketIndex;
+			}
+		}
+
+		__syncthreads();
+
+		for( int i = threadIdx.x, k = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK, k++ )
+		{
+			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
+			if( itemIndex < numberOfInputs )
+			{
+				auto item       = smem.u.phase3.elements[i];
+				u32 bucketIndex = smem.u.phase3.buckets[i];
+
+				// u32 dstIndex = pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex];
+				u32 dstIndex = pSum[bucketIndex] + i;
+				outputValues[dstIndex] = item;
+			}
+		}
+	}
+
 	//if constexpr( keyPair )
 	//{
 	//	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
@@ -786,7 +804,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	//			u8 bucketIndex = el.bucket;
 
 	//			// u32 dstIndex = pSum[bucketIndex] + el.localOffset;
-	//			u32 dstIndex = pSum[bucketIndex] + i - blockHistogram[bucketIndex];
+	//			u32 dstIndex = pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex];
 	//			outputValues[dstIndex] = inputValues[srcIndex];
 	//		}
 	//	}

From fd9357f02095ba61df8f1a272b7e2472969b5816 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Sun, 31 Dec 2023 20:52:40 +0900
Subject: [PATCH 44/68] smaller warpOffsets

---
 ParallelPrimitives/RadixSortKernels.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 65e91be..2b989fd 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -536,8 +536,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	__syncthreads();
 
 	// u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD];
-	u32 keys[REORDER_NUMBER_OF_ITEM_PER_THREAD];
-	u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD];
+	RADIX_SORT_KEY_TYPE keys[REORDER_NUMBER_OF_ITEM_PER_THREAD];
+	u16 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 	// u32 bros[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 
 	int warp = threadIdx.x / 32;

From 1224e6d8c847b71e3770cb839dcb630a4870e55e Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Mon, 1 Jan 2024 07:33:15 +0900
Subject: [PATCH 45/68] n batch loading

---
 ParallelPrimitives/RadixSortKernels.h | 40 ++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 2b989fd..83a67fa 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -482,12 +482,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	// __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
 	// __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
 
+	constexpr int N_BATCH_LOAD = 4;
 	struct SMem
 	{
 		struct Phase1
 		{
 			u16 blockHistogram[BIN_SIZE];
 			u16 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
+			RADIX_SORT_KEY_TYPE batchKeys[REORDER_NUMBER_OF_WARPS][N_BATCH_LOAD][32];
 		};
 		struct Phase2
 		{
@@ -537,19 +539,49 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	// u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 	RADIX_SORT_KEY_TYPE keys[REORDER_NUMBER_OF_ITEM_PER_THREAD];
-	u16 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD];
+	u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 	// u32 bros[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 
+	bool batchLoading = ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs;
+
 	int warp = threadIdx.x / 32;
 	int lane = threadIdx.x % 32;
-	for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
+	for( int i = 0, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
 	{
-		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
+		if( batchLoading && ( k % N_BATCH_LOAD ) == 0 )
+		{
+			struct alignas( 16 ) BatchKeys
+			{
+				RADIX_SORT_KEY_TYPE xs[N_BATCH_LOAD];
+			};
+			int srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane * N_BATCH_LOAD;
+			BatchKeys batchKeys = *(BatchKeys*)&inputKeys[srcIndex];
+			for( int v = 0; v < N_BATCH_LOAD; v++ )
+			{
+				int indexInWarp = lane * N_BATCH_LOAD + v;
+				int toK = indexInWarp / 32;
+				int toLane = indexInWarp % 32;
+				smem.u.phase1.batchKeys[warp][toK][toLane] = batchKeys.xs[v];
+			}
+
+			__syncthreads();
+		}
+
+		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane;
 
 		u32 bucketIndex = 0;
 		if( itemIndex < numberOfInputs )
 		{
-			auto item = inputKeys[itemIndex];
+			RADIX_SORT_KEY_TYPE item;
+			if( batchLoading )
+			{
+				item = smem.u.phase1.batchKeys[warp][k % N_BATCH_LOAD][lane];
+			}
+			else
+			{
+				item = inputKeys[itemIndex];
+			}
+
 			bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
 			keys[k] = item;
 		}

From 2c997072e98d4a4c9029782d3faf5a8d6431bed0 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Mon, 1 Jan 2024 08:50:14 +0900
Subject: [PATCH 46/68] warp level is fine

---
 ParallelPrimitives/RadixSortKernels.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 83a67fa..a78d8c3 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -564,7 +564,11 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 				smem.u.phase1.batchKeys[warp][toK][toLane] = batchKeys.xs[v];
 			}
 
-			__syncthreads();
+#if defined( ITS )
+			__syncwarp( 0xFFFFFFFF );
+#else
+			__threadfence_block();
+#endif
 		}
 
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane;

From 59afd88d56093b5511987932adca7a5c5421eb40 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 2 Jan 2024 19:07:02 +0900
Subject: [PATCH 47/68] clean up

---
 ParallelPrimitives/RadixSortKernels.h | 149 --------------------------
 1 file changed, 149 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index a78d8c3..0de4669 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -479,8 +479,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	};
 
 	__shared__ u32 pSum[BIN_SIZE];
-	// __shared__ u32 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
-	// __shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
 
 	constexpr int N_BATCH_LOAD = 4;
 	struct SMem
@@ -512,27 +510,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	};
 	__shared__ SMem smem;
 
-	//__shared__ u32 localPrefixSum[BIN_SIZE];
-	//__shared__ u32 counters[BIN_SIZE];
-	//__shared__ ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
-	//__shared__ u8 elementBuckets[RADIX_SORT_BLOCK_SIZE];
-	//__shared__ u32 matchMasks[REORDER_NUMBER_OF_WARPS][BIN_SIZE];
-
 	u32 bitLocation = startBits + 8 * iteration;
 	u32 blockIndex = blockIdx.x;
 	u32 numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE );
 
-	// clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( localPrefixSum, 0 );
-	// clearShared<BIN_SIZE, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u32>( counters, 0 );
-
-	//for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ )
-	//{
-	//	for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	//	{
-	//		matchMasks[w][i] = 0;
-	//	}
-	//}
-
 	clearShared<BIN_SIZE * REORDER_NUMBER_OF_WARPS, REORDER_NUMBER_OF_THREADS_PER_BLOCK, u16>( smem.u.phase1.lpSum, 0 );
 
 	__syncthreads();
@@ -722,13 +703,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		pSum[i] = globalOutput;
 	}
 
-
-	//u32 prefix = 0;
-	//for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	//{
-	//	prefix += prefixSumExclusive<REORDER_NUMBER_OF_THREADS_PER_BLOCK>( prefix, &smem.u.phase1.blockHistogram[i] );
-	//}
-
 	scanExclusive<u16>( 0, smem.u.phase1.blockHistogram, BIN_SIZE );
 
 	if( threadIdx.x < BIN_SIZE )
@@ -827,129 +801,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			}
 		}
 	}
-
-	//if constexpr( keyPair )
-	//{
-	//	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	//	{
-	//		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
-	//		if( itemIndex < numberOfInputs )
-	//		{
-	//			ElementLocation el = smem.u.phase2.elementLocations[i];
-	//			u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
-	//			u8 bucketIndex = el.bucket;
-
-	//			// u32 dstIndex = pSum[bucketIndex] + el.localOffset;
-	//			u32 dstIndex = pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex];
-	//			outputValues[dstIndex] = inputValues[srcIndex];
-	//		}
-	//	}
-	//}
-
-	// A special case handling: all elements have the same digit
-	//u32 globalOutput = matchMasks[0][0];
-	//if( globalOutput-- /* -1 for the actual offset */ )
-	//{
-	//	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	//	{
-	//		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
-	//		if( itemIndex < numberOfInputs )
-	//		{
-	//			u32 dstIndex = globalOutput + i;
-	//			outputKeys[dstIndex] = inputKeys[itemIndex];
-	//			if constexpr( keyPair )
-	//			{
-	//				outputValues[dstIndex] = inputValues[itemIndex];
-	//			}
-	//		}
-	//	}
-	//	return;
-	//}
-
-	// reorder
-	//for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	//{
-	//	u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
-	//	u32 bucketIndex = elementBuckets[i];
-
-	//	__syncthreads();
-
-	//	int warp = threadIdx.x / 32;
-	//	int lane = threadIdx.x % 32;
-
-	//	if( itemIndex < numberOfInputs )
-	//	{
-	//		atomicOr( &matchMasks[warp][bucketIndex], 1u << lane );
-	//	}
-
-	//	__syncthreads();
-
-	//	bool flushMask = false;
-
-	//	if( itemIndex < numberOfInputs )
-	//	{
-	//		u32 matchMask = matchMasks[warp][bucketIndex];
-	//		u32 lowerMask = ( 1u << lane ) - 1;
-	//		u32 offset = __popc( matchMask & lowerMask );
-
-	//		flushMask = offset == 0;
-
-	//		for( int w = 0; w < warp; w++ )
-	//		{
-	//			offset += __popc( matchMasks[w][bucketIndex] );
-	//		}
-
-	//		u32 localOffset = counters[bucketIndex] + offset;
-	//		u32 to = localOffset + localPrefixSum[bucketIndex];
-
-	//		ElementLocation el;
-	//		el.localSrcIndex = i;
-	//		el.localOffset = localOffset;
-	//		el.bucket = bucketIndex;
-	//		elementLocations[to] = el;
-	//	}
-
-	//	__syncthreads();
-
-	//	if( itemIndex < numberOfInputs )
-	//	{
-	//		atomicInc( &counters[bucketIndex], 0xFFFFFFFF );
-	//	}
-	//	if( flushMask )
-	//	{
-	//		matchMasks[warp][bucketIndex] = 0;
-	//	}
-	//}
-
-	//for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	//{
-	//	u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
-	//	if( itemIndex < numberOfInputs )
-	//	{
-	//		ElementLocation el = elementLocations[i];
-	//		u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
-	//		u8 bucketIndex = el.bucket;
-
-	//		u32 dstIndex = pSum[bucketIndex] + el.localOffset;
-	//		outputKeys[dstIndex] = inputKeys[srcIndex];
-	//	}
-	//}
-	//if constexpr ( keyPair )
-	//{
-	//	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	//	{
-	//		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
-	//		if( itemIndex < numberOfInputs )
-	//		{
-	//			ElementLocation el = elementLocations[i];
-	//			u32 srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + el.localSrcIndex;
-	//			u8 bucketIndex = el.bucket;
-
-	//			u32 dstIndex = pSum[bucketIndex] + el.localOffset;
-	//			outputValues[dstIndex] = inputValues[srcIndex];
-	//		}
-	//	}
-	//}
 }
 extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits,
 												  u32 iteration )

From fedd3c568fc652183543b69382021cf23f87969d Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 2 Jan 2024 19:10:53 +0900
Subject: [PATCH 48/68] psum in gHistogram

---
 ParallelPrimitives/RadixSort.cpp      |  5 -----
 ParallelPrimitives/RadixSort.h        |  1 -
 ParallelPrimitives/RadixSortConfigs.h |  2 +-
 ParallelPrimitives/RadixSortKernels.h | 22 +++++-----------------
 4 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 3f31121..4ab14c6 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -164,7 +164,6 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 	// TODO: bit code support?
 #define LOAD_FUNC( var, kernel ) var = m_oroutils.getFunctionFromFile( m_device, currentKernelPath.c_str(), kernel, &opts );
 	LOAD_FUNC( m_gHistogram, "gHistogram" );
-	LOAD_FUNC( m_gPrefixSum, "gPrefixSum" );
 	LOAD_FUNC( m_onesweep_reorderKey64, "onesweep_reorderKey64" );
 	LOAD_FUNC( m_onesweep_reorderKeyPair64, "onesweep_reorderKeyPair64" );
 #undef LOAD_FUNC
@@ -232,10 +231,6 @@ void RadixSort::sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n
 		const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &counter };
 		OrochiUtils::launch1D( m_gHistogram, nBlocks * GHISTOGRAM_THREADS_PER_BLOCK, args, GHISTOGRAM_THREADS_PER_BLOCK, 0, stream );
 	}
-	{
-		const void* args[] = { &gpSumBuffer };
-		OrochiUtils::launch1D( m_gPrefixSum, nIteration * BIN_SIZE, args, BIN_SIZE, 0, stream );
-	}
 
 	auto s = src;
 	auto d = dst;
diff --git a/ParallelPrimitives/RadixSort.h b/ParallelPrimitives/RadixSort.h
index 2a7c3be..a20c9f0 100644
--- a/ParallelPrimitives/RadixSort.h
+++ b/ParallelPrimitives/RadixSort.h
@@ -75,7 +75,6 @@ class RadixSort final
 	OrochiUtils& m_oroutils;
 
 	oroFunction m_gHistogram;
-	oroFunction m_gPrefixSum;
 	oroFunction m_onesweep_reorderKey64;
 	oroFunction m_onesweep_reorderKeyPair64;
 
diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index bd0b70f..9f3701f 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -36,7 +36,7 @@ constexpr int MAX_LOOK_BACK = 64;
 constexpr int TAIL_BITS = 5;
 constexpr int TAIL_COUNT = 1u << TAIL_BITS;
 
-//static_assert( REORDER_NUMBER_OF_THREADS_PER_BLOCK <= BIN_SIZE, "please check prefixSumExclusive on onesweep_reorder" );
+static_assert( BIN_SIZE <= REORDER_NUMBER_OF_THREADS_PER_BLOCK, "please check scanExclusive" );
 //static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" );
 
 }; // namespace Oro
\ No newline at end of file
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 0de4669..7e73717 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -442,7 +442,10 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 
 	if( hasData )
 	{
-		__syncthreads();
+		for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
+		{
+			scanExclusive<u32>( 0, &localCounters[i][0], BIN_SIZE );
+		}
 
 		for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 		{
@@ -454,19 +457,6 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 	}
 }
 
-extern "C" __global__ void gPrefixSum( u32* gpSumBuffer )
-{
-	__shared__ u32 smem[BIN_SIZE];
-
-	smem[threadIdx.x] = gpSumBuffer[blockIdx.x * BIN_SIZE + threadIdx.x];
-
-	__syncthreads();
-
-	prefixSumExclusive<BIN_SIZE>( 0, smem );
-
-	gpSumBuffer[blockIdx.x * BIN_SIZE + threadIdx.x] = smem[threadIdx.x];
-}
-
 template <bool keyPair>
 __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer,
 												  volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration )
@@ -518,12 +508,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	__syncthreads();
 
-	// u8 bucketIndices[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 	RADIX_SORT_KEY_TYPE keys[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 	u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD];
-	// u32 bros[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 
-	bool batchLoading = ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs;
+	bool batchLoading = KEY_IS_16BYTE_ALIGNED && ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs;
 
 	int warp = threadIdx.x / 32;
 	int lane = threadIdx.x % 32;

From 19dd9fa0165210b7c09407be0bc678479829bfec Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Tue, 2 Jan 2024 19:12:16 +0900
Subject: [PATCH 49/68] remove unused

---
 ParallelPrimitives/RadixSortKernels.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 7e73717..d71aefc 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -461,13 +461,6 @@ template <bool keyPair>
 __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer,
 												  volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration )
 {
-	struct ElementLocation
-	{
-		u32 localSrcIndex : 12;
-		u32 localOffset : 12;
-		u32 bucket : 8;
-	};
-
 	__shared__ u32 pSum[BIN_SIZE];
 
 	constexpr int N_BATCH_LOAD = 4;
@@ -481,12 +474,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		};
 		struct Phase2
 		{
-			// ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
 			RADIX_SORT_KEY_TYPE elements[RADIX_SORT_BLOCK_SIZE];
 		};
 		struct Phase3
 		{
-			// ElementLocation elementLocations[RADIX_SORT_BLOCK_SIZE];
 			RADIX_SORT_VALUE_TYPE elements[RADIX_SORT_BLOCK_SIZE];
 			u8 buckets[RADIX_SORT_BLOCK_SIZE];
 		};
@@ -558,7 +549,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
 			keys[k] = item;
 		}
-		// bucketIndices[k] = bucketIndex;
 
 		int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32
 		u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems;
@@ -574,7 +564,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 #endif
 			broThreads &= ~difference;
 		}
-		// bros[k] = broThreads;
+
 		int laneIndex = threadIdx.x % 32;
 		u32 lowerMask = ( 1u << laneIndex ) - 1;
 

From ac0605d9711028d00bfb7958e1e276cbae506896 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Wed, 3 Jan 2024 00:23:11 +0900
Subject: [PATCH 50/68] refactor

---
 ParallelPrimitives/RadixSortConfigs.h |  2 ++
 ParallelPrimitives/RadixSortKernels.h | 34 +++++++++++++--------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index 9f3701f..178ffa4 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -20,6 +20,8 @@ constexpr auto SINGLE_SORT_WG_SIZE{ 128 };
 
 static_assert( BIN_SIZE % 2 == 0 );
 
+constexpr int WARP_SIZE = 32;
+
 constexpr int RADIX_SORT_BLOCK_SIZE = 2048 + 1024 + 1024;
 // constexpr int RADIX_SORT_BLOCK_SIZE = 512;
 
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index d71aefc..ee1e4af 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -364,11 +364,11 @@ __device__ inline T scanExclusive( T prefix, T* sMemIO, int nElement )
 
 extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOfInputs, u32* gpSumBuffer, u32 startBits, u32* counter )
 {
-	__shared__ u32 localCounters[sizeof( RADIX_SORT_KEY_TYPE )][256];
+	__shared__ u32 localCounters[sizeof( RADIX_SORT_KEY_TYPE )][BIN_SIZE];
 
 	for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 	{
-		for( int j = threadIdx.x; j < 256; j += GHISTOGRAM_THREADS_PER_BLOCK )
+		for( int j = threadIdx.x; j < BIN_SIZE; j += GHISTOGRAM_THREADS_PER_BLOCK )
 		{
 			localCounters[i][j] = 0;
 		}
@@ -406,7 +406,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 					auto item = key4.xs[k];
 					for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 					{
-						u32 bitLocation = startBits + i * 8;
+						u32 bitLocation = startBits + i * N_RADIX;
 						u32 bits = extractDigit( getKeyBits( item ), bitLocation );
 						atomicInc( &localCounters[i][bits], 0xFFFFFFFF );
 					}
@@ -423,7 +423,7 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 					auto item = inputs[itemIndex];
 					for( int j = 0; j < sizeof( RADIX_SORT_KEY_TYPE ); j++ )
 					{
-						u32 bitLocation = startBits + j * 8;
+						u32 bitLocation = startBits + j * N_RADIX;
 						u32 bits = extractDigit( getKeyBits( item ), bitLocation );
 						atomicInc( &localCounters[j][bits], 0xFFFFFFFF );
 					}
@@ -470,7 +470,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		{
 			u16 blockHistogram[BIN_SIZE];
 			u16 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
-			RADIX_SORT_KEY_TYPE batchKeys[REORDER_NUMBER_OF_WARPS][N_BATCH_LOAD][32];
+			RADIX_SORT_KEY_TYPE batchKeys[REORDER_NUMBER_OF_WARPS][N_BATCH_LOAD][WARP_SIZE];
 		};
 		struct Phase2
 		{
@@ -491,7 +491,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	};
 	__shared__ SMem smem;
 
-	u32 bitLocation = startBits + 8 * iteration;
+	u32 bitLocation = startBits + N_RADIX * iteration;
 	u32 blockIndex = blockIdx.x;
 	u32 numberOfBlocks = div_round_up( numberOfInputs, RADIX_SORT_BLOCK_SIZE );
 
@@ -504,9 +504,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	bool batchLoading = KEY_IS_16BYTE_ALIGNED && ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs;
 
-	int warp = threadIdx.x / 32;
-	int lane = threadIdx.x % 32;
-	for( int i = 0, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
+	int warp = threadIdx.x / WARP_SIZE;
+	int lane = threadIdx.x % WARP_SIZE;
+	for( int i = 0, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ )
 	{
 		if( batchLoading && ( k % N_BATCH_LOAD ) == 0 )
 		{
@@ -519,8 +519,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			for( int v = 0; v < N_BATCH_LOAD; v++ )
 			{
 				int indexInWarp = lane * N_BATCH_LOAD + v;
-				int toK = indexInWarp / 32;
-				int toLane = indexInWarp % 32;
+				int toK = indexInWarp / WARP_SIZE;
+				int toLane = indexInWarp % WARP_SIZE;
 				smem.u.phase1.batchKeys[warp][toK][toLane] = batchKeys.xs[v];
 			}
 
@@ -550,10 +550,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			keys[k] = item;
 		}
 
-		int nNoneActiveItems = 32 - u32min( numberOfInputs - ( itemIndex - lane ), 32 ); // 0 - 32
+		int nNoneActiveItems = WARP_SIZE - u32min( numberOfInputs - ( itemIndex - lane ), WARP_SIZE ); // 0 - 32
 		u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems;
 
-		for( int j = 0; j < 8; ++j )
+		for( int j = 0; j < N_RADIX; ++j )
 		{
 			u32 bit = ( bucketIndex >> j ) & 0x1;
 			u32 difference = ( 0xFFFFFFFF * bit ) ^
@@ -565,8 +565,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			broThreads &= ~difference;
 		}
 
-		int laneIndex = threadIdx.x % 32;
-		u32 lowerMask = ( 1u << laneIndex ) - 1;
+		u32 lowerMask = ( 1u << lane ) - 1;
 
 		if( itemIndex < numberOfInputs )
 		{
@@ -632,7 +631,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	for( int i = threadIdx.x; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		//u32 s = localPrefixSum[i];
 		u32 s = smem.u.phase1.blockHistogram[i];
 		int pIndex = BIN_SIZE * ( blockIndex % LOOKBACK_TABLE_SIZE ) + i;
 
@@ -722,7 +720,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	__syncthreads();
 
 
-	for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
+	for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ )
 	{
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
 		u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation );
@@ -752,7 +750,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	{
 		__syncthreads();
 
-		for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += 32, k++ )
+		for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ )
 		{
 			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;
 			u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation );

From 34ac3652390d19d8f555a54350d9ce5e36358645 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Wed, 3 Jan 2024 11:10:22 +0900
Subject: [PATCH 51/68] fix undefined behavior and simplify

---
 ParallelPrimitives/RadixSortKernels.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index ee1e4af..3ec8bd5 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -285,7 +285,6 @@ __device__ inline u32 getKeyBits( u32 x ) { return x ^ ORDER_MASK_32; }
 __device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; }
 __device__ inline u32 extractDigit( u32 x, u32 bitLocation ) { return ( x >> bitLocation ) & RADIX_MASK; }
 __device__ inline u32 extractDigit( u64 x, u32 bitLocation ) { return (u32)( ( x >> bitLocation ) & RADIX_MASK ); }
-__device__ __forceinline__ u32 u32min( u32 x, u32 y ) { return ( y < x ) ? y : x; }
 
 template<int NThreads>
 __device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO )
@@ -550,8 +549,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 			keys[k] = item;
 		}
 
-		int nNoneActiveItems = WARP_SIZE - u32min( numberOfInputs - ( itemIndex - lane ), WARP_SIZE ); // 0 - 32
-		u32 broThreads = 0xFFFFFFFF >> nNoneActiveItems;
+		// check the attendees
+		u32 broThreads =
+#if defined( ITS )
+			__ballot_sync( 0xFFFFFFFF,
+#else
+			__ballot(
+#endif
+						   itemIndex < numberOfInputs );
 
 		for( int j = 0; j < N_RADIX; ++j )
 		{

From 3289c1b663bec727fa19b6153271111542819870 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Wed, 3 Jan 2024 11:40:09 +0900
Subject: [PATCH 52/68] simplify

---
 ParallelPrimitives/RadixSortKernels.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 3ec8bd5..207affb 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -571,18 +571,15 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 
 		u32 lowerMask = ( 1u << lane ) - 1;
-
-		if( itemIndex < numberOfInputs )
-		{
-			warpOffsets[k] = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask );
-		}
+		warpOffsets[k] = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask );
+		
 #if defined( ITS )
 		__syncwarp( 0xFFFFFFFF );
 #else
 		__threadfence_block();
 #endif
-		bool leader = ( broThreads & lowerMask ) == 0;
-		if( itemIndex < numberOfInputs && leader )
+		u32 leaderIdx = __ffs( broThreads ) - 1;
+		if( lane == leaderIdx )
 		{
 			u32 n = __popc( broThreads );
 			smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n;

From 680a91026e4a386a9dd72726430846bc8cb8bf58 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Wed, 3 Jan 2024 11:46:49 +0900
Subject: [PATCH 53/68] refactor

---
 ParallelPrimitives/RadixSortKernels.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 207affb..2e03f27 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -571,7 +571,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 
 		u32 lowerMask = ( 1u << lane ) - 1;
-		warpOffsets[k] = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] + __popc( broThreads & lowerMask );
+		auto digitCount = smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
+		warpOffsets[k] = digitCount + __popc( broThreads & lowerMask );
 		
 #if defined( ITS )
 		__syncwarp( 0xFFFFFFFF );
@@ -581,8 +582,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		u32 leaderIdx = __ffs( broThreads ) - 1;
 		if( lane == leaderIdx )
 		{
-			u32 n = __popc( broThreads );
-			smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] += n;
+			smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp] = digitCount + __popc( broThreads );
 		}
 #if defined( ITS )
 		__syncwarp( 0xFFFFFFFF );
@@ -593,9 +593,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	__syncthreads();
 
-	if( threadIdx.x < BIN_SIZE )
+	for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		int bucketIndex = threadIdx.x;
 		u32 s = 0;
 		for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ )
 		{
@@ -683,9 +682,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	scanExclusive<u16>( 0, smem.u.phase1.blockHistogram, BIN_SIZE );
 
-	if( threadIdx.x < BIN_SIZE )
+	for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		int bucketIndex = threadIdx.x;
 		u32 s = smem.u.phase1.blockHistogram[bucketIndex];
 		for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ )
 		{
@@ -714,9 +712,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		warpOffsets[k] += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
 	}
 
-	if( threadIdx.x < BIN_SIZE )
+	for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
-		pSum[threadIdx.x] -= smem.u.phase1.blockHistogram[threadIdx.x];
+		pSum[bucketIndex] -= smem.u.phase1.blockHistogram[bucketIndex];
 	}
 
 	__syncthreads();

From d7d0274c3e4e59dce69cc3afea9f43dabf14c254 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Wed, 3 Jan 2024 12:30:25 +0900
Subject: [PATCH 54/68] support non blockDim != 256 case

---
 ParallelPrimitives/RadixSortConfigs.h | 9 ++++-----
 ParallelPrimitives/RadixSortKernels.h | 8 +++++++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index 178ffa4..1cb1639 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -22,23 +22,22 @@ static_assert( BIN_SIZE % 2 == 0 );
 
 constexpr int WARP_SIZE = 32;
 
-constexpr int RADIX_SORT_BLOCK_SIZE = 2048 + 1024 + 1024;
-// constexpr int RADIX_SORT_BLOCK_SIZE = 512;
+constexpr int RADIX_SORT_BLOCK_SIZE = 4096;
 
 constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048;
 constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256;
 
 constexpr int REORDER_NUMBER_OF_WARPS = 8;
-constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = 32 * REORDER_NUMBER_OF_WARPS;
+constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = WARP_SIZE * REORDER_NUMBER_OF_WARPS;
 constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = RADIX_SORT_BLOCK_SIZE / REORDER_NUMBER_OF_WARPS;
-constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32;
+constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / WARP_SIZE;
 
 constexpr int LOOKBACK_TABLE_SIZE = 1024;
 constexpr int MAX_LOOK_BACK = 64;
 constexpr int TAIL_BITS = 5;
 constexpr int TAIL_COUNT = 1u << TAIL_BITS;
 
-static_assert( BIN_SIZE <= REORDER_NUMBER_OF_THREADS_PER_BLOCK, "please check scanExclusive" );
+//static_assert( BIN_SIZE <= REORDER_NUMBER_OF_THREADS_PER_BLOCK, "please check scanExclusive" );
 //static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" );
 
 }; // namespace Oro
\ No newline at end of file
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 2e03f27..51a06f3 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -680,7 +680,13 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		pSum[i] = globalOutput;
 	}
 
-	scanExclusive<u16>( 0, smem.u.phase1.blockHistogram, BIN_SIZE );
+	__syncthreads();
+
+	u32 prefix = 0;
+	for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
+	{
+		prefix += scanExclusive<u16>( prefix, smem.u.phase1.blockHistogram + i, min( REORDER_NUMBER_OF_THREADS_PER_BLOCK, BIN_SIZE ) );
+	}
 
 	for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{

From dea141111282b61e9096a96404eec4d20a27cffd Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Wed, 3 Jan 2024 12:33:15 +0900
Subject: [PATCH 55/68] remove unused

---
 ParallelPrimitives/RadixSortKernels.h | 31 ---------------------------
 1 file changed, 31 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 51a06f3..1bec3eb 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -286,37 +286,6 @@ __device__ inline u64 getKeyBits( u64 x ) { return x ^ ORDER_MASK_64; }
 __device__ inline u32 extractDigit( u32 x, u32 bitLocation ) { return ( x >> bitLocation ) & RADIX_MASK; }
 __device__ inline u32 extractDigit( u64 x, u32 bitLocation ) { return (u32)( ( x >> bitLocation ) & RADIX_MASK ); }
 
-template<int NThreads>
-__device__ inline u32 prefixSumExclusive( u32 prefix, u32* sMemIO )
-{
-	u32 value = sMemIO[threadIdx.x];
-
-	for( u32 offset = 1; offset < NThreads; offset <<= 1 )
-	{
-		u32 x = sMemIO[threadIdx.x];
-
-		if( offset <= threadIdx.x )
-		{
-			x += sMemIO[threadIdx.x - offset];
-		}
-
-		__syncthreads();
-
-		sMemIO[threadIdx.x] = x;
-
-		__syncthreads();
-	}
-	u32 sum = sMemIO[NThreads - 1];
-
-	__syncthreads();
-
-	sMemIO[threadIdx.x] += prefix - value;
-
-	__syncthreads();
-
-	return sum;
-}
-
 template <class T>
 __device__ inline T scanExclusive( T prefix, T* sMemIO, int nElement )
 {

From c6f871bfc6eb00d0fb0bc001f230eb4c14a74461 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Wed, 3 Jan 2024 12:50:02 +0900
Subject: [PATCH 56/68] reduce loops and ealier tail iterator is better

---
 ParallelPrimitives/RadixSortKernels.h | 29 ++++++++++++---------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 1bec3eb..185b271 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -651,6 +651,16 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	__syncthreads();
 
+	if( threadIdx.x == 0 )
+	{
+		while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT )
+			;
+
+		atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ );
+	}
+
+	__syncthreads();
+
 	u32 prefix = 0;
 	for( int i = 0; i < BIN_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
@@ -660,6 +670,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
 		u32 s = smem.u.phase1.blockHistogram[bucketIndex];
+
+		pSum[bucketIndex] -= s; // pre-substruct to avoid pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex] to calculate destinations
+
 		for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ )
 		{
 			int index = bucketIndex * REORDER_NUMBER_OF_WARPS + warp;
@@ -671,30 +684,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	__syncthreads();
 
-	if( threadIdx.x == 0 )
-	{
-		while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT )
-			;
-
-		atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ );
-	}
-
-	__syncthreads();
-
 	for( int k = 0; k < REORDER_NUMBER_OF_ITEM_PER_THREAD; k++ )
 	{
 		u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation );
 		warpOffsets[k] += smem.u.phase1.lpSum[bucketIndex * REORDER_NUMBER_OF_WARPS + warp];
 	}
 
-	for( int bucketIndex = threadIdx.x; bucketIndex < BIN_SIZE; bucketIndex += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
-	{
-		pSum[bucketIndex] -= smem.u.phase1.blockHistogram[bucketIndex];
-	}
-
 	__syncthreads();
 
-
 	for( int i = lane, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ )
 	{
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i;

From 402db80aef7902c3e036a26bd7d037bc98fcf69f Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 4 Jan 2024 00:06:58 +0900
Subject: [PATCH 57/68] remove redundant sync

---
 ParallelPrimitives/RadixSortKernels.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 185b271..854c0c1 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -342,8 +342,6 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 		}
 	}
 
-	__syncthreads();
-
 	u32 numberOfBlocks = div_round_up( numberOfInputs, GHISTOGRAM_ITEM_PER_BLOCK );
 	__shared__ u32 iBlock;
 	if( threadIdx.x == 0 )

From 16e046cf2d345e0ba9aa5354ab3f852eae9cf6b5 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 4 Jan 2024 00:07:14 +0900
Subject: [PATCH 58/68] use constant decl

---
 ParallelPrimitives/RadixSortConfigs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index 1cb1639..7d4c29a 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -30,7 +30,7 @@ constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256;
 constexpr int REORDER_NUMBER_OF_WARPS = 8;
 constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = WARP_SIZE * REORDER_NUMBER_OF_WARPS;
 constexpr int REORDER_NUMBER_OF_ITEM_PER_WARP = RADIX_SORT_BLOCK_SIZE / REORDER_NUMBER_OF_WARPS;
-constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / WARP_SIZE;
+constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WARP / 32;
 
 constexpr int LOOKBACK_TABLE_SIZE = 1024;
 constexpr int MAX_LOOK_BACK = 64;

From 69b09b17560f56f5e887b900528b0ee6d6963183 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Sun, 14 Jan 2024 13:05:18 +0900
Subject: [PATCH 59/68] shorten

---
 ParallelPrimitives/RadixSortKernels.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 854c0c1..a8ad5a3 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -498,20 +498,10 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane;
-
 		u32 bucketIndex = 0;
 		if( itemIndex < numberOfInputs )
 		{
-			RADIX_SORT_KEY_TYPE item;
-			if( batchLoading )
-			{
-				item = smem.u.phase1.batchKeys[warp][k % N_BATCH_LOAD][lane];
-			}
-			else
-			{
-				item = inputKeys[itemIndex];
-			}
-
+			RADIX_SORT_KEY_TYPE item = batchLoading ? smem.u.phase1.batchKeys[warp][k % N_BATCH_LOAD][lane] : inputKeys[itemIndex];
 			bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
 			keys[k] = item;
 		}

From 2261c0280fc925a24e59396cf0d05c82c57201b3 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Sun, 14 Jan 2024 20:35:12 +0900
Subject: [PATCH 60/68] remove unused

---
 ParallelPrimitives/RadixSortKernels.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index a8ad5a3..8436890 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -692,7 +692,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	__syncthreads();
 
-	for( int i = threadIdx.x, k = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK, k++ )
+	for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 	{
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
 		if( itemIndex < numberOfInputs )
@@ -723,7 +723,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 		__syncthreads();
 
-		for( int i = threadIdx.x, k = 0; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK, k++ )
+		for( int i = threadIdx.x; i < RADIX_SORT_BLOCK_SIZE; i += REORDER_NUMBER_OF_THREADS_PER_BLOCK )
 		{
 			u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + i;
 			if( itemIndex < numberOfInputs )

From 6723b8e88f6f4b1a84890b9332ab03a9b58ead09 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Thu, 18 Jan 2024 21:56:38 +0900
Subject: [PATCH 61/68] remove too much optimizations, fix potential sync issue
 etc

---
 ParallelPrimitives/RadixSortKernels.h | 60 ++++++++-------------------
 1 file changed, 17 insertions(+), 43 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 8436890..7605016 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -292,15 +292,10 @@ __device__ inline T scanExclusive( T prefix, T* sMemIO, int nElement )
 	// assert(nElement <= nThreads)
 	bool active = threadIdx.x < nElement;
 	T value = active ? sMemIO[threadIdx.x] : 0;
+	T x = value;
 
 	for( u32 offset = 1; offset < nElement; offset <<= 1 )
 	{
-		T x;
-		if( active )
-		{
-			x = sMemIO[threadIdx.x];
-		}
-
 		if( active && offset <= threadIdx.x )
 		{
 			x += sMemIO[threadIdx.x - offset];
@@ -322,7 +317,7 @@ __device__ inline T scanExclusive( T prefix, T* sMemIO, int nElement )
 
 	if( active )
 	{
-		sMemIO[threadIdx.x] += prefix - value;
+		sMemIO[threadIdx.x] = x + prefix - value;
 	}
 
 	__syncthreads();
@@ -436,7 +431,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		{
 			u16 blockHistogram[BIN_SIZE];
 			u16 lpSum[BIN_SIZE * REORDER_NUMBER_OF_WARPS];
-			RADIX_SORT_KEY_TYPE batchKeys[REORDER_NUMBER_OF_WARPS][N_BATCH_LOAD][WARP_SIZE];
 		};
 		struct Phase2
 		{
@@ -468,43 +462,21 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 	RADIX_SORT_KEY_TYPE keys[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 	u32 warpOffsets[REORDER_NUMBER_OF_ITEM_PER_THREAD];
 
-	bool batchLoading = KEY_IS_16BYTE_ALIGNED && ( blockIndex + 1 ) * RADIX_SORT_BLOCK_SIZE <= numberOfInputs;
-
 	int warp = threadIdx.x / WARP_SIZE;
 	int lane = threadIdx.x % WARP_SIZE;
+
 	for( int i = 0, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ )
 	{
-		if( batchLoading && ( k % N_BATCH_LOAD ) == 0 )
-		{
-			struct alignas( 16 ) BatchKeys
-			{
-				RADIX_SORT_KEY_TYPE xs[N_BATCH_LOAD];
-			};
-			int srcIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane * N_BATCH_LOAD;
-			BatchKeys batchKeys = *(BatchKeys*)&inputKeys[srcIndex];
-			for( int v = 0; v < N_BATCH_LOAD; v++ )
-			{
-				int indexInWarp = lane * N_BATCH_LOAD + v;
-				int toK = indexInWarp / WARP_SIZE;
-				int toLane = indexInWarp % WARP_SIZE;
-				smem.u.phase1.batchKeys[warp][toK][toLane] = batchKeys.xs[v];
-			}
-
-#if defined( ITS )
-			__syncwarp( 0xFFFFFFFF );
-#else
-			__threadfence_block();
-#endif
-		}
-
 		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane;
-		u32 bucketIndex = 0;
 		if( itemIndex < numberOfInputs )
 		{
-			RADIX_SORT_KEY_TYPE item = batchLoading ? smem.u.phase1.batchKeys[warp][k % N_BATCH_LOAD][lane] : inputKeys[itemIndex];
-			bucketIndex = extractDigit( getKeyBits( item ), bitLocation );
-			keys[k] = item;
+			keys[k] = inputKeys[itemIndex];
 		}
+	}
+	for( int i = 0, k = 0; i < REORDER_NUMBER_OF_ITEM_PER_WARP; i += WARP_SIZE, k++ )
+	{
+		u32 itemIndex = blockIndex * RADIX_SORT_BLOCK_SIZE + warp * REORDER_NUMBER_OF_ITEM_PER_WARP + i + lane;
+		u32 bucketIndex = extractDigit( getKeyBits( keys[k] ), bitLocation );
 
 		// check the attendees
 		u32 broThreads =
@@ -534,7 +506,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 #if defined( ITS )
 		__syncwarp( 0xFFFFFFFF );
 #else
-		__threadfence_block();
+		__syncthreads();
 #endif
 		u32 leaderIdx = __ffs( broThreads ) - 1;
 		if( lane == leaderIdx )
@@ -544,7 +516,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 #if defined( ITS )
 		__syncwarp( 0xFFFFFFFF );
 #else
-		__threadfence_block();
+		__syncthreads();
 #endif
 	}
 
@@ -661,9 +633,9 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 		pSum[bucketIndex] -= s; // pre-substruct to avoid pSum[bucketIndex] + i - smem.u.phase1.blockHistogram[bucketIndex] to calculate destinations
 
-		for( int warp = 0; warp < REORDER_NUMBER_OF_WARPS; warp++ )
+		for( int w = 0; w < REORDER_NUMBER_OF_WARPS; w++ )
 		{
-			int index = bucketIndex * REORDER_NUMBER_OF_WARPS + warp;
+			int index = bucketIndex * REORDER_NUMBER_OF_WARPS + w;
 			u32 n = smem.u.phase1.lpSum[index];
 			smem.u.phase1.lpSum[index] = s;
 			s += n;
@@ -738,12 +710,14 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 		}
 	}
 }
-extern "C" __global__ void onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits,
+extern "C" __global__ void __launch_bounds__( REORDER_NUMBER_OF_THREADS_PER_BLOCK ) onesweep_reorderKey64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, u32 numberOfInputs, u32* gpSumBuffer, volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits,
 												  u32 iteration )
 {
 	onesweep_reorder<false /*keyPair*/>( inputKeys, outputKeys, nullptr, nullptr, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration );
 }
-extern "C" __global__ void onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues, u32 numberOfInputs, u32* gpSumBuffer,
+extern "C" __global__ void __launch_bounds__( REORDER_NUMBER_OF_THREADS_PER_BLOCK ) onesweep_reorderKeyPair64( RADIX_SORT_KEY_TYPE* inputKeys, RADIX_SORT_KEY_TYPE* outputKeys, RADIX_SORT_VALUE_TYPE* inputValues, RADIX_SORT_VALUE_TYPE* outputValues,
+																											   u32 numberOfInputs,
+																								   u32* gpSumBuffer,
 													  volatile u64* lookBackBuffer, u32* tailIterator, u32 startBits, u32 iteration )
 {
 	onesweep_reorder<true /*keyPair*/>( inputKeys, outputKeys, inputValues, outputValues, numberOfInputs, gpSumBuffer, lookBackBuffer, tailIterator, startBits, iteration );

From 1d399efa658ab185a5ace715ff042c1cccbdf080 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 23 Feb 2024 18:41:31 +0900
Subject: [PATCH 62/68] remove unused branching. Thanks to ChihChen

---
 ParallelPrimitives/RadixSortKernels.h | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 7605016..ec52952 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -346,12 +346,8 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 
 	__syncthreads();
 
-	bool hasData = false;
-
 	while( iBlock < numberOfBlocks )
 	{
-		hasData = true;
-
 		if( KEY_IS_16BYTE_ALIGNED && ( iBlock + 1 ) * GHISTOGRAM_ITEM_PER_BLOCK <= numberOfInputs )
 		{
 			for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK * 4 )
@@ -401,19 +397,16 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 		__syncthreads();
 	}
 
-	if( hasData )
+	for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 	{
-		for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
-		{
-			scanExclusive<u32>( 0, &localCounters[i][0], BIN_SIZE );
-		}
+		scanExclusive<u32>( 0, &localCounters[i][0], BIN_SIZE );
+	}
 
-		for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
+	for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
+	{
+		for( int j = threadIdx.x; j < BIN_SIZE; j += GHISTOGRAM_THREADS_PER_BLOCK )
 		{
-			for( int j = threadIdx.x; j < BIN_SIZE; j += GHISTOGRAM_THREADS_PER_BLOCK )
-			{
-				atomicAdd( &gpSumBuffer[BIN_SIZE * i + j], localCounters[i][j] );
-			}
+			atomicAdd( &gpSumBuffer[BIN_SIZE * i + j], localCounters[i][j] );
 		}
 	}
 }

From 20b8e5569718b59518136cba0b3df67a4d9ed295 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 23 Feb 2024 20:31:37 +0900
Subject: [PATCH 63/68] refactor the tail iterator conditions

---
 ParallelPrimitives/RadixSortConfigs.h | 3 ++-
 ParallelPrimitives/RadixSortKernels.h | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index 7d4c29a..87036e2 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -35,7 +35,8 @@ constexpr int REORDER_NUMBER_OF_ITEM_PER_THREAD = REORDER_NUMBER_OF_ITEM_PER_WAR
 constexpr int LOOKBACK_TABLE_SIZE = 1024;
 constexpr int MAX_LOOK_BACK = 64;
 constexpr int TAIL_BITS = 5;
-constexpr int TAIL_COUNT = 1u << TAIL_BITS;
+constexpr auto TAIL_MASK = 0xFFFFFFFFu << TAIL_BITS;
+static_assert( MAX_LOOK_BACK < LOOKBACK_TABLE_SIZE, "" );
 
 //static_assert( BIN_SIZE <= REORDER_NUMBER_OF_THREADS_PER_BLOCK, "please check scanExclusive" );
 //static_assert( BIN_SIZE % REORDER_NUMBER_OF_THREADS_PER_BLOCK == 0, "please check prefixSumExclusive on onesweep_reorder" );
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index ec52952..2a1785b 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -546,8 +546,8 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	if( threadIdx.x == 0 && LOOKBACK_TABLE_SIZE <= blockIndex )
 	{
-		u32 mustBeDone = blockIndex - LOOKBACK_TABLE_SIZE + MAX_LOOK_BACK;
-		while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) * TAIL_COUNT <= mustBeDone )
+		// Wait until blockIndex < tail - MAX_LOOK_BACK + LOOKBACK_TABLE_SIZE
+		while( ( atomicAdd( tailIterator, 0 ) & TAIL_MASK ) - MAX_LOOK_BACK + LOOKBACK_TABLE_SIZE <= blockIndex )
 			;
 	}
 	__syncthreads();
@@ -606,7 +606,7 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 
 	if( threadIdx.x == 0 )
 	{
-		while( ( atomicAdd( tailIterator, 0 ) >> TAIL_BITS ) != blockIndex / TAIL_COUNT )
+		while( ( atomicAdd( tailIterator, 0 ) & TAIL_MASK ) != ( blockIndex & TAIL_MASK ) )
 			;
 
 		atomicInc( tailIterator, numberOfBlocks - 1 /* after the vary last item, it will be zero */ );

From a1731faa0827acec0b1385406f462f0b223ba11f Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 23 Feb 2024 23:14:26 +0900
Subject: [PATCH 64/68] simple code is just fine at gHistogram. No more
 KEY_IS_16BYTE_ALIGNED

---
 ParallelPrimitives/RadixSortConfigs.h |  1 +
 ParallelPrimitives/RadixSortKernels.h | 45 ++++++---------------------
 2 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/ParallelPrimitives/RadixSortConfigs.h b/ParallelPrimitives/RadixSortConfigs.h
index 87036e2..40cd112 100644
--- a/ParallelPrimitives/RadixSortConfigs.h
+++ b/ParallelPrimitives/RadixSortConfigs.h
@@ -26,6 +26,7 @@ constexpr int RADIX_SORT_BLOCK_SIZE = 4096;
 
 constexpr int GHISTOGRAM_ITEM_PER_BLOCK = 2048;
 constexpr int GHISTOGRAM_THREADS_PER_BLOCK = 256;
+constexpr int GHISTOGRAM_ITEMS_PER_THREAD = GHISTOGRAM_ITEM_PER_BLOCK / GHISTOGRAM_THREADS_PER_BLOCK;
 
 constexpr int REORDER_NUMBER_OF_WARPS = 8;
 constexpr int REORDER_NUMBER_OF_THREADS_PER_BLOCK = WARP_SIZE * REORDER_NUMBER_OF_WARPS;
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 2a1785b..f2e229a 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -253,9 +253,6 @@ extern "C" __global__ void SortSinglePassKernel( int* gSrcKey, int* gDstKey, int
 
 extern "C" __global__ void SortSinglePassKVKernel( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal, int gN, const int START_BIT, const int END_BIT ) { SortSinglePass<true>( gSrcKey, gSrcVal, gDstKey, gDstVal, gN, START_BIT, END_BIT ); }
 
-
-constexpr auto KEY_IS_16BYTE_ALIGNED = true;
-
 using RADIX_SORT_KEY_TYPE = u32;
 using RADIX_SORT_VALUE_TYPE = u32;
 
@@ -348,45 +345,21 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 
 	while( iBlock < numberOfBlocks )
 	{
-		if( KEY_IS_16BYTE_ALIGNED && ( iBlock + 1 ) * GHISTOGRAM_ITEM_PER_BLOCK <= numberOfInputs )
+		for( int j = 0; j < GHISTOGRAM_ITEMS_PER_THREAD; j++ )
 		{
-			for( int i = 0; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK * 4 )
-			{
-				u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i + threadIdx.x * 4;
-				struct alignas( 16 ) Key4
-				{
-					RADIX_SORT_KEY_TYPE xs[4];
-				};
-				Key4 key4 = *(Key4*)&inputs[itemIndex];
-				for( int k = 0; k < 4; k++ )
-				{
-					auto item = key4.xs[k];
-					for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
-					{
-						u32 bitLocation = startBits + i * N_RADIX;
-						u32 bits = extractDigit( getKeyBits( item ), bitLocation );
-						atomicInc( &localCounters[i][bits], 0xFFFFFFFF );
-					}
-				}
-			}
-		}
-		else
-		{
-			for( int i = threadIdx.x; i < GHISTOGRAM_ITEM_PER_BLOCK; i += GHISTOGRAM_THREADS_PER_BLOCK )
+			u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x * GHISTOGRAM_ITEMS_PER_THREAD + j;
+			if( itemIndex < numberOfInputs )
 			{
-				u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + i;
-				if( itemIndex < numberOfInputs )
+				auto item = inputs[itemIndex];
+				for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )
 				{
-					auto item = inputs[itemIndex];
-					for( int j = 0; j < sizeof( RADIX_SORT_KEY_TYPE ); j++ )
-					{
-						u32 bitLocation = startBits + j * N_RADIX;
-						u32 bits = extractDigit( getKeyBits( item ), bitLocation );
-						atomicInc( &localCounters[j][bits], 0xFFFFFFFF );
-					}
+					u32 bitLocation = startBits + i * N_RADIX;
+					u32 bits = extractDigit( getKeyBits( item ), bitLocation );
+					atomicInc( &localCounters[i][bits], 0xFFFFFFFF );
 				}
 			}
 		}
+
 		__syncthreads();
 
 		if( threadIdx.x == 0 )

From d5beef7e6ba7a93af662e74d0d948f0dd252ab0d Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Sat, 24 Feb 2024 14:41:35 +0900
Subject: [PATCH 65/68] remove unused

---
 ParallelPrimitives/RadixSortKernels.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index f2e229a..9b5849e 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -390,7 +390,6 @@ __device__ __forceinline__ void onesweep_reorder( RADIX_SORT_KEY_TYPE* inputKeys
 {
 	__shared__ u32 pSum[BIN_SIZE];
 
-	constexpr int N_BATCH_LOAD = 4;
 	struct SMem
 	{
 		struct Phase1

From 1ff67d955d7893e98e2b24d233f41be35222cf52 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Sat, 24 Feb 2024 15:09:34 +0900
Subject: [PATCH 66/68] unify atomicInc

---
 ParallelPrimitives/RadixSortKernels.h | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
index 9b5849e..75e9451 100644
--- a/ParallelPrimitives/RadixSortKernels.h
+++ b/ParallelPrimitives/RadixSortKernels.h
@@ -336,15 +336,18 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 
 	u32 numberOfBlocks = div_round_up( numberOfInputs, GHISTOGRAM_ITEM_PER_BLOCK );
 	__shared__ u32 iBlock;
-	if( threadIdx.x == 0 )
+	for(;;)
 	{
-		iBlock = atomicInc( counter, 0xFFFFFFFF );
-	}
+		if( threadIdx.x == 0 )
+		{
+			iBlock = atomicInc( counter, 0xFFFFFFFF );
+		}
 
-	__syncthreads();
+		__syncthreads();
 
-	while( iBlock < numberOfBlocks )
-	{
+		if( numberOfBlocks <= iBlock )
+			break;
+    
 		for( int j = 0; j < GHISTOGRAM_ITEMS_PER_THREAD; j++ )
 		{
 			u32 itemIndex = iBlock * GHISTOGRAM_ITEM_PER_BLOCK + threadIdx.x * GHISTOGRAM_ITEMS_PER_THREAD + j;
@@ -361,13 +364,6 @@ extern "C" __global__ void gHistogram( RADIX_SORT_KEY_TYPE* inputs, u32 numberOf
 		}
 
 		__syncthreads();
-
-		if( threadIdx.x == 0 )
-		{
-			iBlock = atomicInc( counter, 0xFFFFFFFF );
-		}
-
-		__syncthreads();
 	}
 
 	for( int i = 0; i < sizeof( RADIX_SORT_KEY_TYPE ); i++ )

From b3af1e9dbd7d13b532f50a3230bf5de0a583eccc Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 1 Mar 2024 19:08:42 +0900
Subject: [PATCH 67/68] remove temporal splitmix64

---
 Test/RadixSort/main.cpp | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/Test/RadixSort/main.cpp b/Test/RadixSort/main.cpp
index 2f7578d..090da60 100644
--- a/Test/RadixSort/main.cpp
+++ b/Test/RadixSort/main.cpp
@@ -49,19 +49,6 @@ class Stopwatch
 };
 #endif
 
-struct splitmix64
-{
-	uint64_t x = 0; /* The state can be seeded with any value. */
-
-	uint64_t next()
-	{
-		uint64_t z = ( x += 0x9e3779b97f4a7c15 );
-		z = ( z ^ ( z >> 30 ) ) * 0xbf58476d1ce4e5b9;
-		z = ( z ^ ( z >> 27 ) ) * 0x94d049bb133111eb;
-		return z ^ ( z >> 31 );
-	}
-};
-
 using u64 = Oro::RadixSort::u64;
 using u32 = Oro::RadixSort::u32;
 
@@ -82,13 +69,9 @@ class SortTest
 
 		std::vector<u32> srcKey( testSize );
 
-		splitmix64 rng;
 		for( int i = 0; i < testSize; i++ )
 		{
 			srcKey[i] = getRandom( 0u, (u32)( ( 1ull << (u64)testBits ) - 1 ) );
-
-			//u32 mask = (u32)( ( 1ull << (u64)testBits ) - 1 );
-			//srcKey[i] = rng.next() & mask;
 		}
 
 		std::vector<u32> srcValue( testSize );

From 656e5782b90349cc6680e2f16aec3f93b5ce8584 Mon Sep 17 00:00:00 2001
From: "Atsushi.Yoshimura" <Atsushi.Yoshimura@amd.com>
Date: Fri, 1 Mar 2024 19:13:43 +0900
Subject: [PATCH 68/68] use arg_cast instead

---
 ParallelPrimitives/RadixSort.cpp | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
index 4ab14c6..62c7415 100644
--- a/ParallelPrimitives/RadixSort.cpp
+++ b/ParallelPrimitives/RadixSort.cpp
@@ -212,23 +212,17 @@ void RadixSort::sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n
 	int nIteration = div_round_up64( endBit - startBit, 8 );
 	uint64_t numberOfBlocks = div_round_up64( n, RADIX_SORT_BLOCK_SIZE );
 
-	// Buffers
-	void* gpSumBuffer = m_gpSumBuffer.ptr();
-	void* lookBackBuffer = m_lookbackBuffer.ptr();
-	void* tailIteratorBuffer = m_tailIterator.ptr();
-
 	m_lookbackBuffer.resetAsync( stream );
 	m_gpSumCounter.resetAsync( stream );
 	m_gpSumBuffer.resetAsync( stream );
 
 	// counter for gHistogram. 
 	{
-		void* counter = m_gpSumCounter.ptr();
 		int maxBlocksPerMP = 0;
 		oroError e = oroOccupancyMaxActiveBlocksPerMultiprocessor( &maxBlocksPerMP, m_gHistogram, GHISTOGRAM_THREADS_PER_BLOCK, 0 );
 		const int nBlocks = e == oroSuccess ? maxBlocksPerMP * m_props.multiProcessorCount : 2048;
 
-		const void* args[] = { &src.key, &n, &gpSumBuffer, &startBit, &counter };
+		const void* args[] = { &src.key, &n, arg_cast( m_gpSumBuffer.address() ), &startBit, arg_cast( m_gpSumCounter.address() ) };
 		OrochiUtils::launch1D( m_gHistogram, nBlocks * GHISTOGRAM_THREADS_PER_BLOCK, args, GHISTOGRAM_THREADS_PER_BLOCK, 0, stream );
 	}
 
@@ -243,12 +237,12 @@ void RadixSort::sort( const KeyValueSoA& src, const KeyValueSoA& dst, uint32_t n
 
 		if( keyPair )
 		{
-			const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, &gpSumBuffer, &lookBackBuffer, &tailIteratorBuffer, & startBit, &i };
+			const void* args[] = { &s.key, &d.key, &s.value, &d.value, &n, arg_cast( m_gpSumBuffer.address() ), arg_cast( m_lookbackBuffer.address() ), arg_cast( m_tailIterator.address() ), &startBit, &i };
 			OrochiUtils::launch1D( m_onesweep_reorderKeyPair64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream );
 		}
 		else
 		{
-			const void* args[] = { &s.key, &d.key, &n, &gpSumBuffer, &lookBackBuffer, &tailIteratorBuffer, &startBit, &i };
+			const void* args[] = { &s.key, &d.key, &n, arg_cast( m_gpSumBuffer.address() ), arg_cast( m_lookbackBuffer.address() ), arg_cast( m_tailIterator.address() ), &startBit, &i };
 			OrochiUtils::launch1D( m_onesweep_reorderKey64, numberOfBlocks * REORDER_NUMBER_OF_THREADS_PER_BLOCK, args, REORDER_NUMBER_OF_THREADS_PER_BLOCK, 0, stream );
 		}
 		std::swap( s, d );