CIS565-Fall-2020 · AsteriskChan · Sep 20, 2020 · Sep 20, 2020 · Sep 20, 2020 · Sep 20, 2020
diff --git a/README.md b/README.md
@@ -3,12 +3,83 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Szeyu Chan
+  * [LinkedIn](https://www.linkedin.com/in/szeyuchan11/)
+* Tested on: Windows 10, i7-10510U @ 1.80GHz 16GB, MX250 2048MB (Personal Laptop)
 
-### (TODO: Your README)
+### Features
+* CPU Scan & Stream Compaction
+* Naive GPU Scan Algorithm
+* Work-Efficient GPU Scan & Stream Compaction
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Analysis
 
+#### Block Size (Threads per Block)
+![](results/blockSize.png)  
+According to the experiment, I chose 128 as the block size for both Naive implementation and work-efficient implementation.
+
+#### Array Size
+![](results/arraySize.png)  
+The Thrust implementation is so efficient. According to the Nsight Timeline, I guess shared memory may be used for optimization.  
+
+#### Performance Bottleneck
+![](results/timeline.png)  
+For a Naive scan implementation, the performance bottleneck is computation (the dark red part). While for a work-efficient scan implementation, memory copy costs much more than computation. (Array Size = 2^24)
+
+### Output
+```
+****************
+** SCAN TESTS **
+****************
+    [  25   8   8  44  34  15  13  24  12  37  33  49  10 ...  17   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 65.7953ms    (std::chrono Measured)
+    [   0  25  33  41  85 119 134 147 171 183 220 253 302 ... 411028981 411028998 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 23.7746ms    (std::chrono Measured)
+    [   0  25  33  41  85 119 134 147 171 183 220 253 302 ... 411028920 411028958 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 88.662ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 88.7263ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 35.2655ms    (CUDA Measured)
+    [   0  25  33  41  85 119 134 147 171 183 220 253 302 ... 411028981 411028998 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 35.2287ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 3.67821ms    (CUDA Measured)
+    [   0  25  33  41  85 119 134 147 171 183 220 253 302 ... 411028981 411028998 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 3.54509ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   3   1   3   2   2   0   2   3   3   1   3   3 ...   1   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 35.6661ms    (std::chrono Measured)
+    [   1   3   1   3   2   2   2   3   3   1   3   3   2 ...   2   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 35.8474ms    (std::chrono Measured)
+    [   1   3   1   3   2   2   2   3   3   1   3   3   2 ...   1   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 134.251ms    (std::chrono Measured)
+    [   1   3   1   3   2   2   2   3   3   1   3   3   2 ...   2   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 47.6788ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 47.8573ms    (CUDA Measured)
+    passed
+```
diff --git a/profile.xlsx b/profile.xlsx
diff --git a/results/arraySize.png b/results/arraySize.png
diff --git a/results/blockSize.png b/results/blockSize.png
diff --git a/results/naiveScan.png b/results/naiveScan.png
diff --git a/results/thrust.png b/results/thrust.png
diff --git a/results/timeline.png b/results/timeline.png
diff --git a/results/workEfficientScan.png b/results/workEfficientScan.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 24; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -71,7 +71,7 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
@@ -85,7 +85,7 @@ int main(int argc, char* argv[]) {
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);

diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -23,7 +23,13 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            // DONE
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n)
+			{
+				return;
+			}
+			bools[index] = (idata[index] == 0) ? 0 : 1;
         }
 
         /**
@@ -32,7 +38,16 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+            // DONE
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n)
+			{
+				return;
+			}
+			if (bools[index] == 1)
+			{
+				odata[indices[index]] = idata[index];
+			}
         }
 
     }

diff --git a/stream_compaction/common.h b/stream_compaction/common.h
@@ -13,6 +13,8 @@
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+const int threadsPerBlock = 128;
+
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -19,7 +19,12 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            // DONE. Simple for loop scan
+			odata[0] = 0;
+			for (int i = 1; i < n; ++i)
+			{
+				odata[i] = idata[i - 1] + odata[i - 1];
+			}
             timer().endCpuTimer();
         }
 
@@ -30,9 +35,18 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            // DONE
+			int outi = 0;
+			for (int i = 0; i < n; ++i)
+			{
+				if (idata[i] != 0)
+				{
+					odata[outi] = idata[i];
+					outi++;
+				}
+			}
             timer().endCpuTimer();
-            return -1;
+            return outi;
         }
 
         /**
@@ -41,10 +55,35 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
+			int *iMap = new int[n];
+			int *oMap = new int[n];
             timer().startCpuTimer();
-            // TODO
+            // DONE
+			// Map idata to an array with 0s and 1s
+			for (int i = 0; i < n; ++i)
+			{
+				iMap[i] = (idata[i] == 0) ? 0 : 1;
+			}
+			// Simple CPU scan
+			oMap[0] = 0;
+			for (int i = 1; i < n; ++i)
+			{
+				oMap[i] = iMap[i - 1] + oMap[i - 1];
+			}
+			// Scatter
+			int outi = 0;
+			for (int i = 0; i < n; ++i)
+			{
+				if (iMap[i] == 1)
+				{
+					odata[oMap[i]] = idata[i];
+					outi++;
+				}
+			}
             timer().endCpuTimer();
-            return -1;
+			delete[] iMap;
+			delete[] oMap;
+            return outi;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -11,14 +11,85 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
+
+		// n: number of blocks that need to be swept
+		// scaleIndex: 2^(d + 1)
+		// offsetLeft: 2^(d) - 1
+		// offsetRight: 2^(d + 1) - 1
+		__global__ void kernUpSweep(int* oData, int nSwept, int scaleIndex, int offsetLeft, int offsetRight)
+		{
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= nSwept)
+			{
+				return;
+			}
+			int k = index * scaleIndex;
+			oData[k + offsetRight] += oData[k + offsetLeft];
+		}
+
+		// n: number of blocks that need to be swept
+		// scaleIndex: 2^(d + 1)
+		// offsetLeft: 2^(d) - 1
+		// offsetRight: 2^(d + 1) - 1
+		__global__ void kernDownSweep(int* oData, int nSwept, int scaleIndex, int offsetLeft, int offsetRight)
+		{
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= nSwept)
+			{
+				return;
+			}
+			int k = index * scaleIndex;
+			int t = oData[k + offsetLeft];
+			oData[k + offsetLeft] = oData[k + offsetRight];
+			oData[k + offsetRight] += t;
+		}
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+			int *dev_odata;
+			int level = ilog2ceil(n);
+			int nPOT = 1 << level;	// Clamp n to power-of-two
+			cudaMalloc((void**)&dev_odata, nPOT * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_odata1 failed!");
+			cudaMemset(dev_odata, 0, nPOT * sizeof(int));
+			cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+
             timer().startGpuTimer();
-            // TODO
+            // DONE
+
+			// Up Sweep
+			int nSwept = nPOT;
+			for (int d = 0; d < level; ++d)
+			{
+				nSwept /= 2;
+				dim3 blocksPerGrid((nSwept + threadsPerBlock - 1) / threadsPerBlock);
+				int scaleIndex = 1 << (d + 1);
+				int offsetLeft = (1 << d) - 1;
+				int offsetRight = (1 << (d + 1)) - 1;
+				kernUpSweep << <blocksPerGrid, threadsPerBlock >> > (dev_odata, nSwept, scaleIndex, offsetLeft, offsetRight);
+			}
+			// Set root to zero
+			cudaMemset(dev_odata + nPOT - 1, 0, sizeof(int));
+			// Down Sweep
+			nSwept = 1;
+			for (int d = level - 1; d >= 0; --d)
+			{
+				dim3 blocksPerGrid((nSwept + threadsPerBlock - 1) / threadsPerBlock);
+				int scaleIndex = 1 << (d + 1);
+				int offsetLeft = (1 << d) - 1;
+				int offsetRight = (1 << (d + 1)) - 1;
+				kernDownSweep << < blocksPerGrid, threadsPerBlock >> > (dev_odata, nSwept, scaleIndex, offsetLeft, offsetRight);
+				nSwept *= 2;
+			}
+
             timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_odata);
         }
 
         /**
@@ -31,10 +102,71 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+
+			int *dev_bools;
+			int *dev_indices;
+			int *dev_idata;
+
+			int level = ilog2ceil(n);
+			int nPOT = 1 << level;	// Clamp n to power-of-two
+
+			cudaMalloc((void**)&dev_bools, nPOT * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_bools failed!");
+			cudaMalloc((void**)&dev_indices, nPOT * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_indices failed!");
+			cudaMalloc((void**)&dev_idata, nPOT * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_idata failed!");
+
+			cudaMemset(dev_idata, 0, nPOT * sizeof(int));
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+
             timer().startGpuTimer();
-            // TODO
+            // DONE
+			// Step1: Map idata to bools
+			dim3 blocksPerGridnPOT((nPOT + threadsPerBlock - 1) / threadsPerBlock);
+			Common::kernMapToBoolean << <blocksPerGridnPOT, threadsPerBlock >> > (nPOT, dev_bools, dev_idata);
+			cudaMemcpy(dev_indices, dev_bools, nPOT * sizeof(int), cudaMemcpyDeviceToDevice);
+			// Step2: Scan indices
+			// Up Sweep
+			int nSwept = nPOT;
+			for (int d = 0; d < level; ++d)
+			{
+				nSwept /= 2;
+				dim3 blocksPerGrid((nSwept + threadsPerBlock - 1) / threadsPerBlock);
+				int scaleIndex = 1 << (d + 1);
+				int offsetLeft = (1 << d) - 1;
+				int offsetRight = (1 << (d + 1)) - 1;
+				kernUpSweep << <blocksPerGrid, threadsPerBlock >> > (dev_indices, nSwept, scaleIndex, offsetLeft, offsetRight);
+			}
+			// Set root to zero
+			cudaMemset(dev_indices + nPOT - 1, 0, sizeof(int));
+			// Down Sweep
+			nSwept = 1;
+			for (int d = level - 1; d >= 0; --d)
+			{
+				dim3 blocksPerGrid((nSwept + threadsPerBlock - 1) / threadsPerBlock);
+				int scaleIndex = 1 << (d + 1);
+				int offsetLeft = (1 << d) - 1;
+				int offsetRight = (1 << (d + 1)) - 1;
+				kernDownSweep << < blocksPerGrid, threadsPerBlock >> > (dev_indices, nSwept, scaleIndex, offsetLeft, offsetRight);
+				nSwept *= 2;
+			}
+			// Step3: Scatter
+			Common::kernScatter << <blocksPerGridnPOT, threadsPerBlock >> > (nPOT, dev_idata, dev_idata, dev_bools, dev_indices);
+
             timer().endGpuTimer();
-            return -1;
+			cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+			int lastIndex = 0;
+			cudaMemcpy(&lastIndex, dev_indices + nPOT - 1, sizeof(int), cudaMemcpyDeviceToHost);
+			int lastBool = 0;
+			cudaMemcpy(&lastBool, dev_bools + nPOT - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+			cudaFree(dev_bools);
+			cudaFree(dev_indices);
+			cudaFree(dev_idata);
+            return lastIndex + lastBool;
         }
     }
 }