diff --git a/README.md b/README.md index 0e38ddb..8f3e042 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,83 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Szeyu Chan + * [LinkedIn](https://www.linkedin.com/in/szeyuchan11/) +* Tested on: Windows 10, i7-10510U @ 1.80GHz 16GB, MX250 2048MB (Personal Laptop) -### (TODO: Your README) +### Features +* CPU Scan & Stream Compaction +* Naive GPU Scan Algorithm +* Work-Efficient GPU Scan & Stream Compaction -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Analysis +#### Block Size (Threads per Block) +![](results/blockSize.png) +According to the experiment, I chose 128 as the block size for both Naive implementation and work-efficient implementation. + +#### Array Size +![](results/arraySize.png) +The Thrust implementation is so efficient. According to the Nsight Timeline, I guess shared memory may be used for optimization. + +#### Performance Bottleneck +![](results/timeline.png) +For a Naive scan implementation, the performance bottleneck is computation (the dark red part). While for a work-efficient scan implementation, memory copy costs much more than computation. (Array Size = 2^24) + +### Output +``` +**************** +** SCAN TESTS ** +**************** + [ 25 8 8 44 34 15 13 24 12 37 33 49 10 ... 17 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 65.7953ms (std::chrono Measured) + [ 0 25 33 41 85 119 134 147 171 183 220 253 302 ... 411028981 411028998 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 23.7746ms (std::chrono Measured) + [ 0 25 33 41 85 119 134 147 171 183 220 253 302 ... 411028920 411028958 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 88.662ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 88.7263ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 35.2655ms (CUDA Measured) + [ 0 25 33 41 85 119 134 147 171 183 220 253 302 ... 411028981 411028998 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 35.2287ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 3.67821ms (CUDA Measured) + [ 0 25 33 41 85 119 134 147 171 183 220 253 302 ... 411028981 411028998 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 3.54509ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 3 1 3 2 2 0 2 3 3 1 3 3 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 35.6661ms (std::chrono Measured) + [ 1 3 1 3 2 2 2 3 3 1 3 3 2 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 35.8474ms (std::chrono Measured) + [ 1 3 1 3 2 2 2 3 3 1 3 3 2 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 134.251ms (std::chrono Measured) + [ 1 3 1 3 2 2 2 3 3 1 3 3 2 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 47.6788ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 47.8573ms (CUDA Measured) + passed +``` diff --git a/profile.xlsx b/profile.xlsx new file mode 100644 index 0000000..312559a Binary files /dev/null and b/profile.xlsx differ diff --git a/results/arraySize.png b/results/arraySize.png new file mode 100644 index 0000000..4725b76 Binary files /dev/null and b/results/arraySize.png differ diff --git a/results/blockSize.png b/results/blockSize.png new file mode 100644 index 0000000..e03f106 Binary files /dev/null and b/results/blockSize.png differ diff --git a/results/naiveScan.png b/results/naiveScan.png new file mode 100644 index 0000000..b639b7e Binary files /dev/null and b/results/naiveScan.png differ diff --git a/results/thrust.png b/results/thrust.png new file mode 100644 index 0000000..817d7d8 Binary files /dev/null and b/results/thrust.png differ diff --git a/results/timeline.png b/results/timeline.png new file mode 100644 index 0000000..e08175a Binary files /dev/null and b/results/timeline.png differ diff --git a/results/workEfficientScan.png b/results/workEfficientScan.png new file mode 100644 index 0000000..8d7c967 Binary files /dev/null and b/results/workEfficientScan.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..a62e3e2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 24; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -71,7 +71,7 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); @@ -85,7 +85,7 @@ int main(int argc, char* argv[]) { printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..08c31a4 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,13 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + // DONE + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) + { + return; + } + bools[index] = (idata[index] == 0) ? 0 : 1; } /** @@ -32,7 +38,16 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + // DONE + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) + { + return; + } + if (bools[index] == 1) + { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..ea059b4 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -13,6 +13,8 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +const int threadsPerBlock = 128; + /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..f9bfcd6 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,12 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // DONE. Simple for loop scan + odata[0] = 0; + for (int i = 1; i < n; ++i) + { + odata[i] = idata[i - 1] + odata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +35,18 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // DONE + int outi = 0; + for (int i = 0; i < n; ++i) + { + if (idata[i] != 0) + { + odata[outi] = idata[i]; + outi++; + } + } timer().endCpuTimer(); - return -1; + return outi; } /** @@ -41,10 +55,35 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int *iMap = new int[n]; + int *oMap = new int[n]; timer().startCpuTimer(); - // TODO + // DONE + // Map idata to an array with 0s and 1s + for (int i = 0; i < n; ++i) + { + iMap[i] = (idata[i] == 0) ? 0 : 1; + } + // Simple CPU scan + oMap[0] = 0; + for (int i = 1; i < n; ++i) + { + oMap[i] = iMap[i - 1] + oMap[i - 1]; + } + // Scatter + int outi = 0; + for (int i = 0; i < n; ++i) + { + if (iMap[i] == 1) + { + odata[oMap[i]] = idata[i]; + outi++; + } + } timer().endCpuTimer(); - return -1; + delete[] iMap; + delete[] oMap; + return outi; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..ae46278 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -11,14 +11,85 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + + // n: number of blocks that need to be swept + // scaleIndex: 2^(d + 1) + // offsetLeft: 2^(d) - 1 + // offsetRight: 2^(d + 1) - 1 + __global__ void kernUpSweep(int* oData, int nSwept, int scaleIndex, int offsetLeft, int offsetRight) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= nSwept) + { + return; + } + int k = index * scaleIndex; + oData[k + offsetRight] += oData[k + offsetLeft]; + } + + // n: number of blocks that need to be swept + // scaleIndex: 2^(d + 1) + // offsetLeft: 2^(d) - 1 + // offsetRight: 2^(d + 1) - 1 + __global__ void kernDownSweep(int* oData, int nSwept, int scaleIndex, int offsetLeft, int offsetRight) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= nSwept) + { + return; + } + int k = index * scaleIndex; + int t = oData[k + offsetLeft]; + oData[k + offsetLeft] = oData[k + offsetRight]; + oData[k + offsetRight] += t; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int *dev_odata; + int level = ilog2ceil(n); + int nPOT = 1 << level; // Clamp n to power-of-two + cudaMalloc((void**)&dev_odata, nPOT * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_odata1 failed!"); + cudaMemset(dev_odata, 0, nPOT * sizeof(int)); + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); - // TODO + // DONE + + // Up Sweep + int nSwept = nPOT; + for (int d = 0; d < level; ++d) + { + nSwept /= 2; + dim3 blocksPerGrid((nSwept + threadsPerBlock - 1) / threadsPerBlock); + int scaleIndex = 1 << (d + 1); + int offsetLeft = (1 << d) - 1; + int offsetRight = (1 << (d + 1)) - 1; + kernUpSweep << > > (dev_odata, nSwept, scaleIndex, offsetLeft, offsetRight); + } + // Set root to zero + cudaMemset(dev_odata + nPOT - 1, 0, sizeof(int)); + // Down Sweep + nSwept = 1; + for (int d = level - 1; d >= 0; --d) + { + dim3 blocksPerGrid((nSwept + threadsPerBlock - 1) / threadsPerBlock); + int scaleIndex = 1 << (d + 1); + int offsetLeft = (1 << d) - 1; + int offsetRight = (1 << (d + 1)) - 1; + kernDownSweep << < blocksPerGrid, threadsPerBlock >> > (dev_odata, nSwept, scaleIndex, offsetLeft, offsetRight); + nSwept *= 2; + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_odata); } /** @@ -31,10 +102,71 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + int *dev_bools; + int *dev_indices; + int *dev_idata; + + int level = ilog2ceil(n); + int nPOT = 1 << level; // Clamp n to power-of-two + + cudaMalloc((void**)&dev_bools, nPOT * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_bools failed!"); + cudaMalloc((void**)&dev_indices, nPOT * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_indices failed!"); + cudaMalloc((void**)&dev_idata, nPOT * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_idata failed!"); + + cudaMemset(dev_idata, 0, nPOT * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); - // TODO + // DONE + // Step1: Map idata to bools + dim3 blocksPerGridnPOT((nPOT + threadsPerBlock - 1) / threadsPerBlock); + Common::kernMapToBoolean << > > (nPOT, dev_bools, dev_idata); + cudaMemcpy(dev_indices, dev_bools, nPOT * sizeof(int), cudaMemcpyDeviceToDevice); + // Step2: Scan indices + // Up Sweep + int nSwept = nPOT; + for (int d = 0; d < level; ++d) + { + nSwept /= 2; + dim3 blocksPerGrid((nSwept + threadsPerBlock - 1) / threadsPerBlock); + int scaleIndex = 1 << (d + 1); + int offsetLeft = (1 << d) - 1; + int offsetRight = (1 << (d + 1)) - 1; + kernUpSweep << > > (dev_indices, nSwept, scaleIndex, offsetLeft, offsetRight); + } + // Set root to zero + cudaMemset(dev_indices + nPOT - 1, 0, sizeof(int)); + // Down Sweep + nSwept = 1; + for (int d = level - 1; d >= 0; --d) + { + dim3 blocksPerGrid((nSwept + threadsPerBlock - 1) / threadsPerBlock); + int scaleIndex = 1 << (d + 1); + int offsetLeft = (1 << d) - 1; + int offsetRight = (1 << (d + 1)) - 1; + kernDownSweep << < blocksPerGrid, threadsPerBlock >> > (dev_indices, nSwept, scaleIndex, offsetLeft, offsetRight); + nSwept *= 2; + } + // Step3: Scatter + Common::kernScatter << > > (nPOT, dev_idata, dev_idata, dev_bools, dev_indices); + timer().endGpuTimer(); - return -1; + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + + int lastIndex = 0; + cudaMemcpy(&lastIndex, dev_indices + nPOT - 1, sizeof(int), cudaMemcpyDeviceToHost); + int lastBool = 0; + cudaMemcpy(&lastBool, dev_bools + nPOT - 1, sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_idata); + return lastIndex + lastBool; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..4fe56ab 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,56 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + // DONE: __global__ + __global__ void kernNaiveScan(int *odata, const int *idata, int offset, int n) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) + { + return; + } + // offset = 2^(d-1) + if (index >= offset) + { + odata[index] = idata[index - offset] + idata[index]; + } + else + { + odata[index] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *dev_idata; + int *dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_odata0 failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_odata1 failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + // DONE + dim3 blocksPerGrid((n + threadsPerBlock - 1) / threadsPerBlock); + int dmax = ilog2ceil(n); + for (int d = 1; d <= dmax; ++d) + { + int offset = 1 << (d - 1); + kernNaiveScan << > > (dev_odata, dev_idata, offset, n); + std::swap(dev_odata, dev_idata); + } + timer().endGpuTimer(); + odata[0] = 0; + cudaMemcpy(odata + 1, dev_idata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..9882060 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,18 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector host_data(n); + thrust::copy(idata, idata + n, host_data.begin()); + thrust::device_vector dev_idata = host_data; + thrust::device_vector dev_odata(n); timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` + // DONE use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + //thrust::exclusive_scan(idata, idata + n, odata); timer().endGpuTimer(); + host_data = dev_odata; + thrust::copy(host_data.begin(), host_data.begin() + n, odata); } } }