From 082930ee60c5a88878c3da1770d544ee6b87402f Mon Sep 17 00:00:00 2001
From: spencerwb <websp@seas.upenn.edu>
Date: Sun, 20 Sep 2020 23:34:41 -0400
Subject: [PATCH 01/16] 1st commit

---
 stream_compaction/cpu.cu | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..9b49032 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,4 +1,5 @@
 #include <cstdio>
+#include <vector>
 #include "cpu.h"
 
 #include "common.h"
@@ -20,6 +21,9 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; i++)
+                odata[i] = odata[i - 1] + idata[i - 1];
             timer().endCpuTimer();
         }
 
@@ -31,8 +35,13 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            std::vector<int> o = std::vector<int>();
+            for (int i = 0; i < n; i++)
+                if (idata[i])
+                    o.push_back(idata[i]);
+            odata = o.data();
             timer().endCpuTimer();
-            return -1;
+            return o.size();
         }
 
         /**
@@ -43,6 +52,25 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            // step 1: compute bit mask
+            std::vector<int> mask(n);
+            for (int i = 0; i < n; i++) {
+                if (idata[i] == 0) {
+                    mask.at(i) = 0;
+                }
+                else {
+                    mask.at(i) = 1;
+                }
+            }
+
+            // step 2: exclusive scan 
+            scan(n, odata, mask.data());
+
+            // step 3: scatter
+            for (int i = 0; i < n; i++) {
+
+            }
+
             timer().endCpuTimer();
             return -1;
         }

From 456c62f5d01d9d5b573f43a14f6b6725749d9a60 Mon Sep 17 00:00:00 2001
From: spencerwb <websp@seas.upenn.edu>
Date: Tue, 22 Sep 2020 05:12:22 -0400
Subject: [PATCH 02/16] part 1 done

---
 stream_compaction/cpu.cu   | 19 +++++++++++++++----
 stream_compaction/naive.cu |  4 ++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 9b49032..2b6b05e 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,12 +19,12 @@ namespace StreamCompaction {
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
+            // timer().startCpuTimer();
             // TODO
             odata[0] = 0;
             for (int i = 1; i < n; i++)
                 odata[i] = odata[i - 1] + idata[i - 1];
-            timer().endCpuTimer();
+            // timer().endCpuTimer();
         }
 
         /**
@@ -66,13 +66,24 @@ namespace StreamCompaction {
             // step 2: exclusive scan 
             scan(n, odata, mask.data());
 
+            timer().endCpuTimer();
+            return -1;
+
             // step 3: scatter
+            int m = odata[n - 1];
+            std::vector<int> ovec(m);
+            m = 0;
             for (int i = 0; i < n; i++) {
-
+                if (mask[i]) {
+                    ovec[odata[i]] = idata[i];
+                    m++;
+                }
             }
 
+            odata = ovec.data();
+
             timer().endCpuTimer();
-            return -1;
+            return m;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..46cc4fc 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,7 +11,11 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
+
         // TODO: __global__
+        __global__ void kernScan() {
+
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.

From 02aa87b93a5b73446fbe4832de7175a1f1a150cf Mon Sep 17 00:00:00 2001
From: spencerwb <websp@seas.upenn.edu>
Date: Tue, 22 Sep 2020 12:58:58 -0400
Subject: [PATCH 03/16] save

---
 stream_compaction/efficient.cu | 27 ++++++++++++++++++++
 stream_compaction/naive.cu     | 45 +++++++++++++++++++++++++++++++++-
 2 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..ae4901b 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,13 +12,40 @@ namespace StreamCompaction {
             return timer;
         }
 
+
+        __global__ void prescan(float* g_odata, float* g_idata, int n) {
+            extern __shared__ float temp[];
+            // allocated on invocation 
+            int thid = threadIdx.x; int offset = 1;
+        }
+
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata to dev_idata failed!");
+
+            // for most gpus there are 1024 threads per block
+            int threadsPerBlock = 1024;
+            int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; // ceiling of n / threadsPerBlock
+            dim3 blockDim(threadsPerBlock, 0, 0);
+            dim3 gridDim(blocksPerGrid, 0, 0);
+
+
             timer().startGpuTimer();
             // TODO
+            int k = ilog2ceil(n);
+            kernScan << <gridDim, blockDim >> > ();
+
             timer().endGpuTimer();
+
+            cudaFree(dev_idata);
         }
 
         /**
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 46cc4fc..43b2967 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -13,17 +13,60 @@ namespace StreamCompaction {
         }
 
         // TODO: __global__
-        __global__ void kernScan() {
+        __global__ void scan(float* g_odata, float* g_idata, int n) {
+            extern __shared__ float temp[]; 
+            // allocated on invocation    
+            int thid = threadIdx.x;   int pout = 0, pin = 1;   
+            // Load input into shared memory.   
+            // This is exclusive scan, so shift right by one    
+            // and set first element to 0
+            temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
+            __syncthreads(); 
+            for (int offset = 1; offset < n; offset *= 2)   
+            {     
+                pout = 1 - pout; 
+                // swap double buffer indices     
+                pin = 1 - pout;     
+                if (thid >= offset)       
+                    temp[pout*n+thid] += temp[pin*n+thid - offset];     
+                else       
+                    temp[pout*n+thid] = temp[pin*n+thid];     
+                __syncthreads();   
+            }   
+            g_odata[thid] = temp[pout*n+thid]; 
+            // write output 
+        } 
 
+        __global__ void kernScan() {
+            int id = threadIdx.x + blockIdx.x;
         }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata to dev_idata failed!");
+
+            // for most gpus there are 1024 threads per block
+            int threadsPerBlock = 1024;
+            int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; // ceiling of n / threadsPerBlock
+            dim3 blockDim(threadsPerBlock, 0, 0);
+            dim3 gridDim(blocksPerGrid, 0, 0);
+
+            
             timer().startGpuTimer();
             // TODO
+            int k = ilog2ceil(n);
+            kernScan<<<gridDim, blockDim >>>();
+
             timer().endGpuTimer();
+
+            cudaFree(dev_idata);
         }
     }
 }

From 7f5c8533fb836a9ecda12d9d11761339c117fbfb Mon Sep 17 00:00:00 2001
From: spencerwb <websp@seas.upenn.edu>
Date: Tue, 22 Sep 2020 14:53:07 -0400
Subject: [PATCH 04/16] save

---
 stream_compaction/common.cu    | 12 +++++
 stream_compaction/efficient.cu | 82 ++++++++++++++++++++++++++++++++--
 stream_compaction/naive.cu     |  2 +-
 3 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..222ee88 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,13 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n)
+                return;
+            if (idata[index])
+                bools[index] = 1;
+            else
+                bools[index] = 0;
         }
 
         /**
@@ -33,6 +40,11 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n)
+                return;
+            if (bools[index])
+                odata[indices[index]] = idata[index];
         }
 
     }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index ae4901b..5d5da9c 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -13,11 +13,43 @@ namespace StreamCompaction {
         }
 
 
-        __global__ void prescan(float* g_odata, float* g_idata, int n) {
+        __global__ void prescan(int n, float* g_odata, float* g_idata) {
             extern __shared__ float temp[];
             // allocated on invocation 
-            int thid = threadIdx.x; int offset = 1;
-        }
+            int thid = threadIdx.x;
+            int offset = 1;
+            temp[2 * thid] = g_idata[2 * thid]; // load input into shared memory
+            temp[2*thid+1] = g_idata[2*thid+1];
+
+            // build sum in place up the tree
+            for (int d = n >> 1; d > 0; d >>= 1) {
+                __syncthreads();    
+                if (thid < d) {
+                    int ai = offset * (2 * thid + 1) - 1;
+                    int bi = offset * (2 * thid + 2) - 1;
+                    temp[bi] += temp[ai];
+                }
+                offset *= 2;
+            }
+            if (thid == 0) { temp[n - 1] = 0; } // clear the last element
+            
+            // traverse down tree & build scan
+            for (int d = 1; d < n; d *= 2) {
+                offset >>= 1;
+                __syncthreads();
+                if (thid < d) {
+                    int ai = offset * (2 * thid + 1) - 1;
+                    int bi = offset * (2 * thid + 2) - 1;
+                    float t = temp[ai];
+                    temp[ai] = temp[bi];
+                    temp[bi] += t;
+                }
+            }
+            __syncthreads();
+            g_odata[2 * thid] = temp[2 * thid];
+            // write results to device memory
+            g_odata[2*thid+1] = temp[2*thid+1];
+        } 
 
 
         /**
@@ -41,7 +73,7 @@ namespace StreamCompaction {
             timer().startGpuTimer();
             // TODO
             int k = ilog2ceil(n);
-            kernScan << <gridDim, blockDim >> > ();
+            // kernScan << <gridDim, blockDim >> > ();
 
             timer().endGpuTimer();
 
@@ -57,10 +89,52 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
+        using namespace StreamCompaction::Common;
         int compact(int n, int *odata, const int *idata) {
+            int* dev_idata;
+            int* dev_odata;
+            bool* dev_bools;
+            int* dev_indices;
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata to dev_idata failed!");
+
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+
+            cudaMalloc((void**)&dev_bools, n * sizeof(bool));
+            checkCUDAError("cudaMalloc dev_mask failed!");
+
+            cudaMalloc((void**)&dev_indices, n * sizeof(bool));
+            checkCUDAError("cudaMalloc dev_indices failed!");
+
+            // for most gpus there are 1024 threads per block
+            int threadsPerBlock = 1024;
+            int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; // ceiling of n / threadsPerBlock
+            dim3 blockDim(threadsPerBlock, 0, 0);
+            dim3 gridDim(blocksPerGrid, 0, 0);
+
+
             timer().startGpuTimer();
             // TODO
+            int k = ilog2ceil(n);
+            // step 1: compute dev_bools = determine which elements should be purged
+            kernMapToBoolean<<<gridDim, blockDim>>>(n, dev_bools, dev_idata);
+            // step 2: exclusive scan on dev_bools
+            kernScan<<<gridDim, blockDim>>>(n, dev_indices, dev_bools);
+            // step 3: reduce the array based on bools
+            kernScatter<<<gridDim, blockDim>>>(n, dev_odata, dev_idata, dev_bools, dev_indices);
+
             timer().endGpuTimer();
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_bools);
+            cudaFree(dev_indices)
+
             return -1;
         }
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 43b2967..840d12f 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -62,7 +62,7 @@ namespace StreamCompaction {
             timer().startGpuTimer();
             // TODO
             int k = ilog2ceil(n);
-            kernScan<<<gridDim, blockDim >>>();
+            // kernScan<<<gridDim, blockDim >>>();
 
             timer().endGpuTimer();
 

From 8f94ca720e4183652dacb190ff9e7e5db3be1091 Mon Sep 17 00:00:00 2001
From: spencerwb <websp@seas.upenn.edu>
Date: Tue, 22 Sep 2020 15:12:45 -0400
Subject: [PATCH 05/16] breakthrough in algorithmic and syntatic understanding

---
 stream_compaction/naive.cu | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 840d12f..192c1da 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -13,6 +13,7 @@ namespace StreamCompaction {
         }
 
         // TODO: __global__
+        // This version can handle arrays only as large as can be processed by a single thread block running on one multiprocessor of a GPU.
         __global__ void scan(float* g_odata, float* g_idata, int n) {
             extern __shared__ float temp[]; 
             // allocated on invocation    
@@ -46,24 +47,27 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             int* dev_idata;
+            int* dev_odata;
+
             cudaMalloc((void**)&dev_idata, n * sizeof(int));
             checkCUDAError("cudaMalloc dev_idata failed!");
 
             cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
             checkCUDAError("cudaMemcpy idata to dev_idata failed!");
 
-            // for most gpus there are 1024 threads per block
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+
+            // for most gpus there 1024 is the maximum number of threads per block
             int threadsPerBlock = 1024;
             int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; // ceiling of n / threadsPerBlock
             dim3 blockDim(threadsPerBlock, 0, 0);
             dim3 gridDim(blocksPerGrid, 0, 0);
 
-            
             timer().startGpuTimer();
-            // TODO
-            int k = ilog2ceil(n);
-            // kernScan<<<gridDim, blockDim >>>();
-
+            int depth = ilog2ceil(n);
+            for (int d = 0; d < depth; d++)
+                kernScan<<<gridDim, blockDim>>>(n, odata, idata);
             timer().endGpuTimer();
 
             cudaFree(dev_idata);

From f85f0d82d2bd10e804e440e8f8da6ef98a66a899 Mon Sep 17 00:00:00 2001
From: spencerwb <websp@seas.upenn.edu>
Date: Tue, 22 Sep 2020 15:16:13 -0400
Subject: [PATCH 06/16] save

---
 stream_compaction/naive.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 192c1da..7c327bc 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -32,14 +32,15 @@ namespace StreamCompaction {
                     temp[pout*n+thid] += temp[pin*n+thid - offset];     
                 else       
                     temp[pout*n+thid] = temp[pin*n+thid];     
-                __syncthreads();   
+                __syncthreads();
             }   
             g_odata[thid] = temp[pout*n+thid]; 
             // write output 
         } 
 
-        __global__ void kernScan() {
-            int id = threadIdx.x + blockIdx.x;
+        __global__ void kernScan(int n, int* odata, int*idata, int offset, int pingpong) {
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+
         }
 
         /**

From 75ccf8113f22f78d6b0ed1ec45d90487a049b0d6 Mon Sep 17 00:00:00 2001
From: spencerwb <websp@seas.upenn.edu>
Date: Tue, 22 Sep 2020 18:03:44 -0400
Subject: [PATCH 07/16] save

---
 notes.txt                      |   5 ++
 src/main.cpp                   |   7 ++-
 stream_compaction/efficient.cu |  13 ++--
 stream_compaction/naive.cu     | 107 +++++++++++++++++++++++----------
 4 files changed, 92 insertions(+), 40 deletions(-)
 create mode 100644 notes.txt

diff --git a/notes.txt b/notes.txt
new file mode 100644
index 0000000..e2dd362
--- /dev/null
+++ b/notes.txt
@@ -0,0 +1,5 @@
+How does dim3 work? and dimensions in CUDA. Does dim3 set unspecified arguments to 1 or 0?
+Does CUDA expect unused dimensions to be 1 or 0?
+
+How do these checkCUDAError work? They sometimes say that an entire is occurring at an incorrect location if 
+I dont have one at every CUDA function call.
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..61bd8ea 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -51,14 +51,15 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, a, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan */
     onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
+    printArray(SIZE, c, true);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 5d5da9c..045380f 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -56,6 +56,9 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+            return;
+
             int* dev_idata;
             cudaMalloc((void**)&dev_idata, n * sizeof(int));
             checkCUDAError("cudaMalloc dev_idata failed!");
@@ -91,6 +94,8 @@ namespace StreamCompaction {
          */
         using namespace StreamCompaction::Common;
         int compact(int n, int *odata, const int *idata) {
+            return -1;
+
             int* dev_idata;
             int* dev_odata;
             bool* dev_bools;
@@ -122,18 +127,18 @@ namespace StreamCompaction {
             // TODO
             int k = ilog2ceil(n);
             // step 1: compute dev_bools = determine which elements should be purged
-            kernMapToBoolean<<<gridDim, blockDim>>>(n, dev_bools, dev_idata);
+            // kernMapToBoolean<<<gridDim, blockDim>>>(n, dev_bools, dev_idata);
             // step 2: exclusive scan on dev_bools
-            kernScan<<<gridDim, blockDim>>>(n, dev_indices, dev_bools);
+            // kernScan<<<gridDim, blockDim>>>(n, dev_indices, dev_bools);
             // step 3: reduce the array based on bools
-            kernScatter<<<gridDim, blockDim>>>(n, dev_odata, dev_idata, dev_bools, dev_indices);
+            // kernScatter<<<gridDim, blockDim>>>(n, dev_odata, dev_idata, dev_bools, dev_indices);
 
             timer().endGpuTimer();
 
             cudaFree(dev_idata);
             cudaFree(dev_odata);
             cudaFree(dev_bools);
-            cudaFree(dev_indices)
+            cudaFree(dev_indices);
 
             return -1;
         }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 7c327bc..f53c7c0 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+#include <string>
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -14,64 +16,103 @@ namespace StreamCompaction {
 
         // TODO: __global__
         // This version can handle arrays only as large as can be processed by a single thread block running on one multiprocessor of a GPU.
-        __global__ void scan(float* g_odata, float* g_idata, int n) {
-            extern __shared__ float temp[]; 
-            // allocated on invocation    
-            int thid = threadIdx.x;   int pout = 0, pin = 1;   
-            // Load input into shared memory.   
-            // This is exclusive scan, so shift right by one    
-            // and set first element to 0
-            temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
-            __syncthreads(); 
-            for (int offset = 1; offset < n; offset *= 2)   
-            {     
-                pout = 1 - pout; 
-                // swap double buffer indices     
-                pin = 1 - pout;     
-                if (thid >= offset)       
-                    temp[pout*n+thid] += temp[pin*n+thid - offset];     
-                else       
-                    temp[pout*n+thid] = temp[pin*n+thid];     
-                __syncthreads();
-            }   
-            g_odata[thid] = temp[pout*n+thid]; 
-            // write output 
-        } 
+        //__global__ void scan(float* g_odata, float* g_idata, int n) {
+        //    extern __shared__ float temp[]; 
+        //    // allocated on invocation    
+        //    int thid = threadIdx.x;   int pout = 0, pin = 1;   
+        //    // Load input into shared memory.   
+        //    // This is exclusive scan, so shift right by one    
+        //    // and set first element to 0
+        //    temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
+        //    __syncthreads(); 
+        //    for (int offset = 1; offset < n; offset *= 2)   
+        //    {     
+        //        pout = 1 - pout; 
+        //        // swap double buffer indices     
+        //        pin = 1 - pout;     
+        //        if (thid >= offset)       
+        //            temp[pout*n+thid] += temp[pin*n+thid - offset];     
+        //        else       
+        //            temp[pout*n+thid] = temp[pin*n+thid];     
+        //        __syncthreads();
+        //    }   
+        //    g_odata[thid] = temp[pout*n+thid]; 
+        //    // write output 
+        //} 
+
+        __global__ void kernInitExScan(int n, int* temp, int* idata) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (idx >= n)
+                idata[idx] = 0;
+            // shift the array to the right by one for exclusive scan
+            // the initializing the padding of idata inn the above line is not guaranteed to be 
+            // completed for all threads by the time the next line is reached
+            // so just initialize all of the padding in the temp to 0 here
+            temp[idx] = (idx > 0 && idx < n) ? idata[idx - 1] : 0;
+        }
 
-        __global__ void kernScan(int n, int* odata, int*idata, int offset, int pingpong) {
-            int index = threadIdx.x + blockIdx.x * blockDim.x;
+        __global__ void kernExScan(int pN, int* temp, int* odata, const int*idata, int offset, int pingpong) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
 
+            if (idx >= offset)
+                temp[pingpong * pN + idx] += temp[(1 - pingpong) * pN + idx - offset];
+            else
+                temp[pingpong * pN + idx] = temp[(1 - pingpong) * pN + idx];
         }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
+        using namespace std;
         void scan(int n, int *odata, const int *idata) {
             int* dev_idata;
             int* dev_odata;
+            int* dev_temp;
 
-            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            int depth = ilog2ceil(n);
+            // remember numbers are read from right to left
+            int pN = 1 << depth;    // n rounded to the next power of 2 = n after padding
+
+            // allocating memory for dev_idata and copying memory over from idata
+            cudaMalloc((void**)&dev_idata, pN * sizeof(int));
             checkCUDAError("cudaMalloc dev_idata failed!");
 
+            // std::unique_ptr<int[]>intKeys{ new int[N] };
             cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
             checkCUDAError("cudaMemcpy idata to dev_idata failed!");
 
+            // allocating memory for dev_odata
             cudaMalloc((void**)&dev_odata, n * sizeof(int));
             checkCUDAError("cudaMalloc dev_odata failed!");
 
-            // for most gpus there 1024 is the maximum number of threads per block
+            // allocating memory for dev_temp
+            cudaMalloc((void**)&dev_temp, 2 * pN * sizeof(int));
+            checkCUDAError("cudaMalloc dev_temp failed!");
+
+            // for most gpus 1024 is the maximum number of threads per block
             int threadsPerBlock = 1024;
-            int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; // ceiling of n / threadsPerBlock
-            dim3 blockDim(threadsPerBlock, 0, 0);
-            dim3 gridDim(blocksPerGrid, 0, 0);
+            int blocksPerGrid = (pN + threadsPerBlock - 1) / threadsPerBlock; // ceiling of ( pN / threadsPerBlock )
+            dim3 blockDim(threadsPerBlock);
+            dim3 gridDim(blocksPerGrid);
 
             timer().startGpuTimer();
-            int depth = ilog2ceil(n);
-            for (int d = 0; d < depth; d++)
-                kernScan<<<gridDim, blockDim>>>(n, odata, idata);
+            // initializes buffers necessary for naive exclusive scan
+            kernInitExScan<<<gridDim, blockDim>>>(n, dev_temp, dev_idata);
+            checkCUDAError("kernInitExScan failed!");
+            // execution of naive exclusive scan in parallel
+            // uses global memory instead of shared memory for ping pong buffers
+            // so that the data can be of arbitrary size
+            int pingpong = 0;
+            for (int offset = 1; offset < pN; offset *= 2) {
+                kernExScan<<<gridDim, blockDim>>>(pN, dev_temp, dev_odata, dev_idata, offset, pingpong);
+                checkCUDAError("kernExScan failed!");
+                pingpong = 1 - pingpong;
+            }
             timer().endGpuTimer();
 
             cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_temp);
         }
     }
 }

From 55b7e98cd9bc665b5008f4e5c33aa5b322b51dc9 Mon Sep 17 00:00:00 2001
From: spencerwb <websp@seas.upenn.edu>
Date: Tue, 22 Sep 2020 20:24:03 -0400
Subject: [PATCH 08/16] naive scan is finally working properly

---
 notes.txt                  | 31 ++++++++++++++-
 src/main.cpp               |  8 ++--
 stream_compaction/naive.cu | 79 ++++++++++++++++++++++++++++++++++----
 3 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/notes.txt b/notes.txt
index e2dd362..c1f8511 100644
--- a/notes.txt
+++ b/notes.txt
@@ -1,5 +1,34 @@
 How does dim3 work? and dimensions in CUDA. Does dim3 set unspecified arguments to 1 or 0?
 Does CUDA expect unused dimensions to be 1 or 0?
 
+
+
 How do these checkCUDAError work? They sometimes say that an entire is occurring at an incorrect location if 
-I dont have one at every CUDA function call.
\ No newline at end of file
+I dont have one at every CUDA function call.
+
+
+
+Do we need to include new functions in header files?
+
+
+
+Inside of CUDA files are we using c or C++
+
+Dont have classes. The coding style might be closer to C instead of C++. You can pass structs to CUDA kernels.
+
+When using the memory window is it showing you gpu or cpu memory when you copy and paste an address
+from the locals or autos window?
+
+Can memcpy from the device back to the host.
+And start with a smaller sized buffer so that you can check the values in the buffers by hand.
+
+
+dev_data1, dev_data2;
+// cudamalloc, memcpy, etc
+// for eah iteration, launch kernels on dev_data1 and dev_data2
+int* temp = dev_data1
+dev_dta1 = dev_data2
+dev_data2 = temp
+
+
+The weird alternating thing where every other value was zero was due to the book's funky way of ping ponging.
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 61bd8ea..13791af 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 4; //8; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -49,23 +49,25 @@ int main(int argc, char* argv[]) {
 
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
+    printArray(SIZE, a, true);
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    printArray(SIZE, a, true);
     printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan */
     onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
+    printArray(SIZE, a, true);
     StreamCompaction::Naive::scan(SIZE, c, a);
     printArray(SIZE, c, true);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
+    printArray(SIZE, a, true);
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index f53c7c0..235d47b 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -4,6 +4,8 @@
 #include "naive.h"
 
 #include <string>
+#include <iostream>
+#include <vector>
 
 namespace StreamCompaction {
     namespace Naive {
@@ -40,34 +42,71 @@ namespace StreamCompaction {
         //    // write output 
         //} 
 
-        __global__ void kernInitExScan(int n, int* temp, int* idata) {
+        __global__ void kernInitExScan(int n, int pN, int* temp, int* idata, int* pong) {
             int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (idx >= pN)
+                return;
+
             if (idx >= n)
                 idata[idx] = 0;
             // shift the array to the right by one for exclusive scan
             // the initializing the padding of idata inn the above line is not guaranteed to be 
             // completed for all threads by the time the next line is reached
             // so just initialize all of the padding in the temp to 0 here
+            pong[idx] = (idx > 0 && idx < n) ? idata[idx - 1] : 0;
+            return;
+
             temp[idx] = (idx > 0 && idx < n) ? idata[idx - 1] : 0;
         }
 
-        __global__ void kernExScan(int pN, int* temp, int* odata, const int*idata, int offset, int pingpong) {
+        __global__ void kernExScan(int pN, int* temp, int* odata, const int*idata, int* ping, int* pong, int offset, int pingpong) {
             int idx = threadIdx.x + (blockIdx.x * blockDim.x);
 
+            if (idx >= pN)
+                return;
+
+            if (idx >= offset)
+                ping[idx] = pong[idx] + pong[idx - offset];
+            else
+                ping[idx] = pong[idx];
+
+            return;
+
+            if (idx >= offset)
+                ping[idx] += pong[idx - offset];
+            else
+                ping[idx] = pong[idx];
+
+            return;
+
             if (idx >= offset)
                 temp[pingpong * pN + idx] += temp[(1 - pingpong) * pN + idx - offset];
             else
                 temp[pingpong * pN + idx] = temp[(1 - pingpong) * pN + idx];
         }
 
+        using namespace std;
+        void printArray(int n, const int* a, bool abridged = false) {
+            cout << "    [ ";
+            for (int i = 0; i < n; i++) {
+                if (abridged && i + 2 == 15 && n > 16) {
+                    i = n - 2;
+                    cout << "... ";
+                }
+                cout << a[i] << " ";
+            }
+            cout << "]\n";
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        using namespace std;
         void scan(int n, int *odata, const int *idata) {
             int* dev_idata;
             int* dev_odata;
             int* dev_temp;
+            int* dev_ping;
+            int* dev_pong;
 
             int depth = ilog2ceil(n);
             // remember numbers are read from right to left
@@ -77,7 +116,6 @@ namespace StreamCompaction {
             cudaMalloc((void**)&dev_idata, pN * sizeof(int));
             checkCUDAError("cudaMalloc dev_idata failed!");
 
-            // std::unique_ptr<int[]>intKeys{ new int[N] };
             cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
             checkCUDAError("cudaMemcpy idata to dev_idata failed!");
 
@@ -89,6 +127,14 @@ namespace StreamCompaction {
             cudaMalloc((void**)&dev_temp, 2 * pN * sizeof(int));
             checkCUDAError("cudaMalloc dev_temp failed!");
 
+            // allocating memory for dev_ping
+            cudaMalloc((void**)&dev_ping, pN * sizeof(int));
+            checkCUDAError("cudaMalloc dev_ping failed!");
+
+            // allocating memory for dev_pong
+            cudaMalloc((void**)&dev_pong, pN * sizeof(int));
+            checkCUDAError("cudaMalloc dev_pong failed!");
+
             // for most gpus 1024 is the maximum number of threads per block
             int threadsPerBlock = 1024;
             int blocksPerGrid = (pN + threadsPerBlock - 1) / threadsPerBlock; // ceiling of ( pN / threadsPerBlock )
@@ -96,23 +142,42 @@ namespace StreamCompaction {
             dim3 gridDim(blocksPerGrid);
 
             timer().startGpuTimer();
-            // initializes buffers necessary for naive exclusive scan
-            kernInitExScan<<<gridDim, blockDim>>>(n, dev_temp, dev_idata);
+            // launches a kernel that initializes buffers necessary for naive exclusive scan
+            kernInitExScan<<<gridDim, blockDim>>>(n, pN, dev_temp, dev_idata, dev_pong);
             checkCUDAError("kernInitExScan failed!");
+
+            printArray(n, idata, false);
+
             // execution of naive exclusive scan in parallel
             // uses global memory instead of shared memory for ping pong buffers
             // so that the data can be of arbitrary size
             int pingpong = 0;
             for (int offset = 1; offset < pN; offset *= 2) {
-                kernExScan<<<gridDim, blockDim>>>(pN, dev_temp, dev_odata, dev_idata, offset, pingpong);
+                kernExScan<<<gridDim, blockDim>>>(pN, dev_temp, dev_odata, dev_idata, dev_ping, dev_pong, offset, pingpong);
                 checkCUDAError("kernExScan failed!");
+
+                vector<int> temp_test(pN);
+                cudaMemcpy(temp_test.data(), dev_ping, sizeof(int) * pN, cudaMemcpyDeviceToHost);
+                checkCUDAError("cudaMemcpy dev_temp to temp_test failed!");
+                printArray(pN, temp_test.data(), false);
+
                 pingpong = 1 - pingpong;
+                /*cudaMemcpy(dev_temp, dev_ping, pN * sizeof(int), cudaMemcpyDeviceToDevice);
+                cudaMemcpy(dev_ping, dev_pong, pN * sizeof(int), cudaMemcpyDeviceToDevice);
+                cudaMemcpy(dev_pong, dev_temp, pN * sizeof(int), cudaMemcpyDeviceToDevice);*/
+                int* temp = dev_ping;
+                dev_ping = dev_pong;
+                dev_pong = temp;
             }
+            cudaMemcpy(odata, dev_pong, n * sizeof(int), cudaMemcpyDeviceToHost);
+            // cudaMemcpy(odata, dev_temp, n * sizeof(int), cudaMemcpyDeviceToHost);
             timer().endGpuTimer();
 
             cudaFree(dev_idata);
             cudaFree(dev_odata);
             cudaFree(dev_temp);
+            cudaFree(dev_ping);
+            cudaFree(dev_pong);
         }
     }
 }

From b76a34d63c2d48c10040fbde63f4b631aaec14b4 Mon Sep 17 00:00:00 2001
From: spencerwb <websp@seas.upenn.edu>
Date: Tue, 22 Sep 2020 21:19:16 -0400
Subject: [PATCH 09/16] exclusive scan thrust is working

---
 stream_compaction/efficient.cu | 16 +++++++++++++++-
 stream_compaction/thrust.cu    | 20 ++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 045380f..e40e1b3 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -49,7 +49,21 @@ namespace StreamCompaction {
             g_odata[2 * thid] = temp[2 * thid];
             // write results to device memory
             g_odata[2*thid+1] = temp[2*thid+1];
-        } 
+        }
+
+        __device__ void kernUpSweep() {
+
+        }
+
+        __device__ void kernDownSweep() {
+
+        }
+
+        __global__ void kernExScan() {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+
+
+        }
 
 
         /**
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..07fea05 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,31 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_idata;
+            int* dev_odata;
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            thrust::device_ptr<int> thrust_dev_idata(dev_idata);
+            thrust::device_ptr<int> thrust_dev_odata(dev_odata);
+
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(thrust_dev_idata, thrust_dev_idata + n, thrust_dev_odata);
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
         }
     }
 }

From 0d3836874f4f2835d33fcfaf8069f9386243beba Mon Sep 17 00:00:00 2001
From: spencerwb <websp@seas.upenn.edu>
Date: Tue, 22 Sep 2020 22:08:47 -0400
Subject: [PATCH 10/16] save

---
 stream_compaction/efficient.cu | 43 ++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index e40e1b3..cb799f3 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -51,17 +51,24 @@ namespace StreamCompaction {
             g_odata[2*thid+1] = temp[2*thid+1];
         }
 
-        __device__ void kernUpSweep() {
+        __device__ void kernUpSweep(int n, int pN, int* idata, int offset, int d) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+
+            if (idx >= pN)
+                return;
 
+            if (idx < d) {
+                int ai = offset * (2 * thid + 1) - 1;
+                int bi = offset * (2 * thid + 2) - 1;
+                temp[bi] += temp[ai];
+            }
         }
 
         __device__ void kernDownSweep() {
 
         }
 
-        __global__ void kernExScan() {
-            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
-
+        __global__ void kernExScan(int n, int pN) {
 
         }
 
@@ -74,6 +81,11 @@ namespace StreamCompaction {
             return;
 
             int* dev_idata;
+
+            int depth = ilog2ceil(n);
+            // remember numbers are read from right to left
+            int pN = 1 << depth;    // n rounded to the next power of 2 = n after padding
+
             cudaMalloc((void**)&dev_idata, n * sizeof(int));
             checkCUDAError("cudaMalloc dev_idata failed!");
 
@@ -89,9 +101,30 @@ namespace StreamCompaction {
 
             timer().startGpuTimer();
             // TODO
-            int k = ilog2ceil(n);
             // kernScan << <gridDim, blockDim >> > ();
+            // upsweep
+            int offset = 1;
+            for (int d = n >> 1; d > 0; d >>= 1) {
+                kernUpSweep<<<gridDim, blockDim>>>();
+                offset *= 2;
+
 
+                kernExScan << <gridDim, blockDim >> > (pN, dev_temp, dev_odata, dev_idata, dev_ping, dev_pong, offset, pingpong);
+                checkCUDAError("kernExScan failed!");
+
+                vector<int> temp_test(pN);
+                cudaMemcpy(temp_test.data(), dev_ping, sizeof(int) * pN, cudaMemcpyDeviceToHost);
+                checkCUDAError("cudaMemcpy dev_temp to temp_test failed!");
+                printArray(pN, temp_test.data(), false);
+
+                pingpong = 1 - pingpong;
+                /*cudaMemcpy(dev_temp, dev_ping, pN * sizeof(int), cudaMemcpyDeviceToDevice);
+                cudaMemcpy(dev_ping, dev_pong, pN * sizeof(int), cudaMemcpyDeviceToDevice);
+                cudaMemcpy(dev_pong, dev_temp, pN * sizeof(int), cudaMemcpyDeviceToDevice);*/
+                int* temp = dev_ping;
+                dev_ping = dev_pong;
+                dev_pong = temp;
+            }
             timer().endGpuTimer();
 
             cudaFree(dev_idata);

From 9becd83cea4867412f5faf1b988e2430514cd427 Mon Sep 17 00:00:00 2001
From: Spencer Webster-Bass <39770751+spencerwb@users.noreply.github.com>
Date: Tue, 22 Sep 2020 23:46:18 -0400
Subject: [PATCH 11/16] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0e38ddb..1a98207 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,9 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* SPENCER WEBSTER-BASS
+  * [LinkedIn](https://www.linkedin.com/in/spencer-webster-bass/)
+* Tested on: (TODO) Windows 10, i7-6700 @ 3.40GHz 16GB, Quadro P1000 222MB (MOR100B-19)
 
 ### (TODO: Your README)
 

From 1d20d772b9be5c565758fe8eb2fabcad1a263244 Mon Sep 17 00:00:00 2001
From: Spencer Webster-Bass <39770751+spencerwb@users.noreply.github.com>
Date: Tue, 22 Sep 2020 23:46:52 -0400
Subject: [PATCH 12/16] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1a98207..84f2181 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ CUDA Stream Compaction
 
 * SPENCER WEBSTER-BASS
   * [LinkedIn](https://www.linkedin.com/in/spencer-webster-bass/)
-* Tested on: (TODO) Windows 10, i7-6700 @ 3.40GHz 16GB, Quadro P1000 222MB (MOR100B-19)
+* Tested on: Windows 10, i7-6700 @ 3.40GHz 16GB, Quadro P1000 222MB (MOR100B-19)
 
 ### (TODO: Your README)
 

From d7a23030543d21e3f5851e7c01bd1836b04ba653 Mon Sep 17 00:00:00 2001
From: Spencer Webster-Bass <39770751+spencerwb@users.noreply.github.com>
Date: Tue, 22 Sep 2020 23:58:49 -0400
Subject: [PATCH 13/16] Update README.md

---
 README.md | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 84f2181..421b49e 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,96 @@ CUDA Stream Compaction
   * [LinkedIn](https://www.linkedin.com/in/spencer-webster-bass/)
 * Tested on: Windows 10, i7-6700 @ 3.40GHz 16GB, Quadro P1000 222MB (MOR100B-19)
 
-### (TODO: Your README)
+### DESCRIPTION
+
+This project is an implementation of the stream compaction parallel algorithm on the GPU using CUDA and C++.
+
+Features:
+* Serial implementation of scan and stream compaction algorithms on the CPU
+* Naive, parallel implementation of scan and stream compaction algorithms on the GPU
+* Atepted work-efficient, parallel implementation of scan and stream compaction algorithms on the GPU
+* Comparison between my implementations' efficiency and thrust's implementation of exclusive scan algorithm
 
 Include analysis, etc. (Remember, this is public, so don't put
 anything here that you don't want to share with the world.)
 
+****************
+** SCAN TESTS **
+****************
+    [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0ms    (std::chrono Measured)
+    [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0ms    (std::chrono Measured)
+    [   0  49  57  59  86 106 150 171 198 247 250 270 273 ]
+    passed
+==== naive scan, power-of-two ====
+    [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ]
+    [ 0 49 57 10 29 47 64 65 48 76 52 23 23 19 19 34 ]
+    [ 0 49 57 59 86 57 93 112 112 141 100 99 75 42 42 53 ]
+    [ 0 49 57 59 86 106 150 171 198 198 193 211 187 183 142 152 ]
+    [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ]
+   elapsed time: 7.58922ms    (CUDA Measured)
+    [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
+    passed
+==== 1s array for finding bugs ====
+    [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ]
+    [ 0 49 57 10 29 47 64 65 48 76 52 23 23 19 19 34 ]
+    [ 0 49 57 59 86 57 93 112 112 141 100 99 75 42 42 53 ]
+    [ 0 49 57 59 86 106 150 171 198 198 193 211 187 183 142 152 ]
+    [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ]
+    [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
+==== naive scan, non-power-of-two ====
+    [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    [ 49 8 2 27 20 44 21 27 49 3 20 3 16 ]
+    [ 0 49 57 10 29 47 64 65 48 76 52 23 23 3 0 0 ]
+    [ 0 49 57 59 86 57 93 112 112 141 100 99 75 26 23 3 ]
+    [ 0 49 57 59 86 106 150 171 198 198 193 211 187 167 123 102 ]
+    [ 0 49 57 59 86 106 150 171 198 247 250 270 273 273 273 273 ]
+   elapsed time: 15.0825ms    (CUDA Measured)
+    [   0  49  57  59  86 106 150 171 198 247 250 270 273   0   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0ms    (CUDA Measured)
+    a[1] = 49, b[1] = 0
+    FAIL VALUE
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0ms    (CUDA Measured)
+    a[1] = 49, b[1] = 0
+    FAIL VALUE
+==== thrust scan, power-of-two ====
+   elapsed time: 0.083008ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.069632ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   2   2   1   2   0   1   3   1   3   0   3   0   3   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.0034ms    (std::chrono Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.004ms    (std::chrono Measured)
+    [   0   0   0   0   0   0   0   0   0   0 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.0023ms    (std::chrono Measured)
+    [ ]
+    expected 12 elements, got -1
+    FAIL COUNT
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0ms    (CUDA Measured)
+    expected 12 elements, got -1
+    FAIL COUNT
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0ms    (CUDA Measured)
+    expected 10 elements, got -1
+    FAIL COUNT
+Press any key to continue . . .

From 888910b8b14fab3ee89544ee1c7406b4048a7612 Mon Sep 17 00:00:00 2001
From: Spencer Webster-Bass <39770751+spencerwb@users.noreply.github.com>
Date: Fri, 20 Aug 2021 14:05:22 -0400
Subject: [PATCH 14/16] Update README.md

---
 README.md | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 421b49e..1a60f74 100644
--- a/README.md
+++ b/README.md
@@ -17,20 +17,23 @@ Features:
 * Atepted work-efficient, parallel implementation of scan and stream compaction algorithms on the GPU
 * Comparison between my implementations' efficiency and thrust's implementation of exclusive scan algorithm
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+TODOs:
+Include Analysis
 
 ****************
 ** SCAN TESTS **
 ****************
     [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    
 ==== cpu scan, power-of-two ====
    elapsed time: 0ms    (std::chrono Measured)
     [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
+    
 ==== cpu scan, non-power-of-two ====
    elapsed time: 0ms    (std::chrono Measured)
     [   0  49  57  59  86 106 150 171 198 247 250 270 273 ]
     passed
+    
 ==== naive scan, power-of-two ====
     [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
     [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ]
@@ -41,6 +44,7 @@ anything here that you don't want to share with the world.)
    elapsed time: 7.58922ms    (CUDA Measured)
     [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
     passed
+    
 ==== 1s array for finding bugs ====
     [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
     [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ]
@@ -49,6 +53,7 @@ anything here that you don't want to share with the world.)
     [ 0 49 57 59 86 106 150 171 198 198 193 211 187 183 142 152 ]
     [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ]
     [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
+    
 ==== naive scan, non-power-of-two ====
     [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
     [ 49 8 2 27 20 44 21 27 49 3 20 3 16 ]
@@ -59,17 +64,21 @@ anything here that you don't want to share with the world.)
    elapsed time: 15.0825ms    (CUDA Measured)
     [   0  49  57  59  86 106 150 171 198 247 250 270 273   0   0   0 ]
     passed
+    
 ==== work-efficient scan, power-of-two ====
    elapsed time: 0ms    (CUDA Measured)
     a[1] = 49, b[1] = 0
     FAIL VALUE
+    
 ==== work-efficient scan, non-power-of-two ====
    elapsed time: 0ms    (CUDA Measured)
     a[1] = 49, b[1] = 0
     FAIL VALUE
+    
 ==== thrust scan, power-of-two ====
    elapsed time: 0.083008ms    (CUDA Measured)
     passed
+    
 ==== thrust scan, non-power-of-two ====
    elapsed time: 0.069632ms    (CUDA Measured)
     passed
@@ -78,25 +87,30 @@ anything here that you don't want to share with the world.)
 ** STREAM COMPACTION TESTS **
 *****************************
     [   1   2   2   1   2   0   1   3   1   3   0   3   0   3   3   0 ]
+    
 ==== cpu compact without scan, power-of-two ====
    elapsed time: 0.0034ms    (std::chrono Measured)
     [   0   0   0   0   0   0   0   0   0   0   0   0 ]
     passed
+    
 ==== cpu compact without scan, non-power-of-two ====
    elapsed time: 0.004ms    (std::chrono Measured)
     [   0   0   0   0   0   0   0   0   0   0 ]
     passed
+    
 ==== cpu compact with scan ====
    elapsed time: 0.0023ms    (std::chrono Measured)
     [ ]
     expected 12 elements, got -1
     FAIL COUNT
+    
 ==== work-efficient compact, power-of-two ====
    elapsed time: 0ms    (CUDA Measured)
     expected 12 elements, got -1
     FAIL COUNT
+    
 ==== work-efficient compact, non-power-of-two ====
    elapsed time: 0ms    (CUDA Measured)
     expected 10 elements, got -1
     FAIL COUNT
-Press any key to continue . . .
+

From b86d28b960708df4a466b2be4e8de89a92bdcf2f Mon Sep 17 00:00:00 2001
From: Spencer Webster-Bass <39770751+spencerwb@users.noreply.github.com>
Date: Fri, 20 Aug 2021 14:06:18 -0400
Subject: [PATCH 15/16] Update README.md

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 1a60f74..b28b412 100644
--- a/README.md
+++ b/README.md
@@ -36,12 +36,19 @@ Include Analysis
     
 ==== naive scan, power-of-two ====
     [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    
     [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ]
+    
     [ 0 49 57 10 29 47 64 65 48 76 52 23 23 19 19 34 ]
+    
     [ 0 49 57 59 86 57 93 112 112 141 100 99 75 42 42 53 ]
+    
     [ 0 49 57 59 86 106 150 171 198 198 193 211 187 183 142 152 ]
+    
     [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ]
+    
    elapsed time: 7.58922ms    (CUDA Measured)
+   
     [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
     passed
     

From db89bce82a8f844cb4c064770c50f10a7c606a06 Mon Sep 17 00:00:00 2001
From: Spencer Webster-Bass <39770751+spencerwb@users.noreply.github.com>
Date: Fri, 20 Aug 2021 14:08:48 -0400
Subject: [PATCH 16/16] Update README.md

---
 README.md | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/README.md b/README.md
index b28b412..b7acfec 100644
--- a/README.md
+++ b/README.md
@@ -26,15 +26,21 @@ Include Analysis
     [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
     
 ==== cpu scan, power-of-two ====
+
    elapsed time: 0ms    (std::chrono Measured)
+   
     [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
     
 ==== cpu scan, non-power-of-two ====
+
    elapsed time: 0ms    (std::chrono Measured)
+   
     [   0  49  57  59  86 106 150 171 198 247 250 270 273 ]
+    
     passed
     
 ==== naive scan, power-of-two ====
+
     [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
     
     [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ]
@@ -50,74 +56,118 @@ Include Analysis
    elapsed time: 7.58922ms    (CUDA Measured)
    
     [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
+    
     passed
     
 ==== 1s array for finding bugs ====
+
     [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    
     [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ]
+    
     [ 0 49 57 10 29 47 64 65 48 76 52 23 23 19 19 34 ]
+    
     [ 0 49 57 59 86 57 93 112 112 141 100 99 75 42 42 53 ]
+    
     [ 0 49 57 59 86 106 150 171 198 198 193 211 187 183 142 152 ]
+    
     [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ]
+    
     [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
     
 ==== naive scan, non-power-of-two ====
+
     [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    
     [ 49 8 2 27 20 44 21 27 49 3 20 3 16 ]
+    
     [ 0 49 57 10 29 47 64 65 48 76 52 23 23 3 0 0 ]
+    
     [ 0 49 57 59 86 57 93 112 112 141 100 99 75 26 23 3 ]
+    
     [ 0 49 57 59 86 106 150 171 198 198 193 211 187 167 123 102 ]
+    
     [ 0 49 57 59 86 106 150 171 198 247 250 270 273 273 273 273 ]
+    
    elapsed time: 15.0825ms    (CUDA Measured)
+   
     [   0  49  57  59  86 106 150 171 198 247 250 270 273   0   0   0 ]
+    
     passed
     
 ==== work-efficient scan, power-of-two ====
+
    elapsed time: 0ms    (CUDA Measured)
+   
     a[1] = 49, b[1] = 0
+    
     FAIL VALUE
     
 ==== work-efficient scan, non-power-of-two ====
+
    elapsed time: 0ms    (CUDA Measured)
+   
     a[1] = 49, b[1] = 0
+    
     FAIL VALUE
     
 ==== thrust scan, power-of-two ====
+
    elapsed time: 0.083008ms    (CUDA Measured)
+   
     passed
     
 ==== thrust scan, non-power-of-two ====
+
    elapsed time: 0.069632ms    (CUDA Measured)
+   
     passed
 
 *****************************
 ** STREAM COMPACTION TESTS **
 *****************************
+
     [   1   2   2   1   2   0   1   3   1   3   0   3   0   3   3   0 ]
     
 ==== cpu compact without scan, power-of-two ====
+
    elapsed time: 0.0034ms    (std::chrono Measured)
+   
     [   0   0   0   0   0   0   0   0   0   0   0   0 ]
+    
     passed
     
 ==== cpu compact without scan, non-power-of-two ====
+
    elapsed time: 0.004ms    (std::chrono Measured)
+   
     [   0   0   0   0   0   0   0   0   0   0 ]
+    
     passed
     
 ==== cpu compact with scan ====
+
    elapsed time: 0.0023ms    (std::chrono Measured)
+   
     [ ]
+    
     expected 12 elements, got -1
+    
     FAIL COUNT
     
 ==== work-efficient compact, power-of-two ====
+
    elapsed time: 0ms    (CUDA Measured)
+   
     expected 12 elements, got -1
+    
     FAIL COUNT
     
 ==== work-efficient compact, non-power-of-two ====
+
    elapsed time: 0ms    (CUDA Measured)
+   
     expected 10 elements, got -1
+    
     FAIL COUNT