diff --git a/README.md b/README.md
index b71c458..4de752d 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,31 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Fengkai Wu
+* Tested on: Windows 10, i7-6700 @ 3.40GHz 16GB, Quadro K620 4095MB (Twn M70 Lab)
 
-### (TODO: Your README)
+### Analysis
 
 Include analysis, etc. (Remember, this is public, so don't put
 anything here that you don't want to share with the world.)
 
+The running time of exclusive scan under different algorithms are as follows:
+![img_1](https://github.com/wufk/Project2-Stream-Compaction/blob/master/img/Scan.png)
+
+![img_1](https://github.com/wufk/Project2-Stream-Compaction/blob/master/img/proj2table.PNG)
+
+The running time of stream compaction is as follows:
+![img_2](https://github.com/wufk/Project2-Stream-Compaction/blob/master/img/cmpact.png)
+
+As the graph shows, naive scan takes extreme long time to finish the job while the efficient way is much fast. However, the GPU performance is still not as good as CPU. In my Implementation of efficient scan, for each downSweep/upSweep, the number of actual number of working threads is re-computed. The launching blocks are also derived from the number of threads to be used. Bits shifting and modulus operation are also avoided. Other possible factors that downplay the performance might due to too many kernel calls when sweeping up and down, large use of global memory and too many threads required when the array size is large.
+
+Possible ways to further enhance the performance in the future includes using shared memory and dividing  and scanning the array by blocks.
+
+Another worth noticing is that thrust runs way faster when the array size is non multiple of two. 
+
+The timeline of execution when the array size is 2^20 is as follows:
+![img_2](https://github.com/wufk/Project2-Stream-Compaction/blob/master/img/proj2Perform.PNG)
+
+It shows that CUDA library of memery manipulation is very expensive. Furthermore, we can see that using thrust and using our own algorithms of scanning is calling different CUDA runtime API. Thrust is calling cudaDeviceSynchronize function while our algorithms call cudaEventSynchronize. This may partly explain why thrust run way faster, in that it is optimized in device and hardware while our effort is just focusing on algorithm and high level part.
+
+In summary, to get better performance in GPU computing, architecture makes a huge difference and optimization must focus on better allocating resources and making use of the specaiality of GPU hardware. 
diff --git a/img/Scan.png b/img/Scan.png
new file mode 100644
index 0000000..e90f03b
Binary files /dev/null and b/img/Scan.png differ
diff --git a/img/cmpact.png b/img/cmpact.png
new file mode 100644
index 0000000..c9be32b
Binary files /dev/null and b/img/cmpact.png differ
diff --git a/img/proj2Perform.PNG b/img/proj2Perform.PNG
new file mode 100644
index 0000000..ce413be
Binary files /dev/null and b/img/proj2Perform.PNG differ
diff --git a/img/proj2table.PNG b/img/proj2table.PNG
new file mode 100644
index 0000000..3037da3
Binary files /dev/null and b/img/proj2table.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index 7305641..fdd967f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,21 +13,36 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int a[SIZE], b[SIZE], c[SIZE];
+
 
 int main(int argc, char* argv[]) {
     // Scan tests
+	if (argc != 2) {
+		printf("test.exe [sizeOfArray, please input 1-25]");
+		return 1;
+	}
+	const int SIZE = 1 << atoi(argv[1]); // feel free to change the size of array
+	const int NPOT = SIZE - 3; // Non-Power-Of-Two
+	//int a[SIZE], b[SIZE], c[SIZE];
+	int* b = new int[SIZE];
+	int* a = new int[SIZE];
+	int* c = new int[SIZE];
+
+	int *input_inclusive = new int[SIZE];
+	int *b_inclusive = new int[SIZE];
+	int *c_inclusive = new int[SIZE];
 
     printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
+    printf("********************\n");
+	printf("** SCAN TESTS, %d **\n", atoi(argv[1]));
+    printf("********************\n");
 
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    genArray(SIZE - 1, a, 10);  // Leave a 0 at the end to test that edge case
+	genArray(SIZE - 1, input_inclusive, 10);
     a[SIZE - 1] = 0;
+	input_inclusive[SIZE - 1] = 0;
     printArray(SIZE, a, true);
+	printArray(SIZE, input_inclusive, true);
 
     // initialize b using StreamCompaction::CPU::scan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
@@ -38,6 +53,11 @@ int main(int argc, char* argv[]) {
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     printArray(SIZE, b, true);
 
+	zeroArray(SIZE, b_inclusive);
+	printDesc("cpu inclusive scan, power of two");
+	StreamCompaction::CPU::inScan(SIZE, b_inclusive, input_inclusive);
+	printArray(SIZE, b_inclusive, true);
+
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
@@ -45,32 +65,44 @@ int main(int argc, char* argv[]) {
     printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
 
+	zeroArray(SIZE, c);
+	printDesc("cpu inclusive scan, non power of two");
+	StreamCompaction::CPU::inScan(SIZE, c, input_inclusive);
+	printArray(SIZE, c, true);
+	printCmpResult(NPOT, b_inclusive, c);
+
+	zeroArray(SIZE, c);
+	printDesc("SM inclusive scan, non power of two");
+	StreamCompaction::Efficient::scanSM(NPOT, c, input_inclusive);
+	printArray(SIZE, c, true);
+	printCmpResult(NPOT, b_inclusive, c);
+
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
@@ -129,15 +161,19 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
-    system("pause"); // stop Win32 console from closing on exit
+	delete[] a;
+	delete[] b;
+	delete[] c;
+    system
+	("pause"); // stop Win32 console from closing on exit
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..e31ca3c 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_30
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 8fc0211..0347bec 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
 #include "common.h"
+#include "device_launch_parameters.h"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
@@ -23,7 +24,11 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+			int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+			if (idx >= n) return;
+
+			bools[idx] = (int)(idata[idx] != 0);
         }
 
         /**
@@ -32,7 +37,14 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+			int idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+			if (idx >= n) return;
+
+			if (bools[idx])
+			{
+				odata[indices[idx]] = idata[idx];
+			}
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..cc57cb9 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,17 +1,36 @@
 #include <cstdio>
 #include "cpu.h"
 
-#include "common.h"
+#include "common.h"
 
 namespace StreamCompaction {
     namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-	        static PerformanceTimer timer;
-	        return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+	        static PerformanceTimer timer;
+	        return timer;
         }
 
+		void scanImpl(int n, int *odata, const int *idata) {
+
+			int pre;
+
+			for (int i = 0; i < n; ++i)
+			{
+
+				if (i == 0) {
+					pre = idata[i];
+					odata[i] = 0;
+				}
+				else {
+					int temp = idata[i];
+					odata[i] = odata[i - 1] + pre;
+					pre = temp;
+				}
+			}
+		}
+
         /**
          * CPU scan (prefix sum).
          * For performance analysis, this is supposed to be a simple for loop.
@@ -19,10 +38,18 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-            // TODO
+			scanImpl(n, odata, idata);
 	        timer().endCpuTimer();
         }
 
+		void inScan(int n, int * odata, const int * idata)
+		{
+			odata[0] = idata[0];
+			for (int i = 1; i < n; i++) {
+				odata[i] = odata[i - 1] + idata[i];
+			}
+		}
+
         /**
          * CPU stream compaction without using the scan function.
          *
@@ -30,9 +57,19 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-            // TODO
+			int count = 0;
+
+			int k = 0;
+			for (int i = 0; i < n; ++i)
+			{
+				if (idata[i] != 0)
+				{
+					count++;
+					odata[k++] = idata[i];
+				}
+			}
 	        timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -42,9 +79,23 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-	        // TODO
+			for (int i = 0; i < n; ++i)
+			{
+				odata[i] = (idata[i] != 0);
+			}
+
+			scanImpl(n, odata, odata);
+
+			int count = 0;
+			for (int i = 0; i < n; ++i)
+			{
+				if (idata[i] != 0) {
+					odata[odata[i]] = idata[i];
+					count++;
+				}
+			}
 	        timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 236ce11..4e56da7 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -8,6 +8,8 @@ namespace StreamCompaction {
 
         void scan(int n, int *odata, const int *idata);
 
+		void inScan(int n, int *odata, const int *idata);
+
         int compactWithoutScan(int n, int *odata, const int *idata);
 
         int compactWithScan(int n, int *odata, const int *idata);
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..9f8926d 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,25 +2,241 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include "device_launch_parameters.h"
 
 namespace StreamCompaction {
     namespace Efficient {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
 
+		__global__ void kernScanSMBC(int n, int *data) {
+			extern __shared__ int smem[];
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+			if (i >= n) {
+				return;
+			}
+			int padding = i / warpSize;
+			smem[threadIdx.x + padding] = data[i];
+			int temp[1024];
+
+
+
+			for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
+				__syncthreads();
+				for (int i = 0; i < 1024; i++) {
+					temp[i] = smem[i];
+				}
+
+				int index = (threadIdx.x + 1 ) * 2 * stride - 1;
+				int prev = index - stride;
+				index += index / warpSize;
+				prev += prev / warpSize;
+				if (index < blockDim.x) {
+					smem[index] += smem[prev];
+				}
+			}
+
+			for (int stride = 1024 / 4; stride > 0; stride /= 2) {
+				__syncthreads();
+				for (int i = 0; i < 1024; i++) {
+					temp[i] = smem[i];
+				}
+				int index = (threadIdx.x + 1 ) * stride * 2 - 1;
+				int next = index + stride;
+				index += index / warpSize;
+				next += next / warpSize;
+				if (next < blockDim.x) {
+					smem[next] += smem[index];
+				}
+			}
+
+			__syncthreads();
+			for (int i = 0; i < 1024; i++) {
+				temp[i] = smem[i];
+			}
+			data[i] = smem[threadIdx.x + padding];
+		}
+
+		__global__ void kernScanSM(int n, int *data) {
+			extern __shared__ int smem[];
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+			if (i >= n) {
+				return;
+			}
+			smem[threadIdx.x] = data[i];
+			int temp[1024];
+
+
+			for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
+				__syncthreads();
+				for (int i = 0; i < 1024; i++) {
+					temp[i] = smem[i];
+				}
+				int index = (threadIdx.x + 1) * 2 * stride - 1;
+				if (index < blockDim.x) {
+					smem[index] += smem[index - stride];
+				}
+			}
+
+			for (int stride = 1024 / 4; stride > 0; stride /= 2) {
+				__syncthreads();
+				for (int i = 0; i < 1024; i++) {
+					temp[i] = smem[i];
+				}
+				int index = (threadIdx.x + 1) * stride * 2 - 1;
+				if (index + stride < blockDim.x) {
+					smem[index + stride] += smem[index];
+				}
+			}
+
+			__syncthreads();
+			for (int i = 0; i < 1024; i++) {
+				temp[i] = smem[i];
+			}
+			data[i] = smem[threadIdx.x];
+		}
+
+		__global__ void kernUpSweep(int n, int offset, int *odata)
+		{
+			int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+			if (idx >= n) return;
+
+			if (n == 1)
+			{
+				odata[offset - 1] = 0;
+				return;
+			}
+
+			int cur = (idx + 1) * offset - 1;
+
+			int prev = cur - (offset / 2);
+
+			odata[cur] += odata[prev];
+		}
+
+		__global__ void kernDownSweep(int n, int offset, int *odata)
+		{
+			int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+			if (idx >= n) return;
+
+			int cur = (idx + 1) * offset - 1;
+
+			int prev = cur - (offset / 2);
+
+			int temp = odata[prev];
+			odata[prev] = odata[cur];
+			odata[cur] += temp;
+		}
+
+		int getPadded(int n) {
+			int countOfOnes = 0;
+			int ret = 1;
+			while (n != 1)
+			{
+				if (n & 1 == 1)
+				{
+					++countOfOnes;
+				}
+				n >>= 1;
+				ret <<= 1;
+			}
+			if (countOfOnes == 0) return ret;
+			else return ret << 1;
+		}
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+			int *dev_odata;
+			dev_odata = nullptr;
+
+			int numToCompute = getPadded(n);
+
+			cudaMalloc(&dev_odata, numToCompute * sizeof(int));
+			cudaMemset(dev_odata, 0, numToCompute * sizeof(int));
+			cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			int depth = ilog2ceil(n);
+			int blockSize = 1024;
+			int offset = 1;
+
+			timer().startGpuTimer();
+
+			for (int i = 0; i < depth; ++i)
+			{
+				numToCompute /= 2;
+				offset *= 2;
+				int blocksPerGrid = (numToCompute + blockSize - 1) / blockSize;
+				kernUpSweep << <blocksPerGrid, blockSize >> > (numToCompute, offset, dev_odata);
+			}
+
+			numToCompute = 1;
+			for (int i = 0; i < depth; ++i)
+				{
+				int blocksPerGrid = (numToCompute + blockSize - 1) / blockSize;
+				kernDownSweep << <blocksPerGrid, blockSize >> > (numToCompute, offset, dev_odata);
+				numToCompute *= 2;
+				offset /= 2;
+			}
+
+			timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_odata);
         }
 
+		void scanSM(int n, int * odata, const int * idata)
+		{
+			int *dev_odata;
+			dev_odata = nullptr;
+
+			int numToCompute = getPadded(n);
+
+			cudaMalloc(&dev_odata, numToCompute * sizeof(int));
+			cudaMemset(dev_odata, 0, numToCompute * sizeof(int));
+			cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			int blockSize = 1024;
+			int blocksPerGrid = (numToCompute + blockSize - 1) / blockSize;
+			kernScanSMBC<< <blocksPerGrid, blockSize, blockSize * sizeof(int) >> > (numToCompute, dev_odata);
+			
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_odata);
+		}
+
+		//__global__ void kernScanEachBlock(int n, int *a) {
+
+		//}
+
+		//void scanUsingSharedMem(int n, int *odata, const int *idata) {
+		//	int numPadded = getPadded(n);
+
+		//	int *dev_idata, *dev_odata;
+
+		//	dev_idata = nullptr;
+		//	cudaMalloc(&dev_idata, numPadded * sizeof(int));
+		//	cudaMemset(dev_idata, 0, numPadded * sizeof(int));
+		//	cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+		//	dev_odata = nullptr;
+		//	cudaMalloc(&dev_odata, numPadded * sizeof(int));
+
+		//	int blockSize = 1024;
+		//	int numOfBlocks = (numPadded + blockSize - 1) / blockSize;
+
+		//	kernScanEachBlock << <numOfBlocks, blockSize >> > (blockSize, dev_idata, dev_odata);
+
+		//	cudaFree(dev_idata);
+		//}
+
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -31,10 +247,67 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
+			int numToCompute = getPadded(n);
+
+			int *dev_odata, *dev_idata, *dev_bools, *dev_indices;
+			dev_odata = nullptr;
+			dev_idata = nullptr;
+			dev_bools = nullptr;
+			dev_indices = nullptr;
+
+			cudaMalloc(&dev_bools, n * sizeof(int));
+			cudaMalloc(&dev_idata, n * sizeof(int));
+			cudaMalloc(&dev_odata, n * sizeof(int));
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			cudaMemset(dev_bools, 0, n * sizeof(int));
+
+			cudaMalloc(&dev_indices, numToCompute * sizeof(int));
+			cudaMemset(dev_indices, 0, numToCompute * sizeof(int));
+
+			int depth = ilog2ceil(n);
+			int offset = 1;
+			int blockSize = 1024;
+			int blocksPerGrid = (n + blockSize - 1) / blockSize;
+
+			timer().startGpuTimer();
+			Common::kernMapToBoolean << <blocksPerGrid, blockSize >> > (numToCompute, dev_bools, dev_idata);
+			cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+			for (int i = 0; i < depth; ++i)
+			{
+				numToCompute /= 2;
+				offset *= 2;
+				blocksPerGrid = (numToCompute + blockSize - 1) / blockSize;
+				kernUpSweep << <blocksPerGrid, blockSize >> > (numToCompute, offset, dev_indices);
+
+			}
+
+			numToCompute = 1;
+			for (int i = 0; i < depth; ++i)
+			{
+				blocksPerGrid = (numToCompute + blockSize - 1) / blockSize;
+				kernDownSweep << <blocksPerGrid, blockSize >> > (numToCompute, offset, dev_indices);
+				numToCompute *= 2;
+				offset /= 2;
+			}
+
+			int ret;
+			cudaMemcpy(&ret, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+			ret += idata[n - 1] == 0 ? 0 : 1;
+
+
+			blocksPerGrid = (n + blockSize - 1) / blockSize;
+			Common::kernScatter << <blocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);
+
             timer().endGpuTimer();
-            return -1;
+
+			cudaMemcpy(odata, dev_odata, ret * sizeof(int), cudaMemcpyDeviceToHost);
+
+			cudaFree(dev_odata);
+			cudaFree(dev_idata);
+			cudaFree(dev_bools);
+			cudaFree(dev_indices);
+            return ret;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..7615728 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -8,6 +8,8 @@ namespace StreamCompaction {
 
         void scan(int n, int *odata, const int *idata);
 
+		void scanSM(int n, int *odata, const int *idata);
+
         int compact(int n, int *odata, const int *idata);
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..8cce7c3 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,24 +2,90 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include "device_launch_parameters.h"
+
+#define SECTION_SIZE 4
 
 namespace StreamCompaction {
     namespace Naive {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
-        // TODO: __global__
+
+		__global__ void kernNaivSham(int *x, int*y, int n)
+		{
+			__shared__ int xy[SECTION_SIZE];
+
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+			if (i < n) xy[threadIdx.x] = x[i];
+		}
+        
+		__global__ void kernScanNaive(int N, int stride, int *odata, const int *idata)
+		{
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if (idx >= N) return;
+
+			if (idx >= stride)
+			{
+				odata[idx] = idata[idx - stride] + idata[idx];
+			}
+			else
+			{
+				odata[idx] = idata[idx];
+			}
+		}
+
+		__global__ void kernInclusiveToExclusive(int N, int *odata, const int *idata)
+		{
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if (idx >= N) return;
+
+			odata[idx] = idx == 0 ? 0 : idata[idx - 1];
+		}
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+			//Dimensions
+			int blockSize = 1024;
+			int depth = ilog2ceil(n);
+			int blocksPerGrid = (n + blockSize - 1) / blockSize;
+
+			//Memory allocation
+			int *dev_idata, *dev_odata;
+
+			dev_idata = nullptr;
+			dev_odata = nullptr;
+			cudaMalloc(&dev_idata, n * sizeof(int));
+			cudaMalloc(&dev_odata, n * sizeof(int));
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+
+			timer().startGpuTimer();
+
+			int stride = 1;
+			for (int d = 0; d < depth; ++d) {
+				kernScanNaive << <blocksPerGrid, blockSize >> >(n, stride, dev_odata, dev_idata);
+				int *temp = dev_odata;
+				dev_odata = dev_idata;
+				dev_idata = temp;
+				stride *= 2;
+			}
+
+			kernInclusiveToExclusive << < blocksPerGrid, blockSize >> > (n, dev_odata, dev_idata);
+
+			timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(&dev_idata);
+			cudaFree(&dev_odata);
+			cudaDeviceSynchronize();
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..9ad8db0 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -8,21 +8,37 @@
 
 namespace StreamCompaction {
     namespace Thrust {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+			int *dev_idata, *dev_odata;
+			dev_idata = nullptr;
+			dev_odata = nullptr;
+
+			cudaMalloc(&dev_idata, n * sizeof(int));
+			cudaMalloc(&dev_odata, n * sizeof(int));
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			thrust::device_ptr<int> thrust_idata(dev_idata);
+			thrust::device_ptr<int> thrust_odata(dev_odata);
+
+			timer().startGpuTimer();
+			thrust::exclusive_scan(thrust_idata, thrust_idata + n, thrust_odata);
             timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_idata);
+			cudaFree(dev_odata);
         }
     }
 }