diff --git a/.gitignore b/.gitignore
index a59ec56..30af9f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,8 @@ build
 .LSOverride
 
 # Icon must end with two \r
-Icon
+Icon
+
 
 # Thumbnails
 ._*
@@ -560,3 +561,5 @@ xcuserdata
 *.xccheckout
 *.moved-aside
 *.xcuserstate
+
+.vscode
diff --git a/README.md b/README.md
index 0e38ddb..b33afca 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,143 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zhihao Ruan (ruanzh@seas.upenn.edu)
+  * [LinkedIn](https://www.linkedin.com/in/zhihao-ruan-29b29a13a/), [personal website](https://zhihaoruan.xyz/)
+* Tested on: Ubuntu 20.04 LTS, Ryzen 3700X @ 2.22GHz 48GB, RTX 2060 Super @ 7976MB
 
-### (TODO: Your README)
+## Highlights
+This project implements:
+- a naive parallel scan algorithm compatible with arbitrary sized input arrays;
+- a work-efficient parallel scan algorithm compatible with arbitrary sized input arrays;
+- a stream compaction algorithm built upon the work-efficient parallel scan compatible with arbitrary sized input arrays.
+
+The GPU steam compaction algorithm is demonstrated to be over 4x faster than the CPU version.
+
+A sample of test output on `block_size` = 1024, `array_size` = 2^27 **(max array possible on local GPU)**:
+```
+****************
+** SCAN TESTS **
+****************
+    [   5  33  25  22  48  26  23  19  36  32   2  17  45 ...  22   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 79.9027ms    (std::chrono Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515866 -1006515844 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 81.4093ms    (std::chrono Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515949 -1006515918 ]
+    passed 
+==== naive scan, power-of-two ====
+   elapsed time: 31.3315ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515866 -1006515844 ]
+    passed 
+==== naive scan, non-power-of-two ====
+   elapsed time: 24.8398ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ...   0   0 ]
+    passed 
+==== work-efficient scan, power-of-two ====
+   elapsed time: 37.6307ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515866 -1006515844 ]
+    passed 
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 37.6407ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515949 -1006515918 ]
+    passed 
+==== thrust scan, power-of-two ====
+   elapsed time: 3.16525ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515866 -1006515844 ]
+    passed 
+==== thrust scan, non-power-of-two ====
+   elapsed time: 3.12653ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515949 -1006515918 ]
+    passed 
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   3   1   1   1   3   0   1   2   1   1   1   2 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 250.09ms    (std::chrono Measured)
+    [   3   1   1   1   3   1   2   1   1   1   2   2   2 ...   3   3 ]
+    passed 
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 247.095ms    (std::chrono Measured)
+    [   3   1   1   1   3   1   2   1   1   1   2   2   2 ...   2   3 ]
+    passed 
+==== cpu compact with scan ====
+   elapsed time: 886.643ms    (std::chrono Measured)
+    [   3   1   1   1   3   1   2   1   1   1   2   2   2 ...   3   3 ]
+    passed 
+==== work-efficient compact, power-of-two ====
+   elapsed time: 58.4025ms    (CUDA Measured)
+    [   3   1   1   1   3   1   2   1   1   1   2   2   2 ...   3   3 ]
+    passed 
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 48.5331ms    (CUDA Measured)
+    [   3   1   1   1   3   1   2   1   1   1   2   2   2 ...   2   3 ]
+    passed 
+```
+
+## Introduction: Stream Compaction
+Stream compaction, essentially, is a technique that aims at removing elements from a list (aka. stream) that are not satisfied according to some criteria. For example, if we have a stream of integers `[1 2 3 2 1 5 23 4 0 0 3 4 2 0 3 8 0]` and we wish to remove "all elements that are 0" (aka. the *criteria*), we would get the remaining compact list `[1 2 3 2 1 5 23 4 3 4 2 3 8]`.
+
+Stream compaction is widely used in rendering & ray tracing. Although it seems straightforward to implement stream compaction in the first place, it is actually non-trivial to implement it on GPU with some parallel algorithms so that its performance can be boosted. This project would discuss the method for parallel stream compaction, and its underlying necessary component &mdash; parallel scan algorithm.
+
+**For more detailed description of the project, please refer to the [project instruction.](INSTRUCTION.md)**
+
+## Parallel Scan
+Parallel scan, aka. parallel prefix sum, is a task of generating a list of numbers in which each index is the sum of all elements that comes before this index. There are two types of parallel scan: *exclusive* scan and *inclusive scan*, where the former inserts 0 at the beginning of output and discards the total sum at the end of list, while the latter keeps the total sum at the end of list and does not introduce 0 at the beginning.
+
+![](img/scan_inclusive_exclusive.png)
+
+### Naive Parallel Scan
+A naive algorithm of implementing parallel scan is shown as follows. For each iteration, part of the thread adds up two elements in the list, producing the final result after several iterations.
+
+![](img/naive_scan.png)
+
+### Work Efficient Parallel Scan
+There is also a much more efficient version of parallel scan, which involves 1) a list reduction, 2) a down-sweep. The list reduction can also be called as the "up-sweep" procedure, producing a total sum of the list with all partial sums in the middle. The down-sweep procedure exactly compensates those missing parts for the middle elements and completes the entire parallel scan.
+
+![](img/upsweep.png)
+
+![](img/downsweep.png)
+
+## Parallel Stream Compaction
+After solving the problem of parallel scan, we can now get to the real algorithm for parallel stream compaction. Essentially, an effective stream compaction procedure consists of the following:
+1. Generate a boolean array marking the validity of each element. For elements to remove, mark as "0"; otherwise mark as "1".
+2. Compute exclusive parallel scan on the boolean array.
+3. Scatter the desired elements into the output array. If an element is marked as "1" in the boolean array, store it into the corresponding indexed parallel scan position in the output array.
+
+![](img/stream_compaction.png)
+
+## Performance Analysis
+**All the tests are conducted with random input array with `srand(0)` on local desktop.**
+
+I roughly found the optimal block size for naive scan algorithm to be 256 and work-efficient scan algorithm to be 128. With these numbers tuned, I ran the program against multiple sizes of input arrays to evaluate the performance.
+
+After careful evaluation, the current performance bottlenecks should be lying in:
+1. Warp divergence and `__syncthreads()`. For both naive scan and work-efficient scan, the threads are utilized in an interleaved pattern, which leads to huge amount of warp divergence.
+2. Global memory accesses are not coalesced. This is due to the same reason with (1), where we access global memory in an interleaved fashion.
+
+Further improvements to the kernel functions includes re-index active threads to minimize warp divergence, as well as breaking work-efficient scan kernel into two small kernels (up-sweep and down-sweep) to eliminate the effect of `__syncthreads()` and warp divergence.
+
+### Parallel Scan, Array Size Power-of-Two
+In this diagram we can see that for large input data, CPU scan takes the most amount of time to run. For naive scan algorithm and work-efficient algorithm, both of them work similarly. When the data size is small, all four methods run roughly the same amount of time. Thrust outperforms all other three methods on large input data.
+
+![](profiling/img/Figure_1.png)
+
+### Parallel Scan, Array Size Non-Power-of-Two
+In this diagram we can see that the four methods have roughly the same behaviors as in [array size of power of two.](#parallel-scan-array-size-power-of-two)
+
+![](profiling/img/Figure_2.png)
+
+### Stream Compaction, Array Size Power-of-Two
+We can see that when data size is small, CPU compaction has roughly the same performance as work-efficient compaction. However, as the data size increases, GPU compaction outperforms CPU compaction.
+
+![](profiling/img/Figure_3.png)
+
+### Stream Compaction, Array Size Non-Power-of-Two
+This diagram shows similar behaviors as in [array size of power of two.](#stream-compaction-array-size-power-of-two)
+
+![](profiling/img/Figure_4.png)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/cmake/.clang-format b/cmake/.clang-format
new file mode 100644
index 0000000..618314c
--- /dev/null
+++ b/cmake/.clang-format
@@ -0,0 +1,9 @@
+---
+BasedOnStyle: Google
+---
+Language: Cpp
+AccessModifierOffset: -2
+AlignConsecutiveAssignments: true
+AlignConsecutiveMacros: true
+---
+
diff --git a/cmake/cuda_compute_capability.cpp b/cmake/cuda_compute_capability.cpp
index ef589a9..0af4901 100644
--- a/cmake/cuda_compute_capability.cpp
+++ b/cmake/cuda_compute_capability.cpp
@@ -1,58 +1,59 @@
 /*
-* Copyright (C) 2011 Florian Rathgeber, florian.rathgeber@gmail.com
-*
-* This code is licensed under the MIT License.  See the FindCUDA.cmake script
-* for the text of the license.
-*
-* Based on code by Christopher Bruns published on Stack Overflow (CC-BY):
-* http://stackoverflow.com/questions/2285185
-*/
+ * Copyright (C) 2011 Florian Rathgeber, florian.rathgeber@gmail.com
+ *
+ * This code is licensed under the MIT License.  See the FindCUDA.cmake script
+ * for the text of the license.
+ *
+ * Based on code by Christopher Bruns published on Stack Overflow (CC-BY):
+ * http://stackoverflow.com/questions/2285185
+ */
 
-#include <stdio.h>
 #include <cuda_runtime.h>
+#include <stdio.h>
+
 #include <iterator>
 #include <set>
 
 int main() {
-    int deviceCount;
-    int gpuDeviceCount = 0;
-    struct cudaDeviceProp properties;
+  int deviceCount;
+  int gpuDeviceCount = 0;
+  struct cudaDeviceProp properties;
 
-    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess)
-    {
-        printf("Couldn't get device count: %s\n", cudaGetErrorString(cudaGetLastError()));
-        return 1;
-    }
+  if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
+    printf("Couldn't get device count: %s\n",
+           cudaGetErrorString(cudaGetLastError()));
+    return 1;
+  }
 
-    std::set<int> computes;
-    typedef std::set<int>::iterator iter;
+  std::set<int> computes;
+  typedef std::set<int>::iterator iter;
 
-    // machines with no GPUs can still report one emulation device
-    for (int device = 0; device < deviceCount; ++device) {
-        int major = 9999, minor = 9999;
-        cudaGetDeviceProperties(&properties, device);
-        if (properties.major != 9999) { // 9999 means emulation only
-            ++gpuDeviceCount;
-            major = properties.major;
-            minor = properties.minor;
-            if ((major == 2 && minor == 1)) {
-                // There is no --arch compute_21 flag for nvcc, so force minor to 0
-                minor = 0;
-            }
-            computes.insert(10 * major + minor);
-        }
-    }
-    int i = 0;
-    for(iter it = computes.begin(); it != computes.end(); it++, i++) {
-        if(i > 0) {
-            printf(" ");
-        }
-        printf("%d", *it);
+  // machines with no GPUs can still report one emulation device
+  for (int device = 0; device < deviceCount; ++device) {
+    int major = 9999, minor = 9999;
+    cudaGetDeviceProperties(&properties, device);
+    if (properties.major != 9999) {  // 9999 means emulation only
+      ++gpuDeviceCount;
+      major = properties.major;
+      minor = properties.minor;
+      if ((major == 2 && minor == 1)) {
+        // There is no --arch compute_21 flag for nvcc, so force minor to 0
+        minor = 0;
+      }
+      computes.insert(10 * major + minor);
     }
-    /* don't just return the number of gpus, because other runtime cuda
-    errors can also yield non-zero return values */
-    if (gpuDeviceCount <= 0 || computes.size() <= 0) {
-        return 1; // failure
+  }
+  int i = 0;
+  for (iter it = computes.begin(); it != computes.end(); it++, i++) {
+    if (i > 0) {
+      printf(" ");
     }
-    return 0; // success
+    printf("%d", *it);
+  }
+  /* don't just return the number of gpus, because other runtime cuda
+  errors can also yield non-zero return values */
+  if (gpuDeviceCount <= 0 || computes.size() <= 0) {
+    return 1;  // failure
+  }
+  return 0;  // success
 }
diff --git a/img/downsweep.png b/img/downsweep.png
new file mode 100644
index 0000000..2a57e20
Binary files /dev/null and b/img/downsweep.png differ
diff --git a/img/naive_scan.png b/img/naive_scan.png
new file mode 100644
index 0000000..0e9900c
Binary files /dev/null and b/img/naive_scan.png differ
diff --git a/img/scan_inclusive_exclusive.png b/img/scan_inclusive_exclusive.png
new file mode 100644
index 0000000..0ad3461
Binary files /dev/null and b/img/scan_inclusive_exclusive.png differ
diff --git a/img/stream_compaction.png b/img/stream_compaction.png
new file mode 100644
index 0000000..39e67d9
Binary files /dev/null and b/img/stream_compaction.png differ
diff --git a/img/upsweep.png b/img/upsweep.png
new file mode 100644
index 0000000..5c0c86f
Binary files /dev/null and b/img/upsweep.png differ
diff --git a/profiling/.gitignore b/profiling/.gitignore
new file mode 100644
index 0000000..c5adf1a
--- /dev/null
+++ b/profiling/.gitignore
@@ -0,0 +1,441 @@
+
+# Created by https://www.toptal.com/developers/gitignore/api/python,latex
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,latex
+
+### LaTeX ###
+## Core latex/pdflatex auxiliary files:
+*.aux
+*.lof
+*.log
+*.lot
+*.fls
+*.out
+*.toc
+*.fmt
+*.fot
+*.cb
+*.cb2
+.*.lb
+
+## Intermediate documents:
+*.dvi
+*.xdv
+*-converted-to.*
+# these rules might exclude image files for figures etc.
+# *.ps
+# *.eps
+# *.pdf
+
+## Generated if empty string is given at "Please type another file name for output:"
+.pdf
+
+## Bibliography auxiliary files (bibtex/biblatex/biber):
+*.bbl
+*.bcf
+*.blg
+*-blx.aux
+*-blx.bib
+*.run.xml
+
+## Build tool auxiliary files:
+*.fdb_latexmk
+*.synctex
+*.synctex(busy)
+*.synctex.gz
+*.synctex.gz(busy)
+*.pdfsync
+
+## Build tool directories for auxiliary files
+# latexrun
+latex.out/
+
+## Auxiliary and intermediate files from other packages:
+# algorithms
+*.alg
+*.loa
+
+# achemso
+acs-*.bib
+
+# amsthm
+*.thm
+
+# beamer
+*.nav
+*.pre
+*.snm
+*.vrb
+
+# changes
+*.soc
+
+# comment
+*.cut
+
+# cprotect
+*.cpt
+
+# elsarticle (documentclass of Elsevier journals)
+*.spl
+
+# endnotes
+*.ent
+
+# fixme
+*.lox
+
+# feynmf/feynmp
+*.mf
+*.mp
+*.t[1-9]
+*.t[1-9][0-9]
+*.tfm
+
+#(r)(e)ledmac/(r)(e)ledpar
+*.end
+*.?end
+*.[1-9]
+*.[1-9][0-9]
+*.[1-9][0-9][0-9]
+*.[1-9]R
+*.[1-9][0-9]R
+*.[1-9][0-9][0-9]R
+*.eledsec[1-9]
+*.eledsec[1-9]R
+*.eledsec[1-9][0-9]
+*.eledsec[1-9][0-9]R
+*.eledsec[1-9][0-9][0-9]
+*.eledsec[1-9][0-9][0-9]R
+
+# glossaries
+*.acn
+*.acr
+*.glg
+*.glo
+*.gls
+*.glsdefs
+*.lzo
+*.lzs
+
+# uncomment this for glossaries-extra (will ignore makeindex's style files!)
+# *.ist
+
+# gnuplottex
+*-gnuplottex-*
+
+# gregoriotex
+*.gaux
+*.glog
+*.gtex
+
+# htlatex
+*.4ct
+*.4tc
+*.idv
+*.lg
+*.trc
+*.xref
+
+# hyperref
+*.brf
+
+# knitr
+*-concordance.tex
+# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
+# *.tikz
+*-tikzDictionary
+
+# listings
+*.lol
+
+# luatexja-ruby
+*.ltjruby
+
+# makeidx
+*.idx
+*.ilg
+*.ind
+
+# minitoc
+*.maf
+*.mlf
+*.mlt
+*.mtc[0-9]*
+*.slf[0-9]*
+*.slt[0-9]*
+*.stc[0-9]*
+
+# minted
+_minted*
+*.pyg
+
+# morewrites
+*.mw
+
+# newpax
+*.newpax
+
+# nomencl
+*.nlg
+*.nlo
+*.nls
+
+# pax
+*.pax
+
+# pdfpcnotes
+*.pdfpc
+
+# sagetex
+*.sagetex.sage
+*.sagetex.py
+*.sagetex.scmd
+
+# scrwfile
+*.wrt
+
+# sympy
+*.sout
+*.sympy
+sympy-plots-for-*.tex/
+
+# pdfcomment
+*.upa
+*.upb
+
+# pythontex
+*.pytxcode
+pythontex-files-*/
+
+# tcolorbox
+*.listing
+
+# thmtools
+*.loe
+
+# TikZ & PGF
+*.dpth
+*.md5
+*.auxlock
+
+# todonotes
+*.tdo
+
+# vhistory
+*.hst
+*.ver
+
+# easy-todo
+*.lod
+
+# xcolor
+*.xcp
+
+# xmpincl
+*.xmpi
+
+# xindy
+*.xdy
+
+# xypic precompiled matrices and outlines
+*.xyc
+*.xyd
+
+# endfloat
+*.ttt
+*.fff
+
+# Latexian
+TSWLatexianTemp*
+
+## Editors:
+# WinEdt
+*.bak
+*.sav
+
+# Texpad
+.texpadtmp
+
+# LyX
+*.lyx~
+
+# Kile
+*.backup
+
+# gummi
+.*.swp
+
+# KBibTeX
+*~[0-9]*
+
+# TeXnicCenter
+*.tps
+
+# auto folder when using emacs and auctex
+./auto/*
+*.el
+
+# expex forward references with \gathertags
+*-tags.tex
+
+# standalone packages
+*.sta
+
+# Makeindex log files
+*.lpz
+
+# xwatermark package
+*.xwm
+
+# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
+# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
+# Uncomment the next line to have this generated file ignored.
+#*Notes.bib
+
+### LaTeX Patch ###
+# LIPIcs / OASIcs
+*.vtc
+
+# glossaries
+*.glstex
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# End of https://www.toptal.com/developers/gitignore/api/python,latex
diff --git a/profiling/.style.yapf b/profiling/.style.yapf
new file mode 100644
index 0000000..fdd0723
--- /dev/null
+++ b/profiling/.style.yapf
@@ -0,0 +1,2 @@
+[style]
+based_on_style = yapf
diff --git a/profiling/img/Figure_1.png b/profiling/img/Figure_1.png
new file mode 100644
index 0000000..e19a351
Binary files /dev/null and b/profiling/img/Figure_1.png differ
diff --git a/profiling/img/Figure_2.png b/profiling/img/Figure_2.png
new file mode 100644
index 0000000..83a164a
Binary files /dev/null and b/profiling/img/Figure_2.png differ
diff --git a/profiling/img/Figure_3.png b/profiling/img/Figure_3.png
new file mode 100644
index 0000000..a222263
Binary files /dev/null and b/profiling/img/Figure_3.png differ
diff --git a/profiling/img/Figure_4.png b/profiling/img/Figure_4.png
new file mode 100644
index 0000000..5f9d65f
Binary files /dev/null and b/profiling/img/Figure_4.png differ
diff --git a/profiling/profile.csv b/profiling/profile.csv
new file mode 100644
index 0000000..b770825
--- /dev/null
+++ b/profiling/profile.csv
@@ -0,0 +1,9 @@
+Array Size,CPU Scan (power of two),CPU Scan (non power of two),Naive Scan (power of two),Naive Scan (non power of two),Work Efficient Scan (power of two),Work Efficient Scan (non power of two),Thrust Scan (power of two),Thrust Scan (non power of two),CPU Stream Compaction without Scan (power of two),CPU Stream Compaction without Scan (non power of two),CPU Stream Compaction with Scan,Work Efficient Stream Compaction (power of two),Work Efficient Stream Compaction (non power of two)
+2,7e-05,2e-05,0.028416,0.011296,0.016384,0.01616,0.022272,0.018464,5e-05,2e-05,0.000291,0.02304,0.022688
+4,9.1e-05,5e-05,0.021952,0.012672,0.016544,0.016192,0.022816,0.018112,8e-05,6e-05,0.000681,0.0232,0.0224
+8,0.00034,0.0001,0.019808,0.013184,0.030048,0.027808,0.021184,0.01776,0.000651,0.000471,0.001903,0.03424,0.033408
+10,0.000651,0.000421,0.029184,0.021184,0.027456,0.026464,0.022432,0.018464,0.002185,0.001853,0.00522,0.034048,0.033472
+15,0.017503,0.01548,0.027328,0.026848,0.03984,0.03904,0.023104,0.020384,0.062198,0.058741,0.183698,0.047296,0.045472
+20,0.653047,0.636355,0.214848,0.214144,0.269568,0.272928,0.302208,0.182688,1.85969,1.8767,6.14675,0.368096,0.454112
+25,26.3133,20.3762,5.80374,7.06064,6.83888,6.2832,1.02957,1.02858,61.6844,62.6342,235.577,9.63056,9.95472
+27,81.1722,83.0083,23.0215,18.5987,24.2505,24.2091,3.15331,3.1527,230.532,239.606,902.052,39.9164,34.9404
diff --git a/profiling/profile_raw.txt b/profiling/profile_raw.txt
new file mode 100644
index 0000000..2dcd65d
--- /dev/null
+++ b/profiling/profile_raw.txt
@@ -0,0 +1,112 @@
+2
+7e-05
+2e-05
+0.028416
+0.011296
+0.016384
+0.01616
+0.022272
+0.018464
+5e-05
+2e-05
+0.000291
+0.02304
+0.022688
+4
+9.1e-05
+5e-05
+0.021952
+0.012672
+0.016544
+0.016192
+0.022816
+0.018112
+8e-05
+6e-05
+0.000681
+0.0232
+0.0224
+8
+0.00034
+0.0001
+0.019808
+0.013184
+0.030048
+0.027808
+0.021184
+0.01776
+0.000651
+0.000471
+0.001903
+0.03424
+0.033408
+10
+0.000651
+0.000421
+0.029184
+0.021184
+0.027456
+0.026464
+0.022432
+0.018464
+0.002185
+0.001853
+0.00522
+0.034048
+0.033472
+15
+0.017503
+0.01548
+0.027328
+0.026848
+0.03984
+0.03904
+0.023104
+0.020384
+0.062198
+0.058741
+0.183698
+0.047296
+0.045472
+20
+0.653047
+0.636355
+0.214848
+0.214144
+0.269568
+0.272928
+0.302208
+0.182688
+1.85969
+1.8767
+6.14675
+0.368096
+0.454112
+25
+26.3133
+20.3762
+5.80374
+7.06064
+6.83888
+6.2832
+1.02957
+1.02858
+61.6844
+62.6342
+235.577
+9.63056
+9.95472
+27
+81.1722
+83.0083
+23.0215
+18.5987
+24.2505
+24.2091
+3.15331
+3.1527
+230.532
+239.606
+902.052
+39.9164
+34.9404
diff --git a/profiling/sample_log_b-1024_N-27.txt b/profiling/sample_log_b-1024_N-27.txt
new file mode 100644
index 0000000..4c82ccb
--- /dev/null
+++ b/profiling/sample_log_b-1024_N-27.txt
@@ -0,0 +1,60 @@
+****************
+** SCAN TESTS **
+****************
+    [   5  33  25  22  48  26  23  19  36  32   2  17  45 ...  22   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 79.9027ms    (std::chrono Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515866 -1006515844 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 81.4093ms    (std::chrono Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515949 -1006515918 ]
+    passed 
+==== naive scan, power-of-two ====
+   elapsed time: 31.3315ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515866 -1006515844 ]
+    passed 
+==== naive scan, non-power-of-two ====
+   elapsed time: 24.8398ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ...   0   0 ]
+    passed 
+==== work-efficient scan, power-of-two ====
+   elapsed time: 37.6307ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515866 -1006515844 ]
+    passed 
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 37.6407ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515949 -1006515918 ]
+    passed 
+==== thrust scan, power-of-two ====
+   elapsed time: 3.16525ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515866 -1006515844 ]
+    passed 
+==== thrust scan, non-power-of-two ====
+   elapsed time: 3.12653ms    (CUDA Measured)
+    [   0   5  38  63  85 133 159 182 201 237 269 271 288 ... -1006515949 -1006515918 ]
+    passed 
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   3   1   1   1   3   0   1   2   1   1   1   2 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 250.09ms    (std::chrono Measured)
+    [   3   1   1   1   3   1   2   1   1   1   2   2   2 ...   3   3 ]
+    passed 
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 247.095ms    (std::chrono Measured)
+    [   3   1   1   1   3   1   2   1   1   1   2   2   2 ...   2   3 ]
+    passed 
+==== cpu compact with scan ====
+   elapsed time: 886.643ms    (std::chrono Measured)
+    [   3   1   1   1   3   1   2   1   1   1   2   2   2 ...   3   3 ]
+    passed 
+==== work-efficient compact, power-of-two ====
+   elapsed time: 58.4025ms    (CUDA Measured)
+    [   3   1   1   1   3   1   2   1   1   1   2   2   2 ...   3   3 ]
+    passed 
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 48.5331ms    (CUDA Measured)
+    [   3   1   1   1   3   1   2   1   1   1   2   2   2 ...   2   3 ]
+    passed 
diff --git a/profiling/visualize.py b/profiling/visualize.py
new file mode 100644
index 0000000..a00906d
--- /dev/null
+++ b/profiling/visualize.py
@@ -0,0 +1,168 @@
+import csv
+from matplotlib import pyplot as plt
+import numpy as np
+from numpy.core.defchararray import array
+
+
+def process_profile_raw():
+  file = open('profile_raw.txt')
+
+  # initialize arrays
+  array_size = []  # in log2 scale
+  cpu_scan_power_of_two = []
+  cpu_scan_non_power_of_two = []
+  naive_scan_power_of_two = []
+  naive_scan_non_power_of_two = []
+  work_efficient_scan_power_of_two = []
+  work_efficient_scan_non_power_of_two = []
+  thrust_scan_power_of_two = []
+  thrust_scan_non_power_of_two = []
+  cpu_compact_power_of_two = []
+  cpu_compact_non_power_of_two = []
+  cpu_compact_scan = []
+  work_efficient_compact_power_of_two = []
+  work_efficient_compact_non_power_of_two = []
+
+  while True:
+    line = file.readline()
+    if not line:
+      break
+    array_size.append(int(line))
+    cpu_scan_power_of_two.append(float(file.readline()))
+    cpu_scan_non_power_of_two.append(float(file.readline()))
+    naive_scan_power_of_two.append(float(file.readline()))
+    naive_scan_non_power_of_two.append(float(file.readline()))
+    work_efficient_scan_power_of_two.append(float(file.readline()))
+    work_efficient_scan_non_power_of_two.append(float(file.readline()))
+    thrust_scan_power_of_two.append(float(file.readline()))
+    thrust_scan_non_power_of_two.append(float(file.readline()))
+    cpu_compact_power_of_two.append(float(file.readline()))
+    cpu_compact_non_power_of_two.append(float(file.readline()))
+    cpu_compact_scan.append(float(file.readline()))
+    work_efficient_compact_power_of_two.append(float(file.readline()))
+    work_efficient_compact_non_power_of_two.append(float(file.readline()))
+
+  # write to CSV for better data management
+  with open("profile.csv", 'w') as csv_file:
+    csv_writer = csv.writer(csv_file)
+    csv_writer.writerow([
+        'Array Size', 'CPU Scan (power of two)', 'CPU Scan (non power of two)',
+        'Naive Scan (power of two)', 'Naive Scan (non power of two)',
+        'Work Efficient Scan (power of two)',
+        'Work Efficient Scan (non power of two)', 'Thrust Scan (power of two)',
+        'Thrust Scan (non power of two)',
+        'CPU Stream Compaction without Scan (power of two)',
+        'CPU Stream Compaction without Scan (non power of two)',
+        'CPU Stream Compaction with Scan',
+        'Work Efficient Stream Compaction (power of two)',
+        'Work Efficient Stream Compaction (non power of two)'
+    ])
+    csv_writer.writerows(
+        zip(array_size, cpu_scan_power_of_two, cpu_scan_non_power_of_two,
+            naive_scan_power_of_two, naive_scan_non_power_of_two,
+            work_efficient_scan_power_of_two,
+            work_efficient_scan_non_power_of_two, thrust_scan_power_of_two,
+            thrust_scan_non_power_of_two, cpu_compact_power_of_two,
+            cpu_compact_non_power_of_two, cpu_compact_scan,
+            work_efficient_compact_power_of_two,
+            work_efficient_compact_non_power_of_two))
+
+  return (array_size, cpu_scan_power_of_two, cpu_scan_non_power_of_two,
+          naive_scan_power_of_two, naive_scan_non_power_of_two,
+          work_efficient_scan_power_of_two,
+          work_efficient_scan_non_power_of_two, thrust_scan_power_of_two,
+          thrust_scan_non_power_of_two, cpu_compact_power_of_two,
+          cpu_compact_non_power_of_two, cpu_compact_scan,
+          work_efficient_compact_power_of_two,
+          work_efficient_compact_non_power_of_two)
+
+
+def main():
+  # varying block size
+  efficient_block_size = [1024, 512, 256, 128, 64, 32]  # pick: 128
+  efficient_time = [0.714144, 0.78016, 1.33078, 0.275328, 0.48048, 0.402848]
+
+  naive_block_size = [1024, 512, 256, 128, 64, 32]  # pick: 256
+  naive_time = [0.276192, 0.24032, 0.21904, 0.28976, 0.228416, 0.33616]
+
+  (array_size, cpu_scan_power_of_two, cpu_scan_non_power_of_two,
+   naive_scan_power_of_two, naive_scan_non_power_of_two,
+   work_efficient_scan_power_of_two, work_efficient_scan_non_power_of_two,
+   thrust_scan_power_of_two, thrust_scan_non_power_of_two,
+   cpu_compact_power_of_two, cpu_compact_non_power_of_two, cpu_compact_scan,
+   work_efficient_compact_power_of_two,
+   work_efficient_compact_non_power_of_two) = process_profile_raw()
+
+  # visualization
+  # power-of-two
+  plt.figure()
+  plt.plot(
+      np.array(array_size[4:]), np.log10(np.array(cpu_scan_power_of_two[4:])),
+      '.-')
+  plt.plot(
+      np.array(array_size[4:]), np.log10(np.array(naive_scan_power_of_two[4:])),
+      '.-')
+  plt.plot(
+      np.array(array_size[4:]),
+      np.log10(np.array(work_efficient_scan_power_of_two[4:])), '.-')
+  plt.plot(
+      np.array(array_size[4:]),
+      np.log10(np.array(thrust_scan_power_of_two[4:])), '.-')
+  plt.legend(['CPU', 'Naive', 'Work Efficient', 'Thrust'])
+  plt.xticks(array_size[4:])
+  plt.xlabel('Array Size [Log2 Scale]')
+  plt.ylabel('Time [Log10 ms]')
+  plt.title('Scan Runtime vs. Array Size (power-of-two)')
+
+  # non-power-of-two
+  plt.figure()
+  plt.plot(
+      np.array(array_size[4:]),
+      np.log10(np.array(cpu_scan_non_power_of_two[4:])), '.-')
+  plt.plot(
+      np.array(array_size[4:]),
+      np.log10(np.array(naive_scan_non_power_of_two[4:])), '.-')
+  plt.plot(
+      np.array(array_size[4:]),
+      np.log10(np.array(work_efficient_scan_non_power_of_two[4:])), '.-')
+  plt.plot(
+      np.array(array_size[4:]),
+      np.log10(np.array(thrust_scan_non_power_of_two[4:])), '.-')
+  plt.legend(['CPU', 'Naive', 'Work Efficient', 'Thrust'])
+  plt.xticks(array_size[4:])
+  plt.xlabel('Array Size [Log2 Scale]')
+  plt.ylabel('Time [Log10 ms]')
+  plt.title('Scan Runtime vs. Array Size (non-power-of-two)')
+
+  # compact, power-of-two
+  plt.figure()
+  plt.plot(
+      np.array(array_size[4:]),
+      np.log10(np.array(cpu_compact_power_of_two[4:])), '.-')
+  plt.plot(
+      np.array(array_size[4:]),
+      np.log10(np.array(work_efficient_compact_power_of_two[4:])), '.-')
+  plt.legend(['CPU', 'Work Efficient'])
+  plt.xticks(array_size[4:])
+  plt.xlabel('Array Size [Log2 Scale]')
+  plt.ylabel('Time [Log10 ms]')
+  plt.title('Compaction Runtime vs. Array Size (power-of-two)')
+
+  # compact, non-power-of-two
+  plt.figure()
+  plt.plot(
+      np.array(array_size[4:]),
+      np.log10(np.array(cpu_compact_non_power_of_two[4:])), '.-')
+  plt.plot(
+      np.array(array_size[4:]),
+      np.log10(np.array(work_efficient_compact_non_power_of_two[4:])), '.-')
+  plt.legend(['CPU', 'Work Efficient'])
+  plt.xticks(array_size[4:])
+  plt.xlabel('Array Size [Log2 Scale]')
+  plt.ylabel('Time [Log10 ms]')
+  plt.title('Compaction Runtime vs. Array Size (non-power-of-two)')
+  plt.show()
+
+
+if __name__ == '__main__':
+  main()
diff --git a/src/.clang-format b/src/.clang-format
new file mode 100644
index 0000000..618314c
--- /dev/null
+++ b/src/.clang-format
@@ -0,0 +1,9 @@
+---
+BasedOnStyle: Google
+---
+Language: Cpp
+AccessModifierOffset: -2
+AlignConsecutiveAssignments: true
+AlignConsecutiveMacros: true
+---
+
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..4da08a3 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -6,149 +6,180 @@
  * @copyright University of Pennsylvania
  */
 
-#include <cstdio>
 #include <stream_compaction/cpu.h>
-#include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
+#include <stream_compaction/naive.h>
 #include <stream_compaction/thrust.h>
+
+#include <cstdio>
+
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
-
-int main(int argc, char* argv[]) {
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    // initialize b using StreamCompaction::CPU::scan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
-    // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-    onesArray(SIZE, c);
-    printDesc("1s array for finding bugs");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    system("pause"); // stop Win32 console from closing on exit
-    delete[] a;
-    delete[] b;
-    delete[] c;
+const int SIZE = 1 << 20;   // feel free to change the size of array
+const int NPOT = SIZE - 3;  // Non-Power-Of-Two
+int *a         = new int[SIZE];
+int *b         = new int[SIZE];
+int *c         = new int[SIZE];
+
+int main(int argc, char *argv[]) {
+  // Scan tests
+
+  printf("\n");
+  printf("****************\n");
+  printf("** SCAN TESTS **\n");
+  printf("****************\n");
+
+  genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+  a[SIZE - 1] = 0;
+  printArray(SIZE, a, true);
+
+  // initialize b using StreamCompaction::CPU::scan you implement
+  // We use b for further comparison. Make sure your StreamCompaction::CPU::scan
+  // is correct. At first all cases passed because b && c are all zeroes.
+  zeroArray(SIZE, b);
+  printDesc("cpu scan, power-of-two");
+  StreamCompaction::CPU::scan(SIZE, b, a);
+  printElapsedTime(
+      StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(),
+      "(std::chrono Measured)");
+  printArray(SIZE, b, true);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu scan, non-power-of-two");
+  StreamCompaction::CPU::scan(NPOT, c, a);
+  printElapsedTime(
+      StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(),
+      "(std::chrono Measured)");
+  printArray(NPOT, b, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("naive scan, power-of-two");
+  StreamCompaction::Naive::scan(SIZE, c, a);
+  printElapsedTime(
+      StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(),
+      "(CUDA Measured)");
+  printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  /* For bug-finding only: Array of 1s to help find bugs in stream compaction or
+  scan onesArray(SIZE, c); printDesc("1s array for finding bugs");
+  StreamCompaction::Naive::scan(SIZE, c, a);
+  printArray(SIZE, c, true); */
+
+  zeroArray(SIZE, c);
+  printDesc("naive scan, non-power-of-two");
+  StreamCompaction::Naive::scan(NPOT, c, a);
+  printElapsedTime(
+      StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(),
+      "(CUDA Measured)");
+  printArray(SIZE, c, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient scan, power-of-two");
+  StreamCompaction::Efficient::scan(SIZE, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer()
+                       .getGpuElapsedTimeForPreviousOperation(),
+                   "(CUDA Measured)");
+  printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient scan, non-power-of-two");
+  StreamCompaction::Efficient::scan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer()
+                       .getGpuElapsedTimeForPreviousOperation(),
+                   "(CUDA Measured)");
+  printArray(NPOT, c, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("thrust scan, power-of-two");
+  StreamCompaction::Thrust::scan(SIZE, c, a);
+  printElapsedTime(
+      StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(),
+      "(CUDA Measured)");
+  printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("thrust scan, non-power-of-two");
+  StreamCompaction::Thrust::scan(NPOT, c, a);
+  printElapsedTime(
+      StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(),
+      "(CUDA Measured)");
+  printArray(NPOT, c, true);
+  printCmpResult(NPOT, b, c);
+
+  printf("\n");
+  printf("*****************************\n");
+  printf("** STREAM COMPACTION TESTS **\n");
+  printf("*****************************\n");
+
+  // Compaction tests
+
+  genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+  a[SIZE - 1] = 0;
+  printArray(SIZE, a, true);
+
+  int count, expectedCount, expectedNPOT;
+
+  // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
+  // We use b for further comparison. Make sure your
+  // StreamCompaction::CPU::compactWithoutScan is correct.
+  zeroArray(SIZE, b);
+  printDesc("cpu compact without scan, power-of-two");
+  count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+  printElapsedTime(
+      StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(),
+      "(std::chrono Measured)");
+  expectedCount = count;
+  printArray(count, b, true);
+  printCmpLenResult(count, expectedCount, b, b);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu compact without scan, non-power-of-two");
+  count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+  printElapsedTime(
+      StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(),
+      "(std::chrono Measured)");
+  expectedNPOT = count;
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedNPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu compact with scan");
+  count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+  printElapsedTime(
+      StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(),
+      "(std::chrono Measured)");
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedCount, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient compact, power-of-two");
+  count = StreamCompaction::Efficient::compact(SIZE, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer()
+                       .getGpuElapsedTimeForPreviousOperation(),
+                   "(CUDA Measured)");
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedCount, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient compact, non-power-of-two");
+  count = StreamCompaction::Efficient::compact(NPOT, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer()
+                       .getGpuElapsedTimeForPreviousOperation(),
+                   "(CUDA Measured)");
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedNPOT, b, c);
+
+#if defined(_WIN32)
+  system("pause");  // stop Win32 console from closing on exit
+#endif
+
+  delete[] a;
+  delete[] b;
+  delete[] c;
 }
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index 025e94a..bd560a4 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -1,76 +1,72 @@
 #pragma once
 
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
+#include <ctime>
 #include <iostream>
 #include <string>
-#include <ctime>
 
-template<typename T>
+template <typename T>
 int cmpArrays(int n, T *a, T *b) {
-    for (int i = 0; i < n; i++) {
-        if (a[i] != b[i]) {
-            printf("    a[%d] = %d, b[%d] = %d\n", i, a[i], i, b[i]);
-            return 1;
-        }
+  for (int i = 0; i < n; i++) {
+    if (a[i] != b[i]) {
+      printf("    a[%d] = %d, b[%d] = %d\n", i, a[i], i, b[i]);
+      return 1;
     }
-    return 0;
+  }
+  return 0;
 }
 
-void printDesc(const char *desc) {
-    printf("==== %s ====\n", desc);
-}
+void printDesc(const char *desc) { printf("==== %s ====\n", desc); }
 
-template<typename T>
+template <typename T>
 void printCmpResult(int n, T *a, T *b) {
-    printf("    %s \n",
-            cmpArrays(n, a, b) ? "FAIL VALUE" : "passed");
+  printf("    %s \n", cmpArrays(n, a, b) ? "FAIL VALUE" : "passed");
 }
 
-template<typename T>
+template <typename T>
 void printCmpLenResult(int n, int expN, T *a, T *b) {
-    if (n != expN) {
-        printf("    expected %d elements, got %d\n", expN, n);
-    }
-    printf("    %s \n",
-            (n == -1 || n != expN) ? "FAIL COUNT" :
-            cmpArrays(n, a, b) ? "FAIL VALUE" : "passed");
+  if (n != expN) {
+    printf("    expected %d elements, got %d\n", expN, n);
+  }
+  printf("    %s \n", (n == -1 || n != expN) ? "FAIL COUNT"
+                      : cmpArrays(n, a, b)   ? "FAIL VALUE"
+                                             : "passed");
 }
 
 void zeroArray(int n, int *a) {
-    for (int i = 0; i < n; i++) {
-        a[i] = 0;
-    }
+  for (int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
 }
 
 void onesArray(int n, int *a) {
-    for (int i = 0; i < n; i++) {
-        a[i] = 1;
-    }
+  for (int i = 0; i < n; i++) {
+    a[i] = 1;
+  }
 }
 
 void genArray(int n, int *a, int maxval) {
-    srand(time(nullptr));
+  srand(time(nullptr));
 
-    for (int i = 0; i < n; i++) {
-        a[i] = rand() % maxval;
-    }
+  for (int i = 0; i < n; i++) {
+    a[i] = rand() % maxval;
+  }
 }
 
 void printArray(int n, int *a, bool abridged = false) {
-    printf("    [ ");
-    for (int i = 0; i < n; i++) {
-        if (abridged && i + 2 == 15 && n > 16) {
-            i = n - 2;
-            printf("... ");
-        }
-        printf("%3d ", a[i]);
+  printf("    [ ");
+  for (int i = 0; i < n; i++) {
+    if (abridged && i + 2 == 15 && n > 16) {
+      i = n - 2;
+      printf("... ");
     }
-    printf("]\n");
+    printf("%3d ", a[i]);
+  }
+  printf("]\n");
 }
 
-template<typename T>
-void printElapsedTime(T time, std::string note = "")
-{
-    std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
+template <typename T>
+void printElapsedTime(T time, std::string note = "") {
+  std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
 }
diff --git a/stream_compaction/.clang-format b/stream_compaction/.clang-format
new file mode 100644
index 0000000..618314c
--- /dev/null
+++ b/stream_compaction/.clang-format
@@ -0,0 +1,9 @@
+---
+BasedOnStyle: Google
+---
+Language: Cpp
+AccessModifierOffset: -2
+AlignConsecutiveAssignments: true
+AlignConsecutiveMacros: true
+---
+
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..d9e0a5d 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,39 +1,79 @@
 #include "common.h"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess == err) {
+    return;
+  }
 
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
+  fprintf(stderr, "CUDA error");
+  if (file) {
+    fprintf(stderr, " (%s:%d)", file, line);
+  }
+  fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+  exit(EXIT_FAILURE);
 }
 
-
 namespace StreamCompaction {
-    namespace Common {
-
-        /**
-         * Maps an array to an array of 0s and 1s for stream compaction. Elements
-         * which map to 0 will be removed, and elements which map to 1 will be kept.
-         */
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
-        }
-
-        /**
-         * Performs scatter on an array. That is, for each element in idata,
-         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
-         */
-        __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices) {
-            // TODO
-        }
+namespace Common {
+
+const unsigned int block_size_efficient = 128;
+const unsigned int block_size_naive     = 256;
 
+__global__ void kernExtractLastElementPerBlock(int n, int *odata,
+                                               const int *idata) {
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+  int id  = bid * blockDim.x + tid;
+  if (id < n) {
+    if (tid == blockDim.x - 1 || id == n - 1) {
+      odata[bid] = idata[id];
     }
+  }
 }
+
+__global__ void kernAddOffsetPerBlock(int n, int *odata,
+                                      const int *block_offset,
+                                      const int *idata) {
+  int bid = blockIdx.x;
+  int id  = bid * blockDim.x + threadIdx.x;
+  if (id < n) {
+    odata[id] = idata[id] + block_offset[bid];
+  }
+}
+
+__global__ void kernShiftToExclusive(int n, int *odata, const int *idata) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id < n) {
+    if (id == 0)
+      odata[id] = 0;
+    else
+      odata[id] = idata[id - 1];
+  }
+}
+
+/**
+ * Maps an array to an array of 0s and 1s for stream compaction. Elements
+ * which map to 0 will be removed, and elements which map to 1 will be kept.
+ */
+__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id < n) {
+    bools[id] = (idata[id] == 0) ? 0 : 1;
+  }
+}
+
+/**
+ * Performs scatter on an array. That is, for each element in idata,
+ * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+ */
+__global__ void kernScatter(int n, int *odata, const int *idata,
+                            const int *bools, const int *indices) {
+  int id = blockDim.x * blockIdx.x + threadIdx.x;
+  if (id < n) {
+    if (bools[id] == 1) odata[indices[id]] = idata[id];
+  }
+}
+
+}  // namespace Common
+}  // namespace StreamCompaction
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..22597fa 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -2,15 +2,17 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <device_launch_parameters.h>
 
-#include <cstdio>
-#include <cstring>
-#include <cmath>
 #include <algorithm>
 #include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
 #include <stdexcept>
 
-#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define FILENAME \
+  (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
 /**
@@ -18,115 +20,132 @@
  */
 void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
 
-inline int ilog2(int x) {
-    int lg = 0;
-    while (x >>= 1) {
-        ++lg;
-    }
-    return lg;
+inline __device__ __host__ int ilog2(int x) {
+  int lg = 0;
+  while (x >>= 1) {
+    ++lg;
+  }
+  return lg;
 }
 
-inline int ilog2ceil(int x) {
-    return x == 1 ? 0 : ilog2(x - 1) + 1;
+inline __device__ __host__ int ilog2ceil(int x) {
+  return x == 1 ? 0 : ilog2(x - 1) + 1;
 }
 
 namespace StreamCompaction {
-    namespace Common {
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
-
-        __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices);
-
-        /**
-        * This class is used for timing the performance
-        * Uncopyable and unmovable
-        *
-        * Adapted from WindyDarian(https://github.com/WindyDarian)
-        */
-        class PerformanceTimer
-        {
-        public:
-            PerformanceTimer()
-            {
-                cudaEventCreate(&event_start);
-                cudaEventCreate(&event_end);
-            }
-
-            ~PerformanceTimer()
-            {
-                cudaEventDestroy(event_start);
-                cudaEventDestroy(event_end);
-            }
-
-            void startCpuTimer()
-            {
-                if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
-                cpu_timer_started = true;
-
-                time_start_cpu = std::chrono::high_resolution_clock::now();
-            }
-
-            void endCpuTimer()
-            {
-                time_end_cpu = std::chrono::high_resolution_clock::now();
-
-                if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
-
-                std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
-                prev_elapsed_time_cpu_milliseconds =
-                    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
-
-                cpu_timer_started = false;
-            }
-
-            void startGpuTimer()
-            {
-                if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
-                gpu_timer_started = true;
-
-                cudaEventRecord(event_start);
-            }
-
-            void endGpuTimer()
-            {
-                cudaEventRecord(event_end);
-                cudaEventSynchronize(event_end);
-
-                if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
-
-                cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
-                gpu_timer_started = false;
-            }
-
-            float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
-            {
-                return prev_elapsed_time_cpu_milliseconds;
-            }
-
-            float getGpuElapsedTimeForPreviousOperation() //noexcept
-            {
-                return prev_elapsed_time_gpu_milliseconds;
-            }
-
-            // remove copy and move functions
-            PerformanceTimer(const PerformanceTimer&) = delete;
-            PerformanceTimer(PerformanceTimer&&) = delete;
-            PerformanceTimer& operator=(const PerformanceTimer&) = delete;
-            PerformanceTimer& operator=(PerformanceTimer&&) = delete;
-
-        private:
-            cudaEvent_t event_start = nullptr;
-            cudaEvent_t event_end = nullptr;
-
-            using time_point_t = std::chrono::high_resolution_clock::time_point;
-            time_point_t time_start_cpu;
-            time_point_t time_end_cpu;
-
-            bool cpu_timer_started = false;
-            bool gpu_timer_started = false;
-
-            float prev_elapsed_time_cpu_milliseconds = 0.f;
-            float prev_elapsed_time_gpu_milliseconds = 0.f;
-        };
+namespace Common {
+
+extern const unsigned int block_size_efficient;
+extern const unsigned int block_size_naive;
+
+__global__ void kernExtractLastElementPerBlock(int n, int *odata,
+                                               const int *idata);
+
+__global__ void kernAddOffsetPerBlock(int n, int *odata,
+                                      const int *block_offset,
+                                      const int *idata);
+
+__global__ void kernShiftToExclusive(int n, int *odata, const int *idata);
+
+__global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+__global__ void kernScatter(int n, int *odata, const int *idata,
+                            const int *bools, const int *indices);
+
+/**
+ * This class is used for timing the performance
+ * Uncopyable and unmovable
+ *
+ * Adapted from WindyDarian(https://github.com/WindyDarian)
+ */
+class PerformanceTimer {
+public:
+  PerformanceTimer() {
+    cudaEventCreate(&event_start);
+    cudaEventCreate(&event_end);
+  }
+
+  ~PerformanceTimer() {
+    cudaEventDestroy(event_start);
+    cudaEventDestroy(event_end);
+  }
+
+  void startCpuTimer() {
+    if (cpu_timer_started) {
+      throw std::runtime_error("CPU timer already started");
     }
-}
+    cpu_timer_started = true;
+
+    time_start_cpu = std::chrono::high_resolution_clock::now();
+  }
+
+  void endCpuTimer() {
+    time_end_cpu = std::chrono::high_resolution_clock::now();
+
+    if (!cpu_timer_started) {
+      throw std::runtime_error("CPU timer not started");
+    }
+
+    std::chrono::duration<double, std::milli> duro =
+        time_end_cpu - time_start_cpu;
+    prev_elapsed_time_cpu_milliseconds =
+        static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+    cpu_timer_started = false;
+  }
+
+  void startGpuTimer() {
+    if (gpu_timer_started) {
+      throw std::runtime_error("GPU timer already started");
+    }
+    gpu_timer_started = true;
+
+    cudaEventRecord(event_start);
+  }
+
+  void endGpuTimer() {
+    cudaEventRecord(event_end);
+    cudaEventSynchronize(event_end);
+
+    if (!gpu_timer_started) {
+      throw std::runtime_error("GPU timer not started");
+    }
+
+    cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start,
+                         event_end);
+    gpu_timer_started = false;
+  }
+
+  float
+  getCpuElapsedTimeForPreviousOperation()  // noexcept //(damn I need VS 2015
+  {
+    return prev_elapsed_time_cpu_milliseconds;
+  }
+
+  float getGpuElapsedTimeForPreviousOperation()  // noexcept
+  {
+    return prev_elapsed_time_gpu_milliseconds;
+  }
+
+  // remove copy and move functions
+  PerformanceTimer(const PerformanceTimer &) = delete;
+  PerformanceTimer(PerformanceTimer &&)      = delete;
+  PerformanceTimer &operator=(const PerformanceTimer &) = delete;
+  PerformanceTimer &operator=(PerformanceTimer &&) = delete;
+
+private:
+  cudaEvent_t event_start = nullptr;
+  cudaEvent_t event_end   = nullptr;
+
+  using time_point_t = std::chrono::high_resolution_clock::time_point;
+  time_point_t time_start_cpu;
+  time_point_t time_end_cpu;
+
+  bool cpu_timer_started = false;
+  bool gpu_timer_started = false;
+
+  float prev_elapsed_time_cpu_milliseconds = 0.f;
+  float prev_elapsed_time_gpu_milliseconds = 0.f;
+};
+}  // namespace Common
+}  // namespace StreamCompaction
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..5c8d6e6 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,50 +1,87 @@
 #include <cstdio>
-#include "cpu.h"
 
 #include "common.h"
+#include "cpu.h"
 
 namespace StreamCompaction {
-    namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-
-        /**
-         * CPU scan (prefix sum).
-         * For performance analysis, this is supposed to be a simple for loop.
-         * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-        }
-
-        /**
-         * CPU stream compaction without using the scan function.
-         *
-         * @returns the number of elements remaining after compaction.
-         */
-        int compactWithoutScan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
-        }
-
-        /**
-         * CPU stream compaction using scan and scatter, like the parallel version.
-         *
-         * @returns the number of elements remaining after compaction.
-         */
-        int compactWithScan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
-        }
+namespace CPU {
+using StreamCompaction::Common::PerformanceTimer;
+PerformanceTimer &timer() {
+  static PerformanceTimer timer;
+  return timer;
+}
+
+/**
+ * CPU scan core function.
+ * This function runs without starting CPU timer.
+ */
+void scan_core(int n, int *odata, const int *idata) {
+  int sum = 0;
+  for (int i = 0; i < n; ++i) {
+    odata[i] = sum;
+    sum += idata[i];
+  }
+}
+
+/**
+ * CPU scan (prefix sum).
+ * For performance analysis, this is supposed to be a simple for loop.
+ * (Optional) For better understanding before starting moving to GPU, you can
+ * simulate your GPU scan in this function first.
+ */
+void scan(int n, int *odata, const int *idata) {
+  timer().startCpuTimer();
+  scan_core(n, odata, idata);
+  timer().endCpuTimer();
+}
+
+/**
+ * CPU stream compaction without using the scan function.
+ *
+ * @returns the number of elements remaining after compaction.
+ */
+int compactWithoutScan(int n, int *odata, const int *idata) {
+  timer().startCpuTimer();
+  int outarray_len = 0;
+  for (int i = 0; i < n; ++i) {
+    if (idata[i] != 0) {
+      odata[outarray_len++] = idata[i];
     }
+  }
+  timer().endCpuTimer();
+  return outarray_len;
+}
+
+/**
+ * CPU stream compaction using scan and scatter, like the parallel version.
+ *
+ * @returns the number of elements remaining after compaction.
+ */
+int compactWithScan(int n, int *odata, const int *idata) {
+  timer().startCpuTimer();
+
+  // create bool array b
+  int *b = (int *)malloc(n * sizeof(int));
+  std::memset(b, 0, n * sizeof(int));
+  for (int i = 0; i < n; ++i) {
+    if (idata[i] != 0) b[i] = 1;
+  }
+
+  // exclusive scan bool array
+  int *scan_b = (int *)malloc(n * sizeof(int));
+  scan_core(n, scan_b, b);
+  int outarray_len = b[n - 1] + scan_b[n - 1];
+
+  // copy selected array into out array
+  for (int i = 0; i < n; ++i) {
+    if (b[i]) odata[scan_b[i]] = idata[i];
+  }
+
+  free(b);
+  free(scan_b);
+
+  timer().endCpuTimer();
+  return outarray_len;
 }
+}  // namespace CPU
+}  // namespace StreamCompaction
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 873c047..7258098 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -3,13 +3,13 @@
 #include "common.h"
 
 namespace StreamCompaction {
-    namespace CPU {
-        StreamCompaction::Common::PerformanceTimer& timer();
+namespace CPU {
+StreamCompaction::Common::PerformanceTimer &timer();
 
-        void scan(int n, int *odata, const int *idata);
+void scan(int n, int *odata, const int *idata);
 
-        int compactWithoutScan(int n, int *odata, const int *idata);
+int compactWithoutScan(int n, int *odata, const int *idata);
 
-        int compactWithScan(int n, int *odata, const int *idata);
-    }
-}
+int compactWithScan(int n, int *odata, const int *idata);
+}  // namespace CPU
+}  // namespace StreamCompaction
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..3a6fe5e 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -1,40 +1,260 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+
 #include "common.h"
 #include "efficient.h"
 
 namespace StreamCompaction {
-    namespace Efficient {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
-
-        /**
-         * Performs stream compaction on idata, storing the result into odata.
-         * All zeroes are discarded.
-         *
-         * @param n      The number of elements in idata.
-         * @param odata  The array into which to store elements.
-         * @param idata  The array of elements to compact.
-         * @returns      The number of elements remaining after compaction.
-         */
-        int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-            return -1;
-        }
+namespace Efficient {
+using StreamCompaction::Common::PerformanceTimer;
+PerformanceTimer &timer() {
+  static PerformanceTimer timer;
+  return timer;
+}
+
+/**
+ * Work-Efficient Inclusive Scan
+ * Input:   idata
+ * Output:  odata --- inclusive scan, idata --- exclusive scan
+ */
+__global__ void kernScanInclusive(int n, int *odata, int *idata) {
+  int tid   = threadIdx.x;
+  int bdim  = blockDim.x;
+  int id    = blockIdx.x * bdim + tid;
+  int log2n = ilog2ceil((n < bdim) ? n : bdim);
+  if (id < n) {
+    // upsweep
+    for (int d = 0; d < log2n; ++d) {
+      if (id % (1 << (d + 1)) == 0) {
+        idata[id + (1 << (d + 1)) - 1] += idata[id + (1 << d) - 1];
+      }
+      __syncthreads();
+    }
+
+    // last thread remembers and sets reduction sum after downsweep
+    int reduction_sum = 0;
+    if (tid == bdim - 1 || id == n - 1) {
+      reduction_sum = idata[id];
+      idata[id]     = 0;
+    }
+    __syncthreads();
+
+    // downsweep
+    for (int d = log2n - 1; d >= 0; --d) {
+      if (id % (1 << (d + 1)) == 0) {
+        int temp                 = idata[id + (1 << d) - 1];
+        idata[id + (1 << d) - 1] = idata[id + (1 << (d + 1)) - 1];
+        idata[id + (1 << (d + 1)) - 1] += temp;
+      }
+      __syncthreads();
+    }
+
+    // turn exclusive scan into inclusive scan
+    if (tid == bdim - 1 || id == n - 1) {
+      odata[id] = reduction_sum;
+    } else {
+      odata[id] = idata[id + 1];
+    }
+  }
+}
+
+/**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+void scan(int n, int *odata, const int *idata) {
+  if (n <= 0) return;
+  const int n_pad = 1 << (ilog2ceil(n));
+
+  const unsigned int block_size = Common::block_size_efficient;
+  int num_scans                 = 1;
+  int len                       = n_pad;
+  while ((len + block_size - 1) / block_size > 1) {
+    ++num_scans;
+    len = (len + block_size - 1) / block_size;
+  }
+
+  int **dev_idata  = (int **)malloc(num_scans * sizeof(int *));
+  int **dev_odata  = (int **)malloc(num_scans * sizeof(int *));
+  int **dev_buffer = (int **)malloc(num_scans * sizeof(int *));
+  int *array_sizes = (int *)malloc(num_scans * sizeof(int));
+  int *grid_sizes  = (int *)malloc(num_scans * sizeof(int));
+
+  len = n_pad;
+  for (int i = 0; i < num_scans; ++i) {
+    cudaMalloc((void **)&dev_idata[i], len * sizeof(int));
+    cudaMalloc((void **)&dev_odata[i], len * sizeof(int));
+    cudaMalloc((void **)&dev_buffer[i], len * sizeof(int));
+    checkCUDAError("cudaMalloc failed for dev_idata, dev_odata, dev_buffer!");
+    array_sizes[i] = len;
+    len            = (len + block_size - 1) / block_size;
+    grid_sizes[i]  = len;
+  }
+
+  cudaMemcpy(dev_idata[0], idata, n * sizeof(int), cudaMemcpyHostToDevice);
+  checkCUDAError("cudaMemcpy failed for idata --> dev_idata[0]!");
+
+  /******* KERNEL INVOCATIONS *******/
+  dim3 dimBlock{block_size};
+  timer().startGpuTimer();
+  for (int i = 0; i < num_scans; ++i) {
+    dim3 dimGrid{(unsigned int)grid_sizes[i]};
+    kernScanInclusive<<<dimGrid, dimBlock>>>(array_sizes[i], dev_buffer[i],
+                                             dev_idata[i]);
+    cudaMemcpy(dev_idata[i], dev_buffer[i], array_sizes[i] * sizeof(int),
+               cudaMemcpyDeviceToDevice);
+    if (i < num_scans - 1) {
+      Common::kernExtractLastElementPerBlock<<<dimGrid, dimBlock>>>(
+          array_sizes[i], dev_idata[i + 1], dev_idata[i]);
+    }
+  }
+  for (int i = num_scans - 1; i >= 0; --i) {
+    dim3 dimGrid{(unsigned int)grid_sizes[i]};
+    Common::kernShiftToExclusive<<<dimGrid, dimBlock>>>(
+        array_sizes[i], dev_odata[i], dev_buffer[i]);
+    if (i >= 1) {
+      dim3 next_dimGrid{(unsigned int)grid_sizes[i - 1]};
+      Common::kernAddOffsetPerBlock<<<next_dimGrid, dimBlock>>>(
+          array_sizes[i - 1], dev_buffer[i - 1], dev_odata[i],
+          dev_idata[i - 1]);
     }
+  }
+  cudaDeviceSynchronize();
+  timer().endGpuTimer();
+  /**********************************/
+
+  cudaMemcpy(odata, dev_odata[0], n * sizeof(int), cudaMemcpyDeviceToHost);
+
+  // Free all memory allocations
+  for (int i = 0; i < num_scans; ++i) {
+    cudaFree(dev_idata[i]);
+    cudaFree(dev_odata[i]);
+    cudaFree(dev_buffer[i]);
+  }
+  free(grid_sizes);
+  free(array_sizes);
+  free(dev_idata);
+  free(dev_odata);
+  free(dev_buffer);
+}
+
+/**
+ * Performs stream compaction on idata, storing the result into odata.
+ * All zeroes are discarded.
+ *
+ * @param n      The number of elements in idata.
+ * @param odata  The array into which to store elements.
+ * @param idata  The array of elements to compact.
+ * @returns      The number of elements remaining after compaction.
+ */
+int compact(int n, int *odata, const int *idata) {
+  if (n <= 0) return n;
+  const int n_pad = 1 << (ilog2ceil(n));
+
+  const unsigned int block_size = Common::block_size_efficient;
+  int num_scans                 = 1;
+  int len                       = n_pad;
+  while ((len + block_size - 1) / block_size > 1) {
+    ++num_scans;
+    len = (len + block_size - 1) / block_size;
+  }
+
+  // input data device allocation
+  int *dev_idata, *dev_odata, *dev_bools;
+  cudaMalloc((void **)&dev_idata, n_pad * sizeof(int));
+  cudaMalloc((void **)&dev_odata, n_pad * sizeof(int));
+  cudaMalloc((void **)&dev_bools, n_pad * sizeof(int));
+  checkCUDAError("cudaMalloc dev_idata, dev_odata, dev_bools failed!");
+  cudaMemset(dev_idata, 0, n_pad * sizeof(int));
+  checkCUDAError("cudaMemset dev_idata to 0 failed!");
+  cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+  checkCUDAError("cudaMemcpy failed for idata --> dev_idata!");
+
+  int **dev_iIndices = (int **)malloc(num_scans * sizeof(int *));
+  int **dev_oIndices = (int **)malloc(num_scans * sizeof(int *));
+  int **dev_buffer   = (int **)malloc(num_scans * sizeof(int *));
+  int *array_sizes   = (int *)malloc(num_scans * sizeof(int));
+  int *grid_sizes    = (int *)malloc(num_scans * sizeof(int));
+
+  len = n_pad;
+  for (int i = 0; i < num_scans; ++i) {
+    cudaMalloc((void **)&dev_iIndices[i], len * sizeof(int));
+    cudaMalloc((void **)&dev_oIndices[i], len * sizeof(int));
+    cudaMalloc((void **)&dev_buffer[i], len * sizeof(int));
+    checkCUDAError(
+        "cudaMalloc failed for dev_iIndices, dev_oIndices, dev_buffer!");
+    array_sizes[i] = len;
+    len            = (len + block_size - 1) / block_size;
+    grid_sizes[i]  = len;
+  }
+
+  /******* KERNEL INVOCATION *******/
+  dim3 dimGrid{(unsigned int)grid_sizes[0]}, dimBlock{block_size};
+  timer().startGpuTimer();
+
+  Common::kernMapToBoolean<<<dimGrid, dimBlock>>>(n_pad, dev_bools, dev_idata);
+  cudaMemcpy(dev_iIndices[0], dev_bools, n_pad * sizeof(int),
+             cudaMemcpyDeviceToDevice);
+  for (int i = 0; i < num_scans; ++i) {
+    dim3 dimGrid{(unsigned int)grid_sizes[i]};
+    kernScanInclusive<<<dimGrid, dimBlock>>>(array_sizes[i], dev_buffer[i],
+                                             dev_iIndices[i]);
+    cudaMemcpy(dev_iIndices[i], dev_buffer[i], array_sizes[i] * sizeof(int),
+               cudaMemcpyDeviceToDevice);
+    if (i < num_scans - 1) {
+      Common::kernExtractLastElementPerBlock<<<dimGrid, dimBlock>>>(
+          array_sizes[i], dev_iIndices[i + 1], dev_iIndices[i]);
+    }
+  }
+  for (int i = num_scans - 1; i >= 0; --i) {
+    dim3 dimGrid{(unsigned int)grid_sizes[i]};
+    Common::kernShiftToExclusive<<<dimGrid, dimBlock>>>(
+        array_sizes[i], dev_oIndices[i], dev_buffer[i]);
+    if (i >= 1) {
+      dim3 next_dimGrid{(unsigned int)grid_sizes[i - 1]};
+      Common::kernAddOffsetPerBlock<<<next_dimGrid, dimBlock>>>(
+          array_sizes[i - 1], dev_buffer[i - 1], dev_oIndices[i],
+          dev_iIndices[i - 1]);
+    }
+  }
+  Common::kernScatter<<<dimGrid, dimBlock>>>(n_pad, dev_odata, dev_idata,
+                                             dev_bools, dev_oIndices[0]);
+
+  cudaDeviceSynchronize();
+  timer().endGpuTimer();
+  /*********************************/
+
+  // transfer output data to CPU & analyze
+  cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+  checkCUDAError("cudaMemcpy odata from dev_odata failed!");
+
+  // calculate num. of elements after compaction
+  int *indices = (int *)malloc(n * sizeof(int));
+  int *bools   = (int *)malloc(n * sizeof(int));
+  cudaMemcpy(indices, dev_oIndices[0], n * sizeof(int), cudaMemcpyDeviceToHost);
+  cudaMemcpy(bools, dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost);
+  checkCUDAError(
+      "cudaMemcpy indices from dev_oIndices, bools from dev_bools failed!");
+  int compact_len = indices[n - 1] + bools[n - 1];
+
+  // Free all memory allocations
+  free(indices);
+  free(bools);
+
+  for (int i = 0; i < num_scans; ++i) {
+    cudaFree(dev_iIndices[i]);
+    cudaFree(dev_oIndices[i]);
+    cudaFree(dev_buffer[i]);
+  }
+  cudaFree(dev_bools);
+  cudaFree(dev_idata);
+  cudaFree(dev_odata);
+  free(grid_sizes);
+  free(array_sizes);
+  free(dev_iIndices);
+  free(dev_oIndices);
+  free(dev_buffer);
+
+  return compact_len;
 }
+}  // namespace Efficient
+}  // namespace StreamCompaction
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..0638439 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -3,11 +3,11 @@
 #include "common.h"
 
 namespace StreamCompaction {
-    namespace Efficient {
-        StreamCompaction::Common::PerformanceTimer& timer();
+namespace Efficient {
+StreamCompaction::Common::PerformanceTimer &timer();
 
-        void scan(int n, int *odata, const int *idata);
+void scan(int n, int *odata, const int *idata);
 
-        int compact(int n, int *odata, const int *idata);
-    }
-}
+int compact(int n, int *odata, const int *idata);
+}  // namespace Efficient
+}  // namespace StreamCompaction
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..34a1609 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,25 +1,123 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+
 #include "common.h"
 #include "naive.h"
 
 namespace StreamCompaction {
-    namespace Naive {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-        // TODO: __global__
-
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
+namespace Naive {
+using StreamCompaction::Common::PerformanceTimer;
+PerformanceTimer &timer() {
+  static PerformanceTimer timer;
+  return timer;
+}
+
+template <typename T>
+__device__ void inline swap(T &a, T &b) {
+  T c(a);
+  a = b;
+  b = c;
+}
+
+/**
+ * Naive parallel scan algorithm
+ * Input must be stored in `data`.
+ * Output is stored both in `data` and `buffer`.
+ */
+__global__ void kernScanInclusive(int n, int *data, int *buffer) {
+  int id    = blockDim.x * blockIdx.x + threadIdx.x;
+  int tx    = threadIdx.x;
+  int bdim  = blockDim.x;
+  int log2n = ilog2ceil((n < bdim) ? n : bdim);
+
+  if (id < n) {
+    for (int d = 1; d <= log2n; ++d) {
+      buffer[id] = data[id];
+      __syncthreads();
+      if (tx >= (1 << (d - 1))) {
+        buffer[id] = data[id - (1 << (d - 1))] + data[id];
+      }
+      __syncthreads();
+      data[id] = buffer[id];
+      __syncthreads();
     }
+  }
+}
+
+/**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+void scan(int n, int *odata, const int *idata) {
+  if (n <= 0) return;
+
+  const unsigned int block_size = Common::block_size_naive;
+  int num_scans                 = 1;
+  int len                       = n;
+  while ((len + block_size - 1) / block_size > 1) {
+    ++num_scans;
+    len = (len + block_size - 1) / block_size;
+  }
+
+  int **dev_idata  = (int **)malloc(num_scans * sizeof(int *));
+  int **dev_odata  = (int **)malloc(num_scans * sizeof(int *));
+  int **dev_buffer = (int **)malloc(num_scans * sizeof(int *));
+  int *array_sizes = (int *)malloc(num_scans * sizeof(int));
+  int *grid_sizes  = (int *)malloc(num_scans * sizeof(int));
+
+  len = n;
+  for (int i = 0; i < num_scans; ++i) {
+    cudaMalloc((void **)&dev_idata[i], len * sizeof(int));
+    cudaMalloc((void **)&dev_odata[i], len * sizeof(int));
+    cudaMalloc((void **)&dev_buffer[i], len * sizeof(int));
+    checkCUDAError("cudaMalloc failed for dev_idata, dev_odata, dev_buffer!");
+    array_sizes[i] = len;
+    len            = (len + block_size - 1) / block_size;
+    grid_sizes[i]  = len;
+  }
+
+  cudaMemcpy(dev_idata[0], idata, n * sizeof(int), cudaMemcpyHostToDevice);
+  checkCUDAError("cudaMemcpy failed for idata --> dev_idata[0]!");
+
+  /******* KERNEL INVOCATIONS *******/
+  dim3 dimBlock{block_size};
+  timer().startGpuTimer();
+  for (int i = 0; i < num_scans; ++i) {
+    dim3 dimGrid{(unsigned int)grid_sizes[i]};
+    kernScanInclusive<<<dimGrid, dimBlock>>>(array_sizes[i], dev_idata[i],
+                                             dev_buffer[i]);
+    if (i < num_scans - 1) {
+      Common::kernExtractLastElementPerBlock<<<dimGrid, dimBlock>>>(
+          array_sizes[i], dev_idata[i + 1], dev_idata[i]);
+    }
+  }
+  for (int i = num_scans - 1; i >= 0; --i) {
+    dim3 dimGrid{(unsigned int)grid_sizes[i]};
+    Common::kernShiftToExclusive<<<dimGrid, dimBlock>>>(
+        array_sizes[i], dev_odata[i], dev_buffer[i]);
+    if (i >= 1) {
+      dim3 next_dimGrid{(unsigned int)grid_sizes[i - 1]};
+      Common::kernAddOffsetPerBlock<<<next_dimGrid, dimBlock>>>(
+          array_sizes[i - 1], dev_buffer[i - 1], dev_odata[i],
+          dev_idata[i - 1]);
+    }
+  }
+  cudaDeviceSynchronize();
+  timer().endGpuTimer();
+  /**********************************/
+
+  cudaMemcpy(odata, dev_odata[0], n * sizeof(int), cudaMemcpyDeviceToHost);
+
+  // Free all memory allocations
+  for (int i = 0; i < num_scans; ++i) {
+    cudaFree(dev_idata[i]);
+    cudaFree(dev_odata[i]);
+    cudaFree(dev_buffer[i]);
+  }
+  free(grid_sizes);
+  free(array_sizes);
+  free(dev_idata);
+  free(dev_odata);
+  free(dev_buffer);
 }
+}  // namespace Naive
+}  // namespace StreamCompaction
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
index 37dcb06..9351fa9 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.h
@@ -3,9 +3,9 @@
 #include "common.h"
 
 namespace StreamCompaction {
-    namespace Naive {
-        StreamCompaction::Common::PerformanceTimer& timer();
+namespace Naive {
+StreamCompaction::Common::PerformanceTimer &timer();
 
-        void scan(int n, int *odata, const int *idata);
-    }
-}
+void scan(int n, int *odata, const int *idata);
+}  // namespace Naive
+}  // namespace StreamCompaction
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..5cce77d 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -3,26 +3,31 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/scan.h>
+
 #include "common.h"
 #include "thrust.h"
 
 namespace StreamCompaction {
-    namespace Thrust {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-            timer().endGpuTimer();
-        }
-    }
+namespace Thrust {
+using StreamCompaction::Common::PerformanceTimer;
+PerformanceTimer &timer() {
+  static PerformanceTimer timer;
+  return timer;
+}
+/**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+void scan(int n, int *odata, const int *idata) {
+  // example: for device_vectors dv_in and dv_out:
+  // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+  thrust::device_vector<int> dv_in{idata, idata + n};
+  thrust::device_vector<int> dv_out{odata, odata + n};
+
+  timer().startGpuTimer();
+  thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+  timer().endGpuTimer();
+
+  thrust::copy(dv_out.begin(), dv_out.end(), odata);
 }
+}  // namespace Thrust
+}  // namespace StreamCompaction
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
index fe98206..697a202 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.h
@@ -3,9 +3,9 @@
 #include "common.h"
 
 namespace StreamCompaction {
-    namespace Thrust {
-        StreamCompaction::Common::PerformanceTimer& timer();
+namespace Thrust {
+StreamCompaction::Common::PerformanceTimer &timer();
 
-        void scan(int n, int *odata, const int *idata);
-    }
-}
+void scan(int n, int *odata, const int *idata);
+}  // namespace Thrust
+}  // namespace StreamCompaction