diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed74e88..8c282c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,10 @@ cmake_minimum_required(VERSION 3.0)
 
 project(cis565_rasterizer)
 
+# Crucial magic for CUDA linking
+find_package(Threads REQUIRED)
+find_package(CUDA 8.0 REQUIRED)
+
 set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
 
 # Set up include and lib paths
@@ -76,8 +80,6 @@ if (WIN32)
 endif()
 
 # CUDA linker options
-find_package(Threads REQUIRED)
-find_package(CUDA 8.0 REQUIRED)
 set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON)
 set(CUDA_SEPARABLE_COMPILATION ON)
 
diff --git a/README.md b/README.md
index cad1abd..d24a17e 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,149 @@
-CUDA Rasterizer
-===============
-
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
-
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
-
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
-
-
-### Credits
-
-* [tinygltfloader](https://github.com/syoyo/tinygltfloader) by [@soyoyo](https://github.com/syoyo)
-* [glTF Sample Models](https://github.com/KhronosGroup/glTF/blob/master/sampleModels/README.md)
+# **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4:**
+
+# **CUDA Rasterizer**
+
+
+
+
+
+Tested on: Windows 10, Intel Core i7-7700HQ CPU @ 2.80 GHz, 8GB RAM, NVidia GeForce GTX 1050
+
+ ![Built](https://img.shields.io/appveyor/ci/gruntjs/grunt.svg) ![Issues](https://img.shields.io/github/issues-raw/badges/shields/website.svg) ![CUDA 8.0](https://img.shields.io/badge/CUDA-8.0-green.svg?style=flat)  ![Platform](https://img.shields.io/badge/platform-Desktop-bcbcbc.svg)  ![Developer](https://img.shields.io/badge/Developer-Youssef%20Victor-0f97ff.svg?style=flat)
+
+
+
+
+- [Features](#features)
+
+- [In-Depth](#indepth)
+
+- [Time Analysis](#time_analysis)
+
+- [Bloopers](#bloopers)
+ 
+
+ 
+
+____________________________________________________
+
+
+ 
+The goal of this project was to run an algorithm that clears out all zeros from an array on the GPU using CUDA. This parallel reduction is done using the scan algorithm that computes the exclusive prefix sum. I also implemented a parallelized Radix Sort using the exclusive prefix sum algorithm developed.
+
+
+
+### Things Done
+
+#### Core Features
+
+ - [x] Everything
+
+ #### What Makes Me Special
+ 
+  - [x] Perspective Correct Color Interpolation
+  - [x] Instancing
+  - [x] Super-Sampled Anti-Aliasing
+
+![SSAA Cow Instanced](img/ssaa_instacow.gif)
+
+
+### In-Depth:
+
+#### Perspective Correct Color Interpolation
+
+It is important to always correctly interpolate the color at each vertex, and so I did.
+
+Here is what a triangle that is leaning back from the camera looks like without perspective correct interpolation and with (side-by-side):
+
+![Without Perspective-Correct Interpolation](img/not-persp-correct.PNG) ![With Perspective-Correct Interpolation](img/persp-correct.PNG)
+
+Here they are in a nice GIF format that shows them back to back. (As with most color-dense GIFs, the colors are reduced for recording purposes, the colors in real life are exactly as they in the pictures above)
+
+![Back-To-Back](img/persp-correction.gif)
+
+As you can see in this very quantized GIF, there is a lot more blue in the triangle without the interpolation because the Z depth is not correctly interpolated!
+
+#### Instancing
+
+For instancing I added a preprocessor macro, `num_instances`, that defines the number of times that you want a mesh to be instanced. I then have a hard-coded array of transformation matrices that represent the transformations of each instance.
+
+In the vertex shader, I loop over each instance and then transform each vertex `num_instances` times to correspond to the appropriate instance transformation.
+
+I then go through the rest of the primitive rasterization as normal, with `num_instances * numPrimitives` primitives instead of the usual `numPrimitives`.
+
+Here is what it looks like with the cow instanced 9 times:
+
+![instaced_cow_9](img/instanced_cow.gif)
+
+and here is what it looks like with it instanced 27 times!!
+
+![instaced_cow_27](img/instanced_cow_27.gif)
+
+#### Super-Sampled Anti-Aliasing:
+
+For the super-sampled anti-aliasing (SSAA), multiply the fragment buffer by the preprocessor macro `SSAA_RES` which defines the scaling that each axis (`width`, `height`) is scaled by.
+
+Here are the results, with FPS listed in the window title.
+
+##### AA 1x1 (No anti-aliasing)
+
+![anti-aliasing](img/aa_1.PNG)
+
+##### AA 2x2
+
+![anti-aliasing](img/aa_2.PNG)
+
+##### AA 4x4
+
+![anti-aliasing](img/aa_4.PNG)
+
+##### AA 8x8
+
+![anti-aliasing](img/aa_8.PNG)
+
+
+### Time Analysis
+
+I have time analyses for two major aspects that I implemented AA and Instancing. Instancing scaled pretty much very nicely. Considering I was scaling by O(n^3) each time, the time almost doubled every time, so that's a linear increase in time, which actually means my instancing is actually < O(n). That is cool.
+
+Here is a stacked graph also showing the absolute time (in ms):
+
+![timed_insta](img/timed_inst.PNG)
+
+As you can see, the time doubles even though I'm tripling the number of cows in each time step.
+
+Here is that same graph showing just percentages
+
+![timed_insta](img/timed_inst_100.PNG)
+
+As you can see, because all the work is done int the vertex shader (vertex transform and assembly) stage, the time it takes to transform the vertices gradually becomes the bottleneck.
+
+With AA, the time scaled up very evenly and the bottleneck here was rasterization of course. Here is an absolute time (in ms) stacked graph of that:
+
+![timed_insta](img/timed_aa.PNG)
+
+The time increases evenly as I double my n every time, as such this means my AA is actually O(n^2), which is what you'd expect since I am literally sampling n^2 every time. In the 100% stacked bar graph it is very clear how much the rasterization becomes a factor as we scale. (Also note that the vertex shader becomes more and more irrelevant)
+
+![timed_insta](img/timed_aa_100.PNG)
+
+### Bloopers
+
+My best bloopers were mainly while creating the base render. I tried rendering using many debug views with varying levels of success. Here are some of my favorite.
+
+##### "Neon Cow"
+
+I was trying to rasterize the cow with normals showing, a depth buffer bug caused this beauty:
+
+![neon_cow](img/neon_cow2.gif)
+
+##### "The Cow Sees All"
+
+I was trying to again modify the depth buffer, reversed the depth check and got this creepy situation where the cow follows you around if you move the camera in a certain angle:
+
+![follow-cow](img/follow_cow.gif)
+
+##### "Debug View: Eye Space Normal"
+
+Here is a nice debug view of the cow with camera-space normals, the color segmentation is due to the GIF recording software reducing the color palette:
+
+![irridescent](img/iridescent_cow.gif)
diff --git a/gltfs/triangle/triangle.obj b/gltfs/triangle/triangle.obj
index f64cd9b..9366030 100644
--- a/gltfs/triangle/triangle.obj
+++ b/gltfs/triangle/triangle.obj
@@ -4,6 +4,6 @@ mtllib triangle.mtl
 o Cube
 v 0.000000 0.000000 0.000000
 v 0.500000 0.000000 0.000000
-v 0.000000 1.000000 0.000000
+v 0.000000 1.000000 0.00000
 vn 0.0000  0.000000 1.000000
 f 1//1 2//1 3//1
diff --git a/img/aa_1.PNG b/img/aa_1.PNG
new file mode 100644
index 0000000..c5a2cbf
Binary files /dev/null and b/img/aa_1.PNG differ
diff --git a/img/aa_2.PNG b/img/aa_2.PNG
new file mode 100644
index 0000000..1cf0a7c
Binary files /dev/null and b/img/aa_2.PNG differ
diff --git a/img/aa_4.PNG b/img/aa_4.PNG
new file mode 100644
index 0000000..fbd0dc3
Binary files /dev/null and b/img/aa_4.PNG differ
diff --git a/img/aa_8.PNG b/img/aa_8.PNG
new file mode 100644
index 0000000..5d2a6f6
Binary files /dev/null and b/img/aa_8.PNG differ
diff --git a/img/follow_cow.gif b/img/follow_cow.gif
new file mode 100644
index 0000000..ff6053c
Binary files /dev/null and b/img/follow_cow.gif differ
diff --git a/img/instanced_cow.gif b/img/instanced_cow.gif
new file mode 100644
index 0000000..25ed1dc
Binary files /dev/null and b/img/instanced_cow.gif differ
diff --git a/img/instanced_cow_27.gif b/img/instanced_cow_27.gif
new file mode 100644
index 0000000..329c67e
Binary files /dev/null and b/img/instanced_cow_27.gif differ
diff --git a/img/iridescent_cow.gif b/img/iridescent_cow.gif
new file mode 100644
index 0000000..beda835
Binary files /dev/null and b/img/iridescent_cow.gif differ
diff --git a/img/neon_cow.gif b/img/neon_cow.gif
new file mode 100644
index 0000000..4269ac3
Binary files /dev/null and b/img/neon_cow.gif differ
diff --git a/img/neon_cow2.gif b/img/neon_cow2.gif
new file mode 100644
index 0000000..850060d
Binary files /dev/null and b/img/neon_cow2.gif differ
diff --git a/img/no-persp-correct-titled.png b/img/no-persp-correct-titled.png
new file mode 100644
index 0000000..2df398c
Binary files /dev/null and b/img/no-persp-correct-titled.png differ
diff --git a/img/not-persp-correct.PNG b/img/not-persp-correct.PNG
new file mode 100644
index 0000000..59310f1
Binary files /dev/null and b/img/not-persp-correct.PNG differ
diff --git a/img/persp-correct-titled.png b/img/persp-correct-titled.png
new file mode 100644
index 0000000..b43b45c
Binary files /dev/null and b/img/persp-correct-titled.png differ
diff --git a/img/persp-correct.PNG b/img/persp-correct.PNG
new file mode 100644
index 0000000..2a837df
Binary files /dev/null and b/img/persp-correct.PNG differ
diff --git a/img/persp-correction.gif b/img/persp-correction.gif
new file mode 100644
index 0000000..50e0dfc
Binary files /dev/null and b/img/persp-correction.gif differ
diff --git a/img/perspcorretion.psd b/img/perspcorretion.psd
new file mode 100644
index 0000000..0225819
Binary files /dev/null and b/img/perspcorretion.psd differ
diff --git a/img/single_instanced_cow.gif b/img/single_instanced_cow.gif
new file mode 100644
index 0000000..da9c793
Binary files /dev/null and b/img/single_instanced_cow.gif differ
diff --git a/img/ssaa_instacow.gif b/img/ssaa_instacow.gif
new file mode 100644
index 0000000..85068d9
Binary files /dev/null and b/img/ssaa_instacow.gif differ
diff --git a/img/timed_aa.PNG b/img/timed_aa.PNG
new file mode 100644
index 0000000..d7d6ba6
Binary files /dev/null and b/img/timed_aa.PNG differ
diff --git a/img/timed_aa_100.PNG b/img/timed_aa_100.PNG
new file mode 100644
index 0000000..bf29e11
Binary files /dev/null and b/img/timed_aa_100.PNG differ
diff --git a/img/timed_inst.PNG b/img/timed_inst.PNG
new file mode 100644
index 0000000..243dd7d
Binary files /dev/null and b/img/timed_inst.PNG differ
diff --git a/img/timed_inst_100.PNG b/img/timed_inst_100.PNG
new file mode 100644
index 0000000..e6cb042
Binary files /dev/null and b/img/timed_inst_100.PNG differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a57f69f..00edee0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,5 +6,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..067886d 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -7,6 +7,7 @@
  */
 
 #include <cmath>
+#include <chrono>
 #include <cstdio>
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -18,6 +19,14 @@
 #include <glm/gtc/quaternion.hpp>
 #include <glm/gtc/matrix_transform.hpp>
 
+//Number of mesh instances
+#define num_instances 27
+
+//Number of pixel samples taken
+#define SSAA_RES 1
+
+#define TIMED 1
+
 namespace {
 
 	typedef unsigned short VertexIndex;
@@ -43,7 +52,7 @@ namespace {
 
 		 glm::vec3 eyePos;	// eye space position used for shading
 		 glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
-		// glm::vec3 col;
+		 glm::vec3 color;
 		 glm::vec2 texcoord0;
 		 TextureData* dev_diffuseTex = NULL;
 		// int texWidth, texHeight;
@@ -111,6 +120,8 @@ static glm::vec3 *dev_framebuffer = NULL;
 
 static int * dev_depth = NULL;	// you might need this buffer when doing depth test
 
+static glm::mat4* dev_instance_transforms = NULL;
+
 /**
  * Kernel that writes the image to the OpenGL PBO directly.
  */
@@ -140,13 +151,40 @@ __global__
 void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
     int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
+
+	//SUPER SAMPLING ANTI ALIASING
+	glm::vec3 avg_color = glm::vec3(0.f);
+	float n_samples = 0;
 
     if (x < w && y < h) {
-        framebuffer[index] = fragmentBuffer[index].color;
+#if SSAA_RES > 1
+		//Get fragment avg's
+		int res_bound = SSAA_RES / 2;
+		for (int sx = -res_bound; sx < res_bound; sx++) {
+			for (int sy = -res_bound; sy < res_bound; sy++) {
+				int xsx = x*SSAA_RES + sx;
+				int ysy = y*SSAA_RES + sy;
+
+				if (xsx < 0 || xsx >= (w * SSAA_RES) || (ysy < 0) || (ysy >= h * SSAA_RES)) {
+					continue;
+				}
+
+				int index = xsx + (ysy * (w * SSAA_RES));
+				avg_color += fragmentBuffer[index].color;
+				n_samples++;
+			}
+		}
 
-		// TODO: add your fragment shader code here
+		avg_color /= (n_samples);
 
+		int frame_idx = x + (y * w);
+        framebuffer[frame_idx] = avg_color;
+#else
+		int index = x + (y * w);
+		framebuffer[index] = fragmentBuffer[index].color;
+#endif
+
+		// TODO: add your fragment shader code here
     }
 }
 
@@ -154,19 +192,59 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
  * Called once at the beginning of the program to allocate memory.
  */
 void rasterizeInit(int w, int h) {
-    width = w;
-    height = h;
+	width = w * SSAA_RES;
+	height = h * SSAA_RES;
 	cudaFree(dev_fragmentBuffer);
 	cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-    cudaFree(dev_framebuffer);
-    cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
-    cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
-    
+	cudaFree(dev_framebuffer);
+	cudaMalloc(&dev_framebuffer,   (width / SSAA_RES) * (height / SSAA_RES) * sizeof(glm::vec3));
+	cudaMemset(dev_framebuffer, 0, (width / SSAA_RES) * (height / SSAA_RES) * sizeof(glm::vec3));
+
 	cudaFree(dev_depth);
 	cudaMalloc(&dev_depth, width * height * sizeof(int));
-
 	checkCUDAError("rasterizeInit");
+
+	//Create Instances
+	glm::mat4 instance_transforms[num_instances];
+
+	const float delta = 1.5f;
+	
+	  instance_transforms[0] = glm::translate(glm::mat4(1.f), glm::vec3(-delta,-delta, 0));
+	  instance_transforms[1] = glm::translate(glm::mat4(1.f), glm::vec3(-delta, delta, 0));
+	  instance_transforms[2] = glm::translate(glm::mat4(1.f), glm::vec3(-delta,     0, 0));
+	  instance_transforms[3] = glm::translate(glm::mat4(1.f), glm::vec3( delta,-delta, 0));
+	  instance_transforms[4] = glm::translate(glm::mat4(1.f), glm::vec3( delta, delta, 0));
+	  instance_transforms[5] = glm::translate(glm::mat4(1.f), glm::vec3( delta,     0, 0));
+	  instance_transforms[6] = glm::translate(glm::mat4(1.f), glm::vec3(     0,-delta, 0));
+	  instance_transforms[7] = glm::translate(glm::mat4(1.f), glm::vec3(     0, delta, 0));
+	  instance_transforms[8] = glm::translate(glm::mat4(1.f), glm::vec3(     0,     0, 0));
+	  
+	  instance_transforms[9]  = glm::translate(glm::mat4(1.f), glm::vec3(-delta, -delta,-delta));
+	  instance_transforms[10] = glm::translate(glm::mat4(1.f), glm::vec3(-delta,  delta,-delta));
+	  instance_transforms[11] = glm::translate(glm::mat4(1.f), glm::vec3(-delta,  0,    -delta));
+	  instance_transforms[12] = glm::translate(glm::mat4(1.f), glm::vec3( delta, -delta,-delta));
+	  instance_transforms[13] = glm::translate(glm::mat4(1.f), glm::vec3( delta,  delta,-delta));
+	  instance_transforms[14] = glm::translate(glm::mat4(1.f), glm::vec3( delta,      0,-delta));
+	  instance_transforms[15] = glm::translate(glm::mat4(1.f), glm::vec3(     0, -delta,-delta));
+	  instance_transforms[16] = glm::translate(glm::mat4(1.f), glm::vec3(     0,  delta,-delta));
+	  instance_transforms[17] = glm::translate(glm::mat4(1.f), glm::vec3(     0,      0,-delta));
+	  
+	  
+	  instance_transforms[18] = glm::translate(glm::mat4(1.f), glm::vec3(-delta, -delta, delta));
+	  instance_transforms[19] = glm::translate(glm::mat4(1.f), glm::vec3(-delta, delta, delta));
+	  instance_transforms[20] = glm::translate(glm::mat4(1.f), glm::vec3(-delta, 0, delta));
+	  instance_transforms[21] = glm::translate(glm::mat4(1.f), glm::vec3(delta, -delta, delta));
+	  instance_transforms[22] = glm::translate(glm::mat4(1.f), glm::vec3(delta, delta, delta));
+	  instance_transforms[23] = glm::translate(glm::mat4(1.f), glm::vec3(delta, 0, delta));
+	  instance_transforms[24] = glm::translate(glm::mat4(1.f), glm::vec3(0, -delta, delta));
+	  instance_transforms[25] = glm::translate(glm::mat4(1.f), glm::vec3(0, delta, delta));
+	  instance_transforms[26] = glm::translate(glm::mat4(1.f), glm::vec3(0, 0, delta));
+
+	cudaMalloc(&dev_instance_transforms, sizeof(glm::mat4) * num_instances);
+	cudaMemcpy(dev_instance_transforms, instance_transforms, sizeof(glm::mat4) * num_instances, cudaMemcpyHostToDevice);
+
+	checkCUDAError("rasterizeInit - Instance Cpying");
 }
 
 __global__
@@ -389,6 +467,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 					// assume type is SCALAR for indices
 					int n = 1;
 					int numIndices = indexAccessor.count;
+
 					int componentTypeByteSize = sizeof(VertexIndex);
 					int byteLength = numIndices * n * componentTypeByteSize;
 
@@ -513,7 +592,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
 					// malloc for VertexOut
 					VertexOut* dev_vertexOut;
-					cudaMalloc(&dev_vertexOut, numVertices * sizeof(VertexOut));
+					cudaMalloc(&dev_vertexOut, numVertices * num_instances * sizeof(VertexOut));
 					checkCUDAError("Malloc VertexOut Buffer");
 
 					// ----------Materials-------------
@@ -595,6 +674,8 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 		} // for each node
 
 	}
+
+	totalNumPrimitives *= num_instances;
 	
 
 	// 3. Malloc for dev_primitives
@@ -628,20 +709,33 @@ void _vertexTransformAndAssembly(
 	int numVertices, 
 	PrimitiveDevBufPointers primitive, 
 	glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, 
-	int width, int height) {
+	int width, int height, const glm::mat4* dev_instance_transforms) {
 
 	// vertex id
 	int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
 	if (vid < numVertices) {
-
-		// TODO: Apply vertex transformation here
-		// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
-		// Then divide the pos by its w element to transform into NDC space
-		// Finally transform x and y to viewport space
-
-		// TODO: Apply vertex assembly here
-		// Assemble all attribute arraies into the primitive array
 		
+		//INSTANCING:
+		for (int i = 0; i < num_instances; i++) {
+			// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
+			const glm::vec4 instance_pos = dev_instance_transforms[i] * glm::vec4(primitive.dev_position[vid],1);
+			glm::vec4 new_pos = MVP * instance_pos;
+
+			// Then divide the pos by its w element to transform into NDC space
+			new_pos *= (1.f / new_pos.w);
+
+			// Finally transform x and y to viewport space
+			new_pos.x = 0.5f * width  * (1.f - new_pos.x);
+			new_pos.y = 0.5f * height * (1.f - new_pos.y);
+
+			// Assemble all attribute arrays into the primitive array
+			int instance_idx = vid + (primitive.numVertices * i);
+			primitive.dev_verticesOut[instance_idx].pos = new_pos;
+			primitive.dev_verticesOut[instance_idx].color = glm::vec3(0, 1, 1);
+			primitive.dev_verticesOut[instance_idx].eyePos = glm::vec3(MV * instance_pos);
+			primitive.dev_verticesOut[instance_idx].eyeNor = glm::normalize(MV_normal * primitive.dev_normal[vid]);
+		}
+
 	}
 }
 
@@ -660,12 +754,18 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 		// TODO: uncomment the following code for a start
 		// This is primitive assembly for triangles
 
-		//int pid;	// id for cur primitives vector
-		//if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
-		//	pid = iid / (int)primitive.primitiveType;
-		//	dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
-		//		= primitive.dev_verticesOut[primitive.dev_indices[iid]];
-		//}
+		int pid;	// id for cur primitives vector
+		if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+			pid = iid / (int)primitive.primitiveType;
+
+			//Primitive Instancing
+			for (int i = 0; i < num_instances; i++) {
+				int inst_pid = pid + curPrimitiveBeginId + primitive.numPrimitives * i;
+				int vid = primitive.dev_indices[iid] + primitive.numVertices * i;
+
+				dev_primitives[inst_pid].v[iid % (int)primitive.primitiveType]  = primitive.dev_verticesOut[vid];
+			}
+		}
 
 
 		// TODO: other primitive types (point, line)
@@ -673,7 +773,80 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 	
 }
 
+__global__
+void _kern_rasterize(const int numPrims, Primitive* dev_primitives,
+	int* dev_depth, const int width, const int height,
+	Fragment* dev_fragments) {
+
+	// Prim Bounds Check
+	const int pid = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (pid >= numPrims) return;
+
+	Primitive prim = dev_primitives[pid];
+
+	const glm::vec3 p1 = glm::vec3(prim.v[0].pos);
+	const glm::vec3 p2 = glm::vec3(prim.v[1].pos);
+	const glm::vec3 p3 = glm::vec3(prim.v[2].pos);
+
+	//Backface Culling
+	const glm::vec3 tri_nor = glm::cross(p2 - p1, p3 - p1);
+	if (tri_nor.z < 0) { return; }
 
+	const glm::vec3 tri[3] = { p1, p2, p3 };
+
+	const AABB bounds = getAABBForTriangleAndClamp(tri,width,height);
+
+	for (int x = bounds.min.x; x < bounds.max.x; x++) {
+		for (int y = bounds.min.y; y < bounds.max.y; y++) {
+			if (x >= width || y >= height) { break; }
+			
+			//Get Barycentric coordinate
+			const glm::vec3 baryc = calculateBarycentricCoordinate(tri, glm::vec2(x, y));
+			
+			//Only color in fragments that are within tri
+			if (isBarycentricCoordInBounds(baryc)) {
+				glm::vec3 one_over_tri[3] = { glm::vec3(tri[0].x, tri[0].y, 1.0 / tri[0].z),
+											  glm::vec3(tri[1].x, tri[1].y, 1.0 / tri[1].z),
+											  glm::vec3(tri[2].x, tri[2].y, 1.0 / tri[2].z) };
+
+				//Perspective correct interpolation of z in screen space
+				const float z_interpolated = -1.0 / getZAtCoordinate(baryc, one_over_tri);
+
+				const glm::vec3 frag_eyeNor = baryc[0] * prim.v[0].eyeNor +
+					                          baryc[1] * prim.v[1].eyeNor +
+					                          baryc[2] * prim.v[2].eyeNor;
+
+				//Correct color interpolation between points on a primitive
+				const glm::vec3 frag_col = baryc[0] * prim.v[0].color +
+					                       baryc[1] * prim.v[1].color +
+					                       baryc[2] * prim.v[2].color;
+
+				//Depth Buffer Precision Factor
+				const float dbuf_int_precision = 100000;
+
+				const int z_int = z_interpolated  * dbuf_int_precision;
+
+				int index = x + (y * width);
+
+				int* frag_depth = &dev_depth[index];
+				int  old_value = *frag_depth;
+
+				atomicMin(frag_depth, z_int);
+
+				__syncthreads();
+
+				//Which pixel won? +Determined Here.
+				//This is much cleaner code than using atomicCAS
+				if (*frag_depth == z_int) {
+				  //If this is less than current_depth
+					dev_fragments[index].color = glm::dot(frag_eyeNor, glm::normalize(tri_nor)) * frag_col;
+				}
+
+			}
+		}
+	}
+
+}
 
 /**
  * Perform rasterization.
@@ -684,9 +857,13 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
     dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
 		(height - 1) / blockSize2d.y + 1);
 
+#if TIMED
+	using time_point_t = std::chrono::high_resolution_clock::time_point;
+	time_point_t time_start_cpu = std::chrono::high_resolution_clock::now();
+#endif
+
 	// Execute your rasterization pipeline here
 	// (See README for rasterization pipeline outline.)
-
 	// Vertex Process & primitive assembly
 	{
 		curPrimitiveBeginId = 0;
@@ -702,7 +879,8 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 				dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 				dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 
-				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height);
+				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height,
+					                                                                   dev_instance_transforms);
 				checkCUDAError("Vertex Processing");
 				cudaDeviceSynchronize();
 				_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
@@ -718,20 +896,72 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 
 		checkCUDAError("Vertex Processing and Primitive Assembly");
 	}
+
+#if TIMED
+	cudaDeviceSynchronize();
+	time_point_t time_end_cpu = std::chrono::high_resolution_clock::now();
+	std::chrono::duration<double, std::milli> duration = time_end_cpu - time_start_cpu;
+	float elapsed_time = static_cast<decltype(elapsed_time)>(duration.count());
+	printTime(elapsed_time, "VertexTransformAndAssembly (measured using std::chrono).");
+#endif
 	
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
 	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
-	
-	// TODO: rasterize
 
+#if TIMED
+	using time_point_t = std::chrono::high_resolution_clock::time_point;
+	time_start_cpu = std::chrono::high_resolution_clock::now();
+#endif
+
+	const int numThreadsPerBlock = 128;
+	const int numBlocksForPrimitives = (totalNumPrimitives + numThreadsPerBlock - 1) / numThreadsPerBlock;
+	_kern_rasterize << <numBlocksForPrimitives, numThreadsPerBlock >> >(totalNumPrimitives, dev_primitives,
+		dev_depth, width, height, dev_fragmentBuffer);
+	checkCUDAError("_kern_rasterize");
+
+#if TIMED
+	cudaDeviceSynchronize();
+	time_end_cpu = std::chrono::high_resolution_clock::now();
+	duration = time_end_cpu - time_start_cpu;
+	elapsed_time = static_cast<decltype(elapsed_time)>(duration.count());
+	printTime(elapsed_time, "Rasterization (measured using std::chrono).");
+#endif
+
+
+#if TIMED
+	using time_point_t = std::chrono::high_resolution_clock::time_point;
+	time_start_cpu = std::chrono::high_resolution_clock::now();
+#endif
+    
+	// Copy depthbuffer colors into framebuffer
+	render << <blockCount2d, blockSize2d >> >(width/SSAA_RES, height/SSAA_RES, dev_fragmentBuffer, dev_framebuffer);
+	checkCUDAError("fragment shader");
+
+#if TIMED
+	cudaDeviceSynchronize();
+	time_end_cpu = std::chrono::high_resolution_clock::now();
+	duration = time_end_cpu - time_start_cpu;
+	elapsed_time = static_cast<decltype(elapsed_time)>(duration.count());
+	printTime(elapsed_time, "Rendering / FragmentBuffer to FrameBuffer (measured using std::chrono).");
+#endif
 
 
-    // Copy depthbuffer colors into framebuffer
-	render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
-	checkCUDAError("fragment shader");
+#if TIMED
+	using time_point_t = std::chrono::high_resolution_clock::time_point;
+	time_start_cpu = std::chrono::high_resolution_clock::now();
+#endif
+
     // Copy framebuffer into OpenGL buffer for OpenGL previewing
-    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
+    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width / SSAA_RES, height / SSAA_RES, dev_framebuffer);
     checkCUDAError("copy render result to pbo");
+
+#if TIMED
+	cudaDeviceSynchronize();
+	time_end_cpu = std::chrono::high_resolution_clock::now();
+	duration = time_end_cpu - time_start_cpu;
+	elapsed_time = static_cast<decltype(elapsed_time)>(duration.count());
+	printTime(elapsed_time, "Image To PBO (measured using std::chrono).\n");
+#endif
 }
 
 /**
@@ -753,7 +983,7 @@ void rasterizeFree() {
 
 			cudaFree(p->dev_verticesOut);
 
-			
+
 			//TODO: release other attributes and materials
 		}
 	}
@@ -772,5 +1002,8 @@ void rasterizeFree() {
 	cudaFree(dev_depth);
 	dev_depth = NULL;
 
+	cudaFree(dev_instance_transforms);
+	dev_instance_transforms = NULL;
+
     checkCUDAError("rasterize Free");
 }
diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h
index 46c701e..039482f 100644
--- a/src/rasterizeTools.h
+++ b/src/rasterizeTools.h
@@ -30,7 +30,7 @@ glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) {
  * Finds the axis aligned bounding box for a given triangle.
  */
 __host__ __device__ static
-AABB getAABBForTriangle(const glm::vec3 tri[3]) {
+AABB getAABBForTriangleAndClamp(const glm::vec3 tri[3], const int width, const int height) {
     AABB aabb;
     aabb.min = glm::vec3(
             min(min(tri[0].x, tri[1].x), tri[2].x),
@@ -40,6 +40,9 @@ AABB getAABBForTriangle(const glm::vec3 tri[3]) {
             max(max(tri[0].x, tri[1].x), tri[2].x),
             max(max(tri[0].y, tri[1].y), tri[2].y),
             max(max(tri[0].z, tri[1].z), tri[2].z));
+
+	aabb.min = glm::clamp(aabb.min, glm::vec3(0.f), glm::vec3(width, height, 1));
+	aabb.max = glm::clamp(aabb.max, glm::vec3(0.f), glm::vec3(width, height, 1));
     return aabb;
 }
 
@@ -90,7 +93,7 @@ bool isBarycentricCoordInBounds(const glm::vec3 barycentricCoord) {
 
 // CHECKITOUT
 /**
- * For a given barycentric coordinate, compute the corresponding z position
+ * For a give barycentric coordinate, compute the corresponding z position
  * (i.e. depth) on the triangle.
  */
 __host__ __device__ static
@@ -99,3 +102,8 @@ float getZAtCoordinate(const glm::vec3 barycentricCoord, const glm::vec3 tri[3])
            + barycentricCoord.y * tri[1].z
            + barycentricCoord.z * tri[2].z);
 }
+
+
+void printTime(double elapsed_time, const char* message) {
+	printf("%f ms, %s \n", elapsed_time, message);
+}