diff --git a/README.md b/README.md
index cad1abd..88716e0 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,46 @@
 CUDA Rasterizer
 ===============
 
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Wenli Zhao
+* Tested on: Windows 10 Pro, Intel Xeon CPU CPU E5-1630 v4 @ 3.70GHz 32GB, NVIDIA GeForce GTX 24465MB (Sig Lab)
 
-### (TODO: Your README)
+### README
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+![](renders/Capture2.PNG)
 
+In this project, I implemented a simplified graphics rasterizer pipeline which includes vertex shading, primitive assembly, rasterization, fragment shading, and a framebuffer.
+
+The core features I implemented included:
+* Vertex shading. 
+* Primitive assembly with support for triangles read from buffers of index and
+  vertex data.
+* Rasterization.
+* Fragment shading.
+* A depth buffer for storing and depth testing fragments.
+* Fragment-to-depth-buffer writing with atomics
+* A fragment shader with simple Blinn-Phong lighting scheme.
+
+In addition to the basic rasterizer, I implemented UV texture mapping and support for rasterization with points and lines.
+
+#### Texture Mapping
+![](renders/Capture3.PNG)
+
+#### Points
+![](renders/points.PNG)
+
+#### Lines
+![](renders/lines.PNG)
+
+### Analysis
+
+![](renders/chart.png)
+
+![](renders/image.png)
+
+The features I implemented didn't have too much of a performance impact on the models I tested. For example, the first three bars of the chart have a similar distribution. For rasterization of points and lines, I didn't change vertex assembly very much, so the bottleneck remained there. Vertex assembly contains a lot of global memory calls that slow down the pipeline. In general, the fragment shading and rasterization were pretty quick. I think I might have corrupted the cow model since it gave a very different distribution. I'm still not exactly sure why. There is a lot more I could do to accelerate various parts of the rasterization pipeline. I could potentially use shared memory for texture sampling and used tile-based rendering to accelerate my pipeline.
 
 ### Credits
 
diff --git a/renders/Capture.PNG b/renders/Capture.PNG
new file mode 100644
index 0000000..b6ad63d
Binary files /dev/null and b/renders/Capture.PNG differ
diff --git a/renders/Capture2.PNG b/renders/Capture2.PNG
new file mode 100644
index 0000000..dffc089
Binary files /dev/null and b/renders/Capture2.PNG differ
diff --git a/renders/Capture3.PNG b/renders/Capture3.PNG
new file mode 100644
index 0000000..043767f
Binary files /dev/null and b/renders/Capture3.PNG differ
diff --git a/renders/chart.png b/renders/chart.png
new file mode 100644
index 0000000..9357160
Binary files /dev/null and b/renders/chart.png differ
diff --git a/renders/image.png b/renders/image.png
new file mode 100644
index 0000000..64bcaf2
Binary files /dev/null and b/renders/image.png differ
diff --git a/renders/lines.PNG b/renders/lines.PNG
new file mode 100644
index 0000000..4131945
Binary files /dev/null and b/renders/lines.PNG differ
diff --git a/renders/points.PNG b/renders/points.PNG
new file mode 100644
index 0000000..4510419
Binary files /dev/null and b/renders/points.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index 7986959..aed3732 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -108,6 +108,8 @@ void runCuda() {
 		scale * ((float)width / (float)height),
 		-scale, scale, 1.0, 1000.0);
 
+	P = glm::perspective(45.0f, scale*(float)width / (float)height, 1.0f, 1000.0f);
+
 	glm::mat4 V = glm::mat4(1.0f);
 
 	glm::mat4 M =
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..6b13074 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -18,6 +18,11 @@
 #include <glm/gtc/quaternion.hpp>
 #include <glm/gtc/matrix_transform.hpp>
 
+#define ATOMIC 1
+#define POINT 0
+#define LINE 0
+#define TRIANGLE 1
+
 namespace {
 
 	typedef unsigned short VertexIndex;
@@ -46,7 +51,7 @@ namespace {
 		// glm::vec3 col;
 		 glm::vec2 texcoord0;
 		 TextureData* dev_diffuseTex = NULL;
-		// int texWidth, texHeight;
+		 int texWidth, texHeight;
 		// ...
 	};
 
@@ -62,11 +67,11 @@ namespace {
 		// The attributes listed below might be useful, 
 		// but always feel free to modify on your own
 
-		// glm::vec3 eyePos;	// eye space position used for shading
-		// glm::vec3 eyeNor;
-		// VertexAttributeTexcoord texcoord0;
-		// TextureData* dev_diffuseTex;
-		// ...
+		 glm::vec3 eyePos;	// eye space position used for shading
+		 glm::vec3 eyeNor;
+		 VertexAttributeTexcoord texcoord0;
+		 TextureData* dev_diffuseTex;
+		 int texWidth, texHeight;
 	};
 
 	struct PrimitiveDevBufPointers {
@@ -94,10 +99,17 @@ namespace {
 		VertexOut* dev_verticesOut;
 
 		// TODO: add more attributes when needed
+
 	};
 
 }
 
+PerformanceTimer& timer()
+{
+	static PerformanceTimer timer;
+	return timer;
+}
+
 static std::map<std::string, std::vector<PrimitiveDevBufPointers>> mesh2PrimitivesMap;
 
 
@@ -110,6 +122,7 @@ static Fragment *dev_fragmentBuffer = NULL;
 static glm::vec3 *dev_framebuffer = NULL;
 
 static int * dev_depth = NULL;	// you might need this buffer when doing depth test
+static unsigned int * dev_mutex = NULL;
 
 /**
  * Kernel that writes the image to the OpenGL PBO directly.
@@ -133,6 +146,16 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
     }
 }
 
+__device__
+glm::vec3 getColor(TextureData* dev_texcoord0, int u, int v, int width) {
+
+	int	index = u + v*width;
+	float r = (float) dev_texcoord0[index * 3] / 255.0f;
+	float g = (float) dev_texcoord0[index * 3 + 1] / 255.0f;
+	float b = (float) dev_texcoord0[index * 3 + 2] / 255.0f;
+	return glm::vec3(r, g, b);
+}
+
 /** 
 * Writes fragment colors to the framebuffer
 */
@@ -146,8 +169,40 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
         framebuffer[index] = fragmentBuffer[index].color;
 
 		// TODO: add your fragment shader code here
+		Fragment frag = fragmentBuffer[index];
 
+		glm::vec3 lightPos = glm::vec3(60,10,10);
+		glm::vec3 ambientCol = frag.color;
+
+#if TRIANGLE
+		if (frag.color != glm::vec3(0.0, 0.0, 0.0)) {
+			ambientCol = getColor(frag.dev_diffuseTex, frag.texcoord0.x * frag.texWidth, frag.texcoord0.y * frag.texHeight, frag.texWidth);
+		}
+#endif
+
+		glm::vec3 diffuseCol = glm::vec3(0.5f, 0.0f, 0.0f);
+		glm::vec3 specColor = glm::vec3(1.0f, 1.0f, 1.0f);
+		const float shininess = 5.0;
+		const float screenGamma = 1.2;
+
+		glm::vec3 lightDir = normalize(lightPos - fragmentBuffer[index].eyePos);
+		glm::vec3 normal = fragmentBuffer[index].eyeNor;
+
+		float lambertian = max(dot(lightDir, normal), 0.0f);
+		float specular = 0.0;
+		if (lambertian > 0.0) {
+			glm::vec3 viewDir = normalize(-fragmentBuffer[index].eyePos);
+
+			glm::vec3 halfDir = normalize(lightDir + viewDir);
+			float specAngle = max(dot(halfDir, normal), 0.0f);
+			specular = pow(specAngle, shininess);
+		}
+		glm::vec3 colorLinear = ambientCol + lambertian * diffuseCol + specular*specColor;
+
+		glm::vec3 colorGammaCorrected = pow(colorLinear, glm::vec3(1.0f / screenGamma));
+		framebuffer[index] = colorGammaCorrected;
     }
+
 }
 
 /**
@@ -166,11 +221,14 @@ void rasterizeInit(int w, int h) {
 	cudaFree(dev_depth);
 	cudaMalloc(&dev_depth, width * height * sizeof(int));
 
+	cudaFree(dev_mutex);
+	cudaMalloc(&dev_mutex, width*height * sizeof(unsigned int));
+
 	checkCUDAError("rasterizeInit");
 }
 
 __global__
-void initDepth(int w, int h, int * depth)
+void initDepth(int w, int h, int * depth, unsigned int * mutex)
 {
 	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
 	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -179,6 +237,7 @@ void initDepth(int w, int h, int * depth)
 	{
 		int index = x + (y * w);
 		depth[index] = INT_MAX;
+		mutex[index] = 0;
 	}
 }
 
@@ -294,6 +353,7 @@ void traverseNode (
 	}
 }
 
+// Called once in main init function.
 void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
 	totalNumPrimitives = 0;
@@ -331,7 +391,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
 	// 2. for each mesh: 
 	//		for each primitive: 
-	//			build device buffer of indices, materail, and each attributes
+	//			build device buffer of indices, material, and each attributes
 	//			and store these pointers in a map
 	{
 
@@ -638,9 +698,34 @@ void _vertexTransformAndAssembly(
 		// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
 		// Then divide the pos by its w element to transform into NDC space
 		// Finally transform x and y to viewport space
+		VertexIndex vIndex = primitive.dev_indices[vid];
+		VertexAttributePosition vPos = primitive.dev_position[vid];
+		VertexAttributeNormal vNormal = primitive.dev_normal[vid];
+		VertexAttributeTexcoord vTexcoord = primitive.dev_texcoord0[vid];
+
+		// NDC
+		glm::vec4 vOutPos = MVP * glm::vec4(vPos, 1.0f);
+		vOutPos = vOutPos / vOutPos.w;
+
+		// Screen Space
+		vOutPos = glm::vec4(0.5f * (float) width * (vOutPos.x + 1.0f), 0.5f * (float) height * (1.0f - vOutPos.y), vOutPos.z, 1.0f);
+
+		glm::vec3 eyePos = glm::vec3(MV * glm::vec4(vPos, 0));
+		
+		VertexOut vOut;
+		vOut.pos = vOutPos;
+		vOut.eyeNor = glm::vec3(MV * glm::vec4(vNormal,0.0f));
+		vOut.texcoord0 = vTexcoord;
+		vOut.eyePos = eyePos;
+		vOut.texHeight = primitive.diffuseTexHeight;
+		vOut.texWidth = primitive.diffuseTexWidth;
+		vOut.dev_diffuseTex = primitive.dev_diffuseTex;
+
+
+		primitive.dev_verticesOut[vid] = vOut;
 
 		// TODO: Apply vertex assembly here
-		// Assemble all attribute arraies into the primitive array
+		// Assemble all attribute arrays into the primitive array
 		
 	}
 }
@@ -660,12 +745,15 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 		// TODO: uncomment the following code for a start
 		// This is primitive assembly for triangles
 
-		//int pid;	// id for cur primitives vector
-		//if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
-		//	pid = iid / (int)primitive.primitiveType;
-		//	dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
-		//		= primitive.dev_verticesOut[primitive.dev_indices[iid]];
-		//}
+		int pid;	// id for cur primitives vector
+		if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+			// id divided by 3 for triangles
+			pid = iid / (int)primitive.primitiveType;
+
+			// every primitive has a vertex, 
+			dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+				= primitive.dev_verticesOut[primitive.dev_indices[iid]];
+		}
 
 
 		// TODO: other primitive types (point, line)
@@ -673,6 +761,151 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 	
 }
 
+__device__
+glm::vec3 barycentricInterpolation(const glm::vec3 tri[3], const glm::vec3 coord) {
+	return coord.x * tri[0] + coord.y * tri[1] + coord.z * tri[2];
+}
+
+__device__
+glm::vec2 perspectiveInterp(const glm::vec2 texcoord[3], const glm::vec3 coord) {
+	return (float) coord.x * texcoord[0] + (float) coord.y * texcoord[1] + (float) coord.z * texcoord[2];
+}
+
+__global__
+void _rasterizeTriangles(int numPrimitives, int numFragments, int width, Fragment* dev_fragments, Primitive* dev_primitives, int* dev_depth, unsigned int * dev_mutex) {
+	int iid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (iid < numPrimitives) {
+		Primitive triangle = dev_primitives[iid];
+		VertexOut vert0 = triangle.v[0];
+		VertexOut vert1 = triangle.v[1];
+		VertexOut vert2 = triangle.v[2];
+		glm::vec3 tri[3] = { glm::vec3(vert0.pos), 
+							 glm::vec3(vert1.pos), 
+							 glm::vec3(vert2.pos) };
+		AABB boundingBox = getAABBForTriangle(tri);
+
+		for (int y = boundingBox.min.y; y < boundingBox.max.y; y++) {
+			for (int x = boundingBox.min.x; x < boundingBox.max.x; x++) {
+				Fragment f;
+				glm::vec3 bary = calculateBarycentricCoordinate(tri, glm::vec2((float)x, (float)y));
+				if (isBarycentricCoordInBounds(bary)) {
+					// eyePos
+					glm::vec3 eyePos[3] = { vert0.eyePos, vert1.eyePos, vert2.eyePos };
+					f.eyePos = barycentricInterpolation(eyePos, bary);
+
+					// eyeNormals
+					glm::vec3 eyeNormals[3] = { vert0.eyeNor, vert1.eyeNor, vert2.eyeNor };
+					f.eyeNor = barycentricInterpolation(eyeNormals, bary);
+
+					// pos
+					glm::vec3 pos = barycentricInterpolation(tri, bary);
+
+					// textures: 
+
+					glm::vec2 texCoord[3] = { vert0.texcoord0/vert0.pos.z, vert1.texcoord0/vert1.pos.z, vert2.texcoord0/vert2.pos.z };
+					glm::vec2 texCoordinate = pos.z * glm::vec2(perspectiveInterp(texCoord, bary));
+
+					f.texcoord0 = glm::vec2(texCoordinate.x, texCoordinate.y);
+					
+					f.color = glm::vec3(0.01f, 0.0f, 0.0f);
+					f.dev_diffuseTex = vert0.dev_diffuseTex;
+					f.texWidth = vert0.texWidth;
+					f.texHeight = vert0.texHeight;
+					
+					// Depth Buffering
+					int old = dev_depth[y*width + x];
+					bool isSet;
+					do {
+						isSet = (atomicCAS(&dev_mutex[y*width + x], 0, 1) == 0);
+						if (isSet) {
+							// Critical section goes here.
+							// The critical section MUST be inside the wait loop;
+							// if it is afterward, a deadlock will occur.
+							dev_depth[y*width + x] = min(old, (int)(pos.z * INT_MAX));
+							if (old > dev_depth[y*width + x]) {
+								dev_fragments[y * width + x] = f;
+							}
+
+						}
+						if (isSet) {
+							dev_mutex[y*width + x] = 0;
+						}
+					} while (!isSet);
+
+				}
+			}
+		}
+	}
+}
+
+__global__
+void _rasterizePoints(int numPrimitives, int numFragments, int width, Fragment* dev_fragments, Primitive* dev_primitives, int* dev_depth, unsigned int * dev_mutex) {
+	int iid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (iid < numPrimitives) {
+		Primitive triangle = dev_primitives[iid];
+
+		for (int i = 0; i < 3; i++) {
+			Fragment f;
+			int x = triangle.v[i].pos.x;
+			int y = triangle.v[i].pos.y;
+			f.color = triangle.v[i].eyeNor;
+			dev_fragments[y*width + x] = f;
+		}
+		
+	}
+}
+
+__global__
+void _rasterizeLines(int numPrimitives, int numFragments, int width, Fragment* dev_fragments, Primitive* dev_primitives, int* dev_depth, unsigned int * dev_mutex) {
+	int iid = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (iid < numPrimitives) {
+		Primitive triangle = dev_primitives[iid];
+		for (int i = 0; i < 3; i++) {
+			Fragment f;
+			f.color = triangle.v[i].eyeNor;
+			int x1 = triangle.v[i].pos.x;
+			int y1 = triangle.v[i].pos.y;
+			int x2 = triangle.v[(i + 1) % 3].pos.x;
+			int y2 = triangle.v[(i + 1) % 3].pos.y;
+			int y = min(y1, y2);
+
+			glm::vec2 p1 = glm::vec2(x1, y1);
+			glm::vec2 p2 = glm::vec2(x2, y2);
+
+			// swap if necessary
+			if (x1 > x2) {
+				glm::vec2 temp = p1;
+				p1 = p2;
+				p2 = temp;
+			}
+
+			// get slope
+			float m = (float)(p2.y - p1.y) / (float)(p2.x - p1.x);
+			if (m == INFINITY) {
+				int ymin = y;
+				int ymax = max(y1, y2);
+				for (int qy = ymin; qy < ymax; qy++) {
+					dev_fragments[y*width + x1] = f;
+				}
+			}
+			else {
+				int dx = p2.x - p1.x; 
+				for (int dxe = 0; dxe < dx - 1; dxe++) {
+					int dye = (int)( dxe * m + y);
+					int index = dye*width + dxe + p1.x;
+					dev_fragments[index] = f;
+				}
+			}
+
+		}
+	}
+}
+
+
+
 
 
 /**
@@ -687,6 +920,7 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 	// Execute your rasterization pipeline here
 	// (See README for rasterization pipeline outline.)
 
+	timer().startCpuTimer();
 	// Vertex Process & primitive assembly
 	{
 		curPrimitiveBeginId = 0;
@@ -702,9 +936,12 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 				dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 				dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 
+				// 1. Vertex Assembly
 				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height);
 				checkCUDAError("Vertex Processing");
 				cudaDeviceSynchronize();
+
+				// 2. Primitive Assembly
 				_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
 					(p->numIndices, 
 					curPrimitiveBeginId, 
@@ -718,17 +955,34 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 
 		checkCUDAError("Vertex Processing and Primitive Assembly");
 	}
+	timer().endCpuTimer();
+	printElapsedTime(timer().getCpuElapsedTimeForPreviousOperation(), "VERTEX ASSEMBLY");
 	
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
+	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth, dev_mutex);
 	
 	// TODO: rasterize
-
-
-
+	dim3 numThreadsPerBlock(128);
+	dim3 numBlocksForPrimitives((curPrimitiveBeginId + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+	timer().startCpuTimer();
+#if TRIANGLE
+	_rasterizeTriangles << < numBlocksForPrimitives, numThreadsPerBlock >> > (curPrimitiveBeginId, width*height, width, dev_fragmentBuffer, dev_primitives, dev_depth, dev_mutex);
+#elif POINT
+	_rasterizePoints << < numBlocksForPrimitives, numThreadsPerBlock >> > (curPrimitiveBeginId, width*height, width, dev_fragmentBuffer, dev_primitives, dev_depth, dev_mutex);
+#elif LINE
+	_rasterizeLines << < numBlocksForPrimitives, numThreadsPerBlock >> > (curPrimitiveBeginId, width*height, width, dev_fragmentBuffer, dev_primitives, dev_depth, dev_mutex);
+#endif
+	timer().endCpuTimer();
+	printElapsedTime(timer().getCpuElapsedTimeForPreviousOperation(), "RASTERIZATION");
+
+
+	timer().startCpuTimer();
     // Copy depthbuffer colors into framebuffer
 	render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
 	checkCUDAError("fragment shader");
+	timer().endCpuTimer();
+	printElapsedTime(timer().getCpuElapsedTimeForPreviousOperation(), "FRAGMENT SHADER");
+
     // Copy framebuffer into OpenGL buffer for OpenGL previewing
     sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
     checkCUDAError("copy render result to pbo");
@@ -772,5 +1026,8 @@ void rasterizeFree() {
 	cudaFree(dev_depth);
 	dev_depth = NULL;
 
+	cudaFree(dev_mutex);
+	dev_mutex = NULL;
+
     checkCUDAError("rasterize Free");
 }
diff --git a/src/rasterize.h b/src/rasterize.h
index 560aae9..c20c040 100644
--- a/src/rasterize.h
+++ b/src/rasterize.h
@@ -12,6 +12,9 @@
 #include <glm/gtc/matrix_transform.hpp>
 #include <glm/gtx/transform.hpp>
 
+#include <chrono>
+#include <ctime>
+
 namespace tinygltf{
 	class Scene;
 }
@@ -22,3 +25,72 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene);
 
 void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal);
 void rasterizeFree();
+
+class PerformanceTimer
+{
+public:
+	PerformanceTimer()
+	{
+		cudaEventCreate(&event_start);
+		cudaEventCreate(&event_end);
+	}
+
+	~PerformanceTimer()
+	{
+		cudaEventDestroy(event_start);
+		cudaEventDestroy(event_end);
+	}
+
+	void startCpuTimer()
+	{
+		if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+		cpu_timer_started = true;
+
+		time_start_cpu = std::chrono::high_resolution_clock::now();
+	}
+
+	void endCpuTimer()
+	{
+		time_end_cpu = std::chrono::high_resolution_clock::now();
+
+		if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+		std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+		prev_elapsed_time_cpu_milliseconds =
+			static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+		cpu_timer_started = false;
+	}
+
+	float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+	{
+		return prev_elapsed_time_cpu_milliseconds;
+	}
+
+
+	// remove copy and move functions
+	PerformanceTimer(const PerformanceTimer&) = delete;
+	PerformanceTimer(PerformanceTimer&&) = delete;
+	PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+	PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+private:
+	cudaEvent_t event_start = nullptr;
+	cudaEvent_t event_end = nullptr;
+
+	using time_point_t = std::chrono::high_resolution_clock::time_point;
+	time_point_t time_start_cpu;
+	time_point_t time_end_cpu;
+
+	bool cpu_timer_started = false;
+
+	float prev_elapsed_time_cpu_milliseconds = 0.f;
+};
+
+PerformanceTimer& timer();
+
+template<typename T>
+void printElapsedTime(T time, std::string note = "")
+{
+	std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
+}
\ No newline at end of file