diff --git a/CMakeLists.txt b/CMakeLists.txt index ed74e88..6882561 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.0) project(cis565_rasterizer) +find_package(Threads REQUIRED) +find_package(CUDA 8.0 REQUIRED) + set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) # Set up include and lib paths @@ -76,8 +79,6 @@ if (WIN32) endif() # CUDA linker options -find_package(Threads REQUIRED) -find_package(CUDA 8.0 REQUIRED) set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) set(CUDA_SEPARABLE_COMPILATION ON) diff --git a/README.md b/README.md index cad1abd..6365dcb 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,66 @@ CUDA Rasterizer =============== -[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md) - **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Charles Wang +* Tested on: Windows 10, i7-6700K @ 4.00GHz 16GB, GTX 1060 6GB (Personal Computer) + +![](img/truck.gif) + +## **Project Overview and Goals** + +The goal of this project was to write (most of) the graphics pipeline with CUDA kernels. The project tests ability to understand what OpenGL and other graphics APIs typically do under the hood. There is also continual discussion of the shift from hardware implemented pipeline components vs. software implemented components, so this project is, in a way, an investigation on the software capabilities. + +I implemented: + - Vertex Assembly + - storing proper information into VertexOut objects after model and camera transformations + - Rasterization: + - storing fragments with barycentric interpolation for normals, uvs +- Rendering + - basically fragment shading. + - using the stored information to compute lighting (lambert) + +### Different Rasterization Modes + +I implemented three different rasterization modes: lambert, wireframe and point cloud. They all use the same geometry but provide a slightly different way of presenting the data. + + ![](img/duck.PNG) | ![](img/ducklines.PNG) | ![](img/duckpoints.PNG) | +|---|---|---| +| lambert shading with textures | triangle wireframe | point cloud | + +### Perspective Correct + +Naively using barycentric interpolation for uv coordinates will result in skewing at certain viewing angles. Perspective correct interpolation will remedy this at a small computational cost. + + ![](img/notperspectivecorrect.PNG) | ![](img/perspectivecorrect.PNG) | +|---|---| +| no perspective correct | perspective correct | + +### Bilinear Texture Filtering + +Casting uv coordinates from floats to ints and fetching texture pixel information directly commonly leads to aliasing issues when the texture is not high resolution. Bilinear texture filtering is a way of averaging nearby pixels so the resultant texture is more smooth. + + ![](img/nobilinear.PNG) | ![](img/bilinear.PNG) | +|---|---| +| no bilinear filtering | bilinear filtering | + + +### Performance Analysis + +Below are charts that describe the compute breakdown of my graphics pipeline. + +For some reason, the vertex and primitive assembly takes up a huge chunk of the total render time (I need to further investigate because there's not much complexity in what's been implemented... maybe it's a weird hardware thing) + +![](img/chart.PNG) + +In any case, the vertex and primitive assembly stages are the same for all configurations of my rasterizer, so maybe it'd be more meaningful to take a look at the rest: -### (TODO: Your README) +![](img/chartnovertex.PNG) -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. +Rasterization is slightly more costly for shaded geometry because we need to find fragments within a bounding box that are inside the triangle, rather than just plotting the lines or points. Textured geometry will also take more time to rasterize because we need to find the perspective correct interpolated values for the UVs. +Fragment shading for shaded geometry is also significantly slower because we need to calculate lambertian shading and fetch texture data rather than just passing through the fragment color. Intrducing bilinear texture filtering also slows down the process because each fragment color requires multiple texture pixel accesses. ### Credits diff --git a/img/bilinear.PNG b/img/bilinear.PNG new file mode 100644 index 0000000..23c5953 Binary files /dev/null and b/img/bilinear.PNG differ diff --git a/img/chart.PNG b/img/chart.PNG new file mode 100644 index 0000000..abd17cc Binary files /dev/null and b/img/chart.PNG differ diff --git a/img/chartnovertex.PNG b/img/chartnovertex.PNG new file mode 100644 index 0000000..45e4ba7 Binary files /dev/null and b/img/chartnovertex.PNG differ diff --git a/img/duck.PNG b/img/duck.PNG new file mode 100644 index 0000000..0aa1d75 Binary files /dev/null and b/img/duck.PNG differ diff --git a/img/ducklines.PNG b/img/ducklines.PNG new file mode 100644 index 0000000..d1d77ef Binary files /dev/null and b/img/ducklines.PNG differ diff --git a/img/duckpoints.PNG b/img/duckpoints.PNG new file mode 100644 index 0000000..8abc806 Binary files /dev/null and b/img/duckpoints.PNG differ diff --git a/img/nobilinear.PNG b/img/nobilinear.PNG new file mode 100644 index 0000000..d67ac06 Binary files /dev/null and b/img/nobilinear.PNG differ diff --git a/img/notperspectivecorrect.PNG b/img/notperspectivecorrect.PNG new file mode 100644 index 0000000..816c4ad Binary files /dev/null and b/img/notperspectivecorrect.PNG differ diff --git a/img/perspectivecorrect.PNG b/img/perspectivecorrect.PNG new file mode 100644 index 0000000..ec3d16d Binary files /dev/null and b/img/perspectivecorrect.PNG differ diff --git a/img/truck.gif b/img/truck.gif new file mode 100644 index 0000000..393d412 Binary files /dev/null and b/img/truck.gif differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a57f69f..d7be2b6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,9 +2,12 @@ set(SOURCE_FILES "rasterize.cu" "rasterize.h" "rasterizeTools.h" + "timer.h" ) cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) + +LIST(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_61,code=sm_61") \ No newline at end of file diff --git a/src/rasterize.cu b/src/rasterize.cu index 1262a09..1dc6256 100644 --- a/src/rasterize.cu +++ b/src/rasterize.cu @@ -18,6 +18,13 @@ #include #include +#include "timer.h" + +#define TEXTURE 1 +#define PERSPECTIVECORRECT 1 +#define BILINEAR 1 +#define DRAWMODE 2 + namespace { typedef unsigned short VertexIndex; @@ -43,10 +50,10 @@ namespace { glm::vec3 eyePos; // eye space position used for shading glm::vec3 eyeNor; // eye space normal used for shading, cuz normal will go wrong after perspective transformation - // glm::vec3 col; + glm::vec3 col; glm::vec2 texcoord0; TextureData* dev_diffuseTex = NULL; - // int texWidth, texHeight; + int texWidth, texHeight; // ... }; @@ -62,11 +69,12 @@ namespace { // The attributes listed below might be useful, // but always feel free to modify on your own - // glm::vec3 eyePos; // eye space position used for shading - // glm::vec3 eyeNor; - // VertexAttributeTexcoord texcoord0; - // TextureData* dev_diffuseTex; - // ... + glm::vec3 eyePos; // eye space position used for shading + glm::vec3 eyeNor; + VertexAttributeTexcoord texcoord0; + TextureData* dev_diffuseTex; + glm::vec2 uv; + int texWidth, texHeight; }; struct PrimitiveDevBufPointers { @@ -109,7 +117,8 @@ static Primitive *dev_primitives = NULL; static Fragment *dev_fragmentBuffer = NULL; static glm::vec3 *dev_framebuffer = NULL; -static int * dev_depth = NULL; // you might need this buffer when doing depth test +static float * dev_depth = NULL; // you might need this buffer when doing depth test +static int * dev_mutex = NULL; /** * Kernel that writes the image to the OpenGL PBO directly. @@ -133,6 +142,33 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) { } } +__device__ +glm::vec3 getTexColor(TextureData* texture, int u, int v, int width, int height) { + int index = u + v * width; + float r = (float)texture[index * 3] / 255.0f; + float g = (float)texture[(index * 3) + 1] / 255.0f; + float b = (float)texture[(index * 3) + 2] / 255.0f; + return glm::vec3(r, g, b); +} + +__device__ +glm::vec3 getBilinearFilterColor(TextureData* texture, float u, float v, int width, int height) { + float dX = u - glm::floor(u); + float dY = v - glm::floor(v); + int x = (int)u; + int y = (int)v; + + glm::vec3 p0 = getTexColor(texture, x, y, width, height); + glm::vec3 p1 = getTexColor(texture, glm::clamp(x+1, 0, width - 1), y, width, height); + glm::vec3 p2 = getTexColor(texture, x, glm::clamp(y + 1, 0, height - 1), width, height); + glm::vec3 p3 = getTexColor(texture, glm::clamp(x + 1, 0, width - 1), glm::clamp(y + 1, 0, height - 1), width, height); + + glm::vec3 interp1 = glm::mix(p0, p1, dX); + glm::vec3 interp2 = glm::mix(p2, p3, dX); + return glm::mix(interp1, interp2, dY); + +} + /** * Writes fragment colors to the framebuffer */ @@ -143,7 +179,28 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) { int index = x + (y * w); if (x < w && y < h) { - framebuffer[index] = fragmentBuffer[index].color; +#if DRAWMODE == 2 + //LAMBERT SHADING + Fragment f = fragmentBuffer[index]; + glm::vec3 col(1.0f, 1.0f, 1.0f); +#if TEXTURE + if (f.dev_diffuseTex != NULL) { + float u = f.uv[0] * (float)f.texWidth; + float v = f.uv[1] * (float)f.texHeight; + int color_idx = u + v*f.texWidth; +#if BILINEAR + col = getBilinearFilterColor(f.dev_diffuseTex, u, v, f.texWidth, f.texHeight); +#else + col = getTexColor(f.dev_diffuseTex, u, v, f.texWidth, f.texHeight); +#endif + } +#endif + glm::vec3 light_direction = glm::normalize(-f.eyePos); + float dot = glm::dot(light_direction, f.eyeNor); + framebuffer[index] = col * dot; +#else DRAWMODE == 0 //POINTS OR LINES + framebuffer[index] = fragmentBuffer[index].color; +#endif // TODO: add your fragment shader code here @@ -164,13 +221,17 @@ void rasterizeInit(int w, int h) { cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3)); cudaFree(dev_depth); - cudaMalloc(&dev_depth, width * height * sizeof(int)); + cudaMalloc(&dev_depth, width * height * sizeof(float)); + + cudaFree(dev_mutex); + cudaMalloc(&dev_mutex, width * height * sizeof(int)); + cudaMemset(dev_mutex, 0, width * height * sizeof(int)); checkCUDAError("rasterizeInit"); } __global__ -void initDepth(int w, int h, int * depth) +void initDepth(int w, int h, float * depth) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; @@ -178,7 +239,7 @@ void initDepth(int w, int h, int * depth) if (x < w && y < h) { int index = x + (y * w); - depth[index] = INT_MAX; + depth[index] = INFINITY; } } @@ -636,9 +697,23 @@ void _vertexTransformAndAssembly( // TODO: Apply vertex transformation here // Multiply the MVP matrix for each vertex position, this will transform everything into clipping space + glm::vec4 clip_pos = MVP * glm::vec4(primitive.dev_position[vid], 1.0f); // Then divide the pos by its w element to transform into NDC space + clip_pos /= clip_pos.w; // Finally transform x and y to viewport space - + float out_x = (0.5f * (float)width) * (clip_pos.x + 1.0f); + float out_y = (0.5f * (float)height) * (1.0f - clip_pos.y); + VertexOut vOut; + vOut.pos = glm::vec4(out_x, out_y, -clip_pos.z, 1.0f); + vOut.eyePos = glm::vec3(MV * glm::vec4(primitive.dev_position[vid], 1.0f)); + vOut.eyeNor = glm::normalize(MV_normal * primitive.dev_normal[vid]); + vOut.texcoord0 = primitive.dev_texcoord0[vid]; + vOut.dev_diffuseTex = primitive.dev_diffuseTex; + vOut.texWidth = primitive.diffuseTexWidth; + vOut.texHeight = primitive.diffuseTexHeight; + + + primitive.dev_verticesOut[vid] = vOut; // TODO: Apply vertex assembly here // Assemble all attribute arraies into the primitive array @@ -660,12 +735,12 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_ // TODO: uncomment the following code for a start // This is primitive assembly for triangles - //int pid; // id for cur primitives vector - //if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { - // pid = iid / (int)primitive.primitiveType; - // dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] - // = primitive.dev_verticesOut[primitive.dev_indices[iid]]; - //} + int pid; // id for cur primitives vector + if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { + pid = iid / (int)primitive.primitiveType; + dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] + = primitive.dev_verticesOut[primitive.dev_indices[iid]]; + } // TODO: other primitive types (point, line) @@ -674,6 +749,125 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_ } +__global__ +void kernRasterize(int n, int width, int height, Primitive *primitives, Fragment *fragment_buffer, float * depth_buffer, int * mutex) { + + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { return; } + Primitive p = primitives[idx]; +#if DRAWMODE == 2 + glm::vec3 tri[3]; + tri[0] = glm::vec3(p.v[0].pos); + tri[1] = glm::vec3(p.v[1].pos); + tri[2] = glm::vec3(p.v[2].pos); + AABB aabb = getAABBForTriangle(tri); + int minX = glm::clamp((int)aabb.min.x, 0, width); + int maxX = glm::clamp((int)aabb.max.x, 0, width); + int minY = glm::clamp((int)aabb.min.y, 0, height); + int maxY = glm::clamp((int)aabb.max.y, 0, height); + for (int i = minX; i <= maxX; i++) { + for (int j = minY; j <= maxY; j++) { + int id1d = (j*width) + i; + glm::vec3 bary = calculateBarycentricCoordinate(tri, glm::vec2(i, j)); + if (isBarycentricCoordInBounds(bary)) { + VertexOut v0 = p.v[0]; + VertexOut v1 = p.v[1]; + VertexOut v2 = p.v[2]; + glm::vec3 normal = glm::normalize(bary.x * v0.eyeNor + bary.y * v1.eyeNor + bary.z * v2.eyeNor); + glm::vec3 pos = bary.x * v0.eyePos + bary.y * v1.eyePos + bary.z * v2.eyePos; + Fragment f; + f.color = glm::vec3(1.0f, 1.0f, 1.0f); + f.eyeNor = normal; + f.eyePos = pos; +#if TEXTURE + glm::vec2 uv; +#if PERSPECTIVECORRECT + float z0 = 1.0f / v0.eyePos.z; + float z1 = 1.0f / v1.eyePos.z; + float z2 = 1.0f / v2.eyePos.z; + uv = (bary.x * v0.texcoord0 * z0) + (bary.y * v1.texcoord0 * z1) + (bary.z * v2.texcoord0 * z2); + uv /= (bary.x * z0 + bary.y * z1 + bary.z *z2); +#else + uv = bary.x * v0.texcoord0 + bary.y * v1.texcoord0 + bary.z * v2.texcoord0; +#endif + f.uv = uv; + f.texWidth = v0.texWidth; + f.texHeight = v0.texHeight; + f.dev_diffuseTex = v0.dev_diffuseTex; +#endif + + float curr_depth = getZAtCoordinate(bary, tri); //calculate value + bool isSet; + do { + isSet = (atomicCAS(&mutex[id1d], 0, 1) == 0); + if (isSet) { + if (curr_depth < depth_buffer[id1d]) { + depth_buffer[id1d] = curr_depth; + fragment_buffer[id1d] = f; + } + mutex[id1d] = 0; + } + + } while (!isSet); + } + } + } +#elif DRAWMODE == 0 // POINTS + for (int i = 0; i < 3; i++) { + int x = p.v[i].pos.x; + int y = p.v[i].pos.y; + int id1d = (y*width) + x; + if ((x >= 0 && x < width) && (y >= 0 && y < height)) { + fragment_buffer[id1d].color = glm::vec3(1.0f, 1.0f, 1.0f); + } + } +#else //LINES + for (int i = 0; i < 3; i++) { + int j = i + 1; + if (j > 2) { j = 0; } + glm::vec4 origin; + glm::vec4 dest; + if (p.v[i].pos.x <= p.v[j].pos.x) { + origin = p.v[i].pos; + dest = p.v[j].pos; + } + else { + dest = p.v[i].pos; + origin = p.v[j].pos; + } + int dY = (int)dest.y - (int)origin.y; + int dX = (int)dest.x - (int)origin.x; + int prevY = origin.y; + for (int x = (int)origin.x; x <= (int)dest.x; x++) { + int y = (int)origin.y + (dY * (x - (int)origin.x ) /dX); + int oriY; + int destY; + if (prevY <= y) { + oriY = prevY; + destY = y; + } + else { + oriY = y; + destY = prevY; + } + for (int fillY = oriY; fillY <= destY; fillY++) { + int id1d = (fillY*width) + x; + if ((x >= 0 && x < width) && (fillY >= 0 && fillY < height)) { + fragment_buffer[id1d].color = glm::vec3(1.0f, 1.0f, 1.0f); + } + } + prevY = y; + //int id1d = (y*width) + x; + //if ((x >= 0 && x < width) && (y >= 0 && y < height)) { + // fragment_buffer[id1d].color = glm::vec3(1.0f, 1.0f, 1.0f); + //} + } + } +#endif +} + +bool timerOnce = false; +int count = 0; /** * Perform rasterization. @@ -687,7 +881,17 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g // Execute your rasterization pipeline here // (See README for rasterization pipeline outline.) + count++; + if (count > 1000) { + count = 0; + timerOnce = false; + } + // Vertex Process & primitive assembly + if (!timerOnce) { + startCpuTimer(); + } + { curPrimitiveBeginId = 0; dim3 numThreadsPerBlock(128); @@ -718,20 +922,51 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g checkCUDAError("Vertex Processing and Primitive Assembly"); } + + if (!timerOnce) { + endCpuTimer(); + printf("Vertex and Primitive Assembly: \n"); + printTime(); + } + cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); initDepth << > >(width, height, dev_depth); // TODO: rasterize + if (!timerOnce) { + startCpuTimer(); + } + cudaMemset(dev_mutex, 0, sizeof(int)); + int numThreadsPerBlock = 128; + dim3 numBlocksForVertices((totalNumPrimitives + numThreadsPerBlock - 1) / numThreadsPerBlock); + kernRasterize << > > (totalNumPrimitives, width, height, dev_primitives, dev_fragmentBuffer, dev_depth, dev_mutex); + if (!timerOnce) { + endCpuTimer(); + printf("Rasterization: \n"); + printTime(); + } + checkCUDAError("resterization"); // Copy depthbuffer colors into framebuffer + if (!timerOnce) { + startCpuTimer(); + } render << > >(width, height, dev_fragmentBuffer, dev_framebuffer); checkCUDAError("fragment shader"); + + if (!timerOnce) { + endCpuTimer(); + printf("Render (Fragment Shading): \n"); + printTime(); + } + // Copy framebuffer into OpenGL buffer for OpenGL previewing sendImageToPBO<<>>(pbo, width, height, dev_framebuffer); checkCUDAError("copy render result to pbo"); + timerOnce = true; } /** @@ -772,5 +1007,7 @@ void rasterizeFree() { cudaFree(dev_depth); dev_depth = NULL; + cudaFree(dev_mutex); + dev_mutex = NULL; checkCUDAError("rasterize Free"); } diff --git a/src/timer.h b/src/timer.h new file mode 100644 index 0000000..50d5736 --- /dev/null +++ b/src/timer.h @@ -0,0 +1,69 @@ +#include +#include +#include +#include +#include + +using time_point_t = std::chrono::high_resolution_clock::time_point; +bool cpuTimerStarted = false; + +cudaEvent_t event_start = nullptr; +cudaEvent_t event_end = nullptr; + +time_point_t time_start_cpu; +time_point_t time_end_cpu; + +bool cpu_timer_started = false; +bool gpu_timer_started = false; + +float prev_elapsed_time_cpu_milliseconds = 0.f; +float prev_elapsed_time_gpu_milliseconds = 0.f; + +void startCpuTimer() { + if (cpuTimerStarted) { + throw std::runtime_error("CPU timer already started"); + } + + cpuTimerStarted = true; + time_start_cpu = std::chrono::high_resolution_clock::now(); +} + +void endCpuTimer() { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpuTimerStarted) { + throw std::runtime_error("CPU timer not started"); + } + + std::chrono::duration duration = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = static_cast(duration.count()); + + cpuTimerStarted = false; +} + +void startGpuTimer() +{ + if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } + gpu_timer_started = true; + + cudaEventRecord(event_start); +} + +void endGpuTimer() +{ + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); + gpu_timer_started = false; +} + +void printTime() { + printf("Time Elapsed: %f milliseconds\n", prev_elapsed_time_cpu_milliseconds); +} + +void printGPUTime() { + printf("Time Elapsed: %f milliseconds\n", prev_elapsed_time_gpu_milliseconds); +} \ No newline at end of file