diff --git a/README.md b/README.md index cad1abd..88716e0 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,46 @@ CUDA Rasterizer =============== -[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md) **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Wenli Zhao +* Tested on: Windows 10 Pro, Intel Xeon CPU CPU E5-1630 v4 @ 3.70GHz 32GB, NVIDIA GeForce GTX 24465MB (Sig Lab) -### (TODO: Your README) +### README -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. +![](renders/Capture2.PNG) +In this project, I implemented a simplified graphics rasterizer pipeline which includes vertex shading, primitive assembly, rasterization, fragment shading, and a framebuffer. + +The core features I implemented included: +* Vertex shading. +* Primitive assembly with support for triangles read from buffers of index and + vertex data. +* Rasterization. +* Fragment shading. +* A depth buffer for storing and depth testing fragments. +* Fragment-to-depth-buffer writing with atomics +* A fragment shader with simple Blinn-Phong lighting scheme. + +In addition to the basic rasterizer, I implemented UV texture mapping and support for rasterization with points and lines. + +#### Texture Mapping +![](renders/Capture3.PNG) + +#### Points +![](renders/points.PNG) + +#### Lines +![](renders/lines.PNG) + +### Analysis + +![](renders/chart.png) + +![](renders/image.png) + +The features I implemented didn't have too much of a performance impact on the models I tested. For example, the first three bars of the chart have a similar distribution. For rasterization of points and lines, I didn't change vertex assembly very much, so the bottleneck remained there. Vertex assembly contains a lot of global memory calls that slow down the pipeline. In general, the fragment shading and rasterization were pretty quick. I think I might have corrupted the cow model since it gave a very different distribution. I'm still not exactly sure why. There is a lot more I could do to accelerate various parts of the rasterization pipeline. I could potentially use shared memory for texture sampling and used tile-based rendering to accelerate my pipeline. ### Credits diff --git a/renders/Capture.PNG b/renders/Capture.PNG new file mode 100644 index 0000000..b6ad63d Binary files /dev/null and b/renders/Capture.PNG differ diff --git a/renders/Capture2.PNG b/renders/Capture2.PNG new file mode 100644 index 0000000..dffc089 Binary files /dev/null and b/renders/Capture2.PNG differ diff --git a/renders/Capture3.PNG b/renders/Capture3.PNG new file mode 100644 index 0000000..043767f Binary files /dev/null and b/renders/Capture3.PNG differ diff --git a/renders/chart.png b/renders/chart.png new file mode 100644 index 0000000..9357160 Binary files /dev/null and b/renders/chart.png differ diff --git a/renders/image.png b/renders/image.png new file mode 100644 index 0000000..64bcaf2 Binary files /dev/null and b/renders/image.png differ diff --git a/renders/lines.PNG b/renders/lines.PNG new file mode 100644 index 0000000..4131945 Binary files /dev/null and b/renders/lines.PNG differ diff --git a/renders/points.PNG b/renders/points.PNG new file mode 100644 index 0000000..4510419 Binary files /dev/null and b/renders/points.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 7986959..aed3732 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -108,6 +108,8 @@ void runCuda() { scale * ((float)width / (float)height), -scale, scale, 1.0, 1000.0); + P = glm::perspective(45.0f, scale*(float)width / (float)height, 1.0f, 1000.0f); + glm::mat4 V = glm::mat4(1.0f); glm::mat4 M = diff --git a/src/rasterize.cu b/src/rasterize.cu index 1262a09..6b13074 100644 --- a/src/rasterize.cu +++ b/src/rasterize.cu @@ -18,6 +18,11 @@ #include #include +#define ATOMIC 1 +#define POINT 0 +#define LINE 0 +#define TRIANGLE 1 + namespace { typedef unsigned short VertexIndex; @@ -46,7 +51,7 @@ namespace { // glm::vec3 col; glm::vec2 texcoord0; TextureData* dev_diffuseTex = NULL; - // int texWidth, texHeight; + int texWidth, texHeight; // ... }; @@ -62,11 +67,11 @@ namespace { // The attributes listed below might be useful, // but always feel free to modify on your own - // glm::vec3 eyePos; // eye space position used for shading - // glm::vec3 eyeNor; - // VertexAttributeTexcoord texcoord0; - // TextureData* dev_diffuseTex; - // ... + glm::vec3 eyePos; // eye space position used for shading + glm::vec3 eyeNor; + VertexAttributeTexcoord texcoord0; + TextureData* dev_diffuseTex; + int texWidth, texHeight; }; struct PrimitiveDevBufPointers { @@ -94,10 +99,17 @@ namespace { VertexOut* dev_verticesOut; // TODO: add more attributes when needed + }; } +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; +} + static std::map> mesh2PrimitivesMap; @@ -110,6 +122,7 @@ static Fragment *dev_fragmentBuffer = NULL; static glm::vec3 *dev_framebuffer = NULL; static int * dev_depth = NULL; // you might need this buffer when doing depth test +static unsigned int * dev_mutex = NULL; /** * Kernel that writes the image to the OpenGL PBO directly. @@ -133,6 +146,16 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) { } } +__device__ +glm::vec3 getColor(TextureData* dev_texcoord0, int u, int v, int width) { + + int index = u + v*width; + float r = (float) dev_texcoord0[index * 3] / 255.0f; + float g = (float) dev_texcoord0[index * 3 + 1] / 255.0f; + float b = (float) dev_texcoord0[index * 3 + 2] / 255.0f; + return glm::vec3(r, g, b); +} + /** * Writes fragment colors to the framebuffer */ @@ -146,8 +169,40 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) { framebuffer[index] = fragmentBuffer[index].color; // TODO: add your fragment shader code here + Fragment frag = fragmentBuffer[index]; + glm::vec3 lightPos = glm::vec3(60,10,10); + glm::vec3 ambientCol = frag.color; + +#if TRIANGLE + if (frag.color != glm::vec3(0.0, 0.0, 0.0)) { + ambientCol = getColor(frag.dev_diffuseTex, frag.texcoord0.x * frag.texWidth, frag.texcoord0.y * frag.texHeight, frag.texWidth); + } +#endif + + glm::vec3 diffuseCol = glm::vec3(0.5f, 0.0f, 0.0f); + glm::vec3 specColor = glm::vec3(1.0f, 1.0f, 1.0f); + const float shininess = 5.0; + const float screenGamma = 1.2; + + glm::vec3 lightDir = normalize(lightPos - fragmentBuffer[index].eyePos); + glm::vec3 normal = fragmentBuffer[index].eyeNor; + + float lambertian = max(dot(lightDir, normal), 0.0f); + float specular = 0.0; + if (lambertian > 0.0) { + glm::vec3 viewDir = normalize(-fragmentBuffer[index].eyePos); + + glm::vec3 halfDir = normalize(lightDir + viewDir); + float specAngle = max(dot(halfDir, normal), 0.0f); + specular = pow(specAngle, shininess); + } + glm::vec3 colorLinear = ambientCol + lambertian * diffuseCol + specular*specColor; + + glm::vec3 colorGammaCorrected = pow(colorLinear, glm::vec3(1.0f / screenGamma)); + framebuffer[index] = colorGammaCorrected; } + } /** @@ -166,11 +221,14 @@ void rasterizeInit(int w, int h) { cudaFree(dev_depth); cudaMalloc(&dev_depth, width * height * sizeof(int)); + cudaFree(dev_mutex); + cudaMalloc(&dev_mutex, width*height * sizeof(unsigned int)); + checkCUDAError("rasterizeInit"); } __global__ -void initDepth(int w, int h, int * depth) +void initDepth(int w, int h, int * depth, unsigned int * mutex) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; @@ -179,6 +237,7 @@ void initDepth(int w, int h, int * depth) { int index = x + (y * w); depth[index] = INT_MAX; + mutex[index] = 0; } } @@ -294,6 +353,7 @@ void traverseNode ( } } +// Called once in main init function. void rasterizeSetBuffers(const tinygltf::Scene & scene) { totalNumPrimitives = 0; @@ -331,7 +391,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) { // 2. for each mesh: // for each primitive: - // build device buffer of indices, materail, and each attributes + // build device buffer of indices, material, and each attributes // and store these pointers in a map { @@ -638,9 +698,34 @@ void _vertexTransformAndAssembly( // Multiply the MVP matrix for each vertex position, this will transform everything into clipping space // Then divide the pos by its w element to transform into NDC space // Finally transform x and y to viewport space + VertexIndex vIndex = primitive.dev_indices[vid]; + VertexAttributePosition vPos = primitive.dev_position[vid]; + VertexAttributeNormal vNormal = primitive.dev_normal[vid]; + VertexAttributeTexcoord vTexcoord = primitive.dev_texcoord0[vid]; + + // NDC + glm::vec4 vOutPos = MVP * glm::vec4(vPos, 1.0f); + vOutPos = vOutPos / vOutPos.w; + + // Screen Space + vOutPos = glm::vec4(0.5f * (float) width * (vOutPos.x + 1.0f), 0.5f * (float) height * (1.0f - vOutPos.y), vOutPos.z, 1.0f); + + glm::vec3 eyePos = glm::vec3(MV * glm::vec4(vPos, 0)); + + VertexOut vOut; + vOut.pos = vOutPos; + vOut.eyeNor = glm::vec3(MV * glm::vec4(vNormal,0.0f)); + vOut.texcoord0 = vTexcoord; + vOut.eyePos = eyePos; + vOut.texHeight = primitive.diffuseTexHeight; + vOut.texWidth = primitive.diffuseTexWidth; + vOut.dev_diffuseTex = primitive.dev_diffuseTex; + + + primitive.dev_verticesOut[vid] = vOut; // TODO: Apply vertex assembly here - // Assemble all attribute arraies into the primitive array + // Assemble all attribute arrays into the primitive array } } @@ -660,12 +745,15 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_ // TODO: uncomment the following code for a start // This is primitive assembly for triangles - //int pid; // id for cur primitives vector - //if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { - // pid = iid / (int)primitive.primitiveType; - // dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] - // = primitive.dev_verticesOut[primitive.dev_indices[iid]]; - //} + int pid; // id for cur primitives vector + if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { + // id divided by 3 for triangles + pid = iid / (int)primitive.primitiveType; + + // every primitive has a vertex, + dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] + = primitive.dev_verticesOut[primitive.dev_indices[iid]]; + } // TODO: other primitive types (point, line) @@ -673,6 +761,151 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_ } +__device__ +glm::vec3 barycentricInterpolation(const glm::vec3 tri[3], const glm::vec3 coord) { + return coord.x * tri[0] + coord.y * tri[1] + coord.z * tri[2]; +} + +__device__ +glm::vec2 perspectiveInterp(const glm::vec2 texcoord[3], const glm::vec3 coord) { + return (float) coord.x * texcoord[0] + (float) coord.y * texcoord[1] + (float) coord.z * texcoord[2]; +} + +__global__ +void _rasterizeTriangles(int numPrimitives, int numFragments, int width, Fragment* dev_fragments, Primitive* dev_primitives, int* dev_depth, unsigned int * dev_mutex) { + int iid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (iid < numPrimitives) { + Primitive triangle = dev_primitives[iid]; + VertexOut vert0 = triangle.v[0]; + VertexOut vert1 = triangle.v[1]; + VertexOut vert2 = triangle.v[2]; + glm::vec3 tri[3] = { glm::vec3(vert0.pos), + glm::vec3(vert1.pos), + glm::vec3(vert2.pos) }; + AABB boundingBox = getAABBForTriangle(tri); + + for (int y = boundingBox.min.y; y < boundingBox.max.y; y++) { + for (int x = boundingBox.min.x; x < boundingBox.max.x; x++) { + Fragment f; + glm::vec3 bary = calculateBarycentricCoordinate(tri, glm::vec2((float)x, (float)y)); + if (isBarycentricCoordInBounds(bary)) { + // eyePos + glm::vec3 eyePos[3] = { vert0.eyePos, vert1.eyePos, vert2.eyePos }; + f.eyePos = barycentricInterpolation(eyePos, bary); + + // eyeNormals + glm::vec3 eyeNormals[3] = { vert0.eyeNor, vert1.eyeNor, vert2.eyeNor }; + f.eyeNor = barycentricInterpolation(eyeNormals, bary); + + // pos + glm::vec3 pos = barycentricInterpolation(tri, bary); + + // textures: + + glm::vec2 texCoord[3] = { vert0.texcoord0/vert0.pos.z, vert1.texcoord0/vert1.pos.z, vert2.texcoord0/vert2.pos.z }; + glm::vec2 texCoordinate = pos.z * glm::vec2(perspectiveInterp(texCoord, bary)); + + f.texcoord0 = glm::vec2(texCoordinate.x, texCoordinate.y); + + f.color = glm::vec3(0.01f, 0.0f, 0.0f); + f.dev_diffuseTex = vert0.dev_diffuseTex; + f.texWidth = vert0.texWidth; + f.texHeight = vert0.texHeight; + + // Depth Buffering + int old = dev_depth[y*width + x]; + bool isSet; + do { + isSet = (atomicCAS(&dev_mutex[y*width + x], 0, 1) == 0); + if (isSet) { + // Critical section goes here. + // The critical section MUST be inside the wait loop; + // if it is afterward, a deadlock will occur. + dev_depth[y*width + x] = min(old, (int)(pos.z * INT_MAX)); + if (old > dev_depth[y*width + x]) { + dev_fragments[y * width + x] = f; + } + + } + if (isSet) { + dev_mutex[y*width + x] = 0; + } + } while (!isSet); + + } + } + } + } +} + +__global__ +void _rasterizePoints(int numPrimitives, int numFragments, int width, Fragment* dev_fragments, Primitive* dev_primitives, int* dev_depth, unsigned int * dev_mutex) { + int iid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (iid < numPrimitives) { + Primitive triangle = dev_primitives[iid]; + + for (int i = 0; i < 3; i++) { + Fragment f; + int x = triangle.v[i].pos.x; + int y = triangle.v[i].pos.y; + f.color = triangle.v[i].eyeNor; + dev_fragments[y*width + x] = f; + } + + } +} + +__global__ +void _rasterizeLines(int numPrimitives, int numFragments, int width, Fragment* dev_fragments, Primitive* dev_primitives, int* dev_depth, unsigned int * dev_mutex) { + int iid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (iid < numPrimitives) { + Primitive triangle = dev_primitives[iid]; + for (int i = 0; i < 3; i++) { + Fragment f; + f.color = triangle.v[i].eyeNor; + int x1 = triangle.v[i].pos.x; + int y1 = triangle.v[i].pos.y; + int x2 = triangle.v[(i + 1) % 3].pos.x; + int y2 = triangle.v[(i + 1) % 3].pos.y; + int y = min(y1, y2); + + glm::vec2 p1 = glm::vec2(x1, y1); + glm::vec2 p2 = glm::vec2(x2, y2); + + // swap if necessary + if (x1 > x2) { + glm::vec2 temp = p1; + p1 = p2; + p2 = temp; + } + + // get slope + float m = (float)(p2.y - p1.y) / (float)(p2.x - p1.x); + if (m == INFINITY) { + int ymin = y; + int ymax = max(y1, y2); + for (int qy = ymin; qy < ymax; qy++) { + dev_fragments[y*width + x1] = f; + } + } + else { + int dx = p2.x - p1.x; + for (int dxe = 0; dxe < dx - 1; dxe++) { + int dye = (int)( dxe * m + y); + int index = dye*width + dxe + p1.x; + dev_fragments[index] = f; + } + } + + } + } +} + + + /** @@ -687,6 +920,7 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g // Execute your rasterization pipeline here // (See README for rasterization pipeline outline.) + timer().startCpuTimer(); // Vertex Process & primitive assembly { curPrimitiveBeginId = 0; @@ -702,9 +936,12 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); + // 1. Vertex Assembly _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height); checkCUDAError("Vertex Processing"); cudaDeviceSynchronize(); + + // 2. Primitive Assembly _primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> > (p->numIndices, curPrimitiveBeginId, @@ -718,17 +955,34 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g checkCUDAError("Vertex Processing and Primitive Assembly"); } + timer().endCpuTimer(); + printElapsedTime(timer().getCpuElapsedTimeForPreviousOperation(), "VERTEX ASSEMBLY"); cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); - initDepth << > >(width, height, dev_depth); + initDepth << > >(width, height, dev_depth, dev_mutex); // TODO: rasterize - - - + dim3 numThreadsPerBlock(128); + dim3 numBlocksForPrimitives((curPrimitiveBeginId + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); + timer().startCpuTimer(); +#if TRIANGLE + _rasterizeTriangles << < numBlocksForPrimitives, numThreadsPerBlock >> > (curPrimitiveBeginId, width*height, width, dev_fragmentBuffer, dev_primitives, dev_depth, dev_mutex); +#elif POINT + _rasterizePoints << < numBlocksForPrimitives, numThreadsPerBlock >> > (curPrimitiveBeginId, width*height, width, dev_fragmentBuffer, dev_primitives, dev_depth, dev_mutex); +#elif LINE + _rasterizeLines << < numBlocksForPrimitives, numThreadsPerBlock >> > (curPrimitiveBeginId, width*height, width, dev_fragmentBuffer, dev_primitives, dev_depth, dev_mutex); +#endif + timer().endCpuTimer(); + printElapsedTime(timer().getCpuElapsedTimeForPreviousOperation(), "RASTERIZATION"); + + + timer().startCpuTimer(); // Copy depthbuffer colors into framebuffer render << > >(width, height, dev_fragmentBuffer, dev_framebuffer); checkCUDAError("fragment shader"); + timer().endCpuTimer(); + printElapsedTime(timer().getCpuElapsedTimeForPreviousOperation(), "FRAGMENT SHADER"); + // Copy framebuffer into OpenGL buffer for OpenGL previewing sendImageToPBO<<>>(pbo, width, height, dev_framebuffer); checkCUDAError("copy render result to pbo"); @@ -772,5 +1026,8 @@ void rasterizeFree() { cudaFree(dev_depth); dev_depth = NULL; + cudaFree(dev_mutex); + dev_mutex = NULL; + checkCUDAError("rasterize Free"); } diff --git a/src/rasterize.h b/src/rasterize.h index 560aae9..c20c040 100644 --- a/src/rasterize.h +++ b/src/rasterize.h @@ -12,6 +12,9 @@ #include #include +#include +#include + namespace tinygltf{ class Scene; } @@ -22,3 +25,72 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene); void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal); void rasterizeFree(); + +class PerformanceTimer +{ +public: + PerformanceTimer() + { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() + { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() + { + if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + + void endCpuTimer() + { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; + } + + float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + +private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; +}; + +PerformanceTimer& timer(); + +template +void printElapsedTime(T time, std::string note = "") +{ + std::cout << " elapsed time: " << time << "ms " << note << std::endl; +} \ No newline at end of file