diff --git a/CMakeLists.txt b/CMakeLists.txt index ed74e88..32e360a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,12 @@ cmake_minimum_required(VERSION 3.0) project(cis565_rasterizer) +# CUDA linker options +find_package(Threads REQUIRED) +find_package(CUDA 8.0 REQUIRED) +set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) +set(CUDA_SEPARABLE_COMPILATION ON) + set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) # Set up include and lib paths @@ -75,12 +81,6 @@ if (WIN32) list(APPEND CORELIBS legacy_stdio_definitions.lib) endif() -# CUDA linker options -find_package(Threads REQUIRED) -find_package(CUDA 8.0 REQUIRED) -set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) -set(CUDA_SEPARABLE_COMPILATION ON) - #add_subdirectory(stream_compaction) # TODO: uncomment if using your own stream compaction add_subdirectory(src) add_subdirectory(util) diff --git a/README.md b/README.md index cad1abd..8487949 100644 --- a/README.md +++ b/README.md @@ -5,14 +5,51 @@ CUDA Rasterizer **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Joseph Klinger +* Tested on: Windows 10, i5-7300HQ (4 CPUs) @ ~2.50GHz, GTX 1050 6030MB (Personal Machine) -### (TODO: Your README) +### README -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. +This week, I took on the task of implementing a rasterizer in CUDA. I have already written a CPU rasterizer (almost 2 years ago, in the introductory +graphics course CIS 460), but implementing a basic graphics pipeline on the GPU was a different beast. +The features included in this rasterizer are: +- Texture mapping +- Supersampling Antialiasing +- Color interpolation across triangles + +[Demo video here.](https://vimeo.com/238849683) + +Rasterization, in very brief summary, is taking a 3d shape and deciding how to color the pixels that the object overlaps. In this project, that involves transforming +the input GLTF models' vertex data, creating triangles from that data, projecting the triangles into view->clip->NDC/screen->viewport space, computing line intersection +with the edges of the triangle, and shading the overlapping fragments. + +Here is an image of the given Duck GLTF model rasterized with texture mapping: + +![](/renders/duck_noaa.PNG) + +For comparison, here is the same Duck but rendered with SSAA (supersampling antialiasing). This process involves simply rendering to an image of higher resolution than the +screen, then downsampling that information into the final image: + +![](/renders/duck_ssaa.PNG) + +### Performance Analysis + +I benchmarked my rasterizer's performance using the Duck GLFT model, which has ~4000 tris, at a close up and far zoom level. Here are the results: + +![](/renders/graph1.png) + +![](/renders/graph2.png) + +As we can see, rasterization is by far the most expensive operation compared to vertex transform, primitive assembly, fragment shading and downsampling. +Additionally, SSAA, as expected, makes the rasterization process much more costly because we have to render to an image of twice the size of the final, +so more fragments must be computed and be checked with the depth test. Lastly, clearly far zoom makes the rasterization process more costly as there are +simply more fragments overlapping each triangle. + +One experiment I did try was comparing rasterization performance when computing line intersection with the triangle edges as opposed to simply checking all +fragments within the bounding box of the primitive. As expected, it did improve performance, as we were able to avoid computing barycentric weights for every +potential fragment, only having to replace that with a few lines of line intersection code, where the most expensive operation is a divide (as opposed to a +cross product). ### Credits diff --git a/renders/duck_noaa.PNG b/renders/duck_noaa.PNG new file mode 100644 index 0000000..c90e221 Binary files /dev/null and b/renders/duck_noaa.PNG differ diff --git a/renders/duck_ssaa.PNG b/renders/duck_ssaa.PNG new file mode 100644 index 0000000..c82e844 Binary files /dev/null and b/renders/duck_ssaa.PNG differ diff --git a/renders/graph1.png b/renders/graph1.png new file mode 100644 index 0000000..d09f911 Binary files /dev/null and b/renders/graph1.png differ diff --git a/renders/graph2.png b/renders/graph2.png new file mode 100644 index 0000000..fa9e57b Binary files /dev/null and b/renders/graph2.png differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a57f69f..00edee0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,5 +6,5 @@ set(SOURCE_FILES cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/src/main.cpp b/src/main.cpp index 7986959..b69d44e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -104,9 +104,7 @@ void runCuda() { // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer dptr = NULL; - glm::mat4 P = glm::frustum(-scale * ((float)width) / ((float)height), - scale * ((float)width / (float)height), - -scale, scale, 1.0, 1000.0); + glm::mat4 P = glm::perspective(glm::radians(45.0f), ((float)width) / ((float)height), 0.1f, 10.0f); glm::mat4 V = glm::mat4(1.0f); diff --git a/src/rasterize.cu b/src/rasterize.cu index 1262a09..a6f7fed 100644 --- a/src/rasterize.cu +++ b/src/rasterize.cu @@ -20,101 +20,123 @@ namespace { - typedef unsigned short VertexIndex; - typedef glm::vec3 VertexAttributePosition; - typedef glm::vec3 VertexAttributeNormal; - typedef glm::vec2 VertexAttributeTexcoord; - typedef unsigned char TextureData; - - typedef unsigned char BufferByte; - - enum PrimitiveType{ - Point = 1, - Line = 2, - Triangle = 3 - }; - - struct VertexOut { - glm::vec4 pos; - - // TODO: add new attributes to your VertexOut - // The attributes listed below might be useful, - // but always feel free to modify on your own - - glm::vec3 eyePos; // eye space position used for shading - glm::vec3 eyeNor; // eye space normal used for shading, cuz normal will go wrong after perspective transformation - // glm::vec3 col; - glm::vec2 texcoord0; - TextureData* dev_diffuseTex = NULL; - // int texWidth, texHeight; - // ... - }; - - struct Primitive { - PrimitiveType primitiveType = Triangle; // C++ 11 init - VertexOut v[3]; - }; - - struct Fragment { - glm::vec3 color; - - // TODO: add new attributes to your Fragment - // The attributes listed below might be useful, - // but always feel free to modify on your own - - // glm::vec3 eyePos; // eye space position used for shading - // glm::vec3 eyeNor; - // VertexAttributeTexcoord texcoord0; - // TextureData* dev_diffuseTex; - // ... - }; - - struct PrimitiveDevBufPointers { - int primitiveMode; //from tinygltfloader macro - PrimitiveType primitiveType; - int numPrimitives; - int numIndices; - int numVertices; - - // Vertex In, const after loaded - VertexIndex* dev_indices; - VertexAttributePosition* dev_position; - VertexAttributeNormal* dev_normal; - VertexAttributeTexcoord* dev_texcoord0; - - // Materials, add more attributes when needed - TextureData* dev_diffuseTex; - int diffuseTexWidth; - int diffuseTexHeight; - // TextureData* dev_specularTex; - // TextureData* dev_normalTex; - // ... - - // Vertex Out, vertex used for rasterization, this is changing every frame - VertexOut* dev_verticesOut; - - // TODO: add more attributes when needed - }; + typedef unsigned short VertexIndex; + typedef glm::vec3 VertexAttributePosition; + typedef glm::vec3 VertexAttributeNormal; + typedef glm::vec2 VertexAttributeTexcoord; + typedef unsigned char TextureData; + + typedef unsigned char BufferByte; + + enum PrimitiveType { + Point = 1, + Line = 2, + Triangle = 3 + }; + + struct VertexOut { + glm::vec4 pos; + + // TODO: add new attributes to your VertexOut + // The attributes listed below might be useful, + // but always feel free to modify on your own + + glm::vec3 eyePos; // eye space position used for shading + glm::vec3 eyeNor; // eye space normal used for shading, cuz normal will go wrong after perspective transformation + glm::vec3 col; + TextureData* dev_diffuseTex = NULL; + glm::vec2 texcoord0; + int texWidth; + int texHeight; + // ... + }; + + struct Primitive { + PrimitiveType primitiveType = Triangle; // C++ 11 init + VertexOut v[3]; + }; + + struct Fragment { + glm::vec3 color; + + // TODO: add new attributes to your Fragment + // The attributes listed below might be useful, + // but always feel free to modify on your own + + // glm::vec3 eyePos; // eye space position used for shading + glm::vec3 eyeNor; + VertexAttributeTexcoord texcoord0; + TextureData* dev_diffuseTex = NULL; + int diffuseTexWidth; + int diffuseTexHeight; + // ... + }; + + struct PrimitiveDevBufPointers { + int primitiveMode; //from tinygltfloader macro + PrimitiveType primitiveType; + int numPrimitives; + int numIndices; + int numVertices; + + // Vertex In, const after loaded + VertexIndex* dev_indices; + VertexAttributePosition* dev_position; + VertexAttributeNormal* dev_normal; + VertexAttributeTexcoord* dev_texcoord0; + + // Materials, add more attributes when needed + TextureData* dev_diffuseTex; + int diffuseTexWidth; + int diffuseTexHeight; + // TextureData* dev_specularTex; + // TextureData* dev_normalTex; + // ... + + // Vertex Out, vertex used for rasterization, this is changing every frame + VertexOut* dev_verticesOut; + }; } -static std::map> mesh2PrimitivesMap; +// Pipeline Options +#define LINE_INTERSECTION_RASTERIZATION // If on, will compute line intersections with triangle edges for scan line conversion. If off, will naively check every pixel in the triangle's bounding box +#define DEPTH_TEST +#define TEXTURE_MAPPING +//#define SSAA +static std::map> mesh2PrimitivesMap; static int width = 0; static int height = 0; +// for mutex +static int* dev_mutex = NULL; +static int* dev_mutex_4X = NULL; + static int totalNumPrimitives = 0; static Primitive *dev_primitives = NULL; + +// All _4X versions are twice as large for the purposes of super sampling static Fragment *dev_fragmentBuffer = NULL; +static Fragment *dev_fragmentBuffer_4X = NULL; static glm::vec3 *dev_framebuffer = NULL; +static glm::vec3 *dev_framebuffer_4X = NULL; -static int * dev_depth = NULL; // you might need this buffer when doing depth test +static int * dev_depth = NULL; +static int * dev_depth_4X = NULL; + +// timer +#include +using time_point_t = std::chrono::high_resolution_clock::time_point; +time_point_t timeStartCpu; +time_point_t timeEndCpu; +float prevElapsedTime = 0.0f; /** * Kernel that writes the image to the OpenGL PBO directly. */ -__global__ +__global__ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; @@ -133,19 +155,99 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) { } } -/** +__device__ +glm::vec3 BilinearFilter(const glm::vec3 color00, const glm::vec3 color01, const glm::vec3 color10, const glm::vec3 color11, const glm::vec2 weights) { + // Interpolate along the X-axis + const glm::vec3 colorX1 = glm::mix(color00, color01, weights.x); + const glm::vec3 colorX2 = glm::mix(color10, color11, weights.x); + + // Interpolate along the Y-axis + return glm::mix(colorX1, colorX2, weights.y); +} + +// Return the tightly packed chars as a color with its elements on the range [0, 1] +__device__ +glm::vec3 SampleTexture(const TextureData *texture, const int index) { + const int index_x3 = index * 3; + const int r = texture[index_x3]; + const int g = texture[index_x3 + 1]; + const int b = texture[index_x3 + 2]; + return glm::vec3(r, g, b) * 0.00392156862f; // 1 / 255 +} + +// Given a texture and UVs, samples the texture and returns a color using bilinear filtering +__device__ +glm::vec3 GetTextureColor(const TextureData *texture, const glm::vec2 uv, const int width, const int height) { + + // Compute the 4 indices into the texture + const glm::vec2 texIndex = uv * glm::vec2(width, height); + + const glm::vec2 sample00 = floor(texIndex); + const glm::vec2 sample01 = glm::vec2(sample00.x + 1.0f, floor(texIndex.y)); + const glm::vec2 sample10 = glm::vec2(floor(texIndex.x), sample01.y + 1.0f); + const glm::vec2 sample11 = ceil(texIndex); + + const glm::vec2 weights = glm::vec2((texIndex.x - sample10.x) / (sample01.x - sample10.x), + (texIndex.y - sample01.y) / (sample10.y - sample01.y)); + + // Sample the color + const glm::vec3 color00 = SampleTexture(texture, ((int)sample00.x) + ((int)sample00.y) * width); + const glm::vec3 color01 = SampleTexture(texture, ((int)sample01.x) + ((int)sample01.y) * width); + const glm::vec3 color10 = SampleTexture(texture, ((int)sample10.x) + ((int)sample10.y) * width); + const glm::vec3 color11 = SampleTexture(texture, ((int)sample11.x) + ((int)sample11.y) * width); + + return BilinearFilter(color00, color01, color10, color11, weights); +} + +// Downsample the high-res rendered image to the normal image resolution using some reconstruction filter - grid, RGSS, Quincunx, etc +__global__ +void downsample(const int w, const int h, glm::vec3 *framebuffer, const glm::vec3 *framebuffer_4X) { + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + int index = x + (y * w); + + if (x < w && y < h) { + // Downsample using uniform grid sampling + + // Compute index in the larger resolution image + const int index_4X = 2 * x + 4 * y * w; + + // Access colors and average + glm::vec3 color = framebuffer_4X[index_4X]; + color += framebuffer_4X[index_4X + 1]; + color += framebuffer_4X[index_4X + w * 2]; + color += framebuffer_4X[index_4X + 1 + w * 2]; + + framebuffer[index] = color * 0.25f; + } +} + +/** * Writes fragment colors to the framebuffer */ __global__ -void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) { +void render(const int w, const int h, const Fragment *fragmentBuffer, glm::vec3 *framebuffer) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; int index = x + (y * w); if (x < w && y < h) { - framebuffer[index] = fragmentBuffer[index].color; - - // TODO: add your fragment shader code here + const Fragment &fragment = fragmentBuffer[index]; + + // Lighting - Lambertian Shading + const glm::vec3 lightVec = glm::normalize(glm::vec3(1.0f, 1.0f, 0.0f)); + const float lightIntensity = glm::dot(lightVec, fragment.eyeNor) * 0.5f + 0.5f; + + #ifdef TEXTURE_MAPPING + if (fragment.dev_diffuseTex) { + const glm::vec3 texCol = GetTextureColor(fragment.dev_diffuseTex, fragment.texcoord0, fragment.diffuseTexWidth, fragment.diffuseTexHeight); + framebuffer[index] = texCol * lightIntensity; + } else { + framebuffer[index] = fragment.color; + } + #else + framebuffer[index] = fragment.color; + #endif } } @@ -156,30 +258,66 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) { void rasterizeInit(int w, int h) { width = w; height = h; - cudaFree(dev_fragmentBuffer); - cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment)); - cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); + + #ifndef SSAA + // Fragment Buffer + cudaFree(dev_fragmentBuffer); + cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment)); + cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); + #else + // Fragment Buffer 4X + cudaFree(dev_fragmentBuffer_4X); + cudaMalloc(&dev_fragmentBuffer_4X, 4 * width * height * sizeof(Fragment)); + cudaMemset(dev_fragmentBuffer_4X, 0, 4 * width * height * sizeof(Fragment)); + #endif + + // Frame Buffer cudaFree(dev_framebuffer); - cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3)); + cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3)); cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3)); - - cudaFree(dev_depth); - cudaMalloc(&dev_depth, width * height * sizeof(int)); - - checkCUDAError("rasterizeInit"); + #ifdef SSAA + // Frame Buffer 4X + cudaFree(dev_framebuffer_4X); + cudaMalloc(&dev_framebuffer_4X, 4 * width * height * sizeof(glm::vec3)); + cudaMemset(dev_framebuffer_4X, 0, 4 * width * height * sizeof(glm::vec3)); + #endif + + #ifndef SSAA + // Depth Buffer + cudaFree(dev_depth); + cudaMalloc(&dev_depth, width * height * sizeof(int)); + #else + // Depth Buffer 4X + cudaFree(dev_depth_4X); + cudaMalloc(&dev_depth_4X, 4 * width * height * sizeof(int)); + #endif + + #ifndef SSAA + // Mutex array + cudaFree(dev_mutex); + cudaMalloc(&dev_mutex, width * height * sizeof(int)); + cudaMemset(dev_mutex, 0, width * height * sizeof(int)); + #else + // Mutex array 4X + cudaFree(dev_mutex_4X); + cudaMalloc(&dev_mutex_4X, 4 * width * height * sizeof(int)); + cudaMemset(dev_mutex_4X, 0, 4 * width * height * sizeof(int)); + #endif + + checkCUDAError("rasterizeInit"); } __global__ -void initDepth(int w, int h, int * depth) +void initDepth(const int w, const int h, int * depth) { - int x = (blockIdx.x * blockDim.x) + threadIdx.x; - int y = (blockIdx.y * blockDim.y) + threadIdx.y; - - if (x < w && y < h) - { - int index = x + (y * w); - depth[index] = INT_MAX; - } + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + + if (x < w && y < h) + { + const int index = x + (y * w); + depth[index] = INT_MAX; + } } @@ -187,550 +325,713 @@ void initDepth(int w, int h, int * depth) * kern function with support for stride to sometimes replace cudaMemcpy * One thread is responsible for copying one component */ -__global__ +__global__ void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, int n, int byteStride, int byteOffset, int componentTypeByteSize) { - - // Attribute (vec3 position) - // component (3 * float) - // byte (4 * byte) - - // id of component - int i = (blockIdx.x * blockDim.x) + threadIdx.x; - - if (i < N) { - int count = i / n; - int offset = i - count * n; // which component of the attribute - - for (int j = 0; j < componentTypeByteSize; j++) { - - dev_dst[count * componentTypeByteSize * n - + offset * componentTypeByteSize - + j] - - = - - dev_src[byteOffset - + count * (byteStride == 0 ? componentTypeByteSize * n : byteStride) - + offset * componentTypeByteSize - + j]; - } - } - + + // Attribute (vec3 position) + // component (3 * float) + // byte (4 * byte) + + // id of component + const int i = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (i < N) { + int count = i / n; + int offset = i - count * n; // which component of the attribute + + for (int j = 0; j < componentTypeByteSize; j++) { + + dev_dst[count * componentTypeByteSize * n + + offset * componentTypeByteSize + + j] + + = + + dev_src[byteOffset + + count * (byteStride == 0 ? componentTypeByteSize * n : byteStride) + + offset * componentTypeByteSize + + j]; + } + } + } __global__ void _nodeMatrixTransform( - int numVertices, - VertexAttributePosition* position, - VertexAttributeNormal* normal, - glm::mat4 MV, glm::mat3 MV_normal) { - - // vertex id - int vid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (vid < numVertices) { - position[vid] = glm::vec3(MV * glm::vec4(position[vid], 1.0f)); - normal[vid] = glm::normalize(MV_normal * normal[vid]); - } + int numVertices, + VertexAttributePosition* position, + VertexAttributeNormal* normal, + glm::mat4 MV, glm::mat3 MV_normal) { + + // vertex id + int vid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (vid < numVertices) { + position[vid] = glm::vec3(MV * glm::vec4(position[vid], 1.0f)); + normal[vid] = glm::normalize(MV_normal * normal[vid]); + } } glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) { - - glm::mat4 curMatrix(1.0); - - const std::vector &m = n.matrix; - if (m.size() > 0) { - // matrix, copy it - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - curMatrix[i][j] = (float)m.at(4 * i + j); - } - } - } else { - // no matrix, use rotation, scale, translation - - if (n.translation.size() > 0) { - curMatrix[3][0] = n.translation[0]; - curMatrix[3][1] = n.translation[1]; - curMatrix[3][2] = n.translation[2]; - } - - if (n.rotation.size() > 0) { - glm::mat4 R; - glm::quat q; - q[0] = n.rotation[0]; - q[1] = n.rotation[1]; - q[2] = n.rotation[2]; - - R = glm::mat4_cast(q); - curMatrix = curMatrix * R; - } - - if (n.scale.size() > 0) { - curMatrix = curMatrix * glm::scale(glm::vec3(n.scale[0], n.scale[1], n.scale[2])); - } - } - - return curMatrix; + + glm::mat4 curMatrix(1.0); + + const std::vector &m = n.matrix; + if (m.size() > 0) { + // matrix, copy it + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + curMatrix[i][j] = (float)m.at(4 * i + j); + } + } + } + else { + // no matrix, use rotation, scale, translation + + if (n.translation.size() > 0) { + curMatrix[3][0] = n.translation[0]; + curMatrix[3][1] = n.translation[1]; + curMatrix[3][2] = n.translation[2]; + } + + if (n.rotation.size() > 0) { + glm::mat4 R; + glm::quat q; + q[0] = n.rotation[0]; + q[1] = n.rotation[1]; + q[2] = n.rotation[2]; + + R = glm::mat4_cast(q); + curMatrix = curMatrix * R; + } + + if (n.scale.size() > 0) { + curMatrix = curMatrix * glm::scale(glm::vec3(n.scale[0], n.scale[1], n.scale[2])); + } + } + + return curMatrix; } -void traverseNode ( - std::map & n2m, - const tinygltf::Scene & scene, - const std::string & nodeString, - const glm::mat4 & parentMatrix - ) +void traverseNode( + std::map & n2m, + const tinygltf::Scene & scene, + const std::string & nodeString, + const glm::mat4 & parentMatrix +) { - const tinygltf::Node & n = scene.nodes.at(nodeString); - glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n); - n2m.insert(std::pair(nodeString, M)); + const tinygltf::Node & n = scene.nodes.at(nodeString); + glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n); + n2m.insert(std::pair(nodeString, M)); - auto it = n.children.begin(); - auto itEnd = n.children.end(); + auto it = n.children.begin(); + auto itEnd = n.children.end(); - for (; it != itEnd; ++it) { - traverseNode(n2m, scene, *it, M); - } + for (; it != itEnd; ++it) { + traverseNode(n2m, scene, *it, M); + } } void rasterizeSetBuffers(const tinygltf::Scene & scene) { - totalNumPrimitives = 0; - - std::map bufferViewDevPointers; - - // 1. copy all `bufferViews` to device memory - { - std::map::const_iterator it( - scene.bufferViews.begin()); - std::map::const_iterator itEnd( - scene.bufferViews.end()); + totalNumPrimitives = 0; - for (; it != itEnd; it++) { - const std::string key = it->first; - const tinygltf::BufferView &bufferView = it->second; - if (bufferView.target == 0) { - continue; // Unsupported bufferView. - } + std::map bufferViewDevPointers; - const tinygltf::Buffer &buffer = scene.buffers.at(bufferView.buffer); + // 1. copy all `bufferViews` to device memory + { + std::map::const_iterator it( + scene.bufferViews.begin()); + std::map::const_iterator itEnd( + scene.bufferViews.end()); - BufferByte* dev_bufferView; - cudaMalloc(&dev_bufferView, bufferView.byteLength); - cudaMemcpy(dev_bufferView, &buffer.data.front() + bufferView.byteOffset, bufferView.byteLength, cudaMemcpyHostToDevice); + for (; it != itEnd; it++) { + const std::string key = it->first; + const tinygltf::BufferView &bufferView = it->second; + if (bufferView.target == 0) { + continue; // Unsupported bufferView. + } - checkCUDAError("Set BufferView Device Mem"); + const tinygltf::Buffer &buffer = scene.buffers.at(bufferView.buffer); - bufferViewDevPointers.insert(std::make_pair(key, dev_bufferView)); + BufferByte* dev_bufferView; + cudaMalloc(&dev_bufferView, bufferView.byteLength); + cudaMemcpy(dev_bufferView, &buffer.data.front() + bufferView.byteOffset, bufferView.byteLength, cudaMemcpyHostToDevice); - } - } + checkCUDAError("Set BufferView Device Mem"); + bufferViewDevPointers.insert(std::make_pair(key, dev_bufferView)); + } + } - // 2. for each mesh: - // for each primitive: - // build device buffer of indices, materail, and each attributes - // and store these pointers in a map - { - - std::map nodeString2Matrix; - auto rootNodeNamesList = scene.scenes.at(scene.defaultScene); - - { - auto it = rootNodeNamesList.begin(); - auto itEnd = rootNodeNamesList.end(); - for (; it != itEnd; ++it) { - traverseNode(nodeString2Matrix, scene, *it, glm::mat4(1.0f)); - } - } - // parse through node to access mesh - - auto itNode = nodeString2Matrix.begin(); - auto itEndNode = nodeString2Matrix.end(); - for (; itNode != itEndNode; ++itNode) { - - const tinygltf::Node & N = scene.nodes.at(itNode->first); - const glm::mat4 & matrix = itNode->second; - const glm::mat3 & matrixNormal = glm::transpose(glm::inverse(glm::mat3(matrix))); - - auto itMeshName = N.meshes.begin(); - auto itEndMeshName = N.meshes.end(); - - for (; itMeshName != itEndMeshName; ++itMeshName) { - - const tinygltf::Mesh & mesh = scene.meshes.at(*itMeshName); - - auto res = mesh2PrimitivesMap.insert(std::pair>(mesh.name, std::vector())); - std::vector & primitiveVector = (res.first)->second; - - // for each primitive - for (size_t i = 0; i < mesh.primitives.size(); i++) { - const tinygltf::Primitive &primitive = mesh.primitives[i]; - - if (primitive.indices.empty()) - return; - - // TODO: add new attributes for your PrimitiveDevBufPointers when you add new attributes - VertexIndex* dev_indices = NULL; - VertexAttributePosition* dev_position = NULL; - VertexAttributeNormal* dev_normal = NULL; - VertexAttributeTexcoord* dev_texcoord0 = NULL; - - // ----------Indices------------- - - const tinygltf::Accessor &indexAccessor = scene.accessors.at(primitive.indices); - const tinygltf::BufferView &bufferView = scene.bufferViews.at(indexAccessor.bufferView); - BufferByte* dev_bufferView = bufferViewDevPointers.at(indexAccessor.bufferView); - - // assume type is SCALAR for indices - int n = 1; - int numIndices = indexAccessor.count; - int componentTypeByteSize = sizeof(VertexIndex); - int byteLength = numIndices * n * componentTypeByteSize; - - dim3 numThreadsPerBlock(128); - dim3 numBlocks((numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); - cudaMalloc(&dev_indices, byteLength); - _deviceBufferCopy << > > ( - numIndices, - (BufferByte*)dev_indices, - dev_bufferView, - n, - indexAccessor.byteStride, - indexAccessor.byteOffset, - componentTypeByteSize); - - - checkCUDAError("Set Index Buffer"); - - - // ---------Primitive Info------- - - // Warning: LINE_STRIP is not supported in tinygltfloader - int numPrimitives; - PrimitiveType primitiveType; - switch (primitive.mode) { - case TINYGLTF_MODE_TRIANGLES: - primitiveType = PrimitiveType::Triangle; - numPrimitives = numIndices / 3; - break; - case TINYGLTF_MODE_TRIANGLE_STRIP: - primitiveType = PrimitiveType::Triangle; - numPrimitives = numIndices - 2; - break; - case TINYGLTF_MODE_TRIANGLE_FAN: - primitiveType = PrimitiveType::Triangle; - numPrimitives = numIndices - 2; - break; - case TINYGLTF_MODE_LINE: - primitiveType = PrimitiveType::Line; - numPrimitives = numIndices / 2; - break; - case TINYGLTF_MODE_LINE_LOOP: - primitiveType = PrimitiveType::Line; - numPrimitives = numIndices + 1; - break; - case TINYGLTF_MODE_POINTS: - primitiveType = PrimitiveType::Point; - numPrimitives = numIndices; - break; - default: - // output error - break; - }; - - - // ----------Attributes------------- - - auto it(primitive.attributes.begin()); - auto itEnd(primitive.attributes.end()); - - int numVertices = 0; - // for each attribute - for (; it != itEnd; it++) { - const tinygltf::Accessor &accessor = scene.accessors.at(it->second); - const tinygltf::BufferView &bufferView = scene.bufferViews.at(accessor.bufferView); - - int n = 1; - if (accessor.type == TINYGLTF_TYPE_SCALAR) { - n = 1; - } - else if (accessor.type == TINYGLTF_TYPE_VEC2) { - n = 2; - } - else if (accessor.type == TINYGLTF_TYPE_VEC3) { - n = 3; - } - else if (accessor.type == TINYGLTF_TYPE_VEC4) { - n = 4; - } - - BufferByte * dev_bufferView = bufferViewDevPointers.at(accessor.bufferView); - BufferByte ** dev_attribute = NULL; - - numVertices = accessor.count; - int componentTypeByteSize; - - // Note: since the type of our attribute array (dev_position) is static (float32) - // We assume the glTF model attribute type are 5126(FLOAT) here - - if (it->first.compare("POSITION") == 0) { - componentTypeByteSize = sizeof(VertexAttributePosition) / n; - dev_attribute = (BufferByte**)&dev_position; - } - else if (it->first.compare("NORMAL") == 0) { - componentTypeByteSize = sizeof(VertexAttributeNormal) / n; - dev_attribute = (BufferByte**)&dev_normal; - } - else if (it->first.compare("TEXCOORD_0") == 0) { - componentTypeByteSize = sizeof(VertexAttributeTexcoord) / n; - dev_attribute = (BufferByte**)&dev_texcoord0; - } - - std::cout << accessor.bufferView << " - " << it->second << " - " << it->first << '\n'; - - dim3 numThreadsPerBlock(128); - dim3 numBlocks((n * numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); - int byteLength = numVertices * n * componentTypeByteSize; - cudaMalloc(dev_attribute, byteLength); - - _deviceBufferCopy << > > ( - n * numVertices, - *dev_attribute, - dev_bufferView, - n, - accessor.byteStride, - accessor.byteOffset, - componentTypeByteSize); - - std::string msg = "Set Attribute Buffer: " + it->first; - checkCUDAError(msg.c_str()); - } - - // malloc for VertexOut - VertexOut* dev_vertexOut; - cudaMalloc(&dev_vertexOut, numVertices * sizeof(VertexOut)); - checkCUDAError("Malloc VertexOut Buffer"); - - // ----------Materials------------- - - // You can only worry about this part once you started to - // implement textures for your rasterizer - TextureData* dev_diffuseTex = NULL; - int diffuseTexWidth = 0; - int diffuseTexHeight = 0; - if (!primitive.material.empty()) { - const tinygltf::Material &mat = scene.materials.at(primitive.material); - printf("material.name = %s\n", mat.name.c_str()); - - if (mat.values.find("diffuse") != mat.values.end()) { - std::string diffuseTexName = mat.values.at("diffuse").string_value; - if (scene.textures.find(diffuseTexName) != scene.textures.end()) { - const tinygltf::Texture &tex = scene.textures.at(diffuseTexName); - if (scene.images.find(tex.source) != scene.images.end()) { - const tinygltf::Image &image = scene.images.at(tex.source); - - size_t s = image.image.size() * sizeof(TextureData); - cudaMalloc(&dev_diffuseTex, s); - cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice); - - diffuseTexWidth = image.width; - diffuseTexHeight = image.height; - - checkCUDAError("Set Texture Image data"); - } - } - } - - // TODO: write your code for other materails - // You may have to take a look at tinygltfloader - // You can also use the above code loading diffuse material as a start point - } - - - // ---------Node hierarchy transform-------- - cudaDeviceSynchronize(); - - dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); - _nodeMatrixTransform << > > ( - numVertices, - dev_position, - dev_normal, - matrix, - matrixNormal); - - checkCUDAError("Node hierarchy transformation"); - - // at the end of the for loop of primitive - // push dev pointers to map - primitiveVector.push_back(PrimitiveDevBufPointers{ - primitive.mode, - primitiveType, - numPrimitives, - numIndices, - numVertices, - - dev_indices, - dev_position, - dev_normal, - dev_texcoord0, - - dev_diffuseTex, - diffuseTexWidth, - diffuseTexHeight, - - dev_vertexOut //VertexOut - }); - - totalNumPrimitives += numPrimitives; - - } // for each primitive - - } // for each mesh - - } // for each node - - } - - - // 3. Malloc for dev_primitives - { - cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive)); - } - - - // Finally, cudaFree raw dev_bufferViews - { - - std::map::const_iterator it(bufferViewDevPointers.begin()); - std::map::const_iterator itEnd(bufferViewDevPointers.end()); - - //bufferViewDevPointers - - for (; it != itEnd; it++) { - cudaFree(it->second); - } - - checkCUDAError("Free BufferView Device Mem"); - } + // 2. for each mesh: + // for each primitive: + // build device buffer of indices, materail, and each attributes + // and store these pointers in a map + { + + std::map nodeString2Matrix; + auto rootNodeNamesList = scene.scenes.at(scene.defaultScene); + + { + auto it = rootNodeNamesList.begin(); + auto itEnd = rootNodeNamesList.end(); + for (; it != itEnd; ++it) { + traverseNode(nodeString2Matrix, scene, *it, glm::mat4(1.0f)); + } + } + + + // parse through node to access mesh + + auto itNode = nodeString2Matrix.begin(); + auto itEndNode = nodeString2Matrix.end(); + for (; itNode != itEndNode; ++itNode) { + + const tinygltf::Node & N = scene.nodes.at(itNode->first); + const glm::mat4 & matrix = itNode->second; + const glm::mat3 & matrixNormal = glm::transpose(glm::inverse(glm::mat3(matrix))); + + auto itMeshName = N.meshes.begin(); + auto itEndMeshName = N.meshes.end(); + + for (; itMeshName != itEndMeshName; ++itMeshName) { + + const tinygltf::Mesh & mesh = scene.meshes.at(*itMeshName); + + auto res = mesh2PrimitivesMap.insert(std::pair>(mesh.name, std::vector())); + std::vector & primitiveVector = (res.first)->second; + + // for each primitive + for (size_t i = 0; i < mesh.primitives.size(); i++) { + const tinygltf::Primitive &primitive = mesh.primitives[i]; + + if (primitive.indices.empty()) + return; + + // TODO: add new attributes for your PrimitiveDevBufPointers when you add new attributes + VertexIndex* dev_indices = NULL; + VertexAttributePosition* dev_position = NULL; + VertexAttributeNormal* dev_normal = NULL; + VertexAttributeTexcoord* dev_texcoord0 = NULL; + + // ----------Indices------------- + + const tinygltf::Accessor &indexAccessor = scene.accessors.at(primitive.indices); + const tinygltf::BufferView &bufferView = scene.bufferViews.at(indexAccessor.bufferView); + BufferByte* dev_bufferView = bufferViewDevPointers.at(indexAccessor.bufferView); + + // assume type is SCALAR for indices + int n = 1; + int numIndices = indexAccessor.count; + int componentTypeByteSize = sizeof(VertexIndex); + int byteLength = numIndices * n * componentTypeByteSize; + + dim3 numThreadsPerBlock(128); + dim3 numBlocks((numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); + cudaMalloc(&dev_indices, byteLength); + _deviceBufferCopy << > > ( + numIndices, + (BufferByte*)dev_indices, + dev_bufferView, + n, + indexAccessor.byteStride, + indexAccessor.byteOffset, + componentTypeByteSize); + + + checkCUDAError("Set Index Buffer"); + + + // ---------Primitive Info------- + + // Warning: LINE_STRIP is not supported in tinygltfloader + int numPrimitives; + PrimitiveType primitiveType; + switch (primitive.mode) { + case TINYGLTF_MODE_TRIANGLES: + primitiveType = PrimitiveType::Triangle; + numPrimitives = numIndices / 3; + break; + case TINYGLTF_MODE_TRIANGLE_STRIP: + primitiveType = PrimitiveType::Triangle; + numPrimitives = numIndices - 2; + break; + case TINYGLTF_MODE_TRIANGLE_FAN: + primitiveType = PrimitiveType::Triangle; + numPrimitives = numIndices - 2; + break; + case TINYGLTF_MODE_LINE: + primitiveType = PrimitiveType::Line; + numPrimitives = numIndices / 2; + break; + case TINYGLTF_MODE_LINE_LOOP: + primitiveType = PrimitiveType::Line; + numPrimitives = numIndices + 1; + break; + case TINYGLTF_MODE_POINTS: + primitiveType = PrimitiveType::Point; + numPrimitives = numIndices; + break; + default: + // output error + break; + }; + + + // ----------Attributes------------- + + auto it(primitive.attributes.begin()); + auto itEnd(primitive.attributes.end()); + + int numVertices = 0; + // for each attribute + for (; it != itEnd; it++) { + const tinygltf::Accessor &accessor = scene.accessors.at(it->second); + const tinygltf::BufferView &bufferView = scene.bufferViews.at(accessor.bufferView); + + int n = 1; + if (accessor.type == TINYGLTF_TYPE_SCALAR) { + n = 1; + } + else if (accessor.type == TINYGLTF_TYPE_VEC2) { + n = 2; + } + else if (accessor.type == TINYGLTF_TYPE_VEC3) { + n = 3; + } + else if (accessor.type == TINYGLTF_TYPE_VEC4) { + n = 4; + } + + BufferByte * dev_bufferView = bufferViewDevPointers.at(accessor.bufferView); + BufferByte ** dev_attribute = NULL; + + numVertices = accessor.count; + int componentTypeByteSize; + + // Note: since the type of our attribute array (dev_position) is static (float32) + // We assume the glTF model attribute type are 5126(FLOAT) here + + if (it->first.compare("POSITION") == 0) { + componentTypeByteSize = sizeof(VertexAttributePosition) / n; + dev_attribute = (BufferByte**)&dev_position; + } + else if (it->first.compare("NORMAL") == 0) { + componentTypeByteSize = sizeof(VertexAttributeNormal) / n; + dev_attribute = (BufferByte**)&dev_normal; + } + else if (it->first.compare("TEXCOORD_0") == 0) { + componentTypeByteSize = sizeof(VertexAttributeTexcoord) / n; + dev_attribute = (BufferByte**)&dev_texcoord0; + } + + std::cout << accessor.bufferView << " - " << it->second << " - " << it->first << '\n'; + + dim3 numThreadsPerBlock(128); + dim3 numBlocks((n * numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); + int byteLength = numVertices * n * componentTypeByteSize; + cudaMalloc(dev_attribute, byteLength); + + _deviceBufferCopy << > > ( + n * numVertices, + *dev_attribute, + dev_bufferView, + n, + accessor.byteStride, + accessor.byteOffset, + componentTypeByteSize); + + std::string msg = "Set Attribute Buffer: " + it->first; + checkCUDAError(msg.c_str()); + } + + // malloc for VertexOut + VertexOut* dev_vertexOut; + cudaMalloc(&dev_vertexOut, numVertices * sizeof(VertexOut)); + checkCUDAError("Malloc VertexOut Buffer"); + + // ----------Materials------------- + + // You can only worry about this part once you started to + // implement textures for your rasterizer + TextureData* dev_diffuseTex = NULL; + int diffuseTexWidth = 0; + int diffuseTexHeight = 0; + if (!primitive.material.empty()) { + const tinygltf::Material &mat = scene.materials.at(primitive.material); + printf("material.name = %s\n", mat.name.c_str()); + + if (mat.values.find("diffuse") != mat.values.end()) { + std::string diffuseTexName = mat.values.at("diffuse").string_value; + if (scene.textures.find(diffuseTexName) != scene.textures.end()) { + const tinygltf::Texture &tex = scene.textures.at(diffuseTexName); + if (scene.images.find(tex.source) != scene.images.end()) { + const tinygltf::Image &image = scene.images.at(tex.source); + + size_t s = image.image.size() * sizeof(TextureData); + cudaMalloc(&dev_diffuseTex, s); + cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice); + + diffuseTexWidth = image.width; + diffuseTexHeight = image.height; + + checkCUDAError("Set Texture Image data"); + } + } + } + + // TODO: write your code for other materails + // You may have to take a look at tinygltfloader + // You can also use the above code loading diffuse material as a start point + } + + + // ---------Node hierarchy transform-------- + cudaDeviceSynchronize(); + + dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); + _nodeMatrixTransform << > > ( + numVertices, + dev_position, + dev_normal, + matrix, + matrixNormal); + + checkCUDAError("Node hierarchy transformation"); + + // at the end of the for loop of primitive + // push dev pointers to map + primitiveVector.push_back(PrimitiveDevBufPointers{ + primitive.mode, + primitiveType, + numPrimitives, + numIndices, + numVertices, + + dev_indices, + dev_position, + dev_normal, + dev_texcoord0, + + dev_diffuseTex, + diffuseTexWidth, + diffuseTexHeight, + + dev_vertexOut //VertexOut + }); + + totalNumPrimitives += numPrimitives; + + } // for each primitive + + } // for each mesh + + } // for each node + } -} + // 3. Malloc for dev_primitives + { + cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive)); + } -__global__ -void _vertexTransformAndAssembly( - int numVertices, - PrimitiveDevBufPointers primitive, - glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, - int width, int height) { - - // vertex id - int vid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (vid < numVertices) { - - // TODO: Apply vertex transformation here - // Multiply the MVP matrix for each vertex position, this will transform everything into clipping space - // Then divide the pos by its w element to transform into NDC space - // Finally transform x and y to viewport space - - // TODO: Apply vertex assembly here - // Assemble all attribute arraies into the primitive array - - } -} + // Finally, cudaFree raw dev_bufferViews + { + std::map::const_iterator it(bufferViewDevPointers.begin()); + std::map::const_iterator itEnd(bufferViewDevPointers.end()); + //bufferViewDevPointers -static int curPrimitiveBeginId = 0; + for (; it != itEnd; it++) { + cudaFree(it->second); + } -__global__ -void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_primitives, PrimitiveDevBufPointers primitive) { + checkCUDAError("Free BufferView Device Mem"); + } - // index id - int iid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (iid < numIndices) { +} - // TODO: uncomment the following code for a start - // This is primitive assembly for triangles +__global__ +void _vertexTransformAndAssembly( + const int numVertices, + PrimitiveDevBufPointers primitive, + const glm::mat4 MVP, const glm::mat4 MV, const glm::mat3 MV_normal, + const int width, const int height) { + + // vertex id + int vid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (vid < numVertices) { + + // Multiply the MVP matrix for each vertex position, this will transform everything into clipping space + // Then divide the pos by its w element to transform into NDC space + // Finally transform x and y to viewport space + + // Create reference to the out vertex + VertexOut &vOut = primitive.dev_verticesOut[vid]; + vOut.pos = MVP * glm::vec4(primitive.dev_position[vid], 1); + + // Perspective divide + vOut.pos /= vOut.pos.w; + + // Viewport transformation + vOut.pos.x = (vOut.pos.x * 0.5f + 0.5f) * ((float)width); + vOut.pos.y = (1.0f - (vOut.pos.y * 0.5f + 0.5f)) * ((float)height); + + // Eye space attributes + vOut.eyePos = glm::vec3(MV * glm::vec4(primitive.dev_position[vid], 1)); + vOut.eyeNor = glm::normalize(MV_normal * primitive.dev_normal[vid]); + + #ifdef TEXTURE_MAPPING + vOut.dev_diffuseTex = primitive.dev_diffuseTex; + vOut.texcoord0 = primitive.dev_texcoord0[vid]; + vOut.texWidth = primitive.diffuseTexWidth; + vOut.texHeight = primitive.diffuseTexHeight; + #else + int colorChannel = vid % 3; + vOut.col = glm::vec3(colorChannel == 0, colorChannel == 1, colorChannel == 2); + #endif + } +} - //int pid; // id for cur primitives vector - //if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { - // pid = iid / (int)primitive.primitiveType; - // dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] - // = primitive.dev_verticesOut[primitive.dev_indices[iid]]; - //} +static int curPrimitiveBeginId = 0; +__global__ +void _primitiveAssembly(const int numIndices, const int curPrimitiveBeginId, Primitive* dev_primitives, const PrimitiveDevBufPointers primitive) { + + // index id + int iid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (iid < numIndices) { + // This is primitive assembly for triangles + int pid; // id for cur primitives vector + if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { + pid = iid / (int)primitive.primitiveType; + dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] + = primitive.dev_verticesOut[primitive.dev_indices[iid]]; + } + } - // TODO: other primitive types (point, line) - } - } - +__global__ +void rasterize(const int nPrims, const Primitive *primitives, Fragment *fragmentBuffer, int *depthBuffer, int* mutex_array, const int width, const int height) { + const int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid < nPrims) { + // Need an array containing the triangle vertices for helper functions in rasterizeTools.h. + const auto &primVerts = primitives[tid].v; + const glm::vec3 triVerts[] = { glm::vec3(primVerts[0].pos), glm::vec3(primVerts[1].pos), glm::vec3(primVerts[2].pos) }; + + // Compute the bouding box for this primitive + const AABB aabb = getAABBForTriangle(triVerts); + + // Transform the aabb's min/max x and y values into pixel coordinates + const int aabbMin[] = { (int)floor(aabb.min.x), + glm::clamp((int)floor(aabb.min.y), 0, height - 1) }; + const int aabbMax[] = { (int)ceil(aabb.max.x), + glm::clamp((int)ceil(aabb.max.y), 0, height - 1) }; + + for (int y = aabbMin[1]; y <= aabbMax[1]; ++y) { + + float xIsectMin, xIsectMax; + #ifdef LINE_INTERSECTION_RASTERIZATION + + // Compute intersection with each line segment pair + xIsectMin = 1000000000.0f; + xIsectMax = -1000000000.0f; + for (int l = 0; l < 3; ++l) { + // Edge cases + if (triVerts[(l + 1) % 3].y == triVerts[l].y) continue; // horizontal line + if (triVerts[(l + 1) % 3].x == triVerts[l].x) { // vertical line + xIsectMin = glm::min(xIsectMin, triVerts[l].x); + xIsectMax = glm::max(xIsectMax, triVerts[l].x); + continue; + } + + // Compute intersection with line + const float slope = (triVerts[(l + 1) % 3].y - triVerts[l].y) / ((triVerts[(l + 1) % 3].x - triVerts[l].x)); + float xIsect = (y - triVerts[l].y + slope * triVerts[l].x) / slope; + + // Ignore intersections outside of bounding boxes + if (xIsect < aabbMin[0] || xIsect > aabbMax[0]) continue; + + xIsectMin = glm::min(xIsectMin, xIsect); + xIsectMax = glm::max(xIsectMax, xIsect); + } + #else + xIsectMin = aabbMin[0]; + xIsectMax = aabbMax[0]; + #endif + + xIsectMin = glm::clamp(xIsectMin, 0.0f, (float)width - 1); + xIsectMax = glm::clamp(xIsectMax, 0.0f, (float)width - 1); + + // Set interpolated values for all overlapping fragments + for (int x = (int)xIsectMin; x <= (int)xIsectMax; ++x) { + const int fragIndex = x + y * width; + const glm::vec3 baryWeights = calculateBarycentricCoordinate(triVerts, glm::vec2(x, y)); + if (isBarycentricCoordInBounds(baryWeights)) { + // Compute the z value using perspective correct interpolation. Note the depth buffer expects ints + const int zDepth = (int)((10000000.0f) / (baryWeights.x / triVerts[0].z + baryWeights.y / triVerts[1].z + baryWeights.z / triVerts[2].z)); + + // Using a mutex + atomic functions to avoid race conditions, compare with and potentially write to the depth buffer + bool shouldSetFragment = true; + + #ifdef DEPTH_TEST + shouldSetFragment = false; + int *mutex = mutex_array + fragIndex; // pointer arithmetic + // Loop- wait until this thread is able to execute its critical section. + bool isSet = false; + do { + isSet = (atomicCAS(mutex, 0, 1) == 0); + if (isSet) { + if (zDepth < depthBuffer[fragIndex]) { + depthBuffer[fragIndex] = zDepth; + shouldSetFragment = true; + } + *mutex = 0; + } + } while (!isSet); + #endif + // Compute the perspective-correct-ly-interpolated fragment parameters + if (shouldSetFragment) { + #ifdef TEXTURE_MAPPING + // Texture Data - assume the texture data is the same for each vertex + fragmentBuffer[fragIndex].dev_diffuseTex = primVerts[0].dev_diffuseTex; + fragmentBuffer[fragIndex].diffuseTexWidth = primVerts[0].texWidth; + fragmentBuffer[fragIndex].diffuseTexHeight = primVerts[0].texHeight; + + // UVs + const float zDepth_f = ((float)zDepth) / 10000000.0f; + fragmentBuffer[fragIndex].texcoord0 = zDepth_f * (primVerts[0].texcoord0 * baryWeights.x / triVerts[0].z + + primVerts[1].texcoord0 * baryWeights.y / triVerts[1].z + + primVerts[2].texcoord0 * baryWeights.z / triVerts[2].z); + + /*fragmentBuffer[fragIndex].texcoord0 = primVerts[0].texcoord0 * baryWeights.x + + primVerts[1].texcoord0 * baryWeights.y + + primVerts[2].texcoord0 * baryWeights.z;*/ + #else + fragmentBuffer[fragIndex].color = glm::abs(zDepth_f * ((primVerts[0].col * baryWeights.x / triVerts[0].z + + primVerts[1].col * baryWeights.y / triVerts[1].z + + primVerts[2].col * baryWeights.z / triVerts[2].z))); + #endif + + // Normal + fragmentBuffer[fragIndex].eyeNor = glm::normalize(zDepth_f * ((primVerts[0].eyeNor * baryWeights.x / triVerts[0].z + + primVerts[1].eyeNor * baryWeights.y / triVerts[1].z + + primVerts[2].eyeNor * baryWeights.z / triVerts[2].z))); + + /*fragmentBuffer[fragIndex].eyeNor = glm::normalize(primVerts[0].eyeNor * baryWeights.x + + primVerts[1].eyeNor * baryWeights.y + + primVerts[2].eyeNor * baryWeights.z);*/ + } + } + } + } + } +} /** * Perform rasterization. */ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) { - int sideLength2d = 8; - dim3 blockSize2d(sideLength2d, sideLength2d); - dim3 blockCount2d((width - 1) / blockSize2d.x + 1, - (height - 1) / blockSize2d.y + 1); - - // Execute your rasterization pipeline here - // (See README for rasterization pipeline outline.) - - // Vertex Process & primitive assembly - { - curPrimitiveBeginId = 0; - dim3 numThreadsPerBlock(128); - - auto it = mesh2PrimitivesMap.begin(); - auto itEnd = mesh2PrimitivesMap.end(); - - for (; it != itEnd; ++it) { - auto p = (it->second).begin(); // each primitive - auto pEnd = (it->second).end(); - for (; p != pEnd; ++p) { - dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); - dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); - - _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height); - checkCUDAError("Vertex Processing"); - cudaDeviceSynchronize(); - _primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> > - (p->numIndices, - curPrimitiveBeginId, - dev_primitives, - *p); - checkCUDAError("Primitive Assembly"); - - curPrimitiveBeginId += p->numPrimitives; - } - } - - checkCUDAError("Vertex Processing and Primitive Assembly"); - } - - cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); - initDepth << > >(width, height, dev_depth); - - // TODO: rasterize + const int sideLength2d = 8; + const dim3 blockSize2d(sideLength2d, sideLength2d); + dim3 blockCount2d; + + #ifdef SSAA + blockCount2d = dim3((width * 2 - 1) / blockSize2d.x + 1, + (height * 2 - 1) / blockSize2d.y + 1); + #else + blockCount2d = dim3((width - 1) / blockSize2d.x + 1, + (height - 1) / blockSize2d.y + 1); + #endif + + // Execute your rasterization pipeline here + // (See README for rasterization pipeline outline.) + + // Vertex Process & primitive assembly + { + curPrimitiveBeginId = 0; + const dim3 numThreadsPerBlock(128); + + auto it = mesh2PrimitivesMap.begin(); + auto itEnd = mesh2PrimitivesMap.end(); + + for (; it != itEnd; ++it) { + auto p = (it->second).begin(); // each primitive + auto pEnd = (it->second).end(); + for (; p != pEnd; ++p) { + dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); + dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); + + #ifdef SSAA + _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> > (p->numVertices, *p, MVP, MV, MV_normal, width * 2, height * 2); + #else + _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> > (p->numVertices, *p, MVP, MV, MV_normal, width, height); + #endif + checkCUDAError("Vertex Processing"); + cudaDeviceSynchronize(); + + _primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> > + (p->numIndices, + curPrimitiveBeginId, + dev_primitives, + *p); + checkCUDAError("Primitive Assembly"); + + curPrimitiveBeginId += p->numPrimitives; + } + } + + checkCUDAError("Vertex Processing and Primitive Assembly"); + } + // Parameters for rasterization kernel + const int blockSize1d(totalNumPrimitives); + const dim3 blockCount1d((totalNumPrimitives + blockSize1d - 1) / blockSize1d); + + #ifdef SSAA + cudaMemset(dev_fragmentBuffer_4X, 0, 4 * width * height * sizeof(Fragment)); + initDepth << > > (width * 2, height * 2, dev_depth_4X); + + // Compute attributes for fragments overlapping geometry + rasterize << > > (totalNumPrimitives, dev_primitives, dev_fragmentBuffer_4X, dev_depth_4X, dev_mutex_4X, width * 2, height * 2); + cudaDeviceSynchronize(); + checkCUDAError("Rasterization"); + + // Copy depthbuffer colors into framebuffer + render << > > (width * 2, height * 2, dev_fragmentBuffer_4X, dev_framebuffer_4X); + cudaDeviceSynchronize(); + checkCUDAError("fragment shader"); + // Downsample the rendered image to the final image size using bilinear filtering + downsample << > > (width, height, dev_framebuffer, dev_framebuffer_4X); + checkCUDAError("downsample SSAA"); + + #else + cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); + initDepth << > > (width, height, dev_depth); + + rasterize << > > (totalNumPrimitives, dev_primitives, dev_fragmentBuffer, dev_depth, dev_mutex, width, height); + cudaDeviceSynchronize(); + checkCUDAError("Rasterization"); + + render << > > (width, height, dev_fragmentBuffer, dev_framebuffer); + cudaDeviceSynchronize(); + checkCUDAError("fragment shader"); + #endif - // Copy depthbuffer colors into framebuffer - render << > >(width, height, dev_fragmentBuffer, dev_framebuffer); - checkCUDAError("fragment shader"); // Copy framebuffer into OpenGL buffer for OpenGL previewing - sendImageToPBO<<>>(pbo, width, height, dev_framebuffer); + sendImageToPBO << > > (pbo, width, height, dev_framebuffer); checkCUDAError("copy render result to pbo"); } @@ -741,36 +1042,47 @@ void rasterizeFree() { // deconstruct primitives attribute/indices device buffer - auto it(mesh2PrimitivesMap.begin()); - auto itEnd(mesh2PrimitivesMap.end()); - for (; it != itEnd; ++it) { - for (auto p = it->second.begin(); p != it->second.end(); ++p) { - cudaFree(p->dev_indices); - cudaFree(p->dev_position); - cudaFree(p->dev_normal); - cudaFree(p->dev_texcoord0); - cudaFree(p->dev_diffuseTex); + auto it(mesh2PrimitivesMap.begin()); + auto itEnd(mesh2PrimitivesMap.end()); + for (; it != itEnd; ++it) { + for (auto p = it->second.begin(); p != it->second.end(); ++p) { + cudaFree(p->dev_indices); + cudaFree(p->dev_position); + cudaFree(p->dev_normal); + cudaFree(p->dev_texcoord0); + cudaFree(p->dev_diffuseTex); - cudaFree(p->dev_verticesOut); + cudaFree(p->dev_verticesOut); - - //TODO: release other attributes and materials - } - } - //////////// + //TODO: release other attributes and materials + } + } + + //////////// cudaFree(dev_primitives); dev_primitives = NULL; - cudaFree(dev_fragmentBuffer); - dev_fragmentBuffer = NULL; + cudaFree(dev_fragmentBuffer); + dev_fragmentBuffer = NULL; + cudaFree(dev_fragmentBuffer_4X); + dev_fragmentBuffer_4X = NULL; cudaFree(dev_framebuffer); dev_framebuffer = NULL; - - cudaFree(dev_depth); - dev_depth = NULL; + cudaFree(dev_framebuffer_4X); + dev_framebuffer_4X = NULL; + + cudaFree(dev_mutex); + dev_mutex = NULL; + cudaFree(dev_mutex_4X); + dev_mutex_4X = NULL; + + cudaFree(dev_depth); + dev_depth = NULL; + cudaFree(dev_depth_4X); + dev_depth_4X = NULL; checkCUDAError("rasterize Free"); }