diff --git a/README.md b/README.md index cad1abd..0f12725 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,228 @@ -CUDA Rasterizer -=============== +University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4 CUDA Rasterizer +====================== +* Ziyu Li +* Tested on: Windows 7, Intel Core i7-3840QM @2.80GHz 16GB, Nvidia Quadro K4000M 4GB -[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md) +## Features +#### All Features + - All Basic Graphics Pipeline + - Vertex Shading + - Primitive assembly with support for triangles read from buffers of index and vertex data. + - Rasterization + - Fragment shading + - Depth Buffer and Depth Test + - Lambert and Blinn shading Model + - Other Features + - Screen Space Ambient Occlusion (SSAO) + - Super-Sampling Anti Aliasing (SSAA) + - Tile-Based Pipeline + - Back-face Culling (optimized using stream compaction) + - Display Wire-frame / Points with line width and point size adjustment + - UV Texture Mapping with Perspective Correction + - Bilinear Texture Filter + - Toon Shading -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +For better result, all the images demonstrate below are using SSAA 2x -### (TODO: Your README) +#### Shading Models -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. +| Lambert | Blinn | +| ----- | ----- | +| ![lambert](img/duck_lambert.gif) | ![blinn](img/duck_blinn.gif) | +To switch the shading model to Blinn, change the *BLINN* to 1 +```c++ + #define BLINN 1 +``` -### Credits +#### Tile-Based Pipeline + +![tile_imp](img/tile_imp.png) +(Image from *OpenGLInsights, TileBasedArchitectures*) + +Break framebuffer into several tiles, and render one tile per thread instead of one primitive per thread. + +| Primitive-Based | Tile-Based | +| ----- | ----- | +| ![prim](img/truck_prim_base.gif) | ![tile](img/truck_tile_base_backface.gif) | + +By using the Tile-Based Pipeline, the interaction is much smoother and GPU Load is siginificant reduced. + +To switch the pipeline to tile-based, change the *TILE_BASED_RENDER* to 1 +```c++ + #define TILE_BASED_RENDER 1 +``` + +For details performance comparison, please check *Performance* Section. + +#### Back-face Culling + +Remove the triangles which are not facing to the camera(eye) before rasterization. + +| Only Front Face | Only Back Face | +| ----- | ----- | +| ![front](img/duck_lambert.gif) | ![back](img/duck_backface.gif) | + +Here is the Back-face Culling performance comparision in Primitive-Based Pipeline. + +| No Culling | Back-face Culling | +| ----- | ----- | +| ![noculling](img/truck_prim_base.gif) | ![culling](img/truck_prim_base_backface.gif) | + + +To toggle the backface culling, change the *BACKFACE_CULLING* to 1 +```c++ + #define BACKFACE_CULLING 1 +``` + +For details performance comparison, please check *Performance* Section. + +#### Display Wire-frame / Points + +| Solid | Wire-frame | Points| +| ----- | ----- | ----- | +| ![solid](img/duck_no_wire_point.PNG) | ![wire](img/duck_wireframe.PNG) | ![point](img/duck_point.PNG) | + +To switch different mode of display, change the *SHADING* to 1 for rendering wireframe and 2 for rendering vertices +```c++ + #define SHADING 1 // 0: Solid, 1: Wireframe, 2: Vertices +``` + +#### UV Texture Mapping with Perspective Correction + +| Wrong | Correct| +| ----- | ----- | +| ![texw](img/tex_wrong.PNG) | ![texr](img/tex_right.PNG) | + +#### Bilinear Texture Filter + +| No Filter | Bilinear Filter | +| ----- | ----- | +| ![nof](img/truck_no_bilinear.PNG) | ![f](img/truck_bilinear.PNG) | + +To toggle the bilinear texture filter, change the *BILINEAR_INTERP* to 1 +```c++ + #define BILINEAR_INTERP 1 +``` + +#### Screen Space Ambient Occlusion (SSAO) + +| No AO | SSAO | SSAO Pass | +| ----- | ----- | ----- | +| ![no ao](img/truck_lambert_noao.PNG) | ![ssao](img/truck_lambert_ao.PNG) | ![ssaopass](img/truck_ao_pass.PNG) | + +Given any pixel in the scene, it is possible to calculate its ambient occlusion by treating all neighboring pixels as small spheres, and adding together their contributions. +The occlusion can be contributed by distance to occludee and angle between occludee's normal and occluder. + +![ao](img/ao.jpg) +(Image from *A Simple and Practical Approach to SSAO* https://www.gamedev.net/articles/programming/graphics/a-simple-and-practical-approach-to-ssao-r2753/) + +Because we need a random direction of occlusion ray. A cached or per-generated noise texture is necessary. If the SSAO feature is turn on a 16x16 noise texture will be generate rasterizer after the primitive has loaded. + +This screenshot below, shows the AO Pass of the model. + +![aoaa](img/truck_ao_pass_ani.gif) +To toggle the SSAO, change the *SSAO* to 1 +```c++ + #define SSAO 1 +``` + +#### Super-Sampling Anti Aliasing (SSAA) + +| No AA | SSAA x2 | SSAA x4| +| ----- | ----- | ----- | +| ![no aa](img/duck_no_aa.PNG) | ![ssaa2](img/duck_aa.PNG) | ![ssaa4](img/duck_aa4.PNG) | + +To toggle the SSAA, change the *SSAA* to 2 for 2x or 4 for 4x... +```c++ + #define SSAA 2 // A value greater than 1 +``` + +#### Toon Shading + +| Orignal | Toon Shading | +| ----- | ----- | +| ![lambert](img/duck_lambert.gif) | ![toon](img/duck_lambert_toon.gif) | + +To toggle the Toon Shading, change the *TOON* to 1 +```c++ + #define TOON 1 +``` + +#### Bloom (BUGGE......) + +There are still something not corrected yet. But the basic idea is to blend the framebuffer with different amount of blur. + +| Orignal | Bloom | +| ----- | ----- | +| ![blinn](img/duck_blinn.gif) | ![bloom](img/duck_blinn_bloom.gif) | + +To toggle the Bloom Effect, change the *BLOOM* to 1 +```c++ + #define BLOOM 1 +``` + +##Performance +#### Tile-Based Pipeline + +The benchmark is based on test model *CesiumMilkTruck* and use SSAA 2x feature. +And the results of graphics card gpu load and memory usage is recorded on GPU-Z. + +![tile_vs_prim_fps](img/tile_vs_prim_fps.PNG) + +![tile_vs_prim_load](img/tile_vs_prim_load.PNG) + +![tile_vs_prim_mem](img/tile_vs_prim_mem.PNG) + +Based on the result, the Tile-Based Pipeline reduces the GPU load and increases the framerate. It is much more efficient than Primitive-Based Pipeline, especially when less triangle occupy the the most of screen. + +#### BackFace Culling + +![bfc](img/bfc.PNG) + +The backface culling slightly increase the preformance under some scenarios, but it does not affect the performance when a triangle is almost occupy the screen. + +#### Runtime Analysis + +![cLaunch](img/cL.PNG) + +![cRuntime](img/cR.PNG) + +Based on the CUDA Device Functions, rasterization is the bottleneck and most intense function in the whole program. +For the primitive-based pipeline, the reason of that could caused by too much calculation redundancy and extremely unbalance the job schedualing to rasterize triangles. However, for the tile-based pipeline, There are some empty tiles which within the bounding box but outside the triangle still occupy the runtime of the kernel function. So I believe one of the optimization can be do a triangle-tile intersection test and use stream compaction to remove the empty tiles. + +| CUDA Device Functions | Primitive-Based (%)| Tile-Based (%)| +|----------------------------|-----------------|------------| +| initDepth | 0.2 | 0.4 | +| vertexTransformAndAssembly | 0.7 | 1.7 | +| primitiveAssembly | 0.9 | 2.2 | +| decompose | N/A | 0.5 | +| rasterization | 68.2 | 33.1 | +| render | 5.7 | 14.2 | +| sendImageToPBO | 0.6 | 1.6 | +| others | 23.7 | 46.3 | + + +| CUDA Runtime API | Primitive-Based (%)| Tile-Based (%)| +|-----------------------|-----------------|------------| +| cudaFree | 5.87 | 6.96 | +| cudaMalloc | 0.44 | 0.52 | +| cudaMemset | 0.11 | 0.34 | +| cudaMemcpy | 0.27 | 0.31 | +| cudaLaunch | 1.21 | 3.47 | +| cudaDeviceSynchronize | 83.85 | 70.4 | +| others | 8.25 | 18 | + +### References +* [Blinn Shading](https://en.wikipedia.org/wiki/Blinn%E2%80%93Phong_shading_model) +* [SSAO](https://www.gamedev.net/articles/programming/graphics/a-simple-and-practical-approach-to-ssao-r2753/) +* [Back-Face Culling](https://en.wikipedia.org/wiki/Back-face_culling) +* [Kernel Filter](https://www.slideshare.net/DarshanParsana/gaussian-image-blurring-in-cuda-c) + +### Credits +* [factomicMin](https://devtalk.nvidia.com/default/topic/492068/atomicmin-with-float/) by [@hyqneuron]() * [tinygltfloader](https://github.com/syoyo/tinygltfloader) by [@soyoyo](https://github.com/syoyo) * [glTF Sample Models](https://github.com/KhronosGroup/glTF/blob/master/sampleModels/README.md) +* [gpu-z](https://www.techpowerup.com/gpuz/) by [@TechPowerUp](https://www.techpowerup.com/) diff --git a/img/ao.jpg b/img/ao.jpg new file mode 100644 index 0000000..dba9a92 Binary files /dev/null and b/img/ao.jpg differ diff --git a/img/bfc.PNG b/img/bfc.PNG new file mode 100644 index 0000000..eb0cb62 Binary files /dev/null and b/img/bfc.PNG differ diff --git a/img/cL.PNG b/img/cL.PNG new file mode 100644 index 0000000..b3f3f2e Binary files /dev/null and b/img/cL.PNG differ diff --git a/img/cR.PNG b/img/cR.PNG new file mode 100644 index 0000000..f053c7a Binary files /dev/null and b/img/cR.PNG differ diff --git a/img/duck_aa.PNG b/img/duck_aa.PNG new file mode 100644 index 0000000..668c9ab Binary files /dev/null and b/img/duck_aa.PNG differ diff --git a/img/duck_aa4.PNG b/img/duck_aa4.PNG new file mode 100644 index 0000000..0e1443b Binary files /dev/null and b/img/duck_aa4.PNG differ diff --git a/img/duck_backface.gif b/img/duck_backface.gif new file mode 100644 index 0000000..1003786 Binary files /dev/null and b/img/duck_backface.gif differ diff --git a/img/duck_blinn.gif b/img/duck_blinn.gif new file mode 100644 index 0000000..cf35c5e Binary files /dev/null and b/img/duck_blinn.gif differ diff --git a/img/duck_blinn_bloom.gif b/img/duck_blinn_bloom.gif new file mode 100644 index 0000000..a605092 Binary files /dev/null and b/img/duck_blinn_bloom.gif differ diff --git a/img/duck_lambert.gif b/img/duck_lambert.gif new file mode 100644 index 0000000..e366696 Binary files /dev/null and b/img/duck_lambert.gif differ diff --git a/img/duck_lambert_toon.gif b/img/duck_lambert_toon.gif new file mode 100644 index 0000000..f60010a Binary files /dev/null and b/img/duck_lambert_toon.gif differ diff --git a/img/duck_no_aa.PNG b/img/duck_no_aa.PNG new file mode 100644 index 0000000..2d0081b Binary files /dev/null and b/img/duck_no_aa.PNG differ diff --git a/img/duck_no_wire_point.PNG b/img/duck_no_wire_point.PNG new file mode 100644 index 0000000..2108f4b Binary files /dev/null and b/img/duck_no_wire_point.PNG differ diff --git a/img/duck_point.PNG b/img/duck_point.PNG new file mode 100644 index 0000000..5ce319e Binary files /dev/null and b/img/duck_point.PNG differ diff --git a/img/duck_wireframe.PNG b/img/duck_wireframe.PNG new file mode 100644 index 0000000..7bf0bc1 Binary files /dev/null and b/img/duck_wireframe.PNG differ diff --git a/img/noise.PNG b/img/noise.PNG new file mode 100644 index 0000000..6540e5e Binary files /dev/null and b/img/noise.PNG differ diff --git a/img/tex_right.PNG b/img/tex_right.PNG new file mode 100644 index 0000000..64964b2 Binary files /dev/null and b/img/tex_right.PNG differ diff --git a/img/tex_wrong.PNG b/img/tex_wrong.PNG new file mode 100644 index 0000000..3162307 Binary files /dev/null and b/img/tex_wrong.PNG differ diff --git a/img/tile_imp.png b/img/tile_imp.png new file mode 100644 index 0000000..c896563 Binary files /dev/null and b/img/tile_imp.png differ diff --git a/img/tile_vs_prim_fps.PNG b/img/tile_vs_prim_fps.PNG new file mode 100644 index 0000000..8c76476 Binary files /dev/null and b/img/tile_vs_prim_fps.PNG differ diff --git a/img/tile_vs_prim_load.PNG b/img/tile_vs_prim_load.PNG new file mode 100644 index 0000000..ef08112 Binary files /dev/null and b/img/tile_vs_prim_load.PNG differ diff --git a/img/tile_vs_prim_mem.PNG b/img/tile_vs_prim_mem.PNG new file mode 100644 index 0000000..37dc37c Binary files /dev/null and b/img/tile_vs_prim_mem.PNG differ diff --git a/img/truck_ao_pass.PNG b/img/truck_ao_pass.PNG new file mode 100644 index 0000000..e8657dc Binary files /dev/null and b/img/truck_ao_pass.PNG differ diff --git a/img/truck_ao_pass_ani.gif b/img/truck_ao_pass_ani.gif new file mode 100644 index 0000000..56fcc6d Binary files /dev/null and b/img/truck_ao_pass_ani.gif differ diff --git a/img/truck_bilinear.PNG b/img/truck_bilinear.PNG new file mode 100644 index 0000000..8805ca2 Binary files /dev/null and b/img/truck_bilinear.PNG differ diff --git a/img/truck_lambert_ao.PNG b/img/truck_lambert_ao.PNG new file mode 100644 index 0000000..c051c59 Binary files /dev/null and b/img/truck_lambert_ao.PNG differ diff --git a/img/truck_lambert_noao.PNG b/img/truck_lambert_noao.PNG new file mode 100644 index 0000000..30b918d Binary files /dev/null and b/img/truck_lambert_noao.PNG differ diff --git a/img/truck_no_bilinear.PNG b/img/truck_no_bilinear.PNG new file mode 100644 index 0000000..5fbfab2 Binary files /dev/null and b/img/truck_no_bilinear.PNG differ diff --git a/img/truck_prim_base.gif b/img/truck_prim_base.gif new file mode 100644 index 0000000..f5082c5 Binary files /dev/null and b/img/truck_prim_base.gif differ diff --git a/img/truck_prim_base_backface.gif b/img/truck_prim_base_backface.gif new file mode 100644 index 0000000..f235b5a Binary files /dev/null and b/img/truck_prim_base_backface.gif differ diff --git a/img/truck_tile_base_backface.gif b/img/truck_tile_base_backface.gif new file mode 100644 index 0000000..f50752c Binary files /dev/null and b/img/truck_tile_base_backface.gif differ diff --git a/src/main.cpp b/src/main.cpp index 7986959..15e23bd 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -120,7 +120,7 @@ void runCuda() { glm::mat4 MVP = P * MV; cudaGLMapBufferObject((void **)&dptr, pbo); - rasterize(dptr, MVP, MV, MV_normal); + rasterize(dptr, MVP, MV, MV_normal, frame); cudaGLUnmapBufferObject(pbo); frame++; @@ -138,8 +138,8 @@ bool init(const tinygltf::Scene & scene) { return false; } - width = 800; - height = 800; + width = 512; + height = 512; window = glfwCreateWindow(width, height, "CIS 565 Pathtracer", NULL, NULL); if (!window) { glfwTerminate(); diff --git a/src/rasterize.cu b/src/rasterize.cu index 1262a09..17f0b68 100644 --- a/src/rasterize.cu +++ b/src/rasterize.cu @@ -6,18 +6,35 @@ * @copyright University of Pennsylvania & STUDENT */ +#include +#include +#include #include #include #include #include #include +#include +#include #include #include #include "rasterizeTools.h" #include "rasterize.h" +#include #include #include +#define BILINEAR_INTERP 0 +#define BLINN 0 +#define SSAO 0 +#define SSAA 0 +#define TOON 0 +#define BLOOM 0 +#define TILE_BASED_RENDER 1 +#define BACKFACE_CULLING 0 +#define SHADING 0 // 0: Solid, 1: Wireframe, 2: Point +#define DEMO 0 + namespace { typedef unsigned short VertexIndex; @@ -43,30 +60,34 @@ namespace { glm::vec3 eyePos; // eye space position used for shading glm::vec3 eyeNor; // eye space normal used for shading, cuz normal will go wrong after perspective transformation - // glm::vec3 col; + glm::vec3 col; glm::vec2 texcoord0; TextureData* dev_diffuseTex = NULL; - // int texWidth, texHeight; + int texWidth, texHeight; // ... }; struct Primitive { PrimitiveType primitiveType = Triangle; // C++ 11 init + TextureData* tex = NULL; VertexOut v[3]; + int size[2]; }; struct Fragment { glm::vec3 color; + float z; // TODO: add new attributes to your Fragment // The attributes listed below might be useful, // but always feel free to modify on your own - // glm::vec3 eyePos; // eye space position used for shading - // glm::vec3 eyeNor; - // VertexAttributeTexcoord texcoord0; - // TextureData* dev_diffuseTex; - // ... + glm::vec3 eyePos; // eye space position used for shading + glm::vec3 eyeNor; + VertexAttributeTexcoord texcoord0; + TextureData* dev_diffuseTex = NULL; + int size[2]; + }; struct PrimitiveDevBufPointers { @@ -103,13 +124,29 @@ static std::map> mesh2Primitiv static int width = 0; static int height = 0; +static int ow = 0; +static int oh = 0; +static int tilesize = 16; +static int numTilesW, numTilesH, numTiles; static int totalNumPrimitives = 0; static Primitive *dev_primitives = NULL; static Fragment *dev_fragmentBuffer = NULL; static glm::vec3 *dev_framebuffer = NULL; +static glm::vec2 *dev_noise = NULL; + +static int *dev_count = NULL; +static int *dev_tile = NULL; +static float * dev_depth = NULL; // you might need this buffer when doing depth test -static int * dev_depth = NULL; // you might need this buffer when doing depth test + +__constant__ int mx[5][5] = { + { 1, 4, 7, 4, 1 }, + { 4,16,26,16,4 }, + { 7,26,41,26,7 }, + { 4,16,26,16,4 }, + { 1,4,7,4,1 } +}; /** * Kernel that writes the image to the OpenGL PBO directly. @@ -122,9 +159,24 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) { if (x < w && y < h) { glm::vec3 color; - color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0; - color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0; - color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0; + #if SSAA + int it = int(SSAA); + for (int i = 0; i < it; i++) + for (int j = 0; j < it; j++) + { + int sx = it * x; + int sy = it * y; + int sw = it * w; + color.x += glm::clamp(image[sx + i + (sy + j) * sw].x, 0.0f, 1.0f) * 255.0; + color.y += glm::clamp(image[sx + i + (sy + j) * sw].y, 0.0f, 1.0f) * 255.0; + color.z += glm::clamp(image[sx + i + (sy + j) * sw].z, 0.0f, 1.0f) * 255.0; + } + color /= float(1 << it); + #else + color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0; + color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0; + color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0; + #endif // Each thread writes one pixel location in the texture (textel) pbo[index].w = 0; pbo[index].x = color.x; @@ -133,20 +185,186 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) { } } -/** -* Writes fragment colors to the framebuffer -*/ +#define COL(C) (C / 255.0) +#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) +#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) + + +__host__ __device__ +int clamp(int v, int a, int b) +{ + return MIN(MAX(a, v), b); +} + + +__host__ __device__ +glm::vec3 getTexColor(TextureData* tex, int stride, int u, int v) +{ + int idx = (u + v * stride) * 3; + return glm::vec3(COL(tex[idx + 0]), + COL(tex[idx + 1]), + COL(tex[idx + 2])); +} + +template +__host__ __device__ +T lerp(float v, T a, T b) +{ + return a * (1.0f - v) + v * b; +} + +__host__ __device__ +glm::vec2 getRandom(glm::vec2 *noise, int u, int v, int randomSize, int screenSize) +{ + int x = MIN(randomSize * ((float)u / screenSize), 7); + int y = MIN(randomSize * ((float)v / screenSize), 7); + glm::vec2 r = noise[x + y * randomSize]; + return glm::normalize(r * 2.0f - glm::vec2(1.0f)); +} + +__host__ __device__ +float AO(Fragment *fb, int w, int h, glm::vec2 coord, glm::vec3 *buffer, int ww, int hh, glm::vec2 tcoord, glm::vec2 uv, glm::vec3 p, glm::vec3 cnorm) +{ + int gx = coord.x + uv.x; + int gy = coord.y + uv.y; + int x = tcoord.x + uv.x; + int y = tcoord.y + uv.y; + float scale = 1.0f; + float bias = 0.05f; + float ao_a = 5.0f; + if (gx < w && gy < h && gx >=0 && gy >= 0) { + int index; + glm::vec3 diff; + if (x < ww && y < hh && x >= 0 && y >= 0) { + index = x + (y * ww); + diff = buffer[index] - p; + } else { + index = gx + (gy * w); + diff = fb[index].eyePos - p; + } + float d = glm::length(diff) * scale; + if (d == 0) return 0; + glm::vec3 v = glm::normalize(diff); + float r = MAX(0.0f, glm::dot(cnorm, v) - bias) * (1.0f / (1.0f + d)) * ao_a; + return r; + } + return 0; +} + __global__ -void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) { +void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer, glm::vec2 *noise) { + extern __shared__ glm::vec3 buffer[]; + int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; int index = x + (y * w); + + // Cache position to shared memory + buffer[threadIdx.x + blockDim.x * threadIdx.y] = fragmentBuffer[index].eyePos; + __syncthreads(); + + if (x < w && y < h) { + + auto fb = fragmentBuffer[index]; + auto out = glm::vec3(0); + auto col = fb.color; + auto nor = fb.eyeNor; + auto pos = buffer[threadIdx.x + blockDim.x * threadIdx.y]; + + glm::vec3 lightPos = glm::vec3(60, 60, 60); + glm::vec3 lightDir = glm::normalize(lightPos - pos); + float ambient = 0.2f; + float shininess = 32.0f; + float diffuse = 0; + float specular = 0; + + // Blinn Shading Model + // https://en.wikipedia.org/wiki/Blinn%E2%80%93Phong_shading_model + + #if BLINN + glm::vec3 viewDir = glm::normalize(-pos); + glm::vec3 halfDir = glm::normalize(lightDir + viewDir); + float specAngle = glm::max(glm::dot(halfDir, nor), 0.0f); + specular = glm::pow(specAngle, shininess); + diffuse = glm::max(0.0f, glm::dot(nor, lightDir)); + #else + diffuse = glm::max(0.0f, glm::dot(nor, lightDir)); + #endif + + if (fb.dev_diffuseTex != NULL) { + float texWidth = fb.size[0]; + float texHeight = fb.size[1]; + auto tex = fb.dev_diffuseTex; + auto texcoord = fb.texcoord0; + +#if !BILINEAR_INTERP + int u = texcoord.x * texWidth; + int v = texcoord.y * texHeight; + out = getTexColor(tex, texWidth, u, v); +#else + // Bilinear + float fx = texcoord.x * texWidth; + float fy = texcoord.y * texHeight; + int cx = clamp((int)fx, 0, texWidth - 1); + int cy = clamp((int)fy, 0, texHeight - 1); + float dx = fx - cx; + float dy = fy - cy; + auto x0y0 = getTexColor(tex, texWidth, cx + 0, cy + 0); + auto x1y0 = getTexColor(tex, texWidth, cx + 1, cy + 0); + auto x0y1 = getTexColor(tex, texWidth, cx + 0, cy + 1); + auto x1y1 = getTexColor(tex, texWidth, cx + 1, cy + 1); + out = lerp(dy, lerp(dx, x0y0, x1y0), lerp(dx, x0y1, x1y1)); +#endif + } + else { + out = col; + } - if (x < w && y < h) { - framebuffer[index] = fragmentBuffer[index].color; +// Toon Shader +#if TOON - // TODO: add your fragment shader code here + ambient = 0.1f; + specular = specular > 0.75f ? 1.0f : 0.0f; + if (diffuse < 0.1f) diffuse = 0.0f; + else if (diffuse < 0.5f) diffuse = 0.5f; + else diffuse = 0.8f; + out = out * diffuse + glm::vec3(1, 1, 1) * specular + out * ambient + glm::vec3(1, 1, 1) * specular; +#else + out = out * diffuse + glm::vec3(1, 1, 1) * specular + out * ambient + glm::vec3(1, 1, 1) * specular; +#endif + + +// Default SSAO: 4 samples per fragment (4 iterations) +#if SSAO + // SSAO + // https://www.gamedev.net/articles/programming/graphics/a-simple-and-practical-approach-to-ssao-r2753/ + + float z = -glm::abs((float)fb.z) / glm::abs(INT_MIN); + int iterations = 4; + float ao = 0.0f; + glm::vec2 v[4] = { glm::vec2(1,0), glm::vec2(-1,0), glm::vec2(0,1), glm::vec2(0,-1) }; + glm::vec2 rand = getRandom(noise, x, y, 8, w); + glm::vec2 xy(x, y); + + #pragma unroll + for (int i = 0; i < iterations; i++) + { + if (z < 0.00000001f) continue; + float sampleR = clamp(1.f / z, 0.0f, 10.0f) * 3.0f; + + glm::vec2 coord1 = glm::reflect(v[i], rand) * sampleR; + glm::vec2 coord2 = glm::vec2(coord1.x * 0.707f - coord1.y * 0.707f, coord1.x * 0.707f + coord1.y * 0.707f); + ao += AO(fragmentBuffer, w, h, xy, buffer, blockDim.x, blockDim.y, glm::vec2(threadIdx.x, threadIdx.y), coord1 * 0.25f, pos, nor); + ao += AO(fragmentBuffer, w, h, xy, buffer, blockDim.x, blockDim.y, glm::vec2(threadIdx.x, threadIdx.y), coord2 * 0.50f, pos, nor); + ao += AO(fragmentBuffer, w, h, xy, buffer, blockDim.x, blockDim.y, glm::vec2(threadIdx.x, threadIdx.y), coord1 * 0.75f, pos, nor); + ao += AO(fragmentBuffer, w, h, xy, buffer, blockDim.x, blockDim.y, glm::vec2(threadIdx.x, threadIdx.y), coord2 * 1.00f, pos, nor); + } + + ao /= (float)iterations * 4.0f; + out *= (1 - ao); +#endif + framebuffer[index] = out; } } @@ -154,8 +372,20 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) { * Called once at the beginning of the program to allocate memory. */ void rasterizeInit(int w, int h) { - width = w; - height = h; + ow = w; + oh = h; + #if SSAA + width = w * SSAA; + height = h * SSAA; + #else + width = w; + height = h; + #endif + + numTilesW = width / tilesize; + numTilesH = height / tilesize; + numTiles = numTilesW * numTilesH; + cudaFree(dev_fragmentBuffer); cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment)); cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); @@ -164,13 +394,18 @@ void rasterizeInit(int w, int h) { cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3)); cudaFree(dev_depth); - cudaMalloc(&dev_depth, width * height * sizeof(int)); + cudaMalloc(&dev_depth, width * height * sizeof(float)); + + cudaFree(dev_count); + cudaMalloc(&dev_count, numTiles * sizeof(int)); + + cudaFree(dev_tile); checkCUDAError("rasterizeInit"); } __global__ -void initDepth(int w, int h, int * depth) +void initDepth(int w, int h, float * depth) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; @@ -178,7 +413,7 @@ void initDepth(int w, int h, int * depth) if (x < w && y < h) { int index = x + (y * w); - depth[index] = INT_MAX; + depth[index] = float(INT_MAX); } } @@ -544,6 +779,24 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) { checkCUDAError("Set Texture Image data"); } } + /* + if (mat.values.find("specular") != mat.values.end()) { + std::string diffuseTexName = mat.values.at("diffuse").string_value; + if (scene.textures.find(diffuseTexName) != scene.textures.end()) { + const tinygltf::Texture &tex = scene.textures.at(diffuseTexName); + if (scene.images.find(tex.source) != scene.images.end()) { + const tinygltf::Image &image = scene.images.at(tex.source); + + size_t s = image.image.size() * sizeof(TextureData); + cudaMalloc(&dev_diffuseTex, s); + cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice); + + diffuseTexWidth = image.width; + diffuseTexHeight = image.height; + + checkCUDAError("Set Texture Image data"); + } + }*/ } // TODO: write your code for other materails @@ -618,6 +871,30 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) { checkCUDAError("Free BufferView Device Mem"); } + // Generate Noise Texture for AO + glm::vec2 *noise; + noise = (glm::vec2 *)malloc(8 * 8 * sizeof(glm::vec2)); + for (int i = 0; i < 64; i++) + { + std::mt19937_64 rng; + uint64_t timeSeed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + std::seed_seq ss{ uint32_t(timeSeed & 0xffffffff), uint32_t(timeSeed >> 32) }; + rng.seed(ss); + std::uniform_real_distribution unif(0, 1); + + float x = unif(rng); + float y = unif(rng); + noise[i] = glm::vec2(x, y); + } + + cudaMalloc(&dev_noise, 8 * 8 * sizeof(glm::vec2)); + cudaMemcpy(dev_noise, noise, 8 * 8 * sizeof(glm::vec2), cudaMemcpyHostToDevice); + + delete noise; + + + // Init Tiles + cudaMalloc(&dev_tile, numTiles * totalNumPrimitives * sizeof(int)); } @@ -628,7 +905,7 @@ void _vertexTransformAndAssembly( int numVertices, PrimitiveDevBufPointers primitive, glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, - int width, int height) { + int width, int height, int t) { // vertex id int vid = (blockIdx.x * blockDim.x) + threadIdx.x; @@ -638,6 +915,35 @@ void _vertexTransformAndAssembly( // Multiply the MVP matrix for each vertex position, this will transform everything into clipping space // Then divide the pos by its w element to transform into NDC space // Finally transform x and y to viewport space + glm::vec4 vertex_position4 = glm::vec4(primitive.dev_position[vid], 1.0f); + +#if DEMO + glm::mat4 rot(1); + rot = glm::rotate(rot, t / 64.0f, glm::vec3(0, 1, 0)); + vertex_position4 = vertex_position4 * rot; +#endif + + glm::vec4 vertex_proj_pos = MVP * vertex_position4; + vertex_proj_pos = vertex_proj_pos / vertex_proj_pos.w; + + float x = (1.0f - vertex_proj_pos.x) * 0.5f * width; + float y = (1.0f - vertex_proj_pos.y) * 0.5f * height; + float z = vertex_proj_pos.z; + primitive.dev_verticesOut[vid].pos = glm::vec4(x, y, z, 1.0f); + primitive.dev_verticesOut[vid].eyePos = multiplyMV(MV, vertex_position4); + primitive.dev_verticesOut[vid].eyeNor = glm::normalize(MV_normal * primitive.dev_normal[vid]); + primitive.dev_verticesOut[vid].texHeight = primitive.diffuseTexHeight; + primitive.dev_verticesOut[vid].texWidth = primitive.diffuseTexWidth; + primitive.dev_verticesOut[vid].dev_diffuseTex = primitive.dev_diffuseTex; + if (primitive.dev_texcoord0 != NULL) + primitive.dev_verticesOut[vid].texcoord0 = primitive.dev_texcoord0[vid]; + + if(vid % 3 == 0) + primitive.dev_verticesOut[vid].col = glm::vec3(0.8, 0.8, 0.8); + if (vid % 3 == 1) + primitive.dev_verticesOut[vid].col = glm::vec3(0.8, 0.8, 0.8); + if (vid % 3 == 2) + primitive.dev_verticesOut[vid].col = glm::vec3(0.8, 0.8, 0.8); // TODO: Apply vertex assembly here // Assemble all attribute arraies into the primitive array @@ -660,30 +966,320 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_ // TODO: uncomment the following code for a start // This is primitive assembly for triangles - //int pid; // id for cur primitives vector - //if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { - // pid = iid / (int)primitive.primitiveType; - // dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] - // = primitive.dev_verticesOut[primitive.dev_indices[iid]]; - //} - - + int pid; // id for cur primitives vector + if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { + pid = iid / (int)primitive.primitiveType; + dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] + = primitive.dev_verticesOut[primitive.dev_indices[iid]]; + dev_primitives[pid + curPrimitiveBeginId].tex = primitive.dev_diffuseTex; + dev_primitives[pid + curPrimitiveBeginId].size[0] = primitive.diffuseTexWidth; + dev_primitives[pid + curPrimitiveBeginId].size[1] = primitive.diffuseTexHeight; + } // TODO: other primitive types (point, line) } } +// BackFace Culling +// https://en.wikipedia.org/wiki/Back-face_culling + +struct backface { + __host__ __device__ + bool operator()(const Primitive &prim) + { + glm::vec4 tri[3] = { prim.v[0].pos, prim.v[1].pos, prim.v[2].pos }; + float z = (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y) - (tri[1].y - tri[0].y) * (tri[2].x - tri[0].x); + return z < 0.0f; + } +}; + +__host__ __device__ +bool front(glm::vec3 *tri, int mode = 1) +{ + float z = (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y) - (tri[1].y - tri[0].y) * (tri[2].x - tri[0].x); + return mode ? z > 0.0f : z < 0.0f; +} + +__host__ __device__ +float sum(glm::vec3 v) +{ + return v.x + v.y + v.z; +} + +template +__host__ __device__ +T eval(glm::vec3 barycentric, T *val) +{ + return barycentric.x * val[0] + barycentric.y * val[1] + barycentric.z * val[2]; +} + +__host__ __device__ +float getCorrectedZ(const glm::vec3 barycentric, const float *z) +{ + return 1.0f / (barycentric.x / z[0] + barycentric.y / z[1] + barycentric.z / z[2]); +} +__host__ __device__ +void drawWireframe(Fragment * dev_fragmentBuffer, int w, int h, glm::vec3 *tris) +{ + +} + +// atomicMin for Float value +// method 'factomicMin' from user hyqneuron +// https://devtalk.nvidia.com/default/topic/492068/atomicmin-with-float/ + +__device__ +float fatomicMin(float *addr, float value) +{ + float old = *addr, assumed; + if (old <= value) return old; + do { + assumed = old; + old = atomicCAS((unsigned int*)addr, __float_as_int(assumed), __float_as_int(value)); + } while (old != assumed); + return old; +} + +__global__ +void _rasterization(Fragment * dev_fragmentBuffer, Primitive * dev_primitives, int nPrimatives, float * dev_depth, int width, int height) +{ + int pid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (pid < nPrimatives) { + Primitive p = dev_primitives[pid]; + + glm::vec3 tri[3] = {glm::vec3(p.v[0].pos), glm::vec3(p.v[1].pos), glm::vec3(p.v[2].pos)}; + glm::vec2 tex[3] = {p.v[0].texcoord0 , p.v[1].texcoord0 , p.v[2].texcoord0}; + glm::vec3 col[3] = {p.v[0].col , p.v[1].col , p.v[2].col}; + glm::vec3 eyenor[3] = { p.v[0].eyeNor, p.v[1].eyeNor, p.v[2].eyeNor }; + glm::vec3 eyepos[3] = { p.v[0].eyePos, p.v[1].eyePos, p.v[2].eyePos }; + + AABB bbox = getAABBForTriangle(tri); + int maxx = MIN(bbox.max.x, width - 1); + int maxy = MIN(bbox.max.y, height - 1); + int minx = MAX(bbox.min.x, 0); + int miny = MAX(bbox.min.y, 0); + + #if !SHADING + for(int x = minx; x <= maxx; x++) + for(int y = miny; y <= maxy; y++) + { + glm::vec2 xy = glm::vec2(x, y); + glm::vec3 barycentric = calculateBarycentricCoordinate(tri, xy); + if (isBarycentricCoordInBounds(barycentric)) { + float depth = getZAtCoordinate(barycentric, tri) * (float)INT_MIN; + int index = x + y * width; + fatomicMin(&dev_depth[index], depth); + if (depth == dev_depth[index]) { + dev_fragmentBuffer[index].z = depth; // xxxx + dev_fragmentBuffer[index].color = eval(barycentric, col); + dev_fragmentBuffer[index].eyeNor = glm::normalize(eval(barycentric, eyenor)); + dev_fragmentBuffer[index].eyePos = eval(barycentric, eyepos); + if (dev_primitives[pid].tex != NULL) { + dev_fragmentBuffer[index].dev_diffuseTex = dev_primitives[pid].tex; + dev_fragmentBuffer[index].size[0] = dev_primitives[pid].size[0]; + dev_fragmentBuffer[index].size[1] = dev_primitives[pid].size[1]; + + // Perspective Correted Texture Coord + float c[3] = { eyepos[0].z, eyepos[1].z, eyepos[2].z }; + glm::vec2 ttex[3] = { tex[0] / eyepos[0].z, tex[1] / eyepos[1].z, tex[2] / eyepos[2].z }; + float cz = getCorrectedZ(barycentric, c); + dev_fragmentBuffer[index].texcoord0 = cz * eval(barycentric, ttex); + + } + } + + } + } + + #else + for (int x = minx; x <= maxx; x++) + for (int y = miny; y <= maxy; y++) + { + glm::vec2 xy = glm::vec2(x, y); + glm::vec3 barycentric = calculateBarycentricCoordinate(tri, xy); + #if SHADING == 1 + bool mode = isBarycentricCoordOnBounds(barycentric); + #endif + #if SHADING == 2 + bool mode = isBarycentricCoordOnCorner(barycentric); + #endif + if (mode) { + float depth = getZAtCoordinate(barycentric, tri) * (float)INT_MIN; + int index = x + y * width; + fatomicMin(&dev_depth[index], depth); + if (depth == dev_depth[index]) { + dev_fragmentBuffer[index].z = depth; + #if SHADING == 1 + dev_fragmentBuffer[index].color = glm::vec3(0.1, 1, 0.1); + #endif + #if SHADING == 2 + dev_fragmentBuffer[index].color = glm::vec3(1, 0.1, 0.1); + #endif + dev_fragmentBuffer[index].eyeNor = glm::normalize(eval(barycentric, eyenor)); + dev_fragmentBuffer[index].eyePos = eval(barycentric, eyepos); + dev_fragmentBuffer[index].dev_diffuseTex = NULL; + } + + } + } + #endif + } +} + +__host__ __device__ +void tileBound(AABB bbox, int *minmax, glm::vec3 *tri, glm::vec2 tilewh, int tilesize) +{ + int w = tilewh.x; + int h = tilewh.y; + + minmax[0] = MIN(floorf(bbox.max.x + 0.5f) / tilesize, w - 1); + minmax[1] = MIN(floorf(bbox.max.y + 0.5f) / tilesize, h - 1); + minmax[2] = MAX((floorf(bbox.min.x + 0.5f) / tilesize) - 0, 0); + minmax[3] = MAX((floorf(bbox.min.y + 0.5f) / tilesize) - 0, 0); +} + + + +__global__ +void _decompose(Primitive * dev_primitives, int * count, int nPrimatives, int *tile, glm::vec2 tileWidthHeight, int tilesize) +{ + int pid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (pid < nPrimatives) { + Primitive p = dev_primitives[pid]; + glm::vec3 tri[3] = { glm::vec3(p.v[0].pos), glm::vec3(p.v[1].pos), glm::vec3(p.v[2].pos) }; + +#if BACKFACE_CULLING + if (!front(tri)) return; +#endif + + int bound[4]; + AABB bbox = getAABBForTriangle(tri); + tileBound(bbox, bound, tri, tileWidthHeight, tilesize); + for (int i = bound[2]; i <= bound[0]; i++) + for (int j = bound[3]; j <= bound[1]; j++) + { + int w = tileWidthHeight.x; + int h = tileWidthHeight.y; + int off = atomicAdd(&count[i + j * w], 1); + tile[(i + j * w) * nPrimatives + off] = pid; + } + } +} + + +__global__ +void _rasterizationWithTiles(Fragment * dev_fragmentBuffer, Primitive * dev_primitives, int nPrimatives, float * dev_depth, + int width, int height, int *count, int *tile, glm::vec2 tileWidthHeight, int tilesize) +{ + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + if (x >= 0 && y >= 0 && x < width && y < height) + { + int idx = blockIdx.x + blockIdx.y * tileWidthHeight.x; // tile index + int index = x + y * width; // fragment index + + for (int i = 0; i < count[idx]; i++) { + int pid = tile[idx * nPrimatives + i]; + //printf("pid: %i\n", count[idx]); + Primitive p = dev_primitives[pid]; + glm::vec3 tri[3] = { glm::vec3(p.v[0].pos), glm::vec3(p.v[1].pos), glm::vec3(p.v[2].pos) }; + glm::vec2 tex[3] = { p.v[0].texcoord0 , p.v[1].texcoord0 , p.v[2].texcoord0 }; + glm::vec3 col[3] = { p.v[0].col , p.v[1].col , p.v[2].col }; + glm::vec3 eyenor[3] = { p.v[0].eyeNor, p.v[1].eyeNor, p.v[2].eyeNor }; + glm::vec3 eyepos[3] = { p.v[0].eyePos, p.v[1].eyePos, p.v[2].eyePos }; + + glm::vec2 xy = glm::vec2(x, y); + glm::vec3 barycentric = calculateBarycentricCoordinate(tri, xy); + if (isBarycentricCoordInBounds(barycentric)) { + float depth = getZAtCoordinate(barycentric, tri) * (float)INT_MIN; + + fatomicMin(&dev_depth[index], depth); + if (depth == dev_depth[index]) { + dev_fragmentBuffer[index].z = depth; + dev_fragmentBuffer[index].color = eval(barycentric, col); + dev_fragmentBuffer[index].eyeNor = glm::normalize(eval(barycentric, eyenor)); + dev_fragmentBuffer[index].eyePos = eval(barycentric, eyepos); + if (dev_primitives[pid].tex != NULL) { + dev_fragmentBuffer[index].dev_diffuseTex = dev_primitives[pid].tex; + dev_fragmentBuffer[index].size[0] = dev_primitives[pid].size[0]; + dev_fragmentBuffer[index].size[1] = dev_primitives[pid].size[1]; + + // Perspective Correted Texture Coord + float c[3] = { eyepos[0].z, eyepos[1].z, eyepos[2].z }; + glm::vec2 ttex[3] = { tex[0] / eyepos[0].z, tex[1] / eyepos[1].z, tex[2] / eyepos[2].z }; + float cz = getCorrectedZ(barycentric, c); + dev_fragmentBuffer[index].texcoord0 = cz * eval(barycentric, ttex); + + } + } + + } + } + } + +} + + +// Blur based on ... +// https://www.slideshare.net/DarshanParsana/gaussian-image-blurring-in-cuda-c +__global__ +void Blur(int width, int height, glm::vec3 *dev_framebuffer, float a = 1.0f, bool th = false) { + const int R = 2; + const int bw = 32; + const int bh = 32; + + int x = blockIdx.x * (bw - 2 * R) + threadIdx.x; + int y = blockIdx.y * (bh - 2 * R) + threadIdx.y; + + x = clamp(x, 0, width - 1); + y = clamp(y, 0, height - 1); + + int idx = threadIdx.x + threadIdx.y * blockDim.x; + int tidx = x + y * width; + + __shared__ glm::vec3 sm[bw * bh]; + sm[idx] = dev_framebuffer[tidx]; + __syncthreads(); + + if (threadIdx.x >= R && threadIdx.x < (bw - R) && threadIdx.y >= R && threadIdx.y < (bh - R)) { + glm::vec3 sum(0); + for (int dy = -R; dy <= R; dy++) + for (int dx = -R; dx <= R; dx++) + { + glm::vec3 i = sm[idx + (dy * blockDim.y) + dx]; + sum += float(mx[dy][dx]) * i; + } + if(th) dev_framebuffer[tidx] = (a * ((sm[idx].r + sm[idx].g + sm[idx].b) / 3.0f) * sum / 128.0f) + (1 - a) * dev_framebuffer[tidx]; + else dev_framebuffer[tidx] = a * sum / 128.0f; + } +} + +__global__ +void copyToChannel(int width, int height, glm::vec3 *frame, float * arr, int channel) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= 0 && y >= 0 && x < width && y < height) + { + if (channel == 0) arr[x + y * width] = frame[x + y * width].r; + if (channel == 1) arr[x + y * width] = frame[x + y * width].g; + if (channel == 2) arr[x + y * width] = frame[x + y * width].b; + //printf("r: %f\n", frame[x + y * width].r); + } +} /** * Perform rasterization. */ -void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) { - int sideLength2d = 8; +void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal, float time) { + int sideLength2d = 16; dim3 blockSize2d(sideLength2d, sideLength2d); - dim3 blockCount2d((width - 1) / blockSize2d.x + 1, + dim3 blockCount2d((width - 1) / blockSize2d.x + 1, (height - 1) / blockSize2d.y + 1); + _time = time; + // Execute your rasterization pipeline here // (See README for rasterization pipeline outline.) @@ -702,7 +1298,7 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); - _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height); + _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height, _time); checkCUDAError("Vertex Processing"); cudaDeviceSynchronize(); _primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> > @@ -721,16 +1317,60 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); initDepth << > >(width, height, dev_depth); - - // TODO: rasterize +#if !TILE_BASED_RENDER + + int restPrims = totalNumPrimitives; + #if BACKFACE_CULLING + using namespace thrust; + Primitive *_dev_prim = remove_if(device, dev_primitives, dev_primitives + totalNumPrimitives, backface()); + restPrims = MAX(_dev_prim - dev_primitives, 1); + #endif + + const int THREADS = 128; + dim3 threadsPerBlock(THREADS); + dim3 blocksPerGrid((restPrims + THREADS - 1) / THREADS); + cudaFuncSetCacheConfig(_rasterization, cudaFuncCachePreferL1); + _rasterization << > > (dev_fragmentBuffer, dev_primitives, restPrims, dev_depth, width, height); + checkCUDAError("rasteration"); + +#else + cudaMemset(dev_count, 0, numTiles * sizeof(int)); + dim3 tsize(128); + dim3 bsize((totalNumPrimitives + 128 - 1) / 128); + _decompose <<>>(dev_primitives, dev_count, totalNumPrimitives, dev_tile, glm::vec2(numTilesW, numTilesH), tilesize); + checkCUDAError("Decompose"); + + bsize = dim3(numTilesW, numTilesH, 1); + tsize = dim3(tilesize, tilesize, 1); + cudaFuncSetCacheConfig(_rasterizationWithTiles, cudaFuncCachePreferL1); + _rasterizationWithTiles << >>(dev_fragmentBuffer, dev_primitives, totalNumPrimitives, dev_depth, width, height, dev_count, dev_tile, glm::vec2(numTilesW, numTilesH), tilesize); + checkCUDAError("Rasterization With Tiles"); +#endif // Copy depthbuffer colors into framebuffer - render << > >(width, height, dev_fragmentBuffer, dev_framebuffer); + int shareMemSize = sideLength2d * sideLength2d * sizeof(glm::vec3); + cudaFuncSetCacheConfig(render, cudaFuncCachePreferL1); + render <<>>(width, height, dev_fragmentBuffer, dev_framebuffer, dev_noise); checkCUDAError("fragment shader"); + +#if BLOOM + + const int t = 32; + const int _t = 32; + dim3 _tsize(_t, _t, 1); + dim3 _bsize(width / t, height / t, 1); + Blur <<<_bsize, _tsize >>> (width, height, dev_framebuffer, 0.5f, true); + Blur << <_bsize, _tsize >> > (width, height, dev_framebuffer, 0.75f, true); + Blur <<<_bsize, _tsize >>> (width, height, dev_framebuffer, 1.25f); + //Blur << <_bsize, _tsize >> > (width, height, dev_framebuffer, 1.25f); + +#endif + + // Copy framebuffer into OpenGL buffer for OpenGL previewing - sendImageToPBO<<>>(pbo, width, height, dev_framebuffer); + sendImageToPBO<<>>(pbo, ow, oh, dev_framebuffer); checkCUDAError("copy render result to pbo"); } diff --git a/src/rasterize.h b/src/rasterize.h index 560aae9..cdd8dc9 100644 --- a/src/rasterize.h +++ b/src/rasterize.h @@ -16,9 +16,10 @@ namespace tinygltf{ class Scene; } +static float _time; void rasterizeInit(int width, int height); void rasterizeSetBuffers(const tinygltf::Scene & scene); -void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal); +void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal, float time); void rasterizeFree(); diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h index 46c701e..df1e0b7 100644 --- a/src/rasterizeTools.h +++ b/src/rasterizeTools.h @@ -88,6 +88,23 @@ bool isBarycentricCoordInBounds(const glm::vec3 barycentricCoord) { barycentricCoord.z >= 0.0 && barycentricCoord.z <= 1.0; } +__host__ __device__ static +bool isBarycentricCoordOnBounds(const glm::vec3 barycentricCoord) { + if (glm::abs(1 - glm::abs(barycentricCoord.x + barycentricCoord.y)) <= 0.07f && glm::abs(barycentricCoord.z) <= 0.07f) return true; + if (glm::abs(1 - glm::abs(barycentricCoord.y + barycentricCoord.z)) <= 0.07f && glm::abs(barycentricCoord.x) <= 0.07f) return true; + if (glm::abs(1 - glm::abs(barycentricCoord.z + barycentricCoord.x)) <= 0.07f && glm::abs(barycentricCoord.y) <= 0.07f) return true; + return false; +} + +__host__ __device__ static +bool isBarycentricCoordOnCorner(const glm::vec3 barycentricCoord) { + if (glm::abs(1 - glm::abs(barycentricCoord.x)) < 0.11f && glm::abs(barycentricCoord.y) < 0.11 && glm::abs(barycentricCoord.z) < 0.11) return true; + if (glm::abs(1 - glm::abs(barycentricCoord.y)) < 0.11f && glm::abs(barycentricCoord.x) < 0.11 && glm::abs(barycentricCoord.z) < 0.11) return true; + if (glm::abs(1 - glm::abs(barycentricCoord.z)) < 0.11f && glm::abs(barycentricCoord.y) < 0.11 && glm::abs(barycentricCoord.x) < 0.11) return true; + return false; +} + + // CHECKITOUT /** * For a given barycentric coordinate, compute the corresponding z position