From c1f8097576354f6f1e94267b964a6f2955e4def2 Mon Sep 17 00:00:00 2001 From: juanchuletas Date: Thu, 26 Feb 2026 17:01:19 -0600 Subject: [PATCH 1/3] feat: adding hot cold data split --- PBR/Render/include/cpu_renderer.hpp | 4 +- PBR/Render/include/cuda_renderer.hpp | 4 +- PBR/Render/include/icompute_renderer.hpp | 2 + PBR/Render/include/sycl_renderer.hpp | 2 + PBR/Render/shared/core_renderer.hpp | 4 +- PBR/Render/src/cpu_renderer.cpp | 2 +- PBR/Render/src/cuda_renderer.cu | 2 + PBR/Render/src/sycl_renderer.cpp | 2 + PBR/Space/space.cpp | 22 +++++++++- PBR/Space/space.hpp | 4 +- Triangle/triangle.hpp | 22 +++++++++- Vector/vector4.hpp | 55 ++++++++++++++++++++++++ gpu/data/device_pod.hpp | 3 +- gpu/include/fgt_cpu_device.hpp | 2 + 14 files changed, 119 insertions(+), 11 deletions(-) create mode 100644 Vector/vector4.hpp diff --git a/PBR/Render/include/cpu_renderer.hpp b/PBR/Render/include/cpu_renderer.hpp index 34901ce..fffe1e8 100644 --- a/PBR/Render/include/cpu_renderer.hpp +++ b/PBR/Render/include/cpu_renderer.hpp @@ -21,9 +21,11 @@ class CPU_Renderer : public IComputeRenderer{ } std::vector RenderScene( - int width, + int width, int height, const std::vector& triangleList, + const std::vector &hotTriangles, + const std::vector &coldTriangles, const std::vector &nodes, const std::vector &lightsList, const std::vector& emissiveTriIndices, diff --git a/PBR/Render/include/cuda_renderer.hpp b/PBR/Render/include/cuda_renderer.hpp index c0ddbf1..b9991fc 100644 --- a/PBR/Render/include/cuda_renderer.hpp +++ b/PBR/Render/include/cuda_renderer.hpp @@ -34,9 +34,11 @@ class CUDA_Renderer : public IComputeRenderer{ CUDA_Renderer() = default; std::vector RenderScene( - int width, + int width, int height, const std::vector& triangleList, + const std::vector &hotTriangles, + const std::vector &coldTriangles, const std::vector &nodes, const std::vector &lightsList, const std::vector& emissiveTriIndices, diff --git a/PBR/Render/include/icompute_renderer.hpp b/PBR/Render/include/icompute_renderer.hpp index 7d0576a..3ddf43d 100644 --- a/PBR/Render/include/icompute_renderer.hpp +++ b/PBR/Render/include/icompute_renderer.hpp @@ -16,6 +16,8 @@ class IComputeRenderer{ virtual std::vector RenderScene( int width, int height, const std::vector &triangleList, + const std::vector &hotTriangles, + const std::vector &coldTriangles, const std::vector &nodes, const std::vector &lightsList, const std::vector& emissiveTriIndices, diff --git a/PBR/Render/include/sycl_renderer.hpp b/PBR/Render/include/sycl_renderer.hpp index 04b1dc4..cb59c02 100644 --- a/PBR/Render/include/sycl_renderer.hpp +++ b/PBR/Render/include/sycl_renderer.hpp @@ -32,6 +32,8 @@ class SYCL_Renderer : public IComputeRenderer { int width, int height, const std::vector& triangleList, + const std::vector &hotTriangles, + const std::vector &coldTriangles, const std::vector& nodes, const std::vector& lightsList, const std::vector& emissiveTriIndices, diff --git a/PBR/Render/shared/core_renderer.hpp b/PBR/Render/shared/core_renderer.hpp index e1ef645..7af627a 100644 --- a/PBR/Render/shared/core_renderer.hpp +++ b/PBR/Render/shared/core_renderer.hpp @@ -103,7 +103,7 @@ fgt_device_gpu bool traceShadowRayBVH( int numNodes, float maxDist) // Only check hits closer than this (distance to light) { - int stack[64]; + int stack[32]; int stackPtr = 0; stack[stackPtr++] = 0; // Start with root @@ -148,7 +148,7 @@ fgt_device_gpu bool inline traceRayBVH( float closest = FLT_MAX; // Stack-based traversal (no recursion on GPU!) - int stack[64]; // Stack to track nodes to visit + int stack[32]; // Stack to track nodes to visit int stackPtr = 0; stack[stackPtr++] = 0; // Start with root node (index 0) diff --git a/PBR/Render/src/cpu_renderer.cpp b/PBR/Render/src/cpu_renderer.cpp index 3b94427..a0094e6 100644 --- a/PBR/Render/src/cpu_renderer.cpp +++ b/PBR/Render/src/cpu_renderer.cpp @@ -2,7 +2,7 @@ #include "PBR/PBRCamera/pbr_camera.hpp" #include "cpu_renderer.hpp" -std::vector CPU_Renderer::RenderScene(int width, int height, const std::vector& triangleList, const std::vector& nodes, const std::vector& lightsList, const std::vector& emissiveTriIndices ,const PBRCamera& camera, int samplesPerPixel,int sampleOffset) +std::vector CPU_Renderer::RenderScene(int width, int height, const std::vector& triangleList, const std::vector& hotTriangles, const std::vector& coldTriangles, const std::vector& nodes, const std::vector& lightsList, const std::vector& emissiveTriIndices, const PBRCamera& camera, int samplesPerPixel, int sampleOffset) { return std::vector(); } diff --git a/PBR/Render/src/cuda_renderer.cu b/PBR/Render/src/cuda_renderer.cu index 380bc4e..e4e5b4e 100644 --- a/PBR/Render/src/cuda_renderer.cu +++ b/PBR/Render/src/cuda_renderer.cu @@ -195,6 +195,8 @@ fgt_global void render_kernel( std::vector CUDA_Renderer::RenderScene( int width, int height, const std::vector& triangleList, + const std::vector &hotTriangles, + const std::vector &coldTriangles, const std::vector &nodes, const std::vector &lightsList, const std::vector& emissiveTriIndices, diff --git a/PBR/Render/src/sycl_renderer.cpp b/PBR/Render/src/sycl_renderer.cpp index 1aa2cc7..c5c63da 100644 --- a/PBR/Render/src/sycl_renderer.cpp +++ b/PBR/Render/src/sycl_renderer.cpp @@ -132,6 +132,8 @@ std::vector SYCL_Renderer::RenderScene( int width, int height, const std::vector& triangleList, + const std::vector &hotTriangles, + const std::vector &coldTriangles, const std::vector& nodes, const std::vector& lightsList, const std::vector& emissiveTriIndices, diff --git a/PBR/Space/space.cpp b/PBR/Space/space.cpp index 5a5fb0d..40cfcc8 100644 --- a/PBR/Space/space.cpp +++ b/PBR/Space/space.cpp @@ -81,7 +81,8 @@ std::vector Space::Render(const int width, const int height,int sam << " Framebuffer: " << frameMem / (1024.0 * 1024.0) << " MB\n" << " Total: " << totalMem / (1024.0 * 1024.0) << " MB\n"; std::vector frameBuffer = m_computeRenderer->RenderScene( - width, height, m_triangles, m_bvh_nodes, m_lights,m_emissiveTriIndices, + width, height, m_triangles, m_hotTriangles, m_coldTriangles, + m_bvh_nodes, m_lights, m_emissiveTriIndices, m_camera, m_samplesPerPixel, sampleOffset); return frameBuffer; @@ -137,6 +138,24 @@ void Space::sendTexturesToRender() } +void Space::buildGPUDataStructures() +{ + m_hotTriangles.resize(m_triangles.size()); + m_coldTriangles.resize(m_triangles.size()); + + for (size_t i = 0; i < m_triangles.size(); i++) { + m_hotTriangles[i].v0 = fungt::Vec4(m_triangles[i].v0); + m_hotTriangles[i].v1 = fungt::Vec4(m_triangles[i].v1); + m_hotTriangles[i].v2 = fungt::Vec4(m_triangles[i].v2); + + m_coldTriangles[i].n0 = fungt::Vec4(m_triangles[i].n0); + m_coldTriangles[i].n1 = fungt::Vec4(m_triangles[i].n1); + m_coldTriangles[i].n2 = fungt::Vec4(m_triangles[i].n2); + m_coldTriangles[i].material = m_triangles[i].material; + memcpy(m_coldTriangles[i].uvs, m_triangles[i].uvs, sizeof(m_triangles[i].uvs)); + } +} + void Space::InitComputeRenderBackend() { if (!m_computeRenderer) { @@ -404,6 +423,7 @@ void Space::BuildBVH() } } std::cout << "Emissive triangles: " << m_emissiveTriIndices.size() << std::endl; + buildGPUDataStructures(); //build hot/cold triangle arrays for GPU rendering } void Space::setSamples(int numOfSamples) diff --git a/PBR/Space/space.hpp b/PBR/Space/space.hpp index 4524982..35d8abd 100644 --- a/PBR/Space/space.hpp +++ b/PBR/Space/space.hpp @@ -35,6 +35,8 @@ class Space { PBRCamera m_camera; std::vector m_triangles; + std::vector m_hotTriangles; + std::vector m_coldTriangles; std::unique_ptr m_computeRenderer; std::vector m_lights; int m_samplesPerPixel = 16; @@ -44,7 +46,7 @@ class Space { std::vector m_emissiveTriIndices; void sendTexturesToRender(); - + void buildGPUDataStructures(); public: Space(); Space(std::vector& triangleList); diff --git a/Triangle/triangle.hpp b/Triangle/triangle.hpp index bc20ba4..75112ba 100644 --- a/Triangle/triangle.hpp +++ b/Triangle/triangle.hpp @@ -1,7 +1,25 @@ #if !defined(_TRIANGLE_H_) #define _TRIANGLE_H_ -#include "../../Vector/vector3.hpp" -#include "../gpu/data/device_pod.hpp" +#include "Vector/vector4.hpp" +#include "gpu/data/device_pod.hpp" + +namespace gpu{ + + struct TriangleGeometry { + fungt::Vec4 v0, v1, v2; // 48 bytes + }; + + struct TriangleShadingData { + fungt::Vec4 n0, n1, n2; // 48 bytes + MaterialData material; // 32 bytes + float uvs[3][2]; // 24 bytes + float _pad[2]; // 8 bytes + }; // total 112 bytes + +} + + + struct Triangle { fungt::Vec3 v0, v1, v2; fungt::Vec3 n0, n1,n2; diff --git a/Vector/vector4.hpp b/Vector/vector4.hpp new file mode 100644 index 0000000..f576f7e --- /dev/null +++ b/Vector/vector4.hpp @@ -0,0 +1,55 @@ +#if !defined(_VEC4_H_) +#define _VEC4_H_ +#include "gpu/include/fgt_cpu_device.hpp" +#include "vector3.hpp" + +namespace fungt{ + + + class Vec4 { + public: + float x, y, z, w; + + fgt_device Vec4(float x = 0, float y = 0, float z = 0, float w = 0) : x(x), y(y), z(z), w(w) {} + fgt_device Vec4(const Vec3& v, float w = 0.0f) : x(v.x), y(v.y), z(v.z), w(w) {} + fgt_device fungt::Vec3 xyz() const { return Vec3(x, y, z); } + + // Arithmetic with Vec4 + fgt_device Vec4 operator+(const Vec4& o) const { return Vec4(x + o.x, y + o.y, z + o.z, w + o.w); } + fgt_device Vec4 operator-(const Vec4& o) const { return Vec4(x - o.x, y - o.y, z - o.z, w - o.w); } + fgt_device Vec4 operator*(float s) const { return Vec4(x * s, y * s, z * s, w * s); } + fgt_device Vec4 operator/(float s) const { return Vec4(x / s, y / s, z / s, w / s); } + fgt_device Vec4& operator+=(const Vec4& o) { x += o.x; y += o.y; z += o.z; w += o.w; return *this; } + fgt_device Vec4& operator-=(const Vec4& o) { x -= o.x; y -= o.y; z -= o.z; w -= o.w; return *this; } + + // Scalar multiply from left + fgt_device friend Vec4 operator*(float s, const Vec4& v) { return Vec4(v.x * s, v.y * s, v.z * s, v.w * s); } + + // Dot and length — w ignored for geometric ops + fgt_device float dot(const Vec4& o) const { return x * o.x + y * o.y + z * o.z; } + fgt_device float length() const { return FGT_SQRT(x * x + y * y + z * z); } + fgt_device Vec4 normalize() const { + float len = length(); + if (len > 0) return Vec4(x / len, y / len, z / len, w); + return Vec4(0, 0, 0, w); + } + + // Cross product — w ignored + fgt_device Vec4 cross(const Vec4& o) const { + return Vec4( + y * o.z - z * o.y, + z * o.x - x * o.z, + x * o.y - y * o.x, + 0.0f + ); + } + + // Index access + fgt_device float operator[](int i) const { if (i == 0) return x; if (i == 1) return y; if (i == 2) return z; return w; } + fgt_device float& operator[](int i) { if (i == 0) return x; if (i == 1) return y; if (i == 2) return z; return w; } + }; + +} + + +#endif // _VEC4_H_ diff --git a/gpu/data/device_pod.hpp b/gpu/data/device_pod.hpp index caf0236..f0badf8 100644 --- a/gpu/data/device_pod.hpp +++ b/gpu/data/device_pod.hpp @@ -1,10 +1,9 @@ #if !defined(_DEVIVE_POD_H_) #define _DEVIVE_POD_H_ #include "../include/fgt_cpu_device.hpp" -struct MaterialData { +struct fgt_align(16) MaterialData { float baseColor[3]; // Albedo in linear space (e.g. {0.8, 0.8, 0.8}) float metallic; // 0 = dielectric, 1 = fully metallic - float roughness; // 0 = mirror-smooth, 1 = rough float reflectance; // F0 for dielectrics (typ. 0.04) float emission; // Intensity if the material emits light diff --git a/gpu/include/fgt_cpu_device.hpp b/gpu/include/fgt_cpu_device.hpp index ae17439..39350fe 100644 --- a/gpu/include/fgt_cpu_device.hpp +++ b/gpu/include/fgt_cpu_device.hpp @@ -23,6 +23,7 @@ #if defined(__KERNEL_CUDA__) #define fgt_device __host__ __device__ #define fgt_device_gpu __device__ +#define fgt_gpu_noinline __noinline__ #define fgt_device_forceinline __host__ __device__ __forceinline__ #define fgt_device_constant __constant__ #define fgt_global __global__ @@ -31,6 +32,7 @@ #define fgt_device inline #define fgt_device_gpu inline #define fgt_device_forceinline inline +#define fgt_gpu_noinline [[intel::noinline]] #define fgt_device_constant const #define fgt_global #define fgt_shared /* use local_accessor in SYCL kernels */ From b74c266aaf2fae4556d5face495a9e3a5238aa79 Mon Sep 17 00:00:00 2001 From: juanchuletas Date: Thu, 26 Feb 2026 18:31:48 -0600 Subject: [PATCH 2/3] feat: passing hot and cold triangle data to core renderer --- PBR/Intersection/intersection.hpp | 38 ++++++++++++++++++++++++++--- PBR/Render/shared/core_renderer.hpp | 37 ++++++++++++++-------------- Vector/vector4.hpp | 9 +++++-- 3 files changed, 60 insertions(+), 24 deletions(-) diff --git a/PBR/Intersection/intersection.hpp b/PBR/Intersection/intersection.hpp index f2c0702..9d3c385 100644 --- a/PBR/Intersection/intersection.hpp +++ b/PBR/Intersection/intersection.hpp @@ -1,9 +1,9 @@ #if !defined(_INTERSECTION_H_) #define _INTERSECTION_H_ -#include "../Ray/ray.hpp" -#include "../HitData/hit_data.hpp" -#include "../../Triangle/triangle.hpp" -#include "../BVH/aabb.hpp" +#include "Ray/ray.hpp" +#include "HitData/hit_data.hpp" +#include "Triangle/triangle.hpp" +#include "BVH/aabb.hpp" class Intersection{ @@ -53,6 +53,36 @@ class Intersection{ } return true; } + static fgt_device inline bool MollerTrumbore( + const fungt::Ray& ray, + const gpu::TriangleGeometry& tri, + float tMin, float tMax, + HitData& rec) + { + const float EPSILON = 1e-8f; + fungt::Vec3 edge1 = tri.v1 - tri.v0; + fungt::Vec3 edge2 = tri.v2 - tri.v0; + fungt::Vec3 h = ray.m_dir.cross(edge2); + float a = edge1.dot(h); + if (fabs(a) < EPSILON) return false; + + float f = 1.0f / a; + fungt::Vec3 s = ray.m_origin - tri.v0.xyz(); + float u = f * s.dot(h); + if (u < 0.0f || u > 1.0f) return false; + + fungt::Vec3 q = s.cross(edge1); + float v = f * ray.m_dir.dot(q); + if (v < 0.0f || u + v > 1.0f) return false; + + float t = f * edge2.dot(q); + if (t < tMin || t > tMax) return false; + + rec.dis = t; + rec.point = ray.at(t); + rec.bary = fungt::Vec3(1.0f - u - v, u, v); + return true; + } static fgt_device bool intersectAABB( const fungt::Ray& ray, const AABB& box, diff --git a/PBR/Render/shared/core_renderer.hpp b/PBR/Render/shared/core_renderer.hpp index 7af627a..03244f8 100644 --- a/PBR/Render/shared/core_renderer.hpp +++ b/PBR/Render/shared/core_renderer.hpp @@ -137,6 +137,8 @@ fgt_device_gpu bool traceShadowRayBVH( fgt_device_gpu bool inline traceRayBVH( const fungt::Ray& ray, const Triangle* tris, + const gpu::TriangleGeometry *hotTris, + const gpu::TriangleGeometry *coldTris, const BVHNode* bvhNodes, int numNodes, const TextureDeviceObject* textures, @@ -169,36 +171,35 @@ fgt_device_gpu bool inline traceRayBVH( int triIdx = node.firstTriIdx + i; HitData temp; - if (Intersection::MollerTrumbore(ray, tris[triIdx], 0.001f, closest, temp)) { + if (Intersection::MollerTrumbore(ray, hotTris[triIdx], 0.001f, closest, temp)) { hitSomething = true; closest = temp.dis; hit = temp; - // Calculate geometric normal - fungt::Vec3 e1 = tris[triIdx].v1 - tris[triIdx].v0; - fungt::Vec3 e2 = tris[triIdx].v2 - tris[triIdx].v0; + const gpu::TriangleGeometry& hot = hotTris[triIdx]; + const gpu::TriangleShadingData& cold = coldTris[triIdx]; + + const float bx = temp.bary.x; + const float by = temp.bary.y; + const float bz = temp.bary.z; + + // Vec4 direct subtraction — no .xyz() temporaries + fungt::Vec3 e1 = hot.v1 - hot.v0; + fungt::Vec3 e2 = hot.v2 - hot.v0; hit.geometricNormal = e1.cross(e2).normalize(); - // Interpolate shading normal - hit.normal = (tris[triIdx].n0 * temp.bary.x + - tris[triIdx].n1 * temp.bary.y + - tris[triIdx].n2 * temp.bary.z).normalize(); + // Reuse cached barycentrics + hit.normal = (cold.n0 * bx + cold.n1 * by + cold.n2 * bz).normalize(); - // Ensure normal faces same hemisphere - if (hit.normal.dot(hit.geometricNormal) < 0.0f) { + if (hit.normal.dot(hit.geometricNormal) < 0.0f) hit.normal = hit.normal * -1.0f; - } - hit.material = tris[triIdx].material; + hit.material = cold.material; // Texture sampling (if applicable) if (hit.material.baseColorTexIdx >= 0 && textures != nullptr) { - float u = tris[triIdx].uvs[0][0] * temp.bary.x + - tris[triIdx].uvs[1][0] * temp.bary.y + - tris[triIdx].uvs[2][0] * temp.bary.z; - float v = tris[triIdx].uvs[0][1] * temp.bary.x + - tris[triIdx].uvs[1][1] * temp.bary.y + - tris[triIdx].uvs[2][1] * temp.bary.z; + float u = cold.uvs[0][0] * bx + cold.uvs[1][0] * by + cold.uvs[2][0] * bz; + float v = cold.uvs[0][1] * bx + cold.uvs[1][1] * by + cold.uvs[2][1] * bz; fungt::Vec3 texColor = sampleTexture2D(textures[hit.material.baseColorTexIdx], u, v); texColor.x = powf(texColor.x, 2.2f); diff --git a/Vector/vector4.hpp b/Vector/vector4.hpp index f576f7e..a4764b2 100644 --- a/Vector/vector4.hpp +++ b/Vector/vector4.hpp @@ -17,12 +17,17 @@ namespace fungt{ // Arithmetic with Vec4 fgt_device Vec4 operator+(const Vec4& o) const { return Vec4(x + o.x, y + o.y, z + o.z, w + o.w); } fgt_device Vec4 operator-(const Vec4& o) const { return Vec4(x - o.x, y - o.y, z - o.z, w - o.w); } + fgt_device Vec3 operator-(const Vec4& o) const { + return Vec3(x - o.x, y - o.y, z - o.z); + } fgt_device Vec4 operator*(float s) const { return Vec4(x * s, y * s, z * s, w * s); } fgt_device Vec4 operator/(float s) const { return Vec4(x / s, y / s, z / s, w / s); } fgt_device Vec4& operator+=(const Vec4& o) { x += o.x; y += o.y; z += o.z; w += o.w; return *this; } fgt_device Vec4& operator-=(const Vec4& o) { x -= o.x; y -= o.y; z -= o.z; w -= o.w; return *this; } - - // Scalar multiply from left + fgt_device Vec3 operator*(float s) const { + return Vec3(x * s, y * s, z * s); + } + // Scalar multiply from left fgt_device friend Vec4 operator*(float s, const Vec4& v) { return Vec4(v.x * s, v.y * s, v.z * s, v.w * s); } // Dot and length — w ignored for geometric ops From cb8bff9103ecf40117c05215ebf17247c3d64b8c Mon Sep 17 00:00:00 2001 From: juanchuletas Date: Thu, 5 Mar 2026 19:59:49 -0600 Subject: [PATCH 3/3] feat: adding performance --- PBR/Intersection/intersection.hpp | 10 +- PBR/Render/include/cpu_renderer.hpp | 1 - PBR/Render/include/cuda_renderer.hpp | 1 - PBR/Render/include/icompute_renderer.hpp | 1 - PBR/Render/include/sycl_renderer.hpp | 1 - PBR/Render/shared/core_renderer.hpp | 68 ++++-- PBR/Render/src/cpu_renderer.cpp | 2 +- PBR/Render/src/cuda_renderer.cu | 41 ++-- PBR/Render/src/sycl_renderer.cpp | 261 +++++++++++------------ PBR/Space/space.cpp | 2 +- PBR/main/CMakeLists.txt | 4 +- Vector/vector4.hpp | 14 +- gpu/include/fgt_cpu_device.hpp | 3 + 13 files changed, 228 insertions(+), 181 deletions(-) diff --git a/PBR/Intersection/intersection.hpp b/PBR/Intersection/intersection.hpp index 9d3c385..581306b 100644 --- a/PBR/Intersection/intersection.hpp +++ b/PBR/Intersection/intersection.hpp @@ -1,9 +1,9 @@ #if !defined(_INTERSECTION_H_) #define _INTERSECTION_H_ -#include "Ray/ray.hpp" -#include "HitData/hit_data.hpp" +#include "PBR/Ray/ray.hpp" +#include "PBR/HitData/hit_data.hpp" #include "Triangle/triangle.hpp" -#include "BVH/aabb.hpp" +#include "PBR/BVH/aabb.hpp" class Intersection{ @@ -60,8 +60,8 @@ class Intersection{ HitData& rec) { const float EPSILON = 1e-8f; - fungt::Vec3 edge1 = tri.v1 - tri.v0; - fungt::Vec3 edge2 = tri.v2 - tri.v0; + fungt::Vec3 edge1 = fungt::sub(tri.v1, tri.v0); + fungt::Vec3 edge2 = fungt::sub(tri.v2, tri.v0); fungt::Vec3 h = ray.m_dir.cross(edge2); float a = edge1.dot(h); if (fabs(a) < EPSILON) return false; diff --git a/PBR/Render/include/cpu_renderer.hpp b/PBR/Render/include/cpu_renderer.hpp index fffe1e8..dd321f3 100644 --- a/PBR/Render/include/cpu_renderer.hpp +++ b/PBR/Render/include/cpu_renderer.hpp @@ -23,7 +23,6 @@ class CPU_Renderer : public IComputeRenderer{ std::vector RenderScene( int width, int height, - const std::vector& triangleList, const std::vector &hotTriangles, const std::vector &coldTriangles, const std::vector &nodes, diff --git a/PBR/Render/include/cuda_renderer.hpp b/PBR/Render/include/cuda_renderer.hpp index b9991fc..a1d7fe6 100644 --- a/PBR/Render/include/cuda_renderer.hpp +++ b/PBR/Render/include/cuda_renderer.hpp @@ -36,7 +36,6 @@ class CUDA_Renderer : public IComputeRenderer{ std::vector RenderScene( int width, int height, - const std::vector& triangleList, const std::vector &hotTriangles, const std::vector &coldTriangles, const std::vector &nodes, diff --git a/PBR/Render/include/icompute_renderer.hpp b/PBR/Render/include/icompute_renderer.hpp index 3ddf43d..48b1881 100644 --- a/PBR/Render/include/icompute_renderer.hpp +++ b/PBR/Render/include/icompute_renderer.hpp @@ -15,7 +15,6 @@ class IComputeRenderer{ virtual ~IComputeRenderer() = default; virtual std::vector RenderScene( int width, int height, - const std::vector &triangleList, const std::vector &hotTriangles, const std::vector &coldTriangles, const std::vector &nodes, diff --git a/PBR/Render/include/sycl_renderer.hpp b/PBR/Render/include/sycl_renderer.hpp index cb59c02..45a433b 100644 --- a/PBR/Render/include/sycl_renderer.hpp +++ b/PBR/Render/include/sycl_renderer.hpp @@ -31,7 +31,6 @@ class SYCL_Renderer : public IComputeRenderer { std::vector RenderScene( int width, int height, - const std::vector& triangleList, const std::vector &hotTriangles, const std::vector &coldTriangles, const std::vector& nodes, diff --git a/PBR/Render/shared/core_renderer.hpp b/PBR/Render/shared/core_renderer.hpp index 03244f8..245b6ce 100644 --- a/PBR/Render/shared/core_renderer.hpp +++ b/PBR/Render/shared/core_renderer.hpp @@ -52,6 +52,43 @@ fgt_device_gpu void sampleEmissiveLight( if (area < 1e-8f) area = 1e-8f; pdf = 1.0f / (numEmissiveTris * area); } +fgt_device_gpu void sampleEmissiveLight( + const gpu::TriangleGeometry* hotTris, + const gpu::TriangleShadingData* coldTris, + const int* emissiveTris, + int numEmissiveTris, + fungt::RNG& rng, + fungt::Vec3& lightPos, + fungt::Vec3& lightNormal, + fungt::Vec3& lightEmission, + float& pdf) +{ + if (numEmissiveTris == 0) { pdf = 0.0f; return; } + + uint32_t randInt = rng.nextU32(); + int idx = randInt % numEmissiveTris; + int triIdx = emissiveTris[idx]; + + const gpu::TriangleGeometry& hot = hotTris[triIdx]; + const gpu::TriangleShadingData& cold = coldTris[triIdx]; + + float r1 = rng.nextFloat(); + float r2 = rng.nextFloat(); + if (r1 + r2 > 1.0f) { r1 = 1.0f - r1; r2 = 1.0f - r2; } + float r3 = 1.0f - r1 - r2; + + lightPos = fungt::multiply(hot.v0, r1) + fungt::multiply(hot.v1, r2) + fungt::multiply(hot.v2, r3); + lightNormal = (fungt::multiply(cold.n0, r1) + fungt::multiply(cold.n1, r2) + fungt::multiply(cold.n2, r3)).normalize(); + lightEmission = fungt::Vec3(cold.material.baseColor[0], + cold.material.baseColor[1], + cold.material.baseColor[2]) * cold.material.emission; + + fungt::Vec3 edge1 = fungt::sub(hot.v1, hot.v0); + fungt::Vec3 edge2 = fungt::sub(hot.v2, hot.v0); + float area = 0.5f * edge1.cross(edge2).length(); + if (area < 1e-8f) area = 1e-8f; + pdf = 1.0f / (numEmissiveTris * area); +} fgt_device_gpu inline fungt::Vec3 sampleHemisphere(const fungt::Vec3& normal, fungt::RNG& fgtRNG) { float u = fgtRNG.nextFloat(); @@ -96,9 +133,9 @@ fgt_device fungt::Vec3 skyColor(const fungt::Ray& ray) { } // Shadow ray traversal - returns TRUE if anything blocks the ray // Unlike traceRayBVH, this exits immediately on ANY hit (no closest hit needed) -fgt_device_gpu bool traceShadowRayBVH( +fgt_device_gpu_noinline bool traceShadowRayBVH( const fungt::Ray& ray, - const Triangle* tris, + const gpu::TriangleGeometry * tris, const BVHNode* bvhNodes, int numNodes, float maxDist) // Only check hits closer than this (distance to light) @@ -134,11 +171,10 @@ fgt_device_gpu bool traceShadowRayBVH( return false; // Nothing blocked the ray } -fgt_device_gpu bool inline traceRayBVH( +fgt_device_gpu_noinline bool traceRayBVH( const fungt::Ray& ray, - const Triangle* tris, const gpu::TriangleGeometry *hotTris, - const gpu::TriangleGeometry *coldTris, + const gpu::TriangleShadingData *coldTris, const BVHNode* bvhNodes, int numNodes, const TextureDeviceObject* textures, @@ -184,13 +220,16 @@ fgt_device_gpu bool inline traceRayBVH( const float bz = temp.bary.z; // Vec4 direct subtraction — no .xyz() temporaries - fungt::Vec3 e1 = hot.v1 - hot.v0; - fungt::Vec3 e2 = hot.v2 - hot.v0; + // fungt::Vec3 e1 = hot.v1 - hot.v0; + // fungt::Vec3 e2 = hot.v2 - hot.v0; + fungt::Vec3 e1 = fungt::sub(hot.v1, hot.v0); + fungt::Vec3 e2 = fungt::sub(hot.v2, hot.v0); hit.geometricNormal = e1.cross(e2).normalize(); // Reuse cached barycentrics - hit.normal = (cold.n0 * bx + cold.n1 * by + cold.n2 * bz).normalize(); - + hit.normal = (fungt::multiply(cold.n0, bx) + + fungt::multiply(cold.n1, by) + + fungt::multiply(cold.n2, bz)).normalize(); if (hit.normal.dot(hit.geometricNormal) < 0.0f) hit.normal = hit.normal * -1.0f; @@ -227,7 +266,8 @@ fgt_device_gpu bool inline traceRayBVH( } fgt_device_gpu fungt::Vec3 pathTracer_CookTorrance( const fungt::Ray& initialRay, - const Triangle* tris, + const gpu::TriangleGeometry* hotTris, + const gpu::TriangleShadingData* coldTris, const BVHNode* nodes, const Light* lights, const int* emissiveTris, @@ -245,7 +285,7 @@ fgt_device_gpu fungt::Vec3 pathTracer_CookTorrance( for (int bounce = 0; bounce < 6; ++bounce) { HitData hit; - bool hitAny = traceRayBVH(currRay, tris, nodes, numOfNodes, textures, hit); + bool hitAny = traceRayBVH(currRay, hotTris, coldTris, nodes, numOfNodes, textures, hit); if (!hitAny) { radiance += throughput * skyColor(currRay); @@ -281,7 +321,7 @@ fgt_device_gpu fungt::Vec3 pathTracer_CookTorrance( // OPTIMIZED: Early-exit shadow ray fungt::Ray shadowRay(hit.point + hit.geometricNormal * 0.001f, L); - if (traceShadowRayBVH(shadowRay, tris, nodes, numOfNodes, dist)) { + if (traceShadowRayBVH(shadowRay, hotTris, nodes, numOfNodes, dist)) { continue; // Blocked } @@ -295,7 +335,7 @@ fgt_device_gpu fungt::Vec3 pathTracer_CookTorrance( fungt::Vec3 lightPos, lightNormal, lightEmission; float lightPdf; - sampleEmissiveLight(tris, emissiveTris, numOfEmissiveTris, fgtRng, + sampleEmissiveLight(hotTris,coldTris, emissiveTris, numOfEmissiveTris, fgtRng, lightPos, lightNormal, lightEmission, lightPdf); if (lightPdf > 0.0f) { @@ -311,7 +351,7 @@ fgt_device_gpu fungt::Vec3 pathTracer_CookTorrance( // OPTIMIZED: Early-exit shadow ray fungt::Ray shadowRay(hit.point + hit.geometricNormal * 0.001f, L); - bool occluded = traceShadowRayBVH(shadowRay, tris, nodes, numOfNodes, + bool occluded = traceShadowRayBVH(shadowRay, hotTris, nodes, numOfNodes, distToLight - 0.001f); if (!occluded) { diff --git a/PBR/Render/src/cpu_renderer.cpp b/PBR/Render/src/cpu_renderer.cpp index a0094e6..c4d0780 100644 --- a/PBR/Render/src/cpu_renderer.cpp +++ b/PBR/Render/src/cpu_renderer.cpp @@ -2,7 +2,7 @@ #include "PBR/PBRCamera/pbr_camera.hpp" #include "cpu_renderer.hpp" -std::vector CPU_Renderer::RenderScene(int width, int height, const std::vector& triangleList, const std::vector& hotTriangles, const std::vector& coldTriangles, const std::vector& nodes, const std::vector& lightsList, const std::vector& emissiveTriIndices, const PBRCamera& camera, int samplesPerPixel, int sampleOffset) +std::vector CPU_Renderer::RenderScene(int width, int height, const std::vector& hotTriangles, const std::vector& coldTriangles, const std::vector& nodes, const std::vector& lightsList, const std::vector& emissiveTriIndices, const PBRCamera& camera, int samplesPerPixel, int sampleOffset) { return std::vector(); } diff --git a/PBR/Render/src/cuda_renderer.cu b/PBR/Render/src/cuda_renderer.cu index e4e5b4e..7e5a06f 100644 --- a/PBR/Render/src/cuda_renderer.cu +++ b/PBR/Render/src/cuda_renderer.cu @@ -151,7 +151,8 @@ fgt_device fungt::Vec3 shadeNormal(const fungt::Vec3& normal) { } fgt_global void render_kernel( fungt::Vec3* framebuffer, - const Triangle* triangles, + const gpu::TriangleGeometry *hotTris, + const gpu::TriangleShadingData *coldTris, const BVHNode * nodes, const Light *lights, const int *emissiveTris, @@ -163,7 +164,7 @@ fgt_global void render_kernel( int numOfEmissiveTris, int width, int height, - PBRCamera cam, + const PBRCamera* cam, int samplesPerPixel, int seed ) { @@ -181,9 +182,9 @@ fgt_global void render_kernel( //float v = (y + randomFloat(&randomState)) / (height - 1); float u = (x + rng.nextFloat()) / (width - 1); float v = (y + rng.nextFloat()) / (height - 1); - fungt::Ray ray = cam.getRay(u, v); + fungt::Ray ray = cam->getRay(u, v); - pixel += pathTracer_CookTorrance(ray, triangles,nodes, lights, emissiveTris, + pixel += pathTracer_CookTorrance(ray, hotTris,coldTris,nodes, lights, emissiveTris, textures,numTextures, numOfTriangles, numOfNodes, numOfLights,numOfEmissiveTris,rng); } @@ -194,7 +195,6 @@ fgt_global void render_kernel( } std::vector CUDA_Renderer::RenderScene( int width, int height, - const std::vector& triangleList, const std::vector &hotTriangles, const std::vector &coldTriangles, const std::vector &nodes, @@ -225,10 +225,16 @@ std::vector CUDA_Renderer::RenderScene( CUDA_CHECK(cudaMemcpy(device_emissiveTris, emissiveTriIndices.data(), emissiveLightsSize, cudaMemcpyHostToDevice)); } - Triangle* device_Tlist = nullptr; - size_t TlistSize = triangleList.size() * sizeof(Triangle); - CUDA_CHECK(cudaMalloc(&device_Tlist, TlistSize)); - CUDA_CHECK(cudaMemcpy(device_Tlist, triangleList.data(), TlistSize, cudaMemcpyHostToDevice)); + //Memory allocation for hot and cold triangles: + gpu::TriangleGeometry *device_hTList = nullptr; + size_t hotTlistSize = hotTriangles.size() * sizeof(gpu::TriangleGeometry); + CUDA_CHECK(cudaMalloc(&device_hTList,hotTlistSize)); + CUDA_CHECK(cudaMemcpy(device_hTList,hotTriangles.data(),hotTlistSize,cudaMemcpyHostToDevice)); + + gpu::TriangleShadingData *devive_cTList = nullptr; + size_t coldTlistSize = coldTriangles.size() * sizeof(gpu::TriangleShadingData); + CUDA_CHECK(cudaMalloc(&devive_cTList,coldTlistSize)); + CUDA_CHECK(cudaMemcpy(devive_cTList,coldTriangles.data(),coldTlistSize,cudaMemcpyHostToDevice)); BVHNode* device_bvhNode = nullptr; size_t BvhNodeSize = nodes.size()*sizeof(BVHNode); @@ -255,23 +261,25 @@ std::vector CUDA_Renderer::RenderScene( else{ std::cout << "WARNING: CUDA Textures ptr is NUL " << std::endl; } - - + PBRCamera *dCamera = nullptr; + cudaMalloc(&dCamera, sizeof(PBRCamera)); + cudaMemcpy(dCamera, &camera, sizeof(PBRCamera), cudaMemcpyHostToDevice); render_kernel << > > ( device_buff, - device_Tlist, + device_hTList, + devive_cTList, device_bvhNode, device_lights, device_emissiveTris, m_textureObj, m_numTextures, - int(triangleList.size()), + int(coldTriangles.size()), int(nodes.size()), int(lightsList.size()), numEmissiveTris, width, height, - camera, + dCamera, samplesPerPixel, seed ); @@ -287,10 +295,11 @@ std::vector CUDA_Renderer::RenderScene( CUDA_CHECK(cudaFree(device_emissiveTris)); } CUDA_CHECK(cudaFree(device_buff)); - CUDA_CHECK(cudaFree(device_Tlist)); + CUDA_CHECK(cudaFree(device_hTList)); + CUDA_CHECK(cudaFree(devive_cTList)); CUDA_CHECK(cudaFree(device_bvhNode)); CUDA_CHECK(cudaFree(device_lights)); - + CUDA_CHECK(cudaFree(dCamera)); return framebuffer; } \ No newline at end of file diff --git a/PBR/Render/src/sycl_renderer.cpp b/PBR/Render/src/sycl_renderer.cpp index c5c63da..9fbf671 100644 --- a/PBR/Render/src/sycl_renderer.cpp +++ b/PBR/Render/src/sycl_renderer.cpp @@ -1,137 +1,131 @@ #include "sycl_renderer.hpp" -// fgt_device inline fungt::Vec3 skyColor(const fungt::Ray& ray) { -// return fungt::Vec3(0.4, 0.4f, 0.4); -// } - - -fgt_device_gpu fungt::Vec3 pathTracer_CookTorranceSYCL( - const fungt::Ray& initialRay, - const Triangle* tris, - const BVHNode* nodes, - const Light* lights, - const int* emissiveTris, - const syclexp::sampled_image_handle* textures, - int numTextures, - int numOfTriangles, - int numOfNodes, - int numOfLights, - int numEmissiveTris, - fungt::RNG& rng) -{ - fungt::Vec3 throughput(1.0f, 1.0f, 1.0f); - fungt::Vec3 radiance(0.0f, 0.0f, 0.0f); - fungt::Ray currRay = initialRay; - - for (int bounce = 0; bounce < 6; ++bounce) { - HitData hit; - bool hitAny = traceRayBVH(currRay, tris, nodes, numOfNodes, textures, hit); - - if (!hitAny) { - radiance += throughput * skyColor(currRay); - break; - } - - fungt::Vec3 N = hit.normal.normalize(); - fungt::Vec3 V = (currRay.m_dir * (-1.0f)).normalize(); - - fungt::Vec3 baseColor = fungt::Vec3( - hit.material.baseColor[0], - hit.material.baseColor[1], - hit.material.baseColor[2]); - - float metallic = fmaxf(0.0f, fminf(hit.material.metallic, 1.0f)); - float roughness = fmaxf(0.05f, fminf(hit.material.roughness, 1.0f)); - - fungt::Vec3 dielectricF0 = fungt::Vec3( - hit.material.reflectance, - hit.material.reflectance, - hit.material.reflectance); - fungt::Vec3 F0 = lerp(dielectricF0, baseColor, metallic); - - if (hit.material.emission > 0.0f) { - radiance += throughput * baseColor * hit.material.emission; - } - - fungt::Vec3 directLight(0.0f); - for (int l = 0; l < numOfLights; ++l) { - fungt::Vec3 toLight = lights[l].m_pos - hit.point; - float dist = toLight.length(); - fungt::Vec3 L = toLight / dist; - - fungt::Ray shadowRay(hit.point + hit.geometricNormal * 0.001f, L); - HitData temp; - bool occluded = traceRayBVH(shadowRay, tris, nodes, numOfNodes, textures, temp) && temp.dis < dist; - - if (occluded) continue; - - fungt::Vec3 lightRadiance = lights[l].m_intensity / (dist * dist + 1e-6f); - directLight += evaluateCookTorrance(N, V, L, hit.material, lightRadiance); - } - - radiance += throughput * directLight; - - //Emissive Triangles +// fgt_device_gpu fungt::Vec3 pathTracer_CookTorranceSYCL( +// const fungt::Ray& initialRay, +// const Triangle* tris, +// const BVHNode* nodes, +// const Light* lights, +// const int* emissiveTris, +// const syclexp::sampled_image_handle* textures, +// int numTextures, +// int numOfTriangles, +// int numOfNodes, +// int numOfLights, +// int numEmissiveTris, +// fungt::RNG& rng) +// { +// fungt::Vec3 throughput(1.0f, 1.0f, 1.0f); +// fungt::Vec3 radiance(0.0f, 0.0f, 0.0f); +// fungt::Ray currRay = initialRay; + +// for (int bounce = 0; bounce < 6; ++bounce) { +// HitData hit; +// bool hitAny = traceRayBVH(currRay, tris, nodes, numOfNodes, textures, hit); + +// if (!hitAny) { +// radiance += throughput * skyColor(currRay); +// break; +// } + +// fungt::Vec3 N = hit.normal.normalize(); +// fungt::Vec3 V = (currRay.m_dir * (-1.0f)).normalize(); + +// fungt::Vec3 baseColor = fungt::Vec3( +// hit.material.baseColor[0], +// hit.material.baseColor[1], +// hit.material.baseColor[2]); + +// float metallic = fmaxf(0.0f, fminf(hit.material.metallic, 1.0f)); +// float roughness = fmaxf(0.05f, fminf(hit.material.roughness, 1.0f)); + +// fungt::Vec3 dielectricF0 = fungt::Vec3( +// hit.material.reflectance, +// hit.material.reflectance, +// hit.material.reflectance); +// fungt::Vec3 F0 = lerp(dielectricF0, baseColor, metallic); + +// if (hit.material.emission > 0.0f) { +// radiance += throughput * baseColor * hit.material.emission; +// } + +// fungt::Vec3 directLight(0.0f); +// for (int l = 0; l < numOfLights; ++l) { +// fungt::Vec3 toLight = lights[l].m_pos - hit.point; +// float dist = toLight.length(); +// fungt::Vec3 L = toLight / dist; + +// fungt::Ray shadowRay(hit.point + hit.geometricNormal * 0.001f, L); +// HitData temp; +// bool occluded = traceRayBVH(shadowRay, tris, nodes, numOfNodes, textures, temp) && temp.dis < dist; + +// if (occluded) continue; + +// fungt::Vec3 lightRadiance = lights[l].m_intensity / (dist * dist + 1e-6f); +// directLight += evaluateCookTorrance(N, V, L, hit.material, lightRadiance); +// } + +// radiance += throughput * directLight; + +// //Emissive Triangles + + +// if (numEmissiveTris > 0) { +// fungt::Vec3 lightPos, lightNormal, lightEmission; +// float lightPdf; + +// sampleEmissiveLight(tris, emissiveTris, numEmissiveTris, rng, +// lightPos, lightNormal, lightEmission, lightPdf); + +// if (lightPdf > 0.0f) { +// fungt::Vec3 toLight = lightPos - hit.point; +// float distToLight = toLight.length(); +// fungt::Vec3 L = toLight / distToLight; + +// // Shadow ray to check visibility +// fungt::Ray shadowRay(hit.point + hit.geometricNormal * 0.001f, L); +// HitData shadowHit; +// bool visible = !traceRayBVH(shadowRay, tris, nodes, numOfNodes, textures, shadowHit) || +// shadowHit.dis > (distToLight - 0.001f); + +// if (visible) { +// float cosTheta = fmaxf(0.0f, N.dot(L)); +// float cosLight = fmaxf(0.0f, lightNormal.dot(L * -1.0f)); + +// if (cosTheta > 0.0f && cosLight > 0.0f) { +// // Evaluate Cook-Torrance BRDF for this light direction +// fungt::Vec3 emissiveLight = lightEmission / (distToLight * distToLight + 1e-6f); +// fungt::Vec3 neeContribution = evaluateCookTorrance(N, V, L, hit.material, emissiveLight); + +// // Geometric term for area light +// float geometryTerm = cosLight / lightPdf; + +// radiance += throughput * neeContribution * geometryTerm; +// } +// } +// } +// } + +// fungt::Vec3 newDir = sampleHemisphere(N, rng); + +// fungt::Vec3 avgF = F_Schlick(F0, fmaxf(V.dot(N), 0.0f)); +// fungt::Vec3 kD = (fungt::Vec3(1.0f, 1.0f, 1.0f) - avgF) * (1.0f - metallic); +// throughput = throughput * (kD * baseColor); + +// currRay = fungt::Ray(hit.point + N * 0.001f, newDir); +// if (bounce > 2) { +// float maxComponent = fmaxf(throughput.x, fmaxf(throughput.y, throughput.z)); +// float p = fminf(0.95f, maxComponent); +// if (rng.nextFloat() > p) break; +// throughput = throughput / p; +// } +// } - if (numEmissiveTris > 0) { - fungt::Vec3 lightPos, lightNormal, lightEmission; - float lightPdf; - - sampleEmissiveLight(tris, emissiveTris, numEmissiveTris, rng, - lightPos, lightNormal, lightEmission, lightPdf); - - if (lightPdf > 0.0f) { - fungt::Vec3 toLight = lightPos - hit.point; - float distToLight = toLight.length(); - fungt::Vec3 L = toLight / distToLight; - - // Shadow ray to check visibility - fungt::Ray shadowRay(hit.point + hit.geometricNormal * 0.001f, L); - HitData shadowHit; - bool visible = !traceRayBVH(shadowRay, tris, nodes, numOfNodes, textures, shadowHit) || - shadowHit.dis > (distToLight - 0.001f); - - if (visible) { - float cosTheta = fmaxf(0.0f, N.dot(L)); - float cosLight = fmaxf(0.0f, lightNormal.dot(L * -1.0f)); - - if (cosTheta > 0.0f && cosLight > 0.0f) { - // Evaluate Cook-Torrance BRDF for this light direction - fungt::Vec3 emissiveLight = lightEmission / (distToLight * distToLight + 1e-6f); - fungt::Vec3 neeContribution = evaluateCookTorrance(N, V, L, hit.material, emissiveLight); - - // Geometric term for area light - float geometryTerm = cosLight / lightPdf; - - radiance += throughput * neeContribution * geometryTerm; - } - } - } - } - - fungt::Vec3 newDir = sampleHemisphere(N, rng); - - fungt::Vec3 avgF = F_Schlick(F0, fmaxf(V.dot(N), 0.0f)); - fungt::Vec3 kD = (fungt::Vec3(1.0f, 1.0f, 1.0f) - avgF) * (1.0f - metallic); - throughput = throughput * (kD * baseColor); - - currRay = fungt::Ray(hit.point + N * 0.001f, newDir); - - if (bounce > 2) { - float maxComponent = fmaxf(throughput.x, fmaxf(throughput.y, throughput.z)); - float p = fminf(0.95f, maxComponent); - if (rng.nextFloat() > p) break; - throughput = throughput / p; - } - } - - return radiance; -} +// return radiance; +// } std::vector SYCL_Renderer::RenderScene( int width, int height, - const std::vector& triangleList, const std::vector &hotTriangles, const std::vector &coldTriangles, const std::vector& nodes, @@ -156,17 +150,19 @@ std::vector SYCL_Renderer::RenderScene( m_queue.memcpy(dev_emissiveTris, emissiveTriIndices.data(), numEmissiveTris * sizeof(int)); } - Triangle* dev_triList = sycl::malloc_device(triangleList.size(), m_queue); + gpu::TriangleGeometry* dev_hotTris = sycl::malloc_device(hotTriangles.size(), m_queue); + gpu::TriangleShadingData* dev_coldTris = sycl::malloc_device(coldTriangles.size(), m_queue); BVHNode* dev_bvhNode = sycl::malloc_device(nodes.size(), m_queue); Light* dev_lights = sycl::malloc_device(lightsList.size(), m_queue); fungt::Vec3* dev_buff = sycl::malloc_device(imageSize, m_queue); - m_queue.memcpy(dev_triList, triangleList.data(), triangleList.size() * sizeof(Triangle)); + m_queue.memcpy(dev_hotTris, hotTriangles.data(), hotTriangles.size() * sizeof(gpu::TriangleGeometry)); + m_queue.memcpy(dev_coldTris, coldTriangles.data(), coldTriangles.size() * sizeof(gpu::TriangleShadingData)); m_queue.memcpy(dev_bvhNode, nodes.data(), nodes.size() * sizeof(BVHNode)); m_queue.memcpy(dev_lights, lightsList.data(), lightsList.size() * sizeof(Light)); m_queue.wait(); - int numTriangles = triangleList.size(); + int numTriangles = hotTriangles.size(); int numNodes = nodes.size(); int numLights = lightsList.size(); int numTextures = m_numTextures; @@ -211,7 +207,7 @@ std::vector SYCL_Renderer::RenderScene( fungt::Ray ray = camera.getRay(u, v); fungt::Vec3 contribution = pathTracer_CookTorrance( - ray, dev_triList, dev_bvhNode, dev_lights, + ray, dev_hotTris, dev_coldTris, dev_bvhNode, dev_lights, dev_emissiveTris, textureHandles, numTextures, numTriangles, numNodes, numLights, numEmissiveTris, rng @@ -253,7 +249,8 @@ std::vector SYCL_Renderer::RenderScene( if (dev_emissiveTris) { sycl::free(dev_emissiveTris, m_queue); } - sycl::free(dev_triList, m_queue); + sycl::free(dev_hotTris, m_queue); + sycl::free(dev_coldTris, m_queue); sycl::free(dev_bvhNode, m_queue); sycl::free(dev_lights, m_queue); sycl::free(dev_buff, m_queue); diff --git a/PBR/Space/space.cpp b/PBR/Space/space.cpp index 40cfcc8..d69e8df 100644 --- a/PBR/Space/space.cpp +++ b/PBR/Space/space.cpp @@ -81,7 +81,7 @@ std::vector Space::Render(const int width, const int height,int sam << " Framebuffer: " << frameMem / (1024.0 * 1024.0) << " MB\n" << " Total: " << totalMem / (1024.0 * 1024.0) << " MB\n"; std::vector frameBuffer = m_computeRenderer->RenderScene( - width, height, m_triangles, m_hotTriangles, m_coldTriangles, + width, height, m_hotTriangles, m_coldTriangles, m_bvh_nodes, m_lights, m_emissiveTriIndices, m_camera, m_samplesPerPixel, sampleOffset); diff --git a/PBR/main/CMakeLists.txt b/PBR/main/CMakeLists.txt index 21a1b6a..f3c7cb2 100644 --- a/PBR/main/CMakeLists.txt +++ b/PBR/main/CMakeLists.txt @@ -196,7 +196,9 @@ if(FUNGT_USE_CUDA) ) add_library(pbr_cuda STATIC ${CUDA_FILES}) - + target_compile_options(pbr_cuda PRIVATE + $<$:--ptxas-options=-v> + ) target_include_directories(pbr_cuda PUBLIC ${FUNGT_INCLUDES} ${FUNLIB_DIR}/include diff --git a/Vector/vector4.hpp b/Vector/vector4.hpp index a4764b2..72c601f 100644 --- a/Vector/vector4.hpp +++ b/Vector/vector4.hpp @@ -17,16 +17,10 @@ namespace fungt{ // Arithmetic with Vec4 fgt_device Vec4 operator+(const Vec4& o) const { return Vec4(x + o.x, y + o.y, z + o.z, w + o.w); } fgt_device Vec4 operator-(const Vec4& o) const { return Vec4(x - o.x, y - o.y, z - o.z, w - o.w); } - fgt_device Vec3 operator-(const Vec4& o) const { - return Vec3(x - o.x, y - o.y, z - o.z); - } fgt_device Vec4 operator*(float s) const { return Vec4(x * s, y * s, z * s, w * s); } fgt_device Vec4 operator/(float s) const { return Vec4(x / s, y / s, z / s, w / s); } fgt_device Vec4& operator+=(const Vec4& o) { x += o.x; y += o.y; z += o.z; w += o.w; return *this; } fgt_device Vec4& operator-=(const Vec4& o) { x -= o.x; y -= o.y; z -= o.z; w -= o.w; return *this; } - fgt_device Vec3 operator*(float s) const { - return Vec3(x * s, y * s, z * s); - } // Scalar multiply from left fgt_device friend Vec4 operator*(float s, const Vec4& v) { return Vec4(v.x * s, v.y * s, v.z * s, v.w * s); } @@ -53,7 +47,13 @@ namespace fungt{ fgt_device float operator[](int i) const { if (i == 0) return x; if (i == 1) return y; if (i == 2) return z; return w; } fgt_device float& operator[](int i) { if (i == 0) return x; if (i == 1) return y; if (i == 2) return z; return w; } }; - + fgt_device inline fungt::Vec3 sub(const Vec4& a, const Vec4& b) { + return Vec3(a.x - b.x, a.y - b.y, a.z - b.z); + } + //Mutiply a Vec4 times a scalar, returning a Vec3 (w ignored) + fgt_device inline fungt::Vec3 multiply(const Vec4& v, float s) { + return Vec3(v.x * s, v.y * s, v.z * s); + } } diff --git a/gpu/include/fgt_cpu_device.hpp b/gpu/include/fgt_cpu_device.hpp index 39350fe..8e5b764 100644 --- a/gpu/include/fgt_cpu_device.hpp +++ b/gpu/include/fgt_cpu_device.hpp @@ -28,6 +28,7 @@ #define fgt_device_constant __constant__ #define fgt_global __global__ #define fgt_shared __shared__ +#define fgt_device_gpu_noinline __device__ __noinline__ #elif defined(__KERNEL_SYCL__) #define fgt_device inline #define fgt_device_gpu inline @@ -36,10 +37,12 @@ #define fgt_device_constant const #define fgt_global #define fgt_shared /* use local_accessor in SYCL kernels */ +#define fgt_device_gpu_noinline [[intel::noinline]] inline #elif defined(__KERNEL_CPU__) #define fgt_device #define fgt_device_gpu #define fgt_device_forceinline inline #define fgt_global #define fgt_shared +#define fgt_device_gpu_noinline inline #endif \ No newline at end of file