From e133aca6d270ce9793bfc379b69f19551f90bccd Mon Sep 17 00:00:00 2001 From: Tobias Reiter Date: Sat, 6 Dec 2025 10:53:08 +0100 Subject: [PATCH 1/6] Implement GPU double precision --- gpu/include/raygLaunchParams.hpp | 4 ++-- gpu/include/raygTrace.hpp | 12 ++++++------ gpu/pipelines/Particle.cuh | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/gpu/include/raygLaunchParams.hpp b/gpu/include/raygLaunchParams.hpp index a51fd7c..c0ed8ac 100644 --- a/gpu/include/raygLaunchParams.hpp +++ b/gpu/include/raygLaunchParams.hpp @@ -13,8 +13,8 @@ __both__ __forceinline__ unsigned callableIndex(unsigned p, CallableSlot s) { static_cast(s); } -struct LaunchParams { - float *resultBuffer; +template struct LaunchParams { + T *resultBuffer; float rayWeightThreshold = 0.1f; float tThreshold = 0.5f; diff --git a/gpu/include/raygTrace.hpp b/gpu/include/raygTrace.hpp index f694238..8c9d21e 100644 --- a/gpu/include/raygTrace.hpp +++ b/gpu/include/raygTrace.hpp @@ -91,8 +91,8 @@ template class Trace { } // Resize our cuda result buffer - resultBuffer.allocInit(launchParams.numElements * numFluxes_, float(0)); - launchParams.resultBuffer = (float *)resultBuffer.dPointer(); + resultBuffer.allocInit(launchParams.numElements * numFluxes_, T(0)); + launchParams.resultBuffer = (T *)resultBuffer.dPointer(); if (materialIdsBuffer_.sizeInBytes != 0) { launchParams.materialIds = (int *)materialIdsBuffer_.dPointer(); @@ -304,8 +304,8 @@ template class Trace { size_t getNumberOfRays() const { return numRays; } - std::vector getFlux(int particleIdx, int dataIdx, - int smoothingNeighbors = 0) { + std::vector getFlux(int particleIdx, int dataIdx, + int smoothingNeighbors = 0) { if (!resultsDownloaded) { results.resize(launchParams.numElements * numFluxes_); resultBuffer.download(results.data(), @@ -749,12 +749,12 @@ template class Trace { OptixShaderBindingTable sbt = {}; // launch parameters, on the host, constant for all particles - LaunchParams launchParams; + LaunchParams launchParams; std::vector launchParamsBuffers; // results Buffer CudaBuffer resultBuffer; - std::vector results; + std::vector results; rayInternal::KernelConfig config_; bool ignoreBoundary = false; diff --git a/gpu/pipelines/Particle.cuh b/gpu/pipelines/Particle.cuh index 0c58da1..03c91fb 100644 --- a/gpu/pipelines/Particle.cuh +++ b/gpu/pipelines/Particle.cuh @@ -6,7 +6,7 @@ #include #include -extern "C" __constant__ viennaray::gpu::LaunchParams launchParams; +extern "C" __constant__ viennaray::gpu::LaunchParams launchParams; // // --- Generic particle From aa57a372bd03250d0089276b0d9b363c264a46d7 Mon Sep 17 00:00:00 2001 From: reiter Date: Wed, 10 Dec 2025 10:30:02 +0100 Subject: [PATCH 2/6] Use double for flux --- CMakeLists.txt | 1 + gpu/include/raygLaunchParams.hpp | 4 ++-- gpu/include/raygTrace.hpp | 24 ++++++++++++------------ gpu/include/raygTraceDisk.hpp | 8 ++++---- gpu/include/raygTraceLine.hpp | 4 ++-- gpu/include/raygTraceTriangle.hpp | 4 ++-- gpu/include/raygTriangleGeometry.hpp | 2 +- gpu/kernels/normKernels.cu | 24 ++++++++++++------------ gpu/pipelines/Particle.cuh | 4 ++-- include/viennaray/rayUtil.hpp | 8 ++++---- 10 files changed, 42 insertions(+), 41 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3effc8d..3b62c8d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,7 @@ include("cmake/cpm.cmake") CPMAddPackage( NAME ViennaCore VERSION 1.7.3 + GIT_TAG main GIT_REPOSITORY "https://github.com/ViennaTools/ViennaCore" OPTIONS "VIENNACORE_USE_GPU ${VIENNARAY_USE_GPU}") diff --git a/gpu/include/raygLaunchParams.hpp b/gpu/include/raygLaunchParams.hpp index c0ed8ac..7f028c0 100644 --- a/gpu/include/raygLaunchParams.hpp +++ b/gpu/include/raygLaunchParams.hpp @@ -13,8 +13,8 @@ __both__ __forceinline__ unsigned callableIndex(unsigned p, CallableSlot s) { static_cast(s); } -template struct LaunchParams { - T *resultBuffer; +struct LaunchParams { + double *resultBuffer; float rayWeightThreshold = 0.1f; float tThreshold = 0.5f; diff --git a/gpu/include/raygTrace.hpp b/gpu/include/raygTrace.hpp index 8c9d21e..200e654 100644 --- a/gpu/include/raygTrace.hpp +++ b/gpu/include/raygTrace.hpp @@ -91,8 +91,8 @@ template class Trace { } // Resize our cuda result buffer - resultBuffer.allocInit(launchParams.numElements * numFluxes_, T(0)); - launchParams.resultBuffer = (T *)resultBuffer.dPointer(); + resultBuffer.allocInit(launchParams.numElements * numFluxes_, double(0)); + launchParams.resultBuffer = (double *)resultBuffer.dPointer(); if (materialIdsBuffer_.sizeInBytes != 0) { launchParams.materialIds = (int *)materialIdsBuffer_.dPointer(); @@ -111,8 +111,8 @@ template class Trace { launchParams.maxReflections = config_.maxReflections; launchParams.maxBoundaryHits = config_.maxBoundaryHits; - int numPointsPerDim = - static_cast(std::sqrt(static_cast(launchParams.numElements))); + int numPointsPerDim = static_cast( + std::sqrt(static_cast(launchParams.numElements))); if (config_.numRaysFixed > 0) { numPointsPerDim = 1; @@ -304,8 +304,8 @@ template class Trace { size_t getNumberOfRays() const { return numRays; } - std::vector getFlux(int particleIdx, int dataIdx, - int smoothingNeighbors = 0) { + std::vector getFlux(int particleIdx, int dataIdx, + int smoothingNeighbors = 0) { if (!resultsDownloaded) { results.resize(launchParams.numElements * numFluxes_); resultBuffer.download(results.data(), @@ -313,7 +313,7 @@ template class Trace { resultsDownloaded = true; } - std::vector flux(launchParams.numElements); + std::vector flux(launchParams.numElements); unsigned int offset = 0; for (size_t i = 0; i < particles_.size(); i++) { if (particleIdx > i) @@ -321,7 +321,7 @@ template class Trace { } offset = (offset + dataIdx) * launchParams.numElements; std::memcpy(flux.data(), results.data() + offset, - launchParams.numElements * sizeof(float)); + launchParams.numElements * sizeof(double)); if (smoothingNeighbors > 0) smoothFlux(flux, smoothingNeighbors); return flux; @@ -437,7 +437,7 @@ template class Trace { } } - virtual void smoothFlux(std::vector &flux, int smoothingNeighbors) {} + virtual void smoothFlux(std::vector &flux, int smoothingNeighbors) {} // To be implemented by derived classes virtual void normalizeResults() = 0; @@ -746,15 +746,15 @@ template class Trace { CudaBuffer hitgroupRecordBuffer; std::vector directCallablePGs; CudaBuffer directCallableRecordBuffer; - OptixShaderBindingTable sbt = {}; + OptixShaderBindingTable sbt{}; // launch parameters, on the host, constant for all particles - LaunchParams launchParams; + LaunchParams launchParams; std::vector launchParamsBuffers; // results Buffer CudaBuffer resultBuffer; - std::vector results; + std::vector results; rayInternal::KernelConfig config_; bool ignoreBoundary = false; diff --git a/gpu/include/raygTraceDisk.hpp b/gpu/include/raygTraceDisk.hpp index ee2c5f1..58efb7f 100644 --- a/gpu/include/raygTraceDisk.hpp +++ b/gpu/include/raygTraceDisk.hpp @@ -42,7 +42,7 @@ template class TraceDisk final : public Trace { this->ignoreBoundary, sourceOffset); } - void smoothFlux(std::vector &flux, int smoothingNeighbors) override { + void smoothFlux(std::vector &flux, int smoothingNeighbors) override { auto oldFlux = flux; const T requiredDistance = smoothingNeighbors * 2.0 * diskMesh.radius; PointNeighborhood @@ -66,9 +66,9 @@ template class TraceDisk final : public Trace { #pragma omp parallel for for (int idx = 0; idx < launchParams.numElements; idx++) { - float vv = oldFlux[idx]; + double vv = oldFlux[idx]; auto const &neighborhood = pointNeighborhood->getNeighborIndices(idx); - float sum = 1.f; + double sum = 1.0; auto const normal = diskMesh.normals[idx]; for (auto const &nbi : neighborhood) { auto nnormal = diskMesh.normals[nbi]; @@ -85,7 +85,7 @@ template class TraceDisk final : public Trace { void normalizeResults() override { assert(resultBuffer.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); - float sourceArea = 0.f; + double sourceArea = 0.0; if constexpr (D == 2) { sourceArea = (launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]); diff --git a/gpu/include/raygTraceLine.hpp b/gpu/include/raygTraceLine.hpp index aa58be6..1af6631 100644 --- a/gpu/include/raygTraceLine.hpp +++ b/gpu/include/raygTraceLine.hpp @@ -23,7 +23,7 @@ template class TraceLine final : public Trace { this->ignoreBoundary, sourceOffset); } - void smoothFlux(std::vector &flux, int numNeighbors) override { + void smoothFlux(std::vector &flux, int numNeighbors) override { // not implemented for line geometry } @@ -31,7 +31,7 @@ template class TraceLine final : public Trace { assert(resultBuffer.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); - float sourceArea = + double sourceArea = launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]; // calculate areas on host and send to device for now diff --git a/gpu/include/raygTraceTriangle.hpp b/gpu/include/raygTraceTriangle.hpp index b07a9ef..dd0f4cd 100644 --- a/gpu/include/raygTraceTriangle.hpp +++ b/gpu/include/raygTraceTriangle.hpp @@ -40,10 +40,10 @@ template class TraceTriangle final : public Trace { } } - void smoothFlux(std::vector &flux, int smoothingNeighbors) override {} + void smoothFlux(std::vector &flux, int smoothingNeighbors) override {} void normalizeResults() override { - float sourceArea = 0.f; + double sourceArea = 0.0; if constexpr (D == 2) { sourceArea = (launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]); diff --git a/gpu/include/raygTriangleGeometry.hpp b/gpu/include/raygTriangleGeometry.hpp index 0a5bec5..5027cc4 100644 --- a/gpu/include/raygTriangleGeometry.hpp +++ b/gpu/include/raygTriangleGeometry.hpp @@ -264,7 +264,7 @@ struct TriangleGeometry { #ifndef NDEBUG rayInternal::writeVTP(boundaryMesh, "triangleMesh_boundary.vtp", - std::vector()); + std::vector()); #endif return boundaryMesh; diff --git a/gpu/kernels/normKernels.cu b/gpu/kernels/normKernels.cu index f97e66e..eb55e01 100644 --- a/gpu/kernels/normKernels.cu +++ b/gpu/kernels/normKernels.cu @@ -4,9 +4,9 @@ #include extern "C" __global__ void normalize_surface_Triangle_f( - float *data, const viennacore::Vec3Df *vertex, + double *data, const viennacore::Vec3Df *vertex, const viennacore::Vec3D *index, const unsigned int numTriangles, - const float sourceArea, const size_t numRays, const int numData) { + const double sourceArea, const size_t numRays, const int numData) { using namespace viennacore; unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -18,16 +18,16 @@ extern "C" __global__ void normalize_surface_Triangle_f( const auto &C = vertex[elIdx[2]]; const auto area = Norm(CrossProduct(B - A, C - A)) / 2.f; if (area > 1e-6f) - data[tidx] *= sourceArea / (area * (float)numRays); + data[tidx] *= sourceArea / (area * (double)numRays); else - data[tidx] = 0.f; + data[tidx] = 0.0; } } extern "C" __global__ void normalize_surface_Triangle_f_2D( - float *data, const viennacore::Vec3Df *vertex, + double *data, const viennacore::Vec3Df *vertex, const viennacore::Vec3D *index, const unsigned int numTriangles, - const float sourceArea, const size_t numRays, const int numData) { + const double sourceArea, const size_t numRays, const int numData) { using namespace viennacore; unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -43,7 +43,7 @@ extern "C" __global__ void normalize_surface_Triangle_f_2D( else area = 0.5 * Norm(C - A); if (area > 1e-6f) - data[tidx] *= sourceArea / (area * (float)numRays); + data[tidx] *= sourceArea / (area * (double)numRays); else data[tidx] = 0.f; } @@ -71,9 +71,9 @@ extern "C" __global__ void normalize_surface_Triangle_d( } // Areas precomputed on the CPU -extern "C" __global__ void normalize_surface_Disk_f(float *data, float *areas, +extern "C" __global__ void normalize_surface_Disk_f(double *data, float *areas, const unsigned int numDisks, - float sourceArea, + const double sourceArea, const size_t numRays, const int numData) { unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; @@ -90,9 +90,9 @@ extern "C" __global__ void normalize_surface_Disk_f(float *data, float *areas, } // Areas precomputed on the CPU -extern "C" __global__ void normalize_surface_Line_f(float *data, float *areas, +extern "C" __global__ void normalize_surface_Line_f(double *data, float *areas, const unsigned int numLines, - float sourceArea, + const double sourceArea, const size_t numRays, const int numData) { unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; @@ -103,7 +103,7 @@ extern "C" __global__ void normalize_surface_Line_f(float *data, float *areas, // data[tidx] = area; if (area > 1e-5f) - data[tidx] *= sourceArea / (area * (float)numRays); + data[tidx] *= sourceArea / (area * (double)numRays); else data[tidx] = 0.f; } diff --git a/gpu/pipelines/Particle.cuh b/gpu/pipelines/Particle.cuh index 03c91fb..d09c2fe 100644 --- a/gpu/pipelines/Particle.cuh +++ b/gpu/pipelines/Particle.cuh @@ -6,7 +6,7 @@ #include #include -extern "C" __constant__ viennaray::gpu::LaunchParams launchParams; +extern "C" __constant__ viennaray::gpu::LaunchParams launchParams; // // --- Generic particle @@ -18,7 +18,7 @@ particleCollision(viennaray::gpu::PerRayData *prd) { atomicAdd(&launchParams .resultBuffer[viennaray::gpu::getIdxOffset(0, launchParams) + prd->primIDs[i]], - prd->rayWeight); + static_cast(prd->rayWeight)); } } diff --git a/include/viennaray/rayUtil.hpp b/include/viennaray/rayUtil.hpp index b87c9b1..684f151 100644 --- a/include/viennaray/rayUtil.hpp +++ b/include/viennaray/rayUtil.hpp @@ -410,10 +410,10 @@ void readMeshFromFile(const std::string &fileName, NumericType &gridDelta, dataFile.close(); } -template +template void writeVTK(const std::string &filename, const std::vector> &points, - const std::vector &flux) { + const std::vector &flux) { std::ofstream f(filename.c_str()); f << "# vtk DataFile Version 2.0" << std::endl; @@ -452,7 +452,7 @@ template void writeVTP(const std::string &filename, const std::vector> &points, const std::vector> &elements, - const std::vector &flux) { + const std::vector &flux) { std::ofstream f(filename.c_str()); if (!f.is_open()) return; @@ -555,7 +555,7 @@ void writeVTP(const std::string &filename, } inline void writeVTP(TriangleMesh const &mesh, const std::string &filename, - const std::vector &flux) { + const std::vector &flux) { writeVTP(filename, mesh.nodes, mesh.triangles, flux); } From e7bd486ca5a2701ec883f651f15c9754a999db1f Mon Sep 17 00:00:00 2001 From: reiter Date: Wed, 10 Dec 2025 11:13:31 +0100 Subject: [PATCH 3/6] allow changing GPU result type in CMake --- CMakeLists.txt | 6 +++ gpu/examples/trenchTriangles.cpp | 6 +-- gpu/include/raygLaunchParams.hpp | 8 +++- gpu/include/raygTrace.hpp | 20 +++++----- gpu/include/raygTraceDisk.hpp | 9 +++-- gpu/include/raygTraceLine.hpp | 4 +- gpu/include/raygTraceTriangle.hpp | 5 ++- gpu/kernels/normKernels.cu | 65 ++++++++++++------------------- gpu/pipelines/Particle.cuh | 2 +- gpu/tests/rngSeed/rngSeed.cpp | 2 +- include/viennaray/rayUtil.hpp | 4 +- 11 files changed, 65 insertions(+), 66 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b62c8d..402bfa9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ option(EMBREE_RAY_MASK "Enable Embree Ray Masking" OFF) option(VIENNARAY_USE_GPU "Enable GPU support" OFF) option(VIENNARAY_USE_WDIST "Enable weighted distribution of ray weights" OFF) +option(VIENNARAY_GPU_DOUBLE_PRECISION "Use double precision on GPU" ON) option(VIENNARAY_BUILD_EXAMPLES "Build examples" OFF) option(VIENNARAY_BUILD_TESTS "Build tests" OFF) @@ -67,6 +68,11 @@ if(VIENNARAY_USE_WDIST) target_compile_definitions(${PROJECT_NAME} INTERFACE VIENNARAY_USE_WDIST) endif() +if(VIENNARAY_GPU_DOUBLE_PRECISION) + message(STATUS "[ViennaRay] Using double precision on GPU") + target_compile_definitions(${PROJECT_NAME} INTERFACE VIENNARAY_GPU_DOUBLE_PRECISION) +endif() + if(VIENNARAY_PRINT_PROGRESS) target_compile_definitions(${PROJECT_NAME} INTERFACE VIENNARAY_PRINT_PROGRESS) endif() diff --git a/gpu/examples/trenchTriangles.cpp b/gpu/examples/trenchTriangles.cpp index 8443246..829f026 100644 --- a/gpu/examples/trenchTriangles.cpp +++ b/gpu/examples/trenchTriangles.cpp @@ -45,7 +45,7 @@ int main(int argc, char **argv) { tracer.setMaterialIds(materialIds); tracer.setCallables("CallableWrapper", context->modulePath); tracer.setParticleCallableMap({pMap, cMap}); - tracer.setNumberOfRaysPerPoint(2000); + tracer.setNumberOfRaysPerPoint(5000); tracer.insertNextParticle(particle); tracer.prepareParticlePrograms(); @@ -58,9 +58,9 @@ int main(int argc, char **argv) { #endif tracer.apply(); - tracer.normalizeResults(); auto flux = tracer.getFlux(0, 0); + rayInternal::writeVTP("trenchTriangles_triMesh.vtp", mesh.nodes, mesh.triangles, flux); @@ -68,6 +68,4 @@ int main(int argc, char **argv) { rayCountBuffer.download(&rayCount, 1); std::cout << "Trace count: " << rayCount << std::endl; #endif - - context->destroy(); } diff --git a/gpu/include/raygLaunchParams.hpp b/gpu/include/raygLaunchParams.hpp index 7f028c0..0f0f4d8 100644 --- a/gpu/include/raygLaunchParams.hpp +++ b/gpu/include/raygLaunchParams.hpp @@ -8,13 +8,19 @@ namespace viennaray::gpu { +#ifdef VIENNARAY_GPU_DOUBLE_PRECISION +using ResultType = double; +#else +using ResultType = float; +#endif + __both__ __forceinline__ unsigned callableIndex(unsigned p, CallableSlot s) { return p * static_cast(CallableSlot::COUNT) + static_cast(s); } struct LaunchParams { - double *resultBuffer; + ResultType *resultBuffer; float rayWeightThreshold = 0.1f; float tThreshold = 0.5f; diff --git a/gpu/include/raygTrace.hpp b/gpu/include/raygTrace.hpp index 200e654..c2f938d 100644 --- a/gpu/include/raygTrace.hpp +++ b/gpu/include/raygTrace.hpp @@ -91,8 +91,9 @@ template class Trace { } // Resize our cuda result buffer - resultBuffer.allocInit(launchParams.numElements * numFluxes_, double(0)); - launchParams.resultBuffer = (double *)resultBuffer.dPointer(); + resultBuffer.allocInit(launchParams.numElements * numFluxes_, + ResultType(0)); + launchParams.resultBuffer = (ResultType *)resultBuffer.dPointer(); if (materialIdsBuffer_.sizeInBytes != 0) { launchParams.materialIds = (int *)materialIdsBuffer_.dPointer(); @@ -304,8 +305,8 @@ template class Trace { size_t getNumberOfRays() const { return numRays; } - std::vector getFlux(int particleIdx, int dataIdx, - int smoothingNeighbors = 0) { + std::vector getFlux(int particleIdx, int dataIdx, + int smoothingNeighbors = 0) { if (!resultsDownloaded) { results.resize(launchParams.numElements * numFluxes_); resultBuffer.download(results.data(), @@ -313,7 +314,7 @@ template class Trace { resultsDownloaded = true; } - std::vector flux(launchParams.numElements); + std::vector flux(launchParams.numElements); unsigned int offset = 0; for (size_t i = 0; i < particles_.size(); i++) { if (particleIdx > i) @@ -321,7 +322,7 @@ template class Trace { } offset = (offset + dataIdx) * launchParams.numElements; std::memcpy(flux.data(), results.data() + offset, - launchParams.numElements * sizeof(double)); + launchParams.numElements * sizeof(ResultType)); if (smoothingNeighbors > 0) smoothFlux(flux, smoothingNeighbors); return flux; @@ -437,7 +438,8 @@ template class Trace { } } - virtual void smoothFlux(std::vector &flux, int smoothingNeighbors) {} + virtual void smoothFlux(std::vector &flux, + int smoothingNeighbors) {} // To be implemented by derived classes virtual void normalizeResults() = 0; @@ -449,7 +451,7 @@ template class Trace { void initRayTracer() { launchParams.D = D; context_->addModule(normModuleName); - normKernelName.append(geometryType_ + "_f"); + normKernelName.append(geometryType_); // launchParamsBuffer.alloc(sizeof(launchParams)); // normKernelName.push_back(NumericType); } @@ -754,7 +756,7 @@ template class Trace { // results Buffer CudaBuffer resultBuffer; - std::vector results; + std::vector results; rayInternal::KernelConfig config_; bool ignoreBoundary = false; diff --git a/gpu/include/raygTraceDisk.hpp b/gpu/include/raygTraceDisk.hpp index 58efb7f..ed0044f 100644 --- a/gpu/include/raygTraceDisk.hpp +++ b/gpu/include/raygTraceDisk.hpp @@ -42,7 +42,8 @@ template class TraceDisk final : public Trace { this->ignoreBoundary, sourceOffset); } - void smoothFlux(std::vector &flux, int smoothingNeighbors) override { + void smoothFlux(std::vector &flux, + int smoothingNeighbors) override { auto oldFlux = flux; const T requiredDistance = smoothingNeighbors * 2.0 * diskMesh.radius; PointNeighborhood @@ -66,9 +67,9 @@ template class TraceDisk final : public Trace { #pragma omp parallel for for (int idx = 0; idx < launchParams.numElements; idx++) { - double vv = oldFlux[idx]; + ResultType vv = oldFlux[idx]; auto const &neighborhood = pointNeighborhood->getNeighborIndices(idx); - double sum = 1.0; + ResultType sum = 1.0; auto const normal = diskMesh.normals[idx]; for (auto const &nbi : neighborhood) { auto nnormal = diskMesh.normals[nbi]; @@ -85,7 +86,7 @@ template class TraceDisk final : public Trace { void normalizeResults() override { assert(resultBuffer.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); - double sourceArea = 0.0; + ResultType sourceArea = 0.0; if constexpr (D == 2) { sourceArea = (launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]); diff --git a/gpu/include/raygTraceLine.hpp b/gpu/include/raygTraceLine.hpp index 1af6631..8a77987 100644 --- a/gpu/include/raygTraceLine.hpp +++ b/gpu/include/raygTraceLine.hpp @@ -23,7 +23,7 @@ template class TraceLine final : public Trace { this->ignoreBoundary, sourceOffset); } - void smoothFlux(std::vector &flux, int numNeighbors) override { + void smoothFlux(std::vector &flux, int numNeighbors) override { // not implemented for line geometry } @@ -31,7 +31,7 @@ template class TraceLine final : public Trace { assert(resultBuffer.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); - double sourceArea = + ResultType sourceArea = launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]; // calculate areas on host and send to device for now diff --git a/gpu/include/raygTraceTriangle.hpp b/gpu/include/raygTraceTriangle.hpp index dd0f4cd..9710d51 100644 --- a/gpu/include/raygTraceTriangle.hpp +++ b/gpu/include/raygTraceTriangle.hpp @@ -40,10 +40,11 @@ template class TraceTriangle final : public Trace { } } - void smoothFlux(std::vector &flux, int smoothingNeighbors) override {} + void smoothFlux(std::vector &flux, + int smoothingNeighbors) override {} void normalizeResults() override { - double sourceArea = 0.0; + ResultType sourceArea = 0.0; if constexpr (D == 2) { sourceArea = (launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]); diff --git a/gpu/kernels/normKernels.cu b/gpu/kernels/normKernels.cu index eb55e01..63465a3 100644 --- a/gpu/kernels/normKernels.cu +++ b/gpu/kernels/normKernels.cu @@ -3,10 +3,16 @@ #include -extern "C" __global__ void normalize_surface_Triangle_f( - double *data, const viennacore::Vec3Df *vertex, +#ifdef VIENNARAY_GPU_DOUBLE_PRECISION +typedef double Real; +#else +typedef float Real; +#endif + +extern "C" __global__ void normalize_surface_Triangle( + Real *data, const viennacore::Vec3Df *vertex, const viennacore::Vec3D *index, const unsigned int numTriangles, - const double sourceArea, const size_t numRays, const int numData) { + const Real sourceArea, const size_t numRays, const int numData) { using namespace viennacore; unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -24,10 +30,10 @@ extern "C" __global__ void normalize_surface_Triangle_f( } } -extern "C" __global__ void normalize_surface_Triangle_f_2D( - double *data, const viennacore::Vec3Df *vertex, +extern "C" __global__ void normalize_surface_Triangle_2D( + Real *data, const viennacore::Vec3Df *vertex, const viennacore::Vec3D *index, const unsigned int numTriangles, - const double sourceArea, const size_t numRays, const int numData) { + const Real sourceArea, const size_t numRays, const int numData) { using namespace viennacore; unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -43,39 +49,18 @@ extern "C" __global__ void normalize_surface_Triangle_f_2D( else area = 0.5 * Norm(C - A); if (area > 1e-6f) - data[tidx] *= sourceArea / (area * (double)numRays); + data[tidx] *= sourceArea / (area * (Real)numRays); else data[tidx] = 0.f; } } -extern "C" __global__ void normalize_surface_Triangle_d( - double *data, const viennacore::Vec3Df *vertex, - const viennacore::Vec3D *index, const unsigned int numTriangles, - const double sourceArea, const size_t numRays, const int numData) { - using namespace viennacore; - unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int stride = blockDim.x * gridDim.x; - - for (; tidx < numTriangles * numData; tidx += stride) { - auto elIdx = index[tidx % numTriangles]; - const auto &A = vertex[elIdx[0]]; - const auto &B = vertex[elIdx[1]]; - const auto &C = vertex[elIdx[2]]; - const double area = Norm(CrossProduct(B - A, C - A)) / 2.; - if (area > 1e-8) - data[tidx] *= sourceArea / (area * (double)numRays); - else - data[tidx] = 0.; - } -} - // Areas precomputed on the CPU -extern "C" __global__ void normalize_surface_Disk_f(double *data, float *areas, - const unsigned int numDisks, - const double sourceArea, - const size_t numRays, - const int numData) { +extern "C" __global__ void normalize_surface_Disk(Real *data, float *areas, + const unsigned int numDisks, + const Real sourceArea, + const size_t numRays, + const int numData) { unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -83,18 +68,18 @@ extern "C" __global__ void normalize_surface_Disk_f(double *data, float *areas, float area = areas[tidx % numDisks]; if (area > 1e-5f) - data[tidx] *= sourceArea / (area * (float)numRays); + data[tidx] *= sourceArea / (area * (Real)numRays); else data[tidx] = 0.f; } } // Areas precomputed on the CPU -extern "C" __global__ void normalize_surface_Line_f(double *data, float *areas, - const unsigned int numLines, - const double sourceArea, - const size_t numRays, - const int numData) { +extern "C" __global__ void normalize_surface_Line(Real *data, float *areas, + const unsigned int numLines, + const Real sourceArea, + const size_t numRays, + const int numData) { unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -103,7 +88,7 @@ extern "C" __global__ void normalize_surface_Line_f(double *data, float *areas, // data[tidx] = area; if (area > 1e-5f) - data[tidx] *= sourceArea / (area * (double)numRays); + data[tidx] *= sourceArea / (area * (Real)numRays); else data[tidx] = 0.f; } diff --git a/gpu/pipelines/Particle.cuh b/gpu/pipelines/Particle.cuh index d09c2fe..ab9705f 100644 --- a/gpu/pipelines/Particle.cuh +++ b/gpu/pipelines/Particle.cuh @@ -18,7 +18,7 @@ particleCollision(viennaray::gpu::PerRayData *prd) { atomicAdd(&launchParams .resultBuffer[viennaray::gpu::getIdxOffset(0, launchParams) + prd->primIDs[i]], - static_cast(prd->rayWeight)); + static_cast(prd->rayWeight)); } } diff --git a/gpu/tests/rngSeed/rngSeed.cpp b/gpu/tests/rngSeed/rngSeed.cpp index 31952be..7c9ce98 100644 --- a/gpu/tests/rngSeed/rngSeed.cpp +++ b/gpu/tests/rngSeed/rngSeed.cpp @@ -22,7 +22,7 @@ int main() { {0, gpu::CallableSlot::REFLECTION, "__direct_callable__particleReflectionConstSticking"}}; - std::vector flux1, flux2; + std::vector flux1, flux2; { NumericType extent = 5; diff --git a/include/viennaray/rayUtil.hpp b/include/viennaray/rayUtil.hpp index 684f151..ea38e98 100644 --- a/include/viennaray/rayUtil.hpp +++ b/include/viennaray/rayUtil.hpp @@ -448,11 +448,11 @@ void writeVTK(const std::string &filename, f.close(); } -template +template void writeVTP(const std::string &filename, const std::vector> &points, const std::vector> &elements, - const std::vector &flux) { + const std::vector &flux) { std::ofstream f(filename.c_str()); if (!f.is_open()) return; From 27eb6dce06ae1345236766c402fb7f39ef1be537 Mon Sep 17 00:00:00 2001 From: reiter Date: Wed, 10 Dec 2025 13:43:59 +0100 Subject: [PATCH 4/6] Fix compile definition --- cmake/generate_ptx.cmake | 9 +++++++++ gpu/examples/trenchTriangles.cpp | 9 +++++---- gpu/include/raygTrace.hpp | 3 +-- gpu/include/raygTraceDisk.hpp | 2 +- gpu/include/raygTraceLine.hpp | 2 +- gpu/include/raygTraceTriangle.hpp | 2 +- gpu/kernels/normKernels.cu | 6 +++--- 7 files changed, 21 insertions(+), 12 deletions(-) diff --git a/cmake/generate_ptx.cmake b/cmake/generate_ptx.cmake index c28aa17..b79a790 100644 --- a/cmake/generate_ptx.cmake +++ b/cmake/generate_ptx.cmake @@ -11,6 +11,9 @@ function(generate_pipeline target_name generated_files) cuda_include_directories(${VIENNARAY_GPU_INCLUDE}) cuda_include_directories(${ViennaCore_SOURCE_DIR}/include/viennacore) add_compile_definitions(VIENNACORE_COMPILE_GPU) + if (VIENNARAY_GPU_DOUBLE_PRECISION) + add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION) + endif() # Generate OptiX IR files if enabled if(VIENNARAY_GENERATE_OPTIXIR) @@ -53,6 +56,9 @@ function(generate_kernel generated_files) cuda_include_directories(${VIENNARAY_GPU_INCLUDE}) cuda_include_directories(${OptiX_INCLUDE_DIR}) add_compile_definitions(VIENNACORE_COMPILE_GPU) + if (VIENNARAY_GPU_DOUBLE_PRECISION) + add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION) + endif() cuda_compile_ptx(generated_ptx_files ${cu_source_files} ${cmake_options} ${options}) @@ -98,6 +104,9 @@ function(add_gpu_executable target_name_base target_name_var) cuda_include_directories(${VIENNARAY_GPU_INCLUDE}) cuda_include_directories(${ViennaCore_SOURCE_DIR}/include/viennacore) add_compile_definitions(VIENNACORE_COMPILE_GPU) + if (VIENNARAY_GPU_DOUBLE_PRECISION) + add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION) + endif() # Create CUDA kernels cuda_wrap_srcs( diff --git a/gpu/examples/trenchTriangles.cpp b/gpu/examples/trenchTriangles.cpp index 829f026..aee4afb 100644 --- a/gpu/examples/trenchTriangles.cpp +++ b/gpu/examples/trenchTriangles.cpp @@ -1,5 +1,6 @@ #include +#include #include // #define COUNT_RAYS @@ -29,7 +30,7 @@ int main(int argc, char **argv) { gpu::Particle particle; particle.name = "Particle"; - particle.sticking = 0.1f; + particle.sticking = 1.f; particle.dataLabels = {"particleFlux"}; particle.materialSticking[7] = 0.1f; particle.materialSticking[1] = 1.0f; @@ -38,7 +39,7 @@ int main(int argc, char **argv) { std::vector cMap = { {0, gpu::CallableSlot::COLLISION, "__direct_callable__particleCollision"}, {0, gpu::CallableSlot::REFLECTION, - "__direct_callable__particleReflection"}}; + "__direct_callable__particleReflectionConstSticking"}}; gpu::TraceTriangle tracer(context); tracer.setGeometry(mesh); @@ -61,8 +62,8 @@ int main(int argc, char **argv) { tracer.normalizeResults(); auto flux = tracer.getFlux(0, 0); - rayInternal::writeVTP("trenchTriangles_triMesh.vtp", mesh.nodes, - mesh.triangles, flux); + rayInternal::writeVTP( + "trenchTriangles_triMesh.vtp", mesh.nodes, mesh.triangles, flux); #ifdef COUNT_RAYS rayCountBuffer.download(&rayCount, 1); diff --git a/gpu/include/raygTrace.hpp b/gpu/include/raygTrace.hpp index c2f938d..2425011 100644 --- a/gpu/include/raygTrace.hpp +++ b/gpu/include/raygTrace.hpp @@ -187,8 +187,7 @@ template class Trace { launchParams.source.customDirectionBasis = true; } - launchParamsBuffers[i].alloc(sizeof(launchParams)); - launchParamsBuffers[i].upload(&launchParams, 1); + launchParamsBuffers[i].allocUploadSingle(launchParams); CUDA_CHECK(StreamCreate(&streams[i])); } diff --git a/gpu/include/raygTraceDisk.hpp b/gpu/include/raygTraceDisk.hpp index ed0044f..e2c6183 100644 --- a/gpu/include/raygTraceDisk.hpp +++ b/gpu/include/raygTraceDisk.hpp @@ -86,7 +86,7 @@ template class TraceDisk final : public Trace { void normalizeResults() override { assert(resultBuffer.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); - ResultType sourceArea = 0.0; + double sourceArea = 0.0; if constexpr (D == 2) { sourceArea = (launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]); diff --git a/gpu/include/raygTraceLine.hpp b/gpu/include/raygTraceLine.hpp index 8a77987..acc9d3a 100644 --- a/gpu/include/raygTraceLine.hpp +++ b/gpu/include/raygTraceLine.hpp @@ -31,7 +31,7 @@ template class TraceLine final : public Trace { assert(resultBuffer.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); - ResultType sourceArea = + double sourceArea = launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]; // calculate areas on host and send to device for now diff --git a/gpu/include/raygTraceTriangle.hpp b/gpu/include/raygTraceTriangle.hpp index 9710d51..59441a5 100644 --- a/gpu/include/raygTraceTriangle.hpp +++ b/gpu/include/raygTraceTriangle.hpp @@ -44,7 +44,7 @@ template class TraceTriangle final : public Trace { int smoothingNeighbors) override {} void normalizeResults() override { - ResultType sourceArea = 0.0; + double sourceArea = 0.0; if constexpr (D == 2) { sourceArea = (launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]); diff --git a/gpu/kernels/normKernels.cu b/gpu/kernels/normKernels.cu index 63465a3..e5883e0 100644 --- a/gpu/kernels/normKernels.cu +++ b/gpu/kernels/normKernels.cu @@ -12,7 +12,7 @@ typedef float Real; extern "C" __global__ void normalize_surface_Triangle( Real *data, const viennacore::Vec3Df *vertex, const viennacore::Vec3D *index, const unsigned int numTriangles, - const Real sourceArea, const size_t numRays, const int numData) { + const double sourceArea, const size_t numRays, const int numData) { using namespace viennacore; unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -33,7 +33,7 @@ extern "C" __global__ void normalize_surface_Triangle( extern "C" __global__ void normalize_surface_Triangle_2D( Real *data, const viennacore::Vec3Df *vertex, const viennacore::Vec3D *index, const unsigned int numTriangles, - const Real sourceArea, const size_t numRays, const int numData) { + const double sourceArea, const size_t numRays, const int numData) { using namespace viennacore; unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -58,7 +58,7 @@ extern "C" __global__ void normalize_surface_Triangle_2D( // Areas precomputed on the CPU extern "C" __global__ void normalize_surface_Disk(Real *data, float *areas, const unsigned int numDisks, - const Real sourceArea, + const double sourceArea, const size_t numRays, const int numData) { unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; From 9baa7045e02911c0339645f3fa400bb656bb473f Mon Sep 17 00:00:00 2001 From: reiter Date: Wed, 10 Dec 2025 14:25:06 +0100 Subject: [PATCH 5/6] Bump viennacore --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 402bfa9..81c5904 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,8 +98,7 @@ include("cmake/cpm.cmake") CPMAddPackage( NAME ViennaCore - VERSION 1.7.3 - GIT_TAG main + VERSION 1.7.4 GIT_REPOSITORY "https://github.com/ViennaTools/ViennaCore" OPTIONS "VIENNACORE_USE_GPU ${VIENNARAY_USE_GPU}") From 42e95b594334e864f9656dab272aa74ed2cad705 Mon Sep 17 00:00:00 2001 From: reiter Date: Wed, 10 Dec 2025 14:27:19 +0100 Subject: [PATCH 6/6] Format --- cmake/generate_ptx.cmake | 6 +++--- gpu/tests/rngSeed/CMakeLists.txt | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/generate_ptx.cmake b/cmake/generate_ptx.cmake index b79a790..3abd48b 100644 --- a/cmake/generate_ptx.cmake +++ b/cmake/generate_ptx.cmake @@ -11,7 +11,7 @@ function(generate_pipeline target_name generated_files) cuda_include_directories(${VIENNARAY_GPU_INCLUDE}) cuda_include_directories(${ViennaCore_SOURCE_DIR}/include/viennacore) add_compile_definitions(VIENNACORE_COMPILE_GPU) - if (VIENNARAY_GPU_DOUBLE_PRECISION) + if(VIENNARAY_GPU_DOUBLE_PRECISION) add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION) endif() @@ -56,7 +56,7 @@ function(generate_kernel generated_files) cuda_include_directories(${VIENNARAY_GPU_INCLUDE}) cuda_include_directories(${OptiX_INCLUDE_DIR}) add_compile_definitions(VIENNACORE_COMPILE_GPU) - if (VIENNARAY_GPU_DOUBLE_PRECISION) + if(VIENNARAY_GPU_DOUBLE_PRECISION) add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION) endif() @@ -104,7 +104,7 @@ function(add_gpu_executable target_name_base target_name_var) cuda_include_directories(${VIENNARAY_GPU_INCLUDE}) cuda_include_directories(${ViennaCore_SOURCE_DIR}/include/viennacore) add_compile_definitions(VIENNACORE_COMPILE_GPU) - if (VIENNARAY_GPU_DOUBLE_PRECISION) + if(VIENNARAY_GPU_DOUBLE_PRECISION) add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION) endif() diff --git a/gpu/tests/rngSeed/CMakeLists.txt b/gpu/tests/rngSeed/CMakeLists.txt index 05b00f2..d47d5d5 100644 --- a/gpu/tests/rngSeed/CMakeLists.txt +++ b/gpu/tests/rngSeed/CMakeLists.txt @@ -6,4 +6,4 @@ add_dependencies(ViennaRay-GPU_Tests ${PROJECT_NAME}) add_test(NAME ${PROJECT_NAME} COMMAND $) configure_file(${CMAKE_SOURCE_DIR}/examples/triangle3D/trenchMesh.dat - ${CMAKE_CURRENT_BINARY_DIR}/trenchMesh.dat COPYONLY) \ No newline at end of file + ${CMAKE_CURRENT_BINARY_DIR}/trenchMesh.dat COPYONLY)