ViennaTools · tobre1 · Dec 10, 2025 · Dec 6, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,6 +13,7 @@ option(EMBREE_RAY_MASK "Enable Embree Ray Masking" OFF)
 
 option(VIENNARAY_USE_GPU "Enable GPU support" OFF)
 option(VIENNARAY_USE_WDIST "Enable weighted distribution of ray weights" OFF)
+option(VIENNARAY_GPU_DOUBLE_PRECISION "Use double precision on GPU" ON)
 
 option(VIENNARAY_BUILD_EXAMPLES "Build examples" OFF)
 option(VIENNARAY_BUILD_TESTS "Build tests" OFF)
@@ -67,6 +68,11 @@ if(VIENNARAY_USE_WDIST)
   target_compile_definitions(${PROJECT_NAME} INTERFACE VIENNARAY_USE_WDIST)
 endif()
 
+if(VIENNARAY_GPU_DOUBLE_PRECISION)
+  message(STATUS "[ViennaRay] Using double precision on GPU")
+  target_compile_definitions(${PROJECT_NAME} INTERFACE VIENNARAY_GPU_DOUBLE_PRECISION)
+endif()
+
 if(VIENNARAY_PRINT_PROGRESS)
   target_compile_definitions(${PROJECT_NAME} INTERFACE VIENNARAY_PRINT_PROGRESS)
 endif()
@@ -92,7 +98,7 @@ include("cmake/cpm.cmake")
 
 CPMAddPackage(
   NAME ViennaCore
-  VERSION 1.7.3
+  VERSION 1.7.4
   GIT_REPOSITORY "https://github.com/ViennaTools/ViennaCore"
   OPTIONS "VIENNACORE_USE_GPU ${VIENNARAY_USE_GPU}")
 

diff --git a/cmake/generate_ptx.cmake b/cmake/generate_ptx.cmake
@@ -11,6 +11,9 @@ function(generate_pipeline target_name generated_files)
   cuda_include_directories(${VIENNARAY_GPU_INCLUDE})
   cuda_include_directories(${ViennaCore_SOURCE_DIR}/include/viennacore)
   add_compile_definitions(VIENNACORE_COMPILE_GPU)
+  if(VIENNARAY_GPU_DOUBLE_PRECISION)
+    add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION)
+  endif()
 
   # Generate OptiX IR files if enabled
   if(VIENNARAY_GENERATE_OPTIXIR)
@@ -53,6 +56,9 @@ function(generate_kernel generated_files)
   cuda_include_directories(${VIENNARAY_GPU_INCLUDE})
   cuda_include_directories(${OptiX_INCLUDE_DIR})
   add_compile_definitions(VIENNACORE_COMPILE_GPU)
+  if(VIENNARAY_GPU_DOUBLE_PRECISION)
+    add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION)
+  endif()
 
   cuda_compile_ptx(generated_ptx_files ${cu_source_files} ${cmake_options} ${options})
 
@@ -98,6 +104,9 @@ function(add_gpu_executable target_name_base target_name_var)
   cuda_include_directories(${VIENNARAY_GPU_INCLUDE})
   cuda_include_directories(${ViennaCore_SOURCE_DIR}/include/viennacore)
   add_compile_definitions(VIENNACORE_COMPILE_GPU)
+  if(VIENNARAY_GPU_DOUBLE_PRECISION)
+    add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION)
+  endif()
 
   # Create CUDA kernels
   cuda_wrap_srcs(

diff --git a/gpu/examples/trenchTriangles.cpp b/gpu/examples/trenchTriangles.cpp
@@ -1,5 +1,6 @@
 #include <raygTraceTriangle.hpp>
 
+#include <cuda_runtime.h>
 #include <omp.h>
 
 // #define COUNT_RAYS
@@ -29,7 +30,7 @@ int main(int argc, char **argv) {
 
   gpu::Particle<NumericType> particle;
   particle.name = "Particle";
-  particle.sticking = 0.1f;
+  particle.sticking = 1.f;
   particle.dataLabels = {"particleFlux"};
   particle.materialSticking[7] = 0.1f;
   particle.materialSticking[1] = 1.0f;
@@ -38,14 +39,14 @@ int main(int argc, char **argv) {
   std::vector<gpu::CallableConfig> cMap = {
       {0, gpu::CallableSlot::COLLISION, "__direct_callable__particleCollision"},
       {0, gpu::CallableSlot::REFLECTION,
-       "__direct_callable__particleReflection"}};
+       "__direct_callable__particleReflectionConstSticking"}};
 
   gpu::TraceTriangle<NumericType, D> tracer(context);
   tracer.setGeometry(mesh);
   tracer.setMaterialIds(materialIds);
   tracer.setCallables("CallableWrapper", context->modulePath);
   tracer.setParticleCallableMap({pMap, cMap});
-  tracer.setNumberOfRaysPerPoint(2000);
+  tracer.setNumberOfRaysPerPoint(5000);
   tracer.insertNextParticle(particle);
   tracer.prepareParticlePrograms();
 
@@ -58,16 +59,14 @@ int main(int argc, char **argv) {
 #endif
 
   tracer.apply();
-
   tracer.normalizeResults();
   auto flux = tracer.getFlux(0, 0);
-  rayInternal::writeVTP<float, D>("trenchTriangles_triMesh.vtp", mesh.nodes,
-                                  mesh.triangles, flux);
+
+  rayInternal::writeVTP<float, D, gpu::ResultType>(
+      "trenchTriangles_triMesh.vtp", mesh.nodes, mesh.triangles, flux);
 
 #ifdef COUNT_RAYS
   rayCountBuffer.download(&rayCount, 1);
   std::cout << "Trace count: " << rayCount << std::endl;
 #endif
-
-  context->destroy();
 }
diff --git a/gpu/include/raygLaunchParams.hpp b/gpu/include/raygLaunchParams.hpp
@@ -8,13 +8,19 @@
 
 namespace viennaray::gpu {
 
+#ifdef VIENNARAY_GPU_DOUBLE_PRECISION
+using ResultType = double;
+#else
+using ResultType = float;
+#endif
+
 __both__ __forceinline__ unsigned callableIndex(unsigned p, CallableSlot s) {
   return p * static_cast<unsigned>(CallableSlot::COUNT) +
          static_cast<unsigned>(s);
 }
 
 struct LaunchParams {
-  float *resultBuffer;
+  ResultType *resultBuffer;
 
   float rayWeightThreshold = 0.1f;
   float tThreshold = 0.5f;

diff --git a/gpu/include/raygTrace.hpp b/gpu/include/raygTrace.hpp
@@ -91,8 +91,9 @@ template <class T, int D> class Trace {
     }
 
     // Resize our cuda result buffer
-    resultBuffer.allocInit(launchParams.numElements * numFluxes_, float(0));
-    launchParams.resultBuffer = (float *)resultBuffer.dPointer();
+    resultBuffer.allocInit(launchParams.numElements * numFluxes_,
+                           ResultType(0));
+    launchParams.resultBuffer = (ResultType *)resultBuffer.dPointer();
 
     if (materialIdsBuffer_.sizeInBytes != 0) {
       launchParams.materialIds = (int *)materialIdsBuffer_.dPointer();
@@ -111,8 +112,8 @@ template <class T, int D> class Trace {
     launchParams.maxReflections = config_.maxReflections;
     launchParams.maxBoundaryHits = config_.maxBoundaryHits;
 
-    int numPointsPerDim =
-        static_cast<int>(std::sqrt(static_cast<T>(launchParams.numElements)));
+    int numPointsPerDim = static_cast<int>(
+        std::sqrt(static_cast<double>(launchParams.numElements)));
 
     if (config_.numRaysFixed > 0) {
       numPointsPerDim = 1;
@@ -186,8 +187,7 @@ template <class T, int D> class Trace {
         launchParams.source.customDirectionBasis = true;
       }
 
-      launchParamsBuffers[i].alloc(sizeof(launchParams));
-      launchParamsBuffers[i].upload(&launchParams, 1);
+      launchParamsBuffers[i].allocUploadSingle(launchParams);
 
       CUDA_CHECK(StreamCreate(&streams[i]));
     }
@@ -304,24 +304,24 @@ template <class T, int D> class Trace {
 
   size_t getNumberOfRays() const { return numRays; }
 
-  std::vector<float> getFlux(int particleIdx, int dataIdx,
-                             int smoothingNeighbors = 0) {
+  std::vector<ResultType> getFlux(int particleIdx, int dataIdx,
+                                  int smoothingNeighbors = 0) {
     if (!resultsDownloaded) {
       results.resize(launchParams.numElements * numFluxes_);
       resultBuffer.download(results.data(),
                             launchParams.numElements * numFluxes_);
       resultsDownloaded = true;
     }
 
-    std::vector<float> flux(launchParams.numElements);
+    std::vector<ResultType> flux(launchParams.numElements);
     unsigned int offset = 0;
     for (size_t i = 0; i < particles_.size(); i++) {
       if (particleIdx > i)
         offset += particles_[i].dataLabels.size();
     }
     offset = (offset + dataIdx) * launchParams.numElements;
     std::memcpy(flux.data(), results.data() + offset,
-                launchParams.numElements * sizeof(float));
+                launchParams.numElements * sizeof(ResultType));
     if (smoothingNeighbors > 0)
       smoothFlux(flux, smoothingNeighbors);
     return flux;
@@ -437,7 +437,8 @@ template <class T, int D> class Trace {
     }
   }
 
-  virtual void smoothFlux(std::vector<float> &flux, int smoothingNeighbors) {}
+  virtual void smoothFlux(std::vector<ResultType> &flux,
+                          int smoothingNeighbors) {}
 
   // To be implemented by derived classes
   virtual void normalizeResults() = 0;
@@ -449,7 +450,7 @@ template <class T, int D> class Trace {
   void initRayTracer() {
     launchParams.D = D;
     context_->addModule(normModuleName);
-    normKernelName.append(geometryType_ + "_f");
+    normKernelName.append(geometryType_);
     // launchParamsBuffer.alloc(sizeof(launchParams));
     // normKernelName.push_back(NumericType);
   }
@@ -746,15 +747,15 @@ template <class T, int D> class Trace {
   CudaBuffer hitgroupRecordBuffer;
   std::vector<OptixProgramGroup> directCallablePGs;
   CudaBuffer directCallableRecordBuffer;
-  OptixShaderBindingTable sbt = {};
+  OptixShaderBindingTable sbt{};
 
   // launch parameters, on the host, constant for all particles
   LaunchParams launchParams;
   std::vector<CudaBuffer> launchParamsBuffers;
 
   // results Buffer
   CudaBuffer resultBuffer;
-  std::vector<float> results;
+  std::vector<ResultType> results;
 
   rayInternal::KernelConfig config_;
   bool ignoreBoundary = false;

diff --git a/gpu/include/raygTraceDisk.hpp b/gpu/include/raygTraceDisk.hpp
@@ -42,7 +42,8 @@ template <class T, int D> class TraceDisk final : public Trace<T, D> {
                             this->ignoreBoundary, sourceOffset);
   }
 
-  void smoothFlux(std::vector<float> &flux, int smoothingNeighbors) override {
+  void smoothFlux(std::vector<ResultType> &flux,
+                  int smoothingNeighbors) override {
     auto oldFlux = flux;
     const T requiredDistance = smoothingNeighbors * 2.0 * diskMesh.radius;
     PointNeighborhood<float, D>
@@ -66,9 +67,9 @@ template <class T, int D> class TraceDisk final : public Trace<T, D> {
 
 #pragma omp parallel for
     for (int idx = 0; idx < launchParams.numElements; idx++) {
-      float vv = oldFlux[idx];
+      ResultType vv = oldFlux[idx];
       auto const &neighborhood = pointNeighborhood->getNeighborIndices(idx);
-      float sum = 1.f;
+      ResultType sum = 1.0;
       auto const normal = diskMesh.normals[idx];
       for (auto const &nbi : neighborhood) {
         auto nnormal = diskMesh.normals[nbi];
@@ -85,7 +86,7 @@ template <class T, int D> class TraceDisk final : public Trace<T, D> {
   void normalizeResults() override {
     assert(resultBuffer.sizeInBytes != 0 &&
            "Normalization: Result buffer not initialized.");
-    float sourceArea = 0.f;
+    double sourceArea = 0.0;
     if constexpr (D == 2) {
       sourceArea =
           (launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]);

diff --git a/gpu/include/raygTraceLine.hpp b/gpu/include/raygTraceLine.hpp
@@ -23,15 +23,15 @@ template <class T, int D> class TraceLine final : public Trace<T, D> {
                             this->ignoreBoundary, sourceOffset);
   }
 
-  void smoothFlux(std::vector<float> &flux, int numNeighbors) override {
+  void smoothFlux(std::vector<ResultType> &flux, int numNeighbors) override {
     // not implemented for line geometry
   }
 
   void normalizeResults() override {
     assert(resultBuffer.sizeInBytes != 0 &&
            "Normalization: Result buffer not initialized.");
 
-    float sourceArea =
+    double sourceArea =
         launchParams.source.maxPoint[0] - launchParams.source.minPoint[0];
 
     // calculate areas on host and send to device for now

diff --git a/gpu/include/raygTraceTriangle.hpp b/gpu/include/raygTraceTriangle.hpp
@@ -40,10 +40,11 @@ template <class T, int D> class TraceTriangle final : public Trace<T, D> {
     }
   }
 
-  void smoothFlux(std::vector<float> &flux, int smoothingNeighbors) override {}
+  void smoothFlux(std::vector<ResultType> &flux,
+                  int smoothingNeighbors) override {}
 
   void normalizeResults() override {
-    float sourceArea = 0.f;
+    double sourceArea = 0.0;
     if constexpr (D == 2) {
       sourceArea =
           (launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]);

diff --git a/gpu/include/raygTriangleGeometry.hpp b/gpu/include/raygTriangleGeometry.hpp
@@ -264,7 +264,7 @@ struct TriangleGeometry {
 
 #ifndef NDEBUG
     rayInternal::writeVTP(boundaryMesh, "triangleMesh_boundary.vtp",
-                          std::vector<float>());
+                          std::vector<double>());
 #endif
 
     return boundaryMesh;