diff --git a/CMakeLists.txt b/CMakeLists.txt index 3effc8d..81c5904 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ option(EMBREE_RAY_MASK "Enable Embree Ray Masking" OFF) option(VIENNARAY_USE_GPU "Enable GPU support" OFF) option(VIENNARAY_USE_WDIST "Enable weighted distribution of ray weights" OFF) +option(VIENNARAY_GPU_DOUBLE_PRECISION "Use double precision on GPU" ON) option(VIENNARAY_BUILD_EXAMPLES "Build examples" OFF) option(VIENNARAY_BUILD_TESTS "Build tests" OFF) @@ -67,6 +68,11 @@ if(VIENNARAY_USE_WDIST) target_compile_definitions(${PROJECT_NAME} INTERFACE VIENNARAY_USE_WDIST) endif() +if(VIENNARAY_GPU_DOUBLE_PRECISION) + message(STATUS "[ViennaRay] Using double precision on GPU") + target_compile_definitions(${PROJECT_NAME} INTERFACE VIENNARAY_GPU_DOUBLE_PRECISION) +endif() + if(VIENNARAY_PRINT_PROGRESS) target_compile_definitions(${PROJECT_NAME} INTERFACE VIENNARAY_PRINT_PROGRESS) endif() @@ -92,7 +98,7 @@ include("cmake/cpm.cmake") CPMAddPackage( NAME ViennaCore - VERSION 1.7.3 + VERSION 1.7.4 GIT_REPOSITORY "https://github.com/ViennaTools/ViennaCore" OPTIONS "VIENNACORE_USE_GPU ${VIENNARAY_USE_GPU}") diff --git a/cmake/generate_ptx.cmake b/cmake/generate_ptx.cmake index c28aa17..3abd48b 100644 --- a/cmake/generate_ptx.cmake +++ b/cmake/generate_ptx.cmake @@ -11,6 +11,9 @@ function(generate_pipeline target_name generated_files) cuda_include_directories(${VIENNARAY_GPU_INCLUDE}) cuda_include_directories(${ViennaCore_SOURCE_DIR}/include/viennacore) add_compile_definitions(VIENNACORE_COMPILE_GPU) + if(VIENNARAY_GPU_DOUBLE_PRECISION) + add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION) + endif() # Generate OptiX IR files if enabled if(VIENNARAY_GENERATE_OPTIXIR) @@ -53,6 +56,9 @@ function(generate_kernel generated_files) cuda_include_directories(${VIENNARAY_GPU_INCLUDE}) cuda_include_directories(${OptiX_INCLUDE_DIR}) add_compile_definitions(VIENNACORE_COMPILE_GPU) + if(VIENNARAY_GPU_DOUBLE_PRECISION) + add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION) + endif() cuda_compile_ptx(generated_ptx_files ${cu_source_files} ${cmake_options} ${options}) @@ -98,6 +104,9 @@ function(add_gpu_executable target_name_base target_name_var) cuda_include_directories(${VIENNARAY_GPU_INCLUDE}) cuda_include_directories(${ViennaCore_SOURCE_DIR}/include/viennacore) add_compile_definitions(VIENNACORE_COMPILE_GPU) + if(VIENNARAY_GPU_DOUBLE_PRECISION) + add_compile_definitions(VIENNARAY_GPU_DOUBLE_PRECISION) + endif() # Create CUDA kernels cuda_wrap_srcs( diff --git a/gpu/examples/trenchTriangles.cpp b/gpu/examples/trenchTriangles.cpp index 8443246..aee4afb 100644 --- a/gpu/examples/trenchTriangles.cpp +++ b/gpu/examples/trenchTriangles.cpp @@ -1,5 +1,6 @@ #include +#include #include // #define COUNT_RAYS @@ -29,7 +30,7 @@ int main(int argc, char **argv) { gpu::Particle particle; particle.name = "Particle"; - particle.sticking = 0.1f; + particle.sticking = 1.f; particle.dataLabels = {"particleFlux"}; particle.materialSticking[7] = 0.1f; particle.materialSticking[1] = 1.0f; @@ -38,14 +39,14 @@ int main(int argc, char **argv) { std::vector cMap = { {0, gpu::CallableSlot::COLLISION, "__direct_callable__particleCollision"}, {0, gpu::CallableSlot::REFLECTION, - "__direct_callable__particleReflection"}}; + "__direct_callable__particleReflectionConstSticking"}}; gpu::TraceTriangle tracer(context); tracer.setGeometry(mesh); tracer.setMaterialIds(materialIds); tracer.setCallables("CallableWrapper", context->modulePath); tracer.setParticleCallableMap({pMap, cMap}); - tracer.setNumberOfRaysPerPoint(2000); + tracer.setNumberOfRaysPerPoint(5000); tracer.insertNextParticle(particle); tracer.prepareParticlePrograms(); @@ -58,16 +59,14 @@ int main(int argc, char **argv) { #endif tracer.apply(); - tracer.normalizeResults(); auto flux = tracer.getFlux(0, 0); - rayInternal::writeVTP("trenchTriangles_triMesh.vtp", mesh.nodes, - mesh.triangles, flux); + + rayInternal::writeVTP( + "trenchTriangles_triMesh.vtp", mesh.nodes, mesh.triangles, flux); #ifdef COUNT_RAYS rayCountBuffer.download(&rayCount, 1); std::cout << "Trace count: " << rayCount << std::endl; #endif - - context->destroy(); } diff --git a/gpu/include/raygLaunchParams.hpp b/gpu/include/raygLaunchParams.hpp index a51fd7c..0f0f4d8 100644 --- a/gpu/include/raygLaunchParams.hpp +++ b/gpu/include/raygLaunchParams.hpp @@ -8,13 +8,19 @@ namespace viennaray::gpu { +#ifdef VIENNARAY_GPU_DOUBLE_PRECISION +using ResultType = double; +#else +using ResultType = float; +#endif + __both__ __forceinline__ unsigned callableIndex(unsigned p, CallableSlot s) { return p * static_cast(CallableSlot::COUNT) + static_cast(s); } struct LaunchParams { - float *resultBuffer; + ResultType *resultBuffer; float rayWeightThreshold = 0.1f; float tThreshold = 0.5f; diff --git a/gpu/include/raygTrace.hpp b/gpu/include/raygTrace.hpp index f694238..2425011 100644 --- a/gpu/include/raygTrace.hpp +++ b/gpu/include/raygTrace.hpp @@ -91,8 +91,9 @@ template class Trace { } // Resize our cuda result buffer - resultBuffer.allocInit(launchParams.numElements * numFluxes_, float(0)); - launchParams.resultBuffer = (float *)resultBuffer.dPointer(); + resultBuffer.allocInit(launchParams.numElements * numFluxes_, + ResultType(0)); + launchParams.resultBuffer = (ResultType *)resultBuffer.dPointer(); if (materialIdsBuffer_.sizeInBytes != 0) { launchParams.materialIds = (int *)materialIdsBuffer_.dPointer(); @@ -111,8 +112,8 @@ template class Trace { launchParams.maxReflections = config_.maxReflections; launchParams.maxBoundaryHits = config_.maxBoundaryHits; - int numPointsPerDim = - static_cast(std::sqrt(static_cast(launchParams.numElements))); + int numPointsPerDim = static_cast( + std::sqrt(static_cast(launchParams.numElements))); if (config_.numRaysFixed > 0) { numPointsPerDim = 1; @@ -186,8 +187,7 @@ template class Trace { launchParams.source.customDirectionBasis = true; } - launchParamsBuffers[i].alloc(sizeof(launchParams)); - launchParamsBuffers[i].upload(&launchParams, 1); + launchParamsBuffers[i].allocUploadSingle(launchParams); CUDA_CHECK(StreamCreate(&streams[i])); } @@ -304,8 +304,8 @@ template class Trace { size_t getNumberOfRays() const { return numRays; } - std::vector getFlux(int particleIdx, int dataIdx, - int smoothingNeighbors = 0) { + std::vector getFlux(int particleIdx, int dataIdx, + int smoothingNeighbors = 0) { if (!resultsDownloaded) { results.resize(launchParams.numElements * numFluxes_); resultBuffer.download(results.data(), @@ -313,7 +313,7 @@ template class Trace { resultsDownloaded = true; } - std::vector flux(launchParams.numElements); + std::vector flux(launchParams.numElements); unsigned int offset = 0; for (size_t i = 0; i < particles_.size(); i++) { if (particleIdx > i) @@ -321,7 +321,7 @@ template class Trace { } offset = (offset + dataIdx) * launchParams.numElements; std::memcpy(flux.data(), results.data() + offset, - launchParams.numElements * sizeof(float)); + launchParams.numElements * sizeof(ResultType)); if (smoothingNeighbors > 0) smoothFlux(flux, smoothingNeighbors); return flux; @@ -437,7 +437,8 @@ template class Trace { } } - virtual void smoothFlux(std::vector &flux, int smoothingNeighbors) {} + virtual void smoothFlux(std::vector &flux, + int smoothingNeighbors) {} // To be implemented by derived classes virtual void normalizeResults() = 0; @@ -449,7 +450,7 @@ template class Trace { void initRayTracer() { launchParams.D = D; context_->addModule(normModuleName); - normKernelName.append(geometryType_ + "_f"); + normKernelName.append(geometryType_); // launchParamsBuffer.alloc(sizeof(launchParams)); // normKernelName.push_back(NumericType); } @@ -746,7 +747,7 @@ template class Trace { CudaBuffer hitgroupRecordBuffer; std::vector directCallablePGs; CudaBuffer directCallableRecordBuffer; - OptixShaderBindingTable sbt = {}; + OptixShaderBindingTable sbt{}; // launch parameters, on the host, constant for all particles LaunchParams launchParams; @@ -754,7 +755,7 @@ template class Trace { // results Buffer CudaBuffer resultBuffer; - std::vector results; + std::vector results; rayInternal::KernelConfig config_; bool ignoreBoundary = false; diff --git a/gpu/include/raygTraceDisk.hpp b/gpu/include/raygTraceDisk.hpp index ee2c5f1..e2c6183 100644 --- a/gpu/include/raygTraceDisk.hpp +++ b/gpu/include/raygTraceDisk.hpp @@ -42,7 +42,8 @@ template class TraceDisk final : public Trace { this->ignoreBoundary, sourceOffset); } - void smoothFlux(std::vector &flux, int smoothingNeighbors) override { + void smoothFlux(std::vector &flux, + int smoothingNeighbors) override { auto oldFlux = flux; const T requiredDistance = smoothingNeighbors * 2.0 * diskMesh.radius; PointNeighborhood @@ -66,9 +67,9 @@ template class TraceDisk final : public Trace { #pragma omp parallel for for (int idx = 0; idx < launchParams.numElements; idx++) { - float vv = oldFlux[idx]; + ResultType vv = oldFlux[idx]; auto const &neighborhood = pointNeighborhood->getNeighborIndices(idx); - float sum = 1.f; + ResultType sum = 1.0; auto const normal = diskMesh.normals[idx]; for (auto const &nbi : neighborhood) { auto nnormal = diskMesh.normals[nbi]; @@ -85,7 +86,7 @@ template class TraceDisk final : public Trace { void normalizeResults() override { assert(resultBuffer.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); - float sourceArea = 0.f; + double sourceArea = 0.0; if constexpr (D == 2) { sourceArea = (launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]); diff --git a/gpu/include/raygTraceLine.hpp b/gpu/include/raygTraceLine.hpp index aa58be6..acc9d3a 100644 --- a/gpu/include/raygTraceLine.hpp +++ b/gpu/include/raygTraceLine.hpp @@ -23,7 +23,7 @@ template class TraceLine final : public Trace { this->ignoreBoundary, sourceOffset); } - void smoothFlux(std::vector &flux, int numNeighbors) override { + void smoothFlux(std::vector &flux, int numNeighbors) override { // not implemented for line geometry } @@ -31,7 +31,7 @@ template class TraceLine final : public Trace { assert(resultBuffer.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); - float sourceArea = + double sourceArea = launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]; // calculate areas on host and send to device for now diff --git a/gpu/include/raygTraceTriangle.hpp b/gpu/include/raygTraceTriangle.hpp index b07a9ef..59441a5 100644 --- a/gpu/include/raygTraceTriangle.hpp +++ b/gpu/include/raygTraceTriangle.hpp @@ -40,10 +40,11 @@ template class TraceTriangle final : public Trace { } } - void smoothFlux(std::vector &flux, int smoothingNeighbors) override {} + void smoothFlux(std::vector &flux, + int smoothingNeighbors) override {} void normalizeResults() override { - float sourceArea = 0.f; + double sourceArea = 0.0; if constexpr (D == 2) { sourceArea = (launchParams.source.maxPoint[0] - launchParams.source.minPoint[0]); diff --git a/gpu/include/raygTriangleGeometry.hpp b/gpu/include/raygTriangleGeometry.hpp index 0a5bec5..5027cc4 100644 --- a/gpu/include/raygTriangleGeometry.hpp +++ b/gpu/include/raygTriangleGeometry.hpp @@ -264,7 +264,7 @@ struct TriangleGeometry { #ifndef NDEBUG rayInternal::writeVTP(boundaryMesh, "triangleMesh_boundary.vtp", - std::vector()); + std::vector()); #endif return boundaryMesh; diff --git a/gpu/kernels/normKernels.cu b/gpu/kernels/normKernels.cu index f97e66e..e5883e0 100644 --- a/gpu/kernels/normKernels.cu +++ b/gpu/kernels/normKernels.cu @@ -3,10 +3,16 @@ #include -extern "C" __global__ void normalize_surface_Triangle_f( - float *data, const viennacore::Vec3Df *vertex, +#ifdef VIENNARAY_GPU_DOUBLE_PRECISION +typedef double Real; +#else +typedef float Real; +#endif + +extern "C" __global__ void normalize_surface_Triangle( + Real *data, const viennacore::Vec3Df *vertex, const viennacore::Vec3D *index, const unsigned int numTriangles, - const float sourceArea, const size_t numRays, const int numData) { + const double sourceArea, const size_t numRays, const int numData) { using namespace viennacore; unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -18,16 +24,16 @@ extern "C" __global__ void normalize_surface_Triangle_f( const auto &C = vertex[elIdx[2]]; const auto area = Norm(CrossProduct(B - A, C - A)) / 2.f; if (area > 1e-6f) - data[tidx] *= sourceArea / (area * (float)numRays); + data[tidx] *= sourceArea / (area * (double)numRays); else - data[tidx] = 0.f; + data[tidx] = 0.0; } } -extern "C" __global__ void normalize_surface_Triangle_f_2D( - float *data, const viennacore::Vec3Df *vertex, +extern "C" __global__ void normalize_surface_Triangle_2D( + Real *data, const viennacore::Vec3Df *vertex, const viennacore::Vec3D *index, const unsigned int numTriangles, - const float sourceArea, const size_t numRays, const int numData) { + const double sourceArea, const size_t numRays, const int numData) { using namespace viennacore; unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -43,39 +49,18 @@ extern "C" __global__ void normalize_surface_Triangle_f_2D( else area = 0.5 * Norm(C - A); if (area > 1e-6f) - data[tidx] *= sourceArea / (area * (float)numRays); + data[tidx] *= sourceArea / (area * (Real)numRays); else data[tidx] = 0.f; } } -extern "C" __global__ void normalize_surface_Triangle_d( - double *data, const viennacore::Vec3Df *vertex, - const viennacore::Vec3D *index, const unsigned int numTriangles, - const double sourceArea, const size_t numRays, const int numData) { - using namespace viennacore; - unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int stride = blockDim.x * gridDim.x; - - for (; tidx < numTriangles * numData; tidx += stride) { - auto elIdx = index[tidx % numTriangles]; - const auto &A = vertex[elIdx[0]]; - const auto &B = vertex[elIdx[1]]; - const auto &C = vertex[elIdx[2]]; - const double area = Norm(CrossProduct(B - A, C - A)) / 2.; - if (area > 1e-8) - data[tidx] *= sourceArea / (area * (double)numRays); - else - data[tidx] = 0.; - } -} - // Areas precomputed on the CPU -extern "C" __global__ void normalize_surface_Disk_f(float *data, float *areas, - const unsigned int numDisks, - float sourceArea, - const size_t numRays, - const int numData) { +extern "C" __global__ void normalize_surface_Disk(Real *data, float *areas, + const unsigned int numDisks, + const double sourceArea, + const size_t numRays, + const int numData) { unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -83,18 +68,18 @@ extern "C" __global__ void normalize_surface_Disk_f(float *data, float *areas, float area = areas[tidx % numDisks]; if (area > 1e-5f) - data[tidx] *= sourceArea / (area * (float)numRays); + data[tidx] *= sourceArea / (area * (Real)numRays); else data[tidx] = 0.f; } } // Areas precomputed on the CPU -extern "C" __global__ void normalize_surface_Line_f(float *data, float *areas, - const unsigned int numLines, - float sourceArea, - const size_t numRays, - const int numData) { +extern "C" __global__ void normalize_surface_Line(Real *data, float *areas, + const unsigned int numLines, + const Real sourceArea, + const size_t numRays, + const int numData) { unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int stride = blockDim.x * gridDim.x; @@ -103,7 +88,7 @@ extern "C" __global__ void normalize_surface_Line_f(float *data, float *areas, // data[tidx] = area; if (area > 1e-5f) - data[tidx] *= sourceArea / (area * (float)numRays); + data[tidx] *= sourceArea / (area * (Real)numRays); else data[tidx] = 0.f; } diff --git a/gpu/pipelines/Particle.cuh b/gpu/pipelines/Particle.cuh index 0c58da1..ab9705f 100644 --- a/gpu/pipelines/Particle.cuh +++ b/gpu/pipelines/Particle.cuh @@ -18,7 +18,7 @@ particleCollision(viennaray::gpu::PerRayData *prd) { atomicAdd(&launchParams .resultBuffer[viennaray::gpu::getIdxOffset(0, launchParams) + prd->primIDs[i]], - prd->rayWeight); + static_cast(prd->rayWeight)); } } diff --git a/gpu/tests/rngSeed/CMakeLists.txt b/gpu/tests/rngSeed/CMakeLists.txt index 05b00f2..d47d5d5 100644 --- a/gpu/tests/rngSeed/CMakeLists.txt +++ b/gpu/tests/rngSeed/CMakeLists.txt @@ -6,4 +6,4 @@ add_dependencies(ViennaRay-GPU_Tests ${PROJECT_NAME}) add_test(NAME ${PROJECT_NAME} COMMAND $) configure_file(${CMAKE_SOURCE_DIR}/examples/triangle3D/trenchMesh.dat - ${CMAKE_CURRENT_BINARY_DIR}/trenchMesh.dat COPYONLY) \ No newline at end of file + ${CMAKE_CURRENT_BINARY_DIR}/trenchMesh.dat COPYONLY) diff --git a/gpu/tests/rngSeed/rngSeed.cpp b/gpu/tests/rngSeed/rngSeed.cpp index 31952be..7c9ce98 100644 --- a/gpu/tests/rngSeed/rngSeed.cpp +++ b/gpu/tests/rngSeed/rngSeed.cpp @@ -22,7 +22,7 @@ int main() { {0, gpu::CallableSlot::REFLECTION, "__direct_callable__particleReflectionConstSticking"}}; - std::vector flux1, flux2; + std::vector flux1, flux2; { NumericType extent = 5; diff --git a/include/viennaray/rayUtil.hpp b/include/viennaray/rayUtil.hpp index b87c9b1..ea38e98 100644 --- a/include/viennaray/rayUtil.hpp +++ b/include/viennaray/rayUtil.hpp @@ -410,10 +410,10 @@ void readMeshFromFile(const std::string &fileName, NumericType &gridDelta, dataFile.close(); } -template +template void writeVTK(const std::string &filename, const std::vector> &points, - const std::vector &flux) { + const std::vector &flux) { std::ofstream f(filename.c_str()); f << "# vtk DataFile Version 2.0" << std::endl; @@ -448,11 +448,11 @@ void writeVTK(const std::string &filename, f.close(); } -template +template void writeVTP(const std::string &filename, const std::vector> &points, const std::vector> &elements, - const std::vector &flux) { + const std::vector &flux) { std::ofstream f(filename.c_str()); if (!f.is_open()) return; @@ -555,7 +555,7 @@ void writeVTP(const std::string &filename, } inline void writeVTP(TriangleMesh const &mesh, const std::string &filename, - const std::vector &flux) { + const std::vector &flux) { writeVTP(filename, mesh.nodes, mesh.triangles, flux); }