From 20a89e66be009c6a5132626178873182f5ad1034 Mon Sep 17 00:00:00 2001 From: reiter Date: Tue, 13 Jan 2026 10:30:39 +0100 Subject: [PATCH 1/4] Improve material mapping to consecutive for GPU tracing --- CMakeLists.txt | 2 +- gpu/CMakeLists.txt | 3 +- gpu/examples/trenchTriangles.cpp | 2 +- include/viennaray/gpu/raygTrace.hpp | 76 ++++++++++++++++------------- 4 files changed, 45 insertions(+), 38 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f5330b..aee7e15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,7 +99,7 @@ include("cmake/cpm.cmake") CPMAddPackage( NAME ViennaCore - VERSION 1.9.0 + VERSION 1.9.2 GIT_REPOSITORY "https://github.com/ViennaTools/ViennaCore" OPTIONS "VIENNACORE_USE_GPU ${VIENNARAY_USE_GPU}") diff --git a/gpu/CMakeLists.txt b/gpu/CMakeLists.txt index 880d9f4..9503f1b 100644 --- a/gpu/CMakeLists.txt +++ b/gpu/CMakeLists.txt @@ -22,7 +22,8 @@ endif() viennacore_add_optixir(GeneralPipelineDisk ${VIENNARAY_PIPELINE_DIR}/GeneralPipelineDisk.cu) viennacore_add_optixir(GeneralPipelineTriangle ${VIENNARAY_PIPELINE_DIR}/GeneralPipelineTriangle.cu) viennacore_add_optixir(GeneralPipelineLine ${VIENNARAY_PIPELINE_DIR}/GeneralPipelineLine.cu) -viennacore_add_optixir(ViennaRayCallableWrapper ${VIENNARAY_PIPELINE_DIR}/CallableWrapper.cu) +viennacore_add_optixir(ViennaRayCallableWrapper ${VIENNARAY_PIPELINE_DIR}/CallableWrapper.cu DEPENDS + "${VIENNARAY_PIPELINE_DIR}/Particle.cuh") # Add the norm kernels viennacore_add_ptx(normKernels ${VIENNARAY_CUDA_KERNELS}) diff --git a/gpu/examples/trenchTriangles.cpp b/gpu/examples/trenchTriangles.cpp index 63b25a5..740f150 100644 --- a/gpu/examples/trenchTriangles.cpp +++ b/gpu/examples/trenchTriangles.cpp @@ -38,7 +38,7 @@ int main(int argc, char **argv) { std::vector cMap = { {0, gpu::CallableSlot::COLLISION, "__direct_callable__particleCollision"}, {0, gpu::CallableSlot::REFLECTION, - "__direct_callable__particleReflectionConstSticking"}}; + "__direct_callable__particleReflection"}}; gpu::TraceTriangle tracer(context); tracer.setGeometry(mesh); diff --git a/include/viennaray/gpu/raygTrace.hpp b/include/viennaray/gpu/raygTrace.hpp index 311274c..bccb8f8 100644 --- a/include/viennaray/gpu/raygTrace.hpp +++ b/include/viennaray/gpu/raygTrace.hpp @@ -134,25 +134,26 @@ template class Trace { // set up material specific sticking probabilities materialStickingBuffer_.resize(particles_.size()); for (size_t i = 0; i < particles_.size(); i++) { - if (!particles_[i].materialSticking.empty()) { + auto &materialStickingMap = particles_[i].materialSticking; + + if (!materialStickingMap.empty()) { if (uniqueMaterialIds_.empty() || materialIdsBuffer_.sizeInBytes == 0) { VIENNACORE_LOG_ERROR( "Material IDs not set, when using material dependent " "sticking."); } - std::vector materialSticking(uniqueMaterialIds_.size()); - unsigned currentId = 0; - for (auto &matId : uniqueMaterialIds_) { - if (particles_[i].materialSticking.find(matId) == - particles_[i].materialSticking.end()) { - materialSticking[currentId++] = - static_cast(particles_[i].sticking); + std::vector materialStickingArray(uniqueMaterialIds_.size()); + for (size_t idx = 0; idx < uniqueMaterialIds_.size(); ++idx) { + if (auto it = materialStickingMap.find(uniqueMaterialIds_[idx]); + it != materialStickingMap.end()) { + materialStickingArray[idx] = static_cast(it->second); } else { - materialSticking[currentId++] = - static_cast(particles_[i].materialSticking[matId]); + // not in map, use default sticking + materialStickingArray[idx] = + static_cast(particles_[i].sticking); } } - materialStickingBuffer_[i].allocUpload(materialSticking); + materialStickingBuffer_[i].allocUpload(materialStickingArray); } } @@ -244,45 +245,50 @@ template class Trace { template void setMaterialIds(const std::vector &materialIds, - const bool mapToConsecutive = true, - const std::set &pUniqueMaterialIds = {}) { + const bool mapToConsecutive = true) { assert(materialIds.size() == launchParams_.numElements); - uniqueMaterialIds_.clear(); - if (!pUniqueMaterialIds.empty()) { - uniqueMaterialIds_ = pUniqueMaterialIds; + // copy material IDs + if constexpr (std::is_same_v) { + uniqueMaterialIds_ = materialIds; } else { - for (auto &matId : materialIds) { - uniqueMaterialIds_.insert(static_cast(matId)); - } + // cast to int + uniqueMaterialIds_.resize(materialIds.size()); + std::transform(materialIds.begin(), materialIds.end(), + uniqueMaterialIds_.begin(), + [](auto x) { return static_cast(x); }); } - std::vector materialMap(uniqueMaterialIds_.begin(), - uniqueMaterialIds_.end()); - materialMapBuffer_.allocUpload(materialMap); + // reduce to sorted unique IDs + std::sort(uniqueMaterialIds_.begin(), uniqueMaterialIds_.end()); + uniqueMaterialIds_.erase( + std::unique(uniqueMaterialIds_.begin(), uniqueMaterialIds_.end()), + uniqueMaterialIds_.end()); - if (mapToConsecutive) { - std::unordered_map materialIdMap; - int currentId = 0; - for (auto &uniqueMaterialId : uniqueMaterialIds_) { - materialIdMap[uniqueMaterialId] = currentId++; - } - assert(currentId == materialIdMap.size()); + std::vector materialIdsMapped(launchParams_.numElements); - std::vector materialIdsMapped(launchParams_.numElements); + if (mapToConsecutive) { + // mapping to consecutive IDs. Use binary search #pragma omp parallel for for (int i = 0; i < launchParams_.numElements; i++) { - materialIdsMapped[i] = materialIdMap[materialIds[i]]; + int idx = 0, matId = static_cast(materialIds[i]); + for (; idx < uniqueMaterialIds_.size(); ++idx) { + if (uniqueMaterialIds_[idx] == matId) + break; + } + materialIdsMapped[i] = idx; } - materialIdsBuffer_.allocUpload(materialIdsMapped); } else { - std::vector materialIdsMapped(launchParams_.numElements); + // no mapping, just copy #pragma omp parallel for for (int i = 0; i < launchParams_.numElements; i++) { materialIdsMapped[i] = static_cast(materialIds[i]); } - materialIdsBuffer_.allocUpload(materialIdsMapped); } + + // upload to device + materialMapBuffer_.allocUpload(uniqueMaterialIds_); + materialIdsBuffer_.allocUpload(materialIdsMapped); } void setNumberOfRaysPerPoint(const size_t pNumRays) { @@ -752,7 +758,7 @@ template class Trace { std::unordered_map particleMap_; std::vector callableMap_; - std::set uniqueMaterialIds_; + std::vector uniqueMaterialIds_; CudaBuffer materialIdsBuffer_; float gridDelta_ = 0.0f; From acae28c3420569147c3bbbf5dc65d561daeaf6c1 Mon Sep 17 00:00:00 2001 From: reiter Date: Tue, 13 Jan 2026 10:48:27 +0100 Subject: [PATCH 2/4] Enable manual synchronization --- gpu/CMakeLists.txt | 4 +-- include/viennaray/gpu/raygTrace.hpp | 40 ++++++++++++++------- include/viennaray/gpu/raygTraceDisk.hpp | 1 + include/viennaray/gpu/raygTraceLine.hpp | 1 + include/viennaray/gpu/raygTraceTriangle.hpp | 1 + 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/gpu/CMakeLists.txt b/gpu/CMakeLists.txt index 9503f1b..f01f60e 100644 --- a/gpu/CMakeLists.txt +++ b/gpu/CMakeLists.txt @@ -22,8 +22,8 @@ endif() viennacore_add_optixir(GeneralPipelineDisk ${VIENNARAY_PIPELINE_DIR}/GeneralPipelineDisk.cu) viennacore_add_optixir(GeneralPipelineTriangle ${VIENNARAY_PIPELINE_DIR}/GeneralPipelineTriangle.cu) viennacore_add_optixir(GeneralPipelineLine ${VIENNARAY_PIPELINE_DIR}/GeneralPipelineLine.cu) -viennacore_add_optixir(ViennaRayCallableWrapper ${VIENNARAY_PIPELINE_DIR}/CallableWrapper.cu DEPENDS - "${VIENNARAY_PIPELINE_DIR}/Particle.cuh") +viennacore_add_optixir(ViennaRayCallableWrapper ${VIENNARAY_PIPELINE_DIR}/CallableWrapper.cu + DEPENDS "${VIENNARAY_PIPELINE_DIR}/Particle.cuh") # Add the norm kernels viennacore_add_ptx(normKernels ${VIENNARAY_CUDA_KERNELS}) diff --git a/include/viennaray/gpu/raygTrace.hpp b/include/viennaray/gpu/raygTrace.hpp index bccb8f8..c75b49a 100644 --- a/include/viennaray/gpu/raygTrace.hpp +++ b/include/viennaray/gpu/raygTrace.hpp @@ -157,9 +157,10 @@ template class Trace { } } - // Every particle gets its own stream and launch parameters - std::vector streams(particles_.size()); + // Every particle gets its launch parameters launchParamsBuffers_.resize(particles_.size()); + assert(launchParamsBuffers_.size() == streams_.size() && + "Number of streams not initialized correctly."); if (particleMap_.empty()) { VIENNACORE_LOG_ERROR("No particle name->particleType mapping provided."); @@ -191,8 +192,6 @@ template class Trace { } launchParamsBuffers_[i].allocUploadSingle(launchParams_); - - CUDA_CHECK(StreamCreate(&streams[i])); } generateSBT(); @@ -200,7 +199,7 @@ template class Trace { #ifndef NDEBUG // Launch on single stream in debug mode for (size_t i = 0; i < particles_.size(); i++) { OPTIX_CHECK(optixLaunch( - pipeline_, streams[0], + pipeline_, streams_[0], /*! parameters and SBT */ launchParamsBuffers_[i].dPointer(), launchParamsBuffers_[i].sizeInBytes, &shaderBindingTable_, @@ -210,7 +209,7 @@ template class Trace { #else // Launch on multiple streams in release mode for (size_t i = 0; i < particles_.size(); i++) { OPTIX_CHECK(optixLaunch( - pipeline_, streams[i], + pipeline_, streams_[i], /*! parameters and SBT */ launchParamsBuffers_[i].dPointer(), launchParamsBuffers_[i].sizeInBytes, &shaderBindingTable_, @@ -219,12 +218,7 @@ template class Trace { } #endif - // sync - for (auto &s : streams) { - CUDA_CHECK(StreamSynchronize(s)); - CUDA_CHECK(StreamDestroy(s)); - } - + isSynced_ = false; resultsDownloaded_ = false; } @@ -402,6 +396,9 @@ template class Trace { } } directCallablePGs_.clear(); + for (auto &s : streams_) { + CUDA_CHECK(StreamDestroy(s)); + } } unsigned int prepareParticlePrograms() { @@ -429,6 +426,12 @@ template class Trace { VIENNACORE_LOG_DEBUG("Number of flux arrays: " + std::to_string(numFluxes_)); + // each particle gets its own stream + streams_.resize(particles_.size()); + for (size_t i = 0; i < particles_.size(); i++) { + CUDA_CHECK(StreamCreate(&streams_[i])); + } + return numFluxes_; } @@ -461,8 +464,19 @@ template class Trace { launchParams_.customData = (void *)d_params; } + void syncStreams() { + if (isSynced_) + return; + + for (auto &s : streams_) { + CUDA_CHECK(StreamSynchronize(s)); + } + isSynced_ = true; + } + void downloadResults() { if (!resultsDownloaded_) { + syncStreams(); results_.resize(launchParams_.numElements * numFluxes_); resultBuffer_.download(results_.data(), launchParams_.numElements * numFluxes_); @@ -794,6 +808,7 @@ template class Trace { // launch parameters LaunchParams launchParams_; std::vector launchParamsBuffers_; // one per particle + std::vector streams_; // results Buffer CudaBuffer resultBuffer_; @@ -802,6 +817,7 @@ template class Trace { rayInternal::KernelConfig config_; bool ignoreBoundary_ = false; bool resultsDownloaded_ = false; + bool isSynced_ = false; size_t numRays_ = 0; unsigned numCellData_ = 0; diff --git a/include/viennaray/gpu/raygTraceDisk.hpp b/include/viennaray/gpu/raygTraceDisk.hpp index 5a2a572..4f52828 100644 --- a/include/viennaray/gpu/raygTraceDisk.hpp +++ b/include/viennaray/gpu/raygTraceDisk.hpp @@ -84,6 +84,7 @@ template class TraceDisk final : public Trace { } void normalizeResults() override { + this->syncStreams(); assert(this->resultBuffer_.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); double sourceArea = 0.0; diff --git a/include/viennaray/gpu/raygTraceLine.hpp b/include/viennaray/gpu/raygTraceLine.hpp index e738a08..c6220e6 100644 --- a/include/viennaray/gpu/raygTraceLine.hpp +++ b/include/viennaray/gpu/raygTraceLine.hpp @@ -28,6 +28,7 @@ template class TraceLine final : public Trace { } void normalizeResults() override { + this->syncStreams(); assert(this->resultBuffer_.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); diff --git a/include/viennaray/gpu/raygTraceTriangle.hpp b/include/viennaray/gpu/raygTraceTriangle.hpp index 2e9cf94..13cabcb 100644 --- a/include/viennaray/gpu/raygTraceTriangle.hpp +++ b/include/viennaray/gpu/raygTraceTriangle.hpp @@ -41,6 +41,7 @@ template class TraceTriangle final : public Trace { } void normalizeResults() override { + this->syncStreams(); double sourceArea = 0.0; if constexpr (D == 2) { sourceArea = From 56cae209c64d9a6ddb80fe5196db2d871f57caed Mon Sep 17 00:00:00 2001 From: reiter Date: Tue, 13 Jan 2026 14:38:57 +0100 Subject: [PATCH 3/4] Move sync right before normlize kernel --- include/viennaray/gpu/raygTraceDisk.hpp | 2 +- include/viennaray/gpu/raygTraceLine.hpp | 2 +- include/viennaray/gpu/raygTraceTriangle.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/viennaray/gpu/raygTraceDisk.hpp b/include/viennaray/gpu/raygTraceDisk.hpp index 4f52828..db442fb 100644 --- a/include/viennaray/gpu/raygTraceDisk.hpp +++ b/include/viennaray/gpu/raygTraceDisk.hpp @@ -84,7 +84,6 @@ template class TraceDisk final : public Trace { } void normalizeResults() override { - this->syncStreams(); assert(this->resultBuffer_.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); double sourceArea = 0.0; @@ -170,6 +169,7 @@ template class TraceDisk final : public Trace { areaBuffer.allocUpload(areas); CUdeviceptr d_areas = areaBuffer.dPointer(); CUdeviceptr d_data = this->resultBuffer_.dPointer(); + this->syncStreams(); void *kernel_args[] = { &d_data, &d_areas, &launchParams_.numElements, diff --git a/include/viennaray/gpu/raygTraceLine.hpp b/include/viennaray/gpu/raygTraceLine.hpp index c6220e6..89f0444 100644 --- a/include/viennaray/gpu/raygTraceLine.hpp +++ b/include/viennaray/gpu/raygTraceLine.hpp @@ -28,7 +28,6 @@ template class TraceLine final : public Trace { } void normalizeResults() override { - this->syncStreams(); assert(this->resultBuffer_.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); @@ -48,6 +47,7 @@ template class TraceLine final : public Trace { areaBuffer.allocUpload(areas); CUdeviceptr d_areas = areaBuffer.dPointer(); CUdeviceptr d_data = this->resultBuffer_.dPointer(); + this->syncStreams(); void *kernel_args[] = { &d_data, &d_areas, &launchParams_.numElements, diff --git a/include/viennaray/gpu/raygTraceTriangle.hpp b/include/viennaray/gpu/raygTraceTriangle.hpp index 13cabcb..43219f2 100644 --- a/include/viennaray/gpu/raygTraceTriangle.hpp +++ b/include/viennaray/gpu/raygTraceTriangle.hpp @@ -41,7 +41,6 @@ template class TraceTriangle final : public Trace { } void normalizeResults() override { - this->syncStreams(); double sourceArea = 0.0; if constexpr (D == 2) { sourceArea = @@ -52,6 +51,7 @@ template class TraceTriangle final : public Trace { launchParams_.source.minPoint[0]) * (launchParams_.source.maxPoint[1] - launchParams_.source.minPoint[1]); } + this->syncStreams(); assert(this->resultBuffer_.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); CUdeviceptr d_data = this->resultBuffer_.dPointer(); From 52f8d3b7b4a12d4cea384657047520195cf0d2b4 Mon Sep 17 00:00:00 2001 From: reiter Date: Tue, 13 Jan 2026 14:39:37 +0100 Subject: [PATCH 4/4] chore: bump version --- CMakeLists.txt | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aee7e15..441a642 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.20 FATAL_ERROR) project( ViennaRay LANGUAGES CXX - VERSION 3.10.1) + VERSION 3.10.2) # -------------------------------------------------------------------------------------------------------- # Library switches diff --git a/README.md b/README.md index 9d80a3b..08967b9 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ We recommend using [CPM.cmake](https://github.com/cpm-cmake/CPM.cmake) to consum * Installation with CPM ```cmake - CPMAddPackage("gh:viennatools/viennaray@3.10.1") # Use the latest release version + CPMAddPackage("gh:viennatools/viennaray@3.10.2") # Use the latest release version ``` * With a local installation