diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f5330b..441a642 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.20 FATAL_ERROR) project( ViennaRay LANGUAGES CXX - VERSION 3.10.1) + VERSION 3.10.2) # -------------------------------------------------------------------------------------------------------- # Library switches @@ -99,7 +99,7 @@ include("cmake/cpm.cmake") CPMAddPackage( NAME ViennaCore - VERSION 1.9.0 + VERSION 1.9.2 GIT_REPOSITORY "https://github.com/ViennaTools/ViennaCore" OPTIONS "VIENNACORE_USE_GPU ${VIENNARAY_USE_GPU}") diff --git a/README.md b/README.md index 9d80a3b..08967b9 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ We recommend using [CPM.cmake](https://github.com/cpm-cmake/CPM.cmake) to consum * Installation with CPM ```cmake - CPMAddPackage("gh:viennatools/viennaray@3.10.1") # Use the latest release version + CPMAddPackage("gh:viennatools/viennaray@3.10.2") # Use the latest release version ``` * With a local installation diff --git a/gpu/CMakeLists.txt b/gpu/CMakeLists.txt index 880d9f4..f01f60e 100644 --- a/gpu/CMakeLists.txt +++ b/gpu/CMakeLists.txt @@ -22,7 +22,8 @@ endif() viennacore_add_optixir(GeneralPipelineDisk ${VIENNARAY_PIPELINE_DIR}/GeneralPipelineDisk.cu) viennacore_add_optixir(GeneralPipelineTriangle ${VIENNARAY_PIPELINE_DIR}/GeneralPipelineTriangle.cu) viennacore_add_optixir(GeneralPipelineLine ${VIENNARAY_PIPELINE_DIR}/GeneralPipelineLine.cu) -viennacore_add_optixir(ViennaRayCallableWrapper ${VIENNARAY_PIPELINE_DIR}/CallableWrapper.cu) +viennacore_add_optixir(ViennaRayCallableWrapper ${VIENNARAY_PIPELINE_DIR}/CallableWrapper.cu + DEPENDS "${VIENNARAY_PIPELINE_DIR}/Particle.cuh") # Add the norm kernels viennacore_add_ptx(normKernels ${VIENNARAY_CUDA_KERNELS}) diff --git a/gpu/examples/trenchTriangles.cpp b/gpu/examples/trenchTriangles.cpp index 63b25a5..740f150 100644 --- a/gpu/examples/trenchTriangles.cpp +++ b/gpu/examples/trenchTriangles.cpp @@ -38,7 +38,7 @@ int main(int argc, char **argv) { std::vector cMap = { {0, gpu::CallableSlot::COLLISION, "__direct_callable__particleCollision"}, {0, gpu::CallableSlot::REFLECTION, - "__direct_callable__particleReflectionConstSticking"}}; + "__direct_callable__particleReflection"}}; gpu::TraceTriangle tracer(context); tracer.setGeometry(mesh); diff --git a/include/viennaray/gpu/raygTrace.hpp b/include/viennaray/gpu/raygTrace.hpp index 311274c..c75b49a 100644 --- a/include/viennaray/gpu/raygTrace.hpp +++ b/include/viennaray/gpu/raygTrace.hpp @@ -134,31 +134,33 @@ template class Trace { // set up material specific sticking probabilities materialStickingBuffer_.resize(particles_.size()); for (size_t i = 0; i < particles_.size(); i++) { - if (!particles_[i].materialSticking.empty()) { + auto &materialStickingMap = particles_[i].materialSticking; + + if (!materialStickingMap.empty()) { if (uniqueMaterialIds_.empty() || materialIdsBuffer_.sizeInBytes == 0) { VIENNACORE_LOG_ERROR( "Material IDs not set, when using material dependent " "sticking."); } - std::vector materialSticking(uniqueMaterialIds_.size()); - unsigned currentId = 0; - for (auto &matId : uniqueMaterialIds_) { - if (particles_[i].materialSticking.find(matId) == - particles_[i].materialSticking.end()) { - materialSticking[currentId++] = - static_cast(particles_[i].sticking); + std::vector materialStickingArray(uniqueMaterialIds_.size()); + for (size_t idx = 0; idx < uniqueMaterialIds_.size(); ++idx) { + if (auto it = materialStickingMap.find(uniqueMaterialIds_[idx]); + it != materialStickingMap.end()) { + materialStickingArray[idx] = static_cast(it->second); } else { - materialSticking[currentId++] = - static_cast(particles_[i].materialSticking[matId]); + // not in map, use default sticking + materialStickingArray[idx] = + static_cast(particles_[i].sticking); } } - materialStickingBuffer_[i].allocUpload(materialSticking); + materialStickingBuffer_[i].allocUpload(materialStickingArray); } } - // Every particle gets its own stream and launch parameters - std::vector streams(particles_.size()); + // Every particle gets its launch parameters launchParamsBuffers_.resize(particles_.size()); + assert(launchParamsBuffers_.size() == streams_.size() && + "Number of streams not initialized correctly."); if (particleMap_.empty()) { VIENNACORE_LOG_ERROR("No particle name->particleType mapping provided."); @@ -190,8 +192,6 @@ template class Trace { } launchParamsBuffers_[i].allocUploadSingle(launchParams_); - - CUDA_CHECK(StreamCreate(&streams[i])); } generateSBT(); @@ -199,7 +199,7 @@ template class Trace { #ifndef NDEBUG // Launch on single stream in debug mode for (size_t i = 0; i < particles_.size(); i++) { OPTIX_CHECK(optixLaunch( - pipeline_, streams[0], + pipeline_, streams_[0], /*! parameters and SBT */ launchParamsBuffers_[i].dPointer(), launchParamsBuffers_[i].sizeInBytes, &shaderBindingTable_, @@ -209,7 +209,7 @@ template class Trace { #else // Launch on multiple streams in release mode for (size_t i = 0; i < particles_.size(); i++) { OPTIX_CHECK(optixLaunch( - pipeline_, streams[i], + pipeline_, streams_[i], /*! parameters and SBT */ launchParamsBuffers_[i].dPointer(), launchParamsBuffers_[i].sizeInBytes, &shaderBindingTable_, @@ -218,12 +218,7 @@ template class Trace { } #endif - // sync - for (auto &s : streams) { - CUDA_CHECK(StreamSynchronize(s)); - CUDA_CHECK(StreamDestroy(s)); - } - + isSynced_ = false; resultsDownloaded_ = false; } @@ -244,45 +239,50 @@ template class Trace { template void setMaterialIds(const std::vector &materialIds, - const bool mapToConsecutive = true, - const std::set &pUniqueMaterialIds = {}) { + const bool mapToConsecutive = true) { assert(materialIds.size() == launchParams_.numElements); - uniqueMaterialIds_.clear(); - if (!pUniqueMaterialIds.empty()) { - uniqueMaterialIds_ = pUniqueMaterialIds; + // copy material IDs + if constexpr (std::is_same_v) { + uniqueMaterialIds_ = materialIds; } else { - for (auto &matId : materialIds) { - uniqueMaterialIds_.insert(static_cast(matId)); - } + // cast to int + uniqueMaterialIds_.resize(materialIds.size()); + std::transform(materialIds.begin(), materialIds.end(), + uniqueMaterialIds_.begin(), + [](auto x) { return static_cast(x); }); } - std::vector materialMap(uniqueMaterialIds_.begin(), - uniqueMaterialIds_.end()); - materialMapBuffer_.allocUpload(materialMap); + // reduce to sorted unique IDs + std::sort(uniqueMaterialIds_.begin(), uniqueMaterialIds_.end()); + uniqueMaterialIds_.erase( + std::unique(uniqueMaterialIds_.begin(), uniqueMaterialIds_.end()), + uniqueMaterialIds_.end()); - if (mapToConsecutive) { - std::unordered_map materialIdMap; - int currentId = 0; - for (auto &uniqueMaterialId : uniqueMaterialIds_) { - materialIdMap[uniqueMaterialId] = currentId++; - } - assert(currentId == materialIdMap.size()); + std::vector materialIdsMapped(launchParams_.numElements); - std::vector materialIdsMapped(launchParams_.numElements); + if (mapToConsecutive) { + // mapping to consecutive IDs. Use binary search #pragma omp parallel for for (int i = 0; i < launchParams_.numElements; i++) { - materialIdsMapped[i] = materialIdMap[materialIds[i]]; + int idx = 0, matId = static_cast(materialIds[i]); + for (; idx < uniqueMaterialIds_.size(); ++idx) { + if (uniqueMaterialIds_[idx] == matId) + break; + } + materialIdsMapped[i] = idx; } - materialIdsBuffer_.allocUpload(materialIdsMapped); } else { - std::vector materialIdsMapped(launchParams_.numElements); + // no mapping, just copy #pragma omp parallel for for (int i = 0; i < launchParams_.numElements; i++) { materialIdsMapped[i] = static_cast(materialIds[i]); } - materialIdsBuffer_.allocUpload(materialIdsMapped); } + + // upload to device + materialMapBuffer_.allocUpload(uniqueMaterialIds_); + materialIdsBuffer_.allocUpload(materialIdsMapped); } void setNumberOfRaysPerPoint(const size_t pNumRays) { @@ -396,6 +396,9 @@ template class Trace { } } directCallablePGs_.clear(); + for (auto &s : streams_) { + CUDA_CHECK(StreamDestroy(s)); + } } unsigned int prepareParticlePrograms() { @@ -423,6 +426,12 @@ template class Trace { VIENNACORE_LOG_DEBUG("Number of flux arrays: " + std::to_string(numFluxes_)); + // each particle gets its own stream + streams_.resize(particles_.size()); + for (size_t i = 0; i < particles_.size(); i++) { + CUDA_CHECK(StreamCreate(&streams_[i])); + } + return numFluxes_; } @@ -455,8 +464,19 @@ template class Trace { launchParams_.customData = (void *)d_params; } + void syncStreams() { + if (isSynced_) + return; + + for (auto &s : streams_) { + CUDA_CHECK(StreamSynchronize(s)); + } + isSynced_ = true; + } + void downloadResults() { if (!resultsDownloaded_) { + syncStreams(); results_.resize(launchParams_.numElements * numFluxes_); resultBuffer_.download(results_.data(), launchParams_.numElements * numFluxes_); @@ -752,7 +772,7 @@ template class Trace { std::unordered_map particleMap_; std::vector callableMap_; - std::set uniqueMaterialIds_; + std::vector uniqueMaterialIds_; CudaBuffer materialIdsBuffer_; float gridDelta_ = 0.0f; @@ -788,6 +808,7 @@ template class Trace { // launch parameters LaunchParams launchParams_; std::vector launchParamsBuffers_; // one per particle + std::vector streams_; // results Buffer CudaBuffer resultBuffer_; @@ -796,6 +817,7 @@ template class Trace { rayInternal::KernelConfig config_; bool ignoreBoundary_ = false; bool resultsDownloaded_ = false; + bool isSynced_ = false; size_t numRays_ = 0; unsigned numCellData_ = 0; diff --git a/include/viennaray/gpu/raygTraceDisk.hpp b/include/viennaray/gpu/raygTraceDisk.hpp index 5a2a572..db442fb 100644 --- a/include/viennaray/gpu/raygTraceDisk.hpp +++ b/include/viennaray/gpu/raygTraceDisk.hpp @@ -169,6 +169,7 @@ template class TraceDisk final : public Trace { areaBuffer.allocUpload(areas); CUdeviceptr d_areas = areaBuffer.dPointer(); CUdeviceptr d_data = this->resultBuffer_.dPointer(); + this->syncStreams(); void *kernel_args[] = { &d_data, &d_areas, &launchParams_.numElements, diff --git a/include/viennaray/gpu/raygTraceLine.hpp b/include/viennaray/gpu/raygTraceLine.hpp index e738a08..89f0444 100644 --- a/include/viennaray/gpu/raygTraceLine.hpp +++ b/include/viennaray/gpu/raygTraceLine.hpp @@ -47,6 +47,7 @@ template class TraceLine final : public Trace { areaBuffer.allocUpload(areas); CUdeviceptr d_areas = areaBuffer.dPointer(); CUdeviceptr d_data = this->resultBuffer_.dPointer(); + this->syncStreams(); void *kernel_args[] = { &d_data, &d_areas, &launchParams_.numElements, diff --git a/include/viennaray/gpu/raygTraceTriangle.hpp b/include/viennaray/gpu/raygTraceTriangle.hpp index 2e9cf94..43219f2 100644 --- a/include/viennaray/gpu/raygTraceTriangle.hpp +++ b/include/viennaray/gpu/raygTraceTriangle.hpp @@ -51,6 +51,7 @@ template class TraceTriangle final : public Trace { launchParams_.source.minPoint[0]) * (launchParams_.source.maxPoint[1] - launchParams_.source.minPoint[1]); } + this->syncStreams(); assert(this->resultBuffer_.sizeInBytes != 0 && "Normalization: Result buffer not initialized."); CUdeviceptr d_data = this->resultBuffer_.dPointer();