diff --git a/src/AtomicMacro.hh b/src/AtomicMacro.hh index 4c31c853..3e01a15f 100644 --- a/src/AtomicMacro.hh +++ b/src/AtomicMacro.hh @@ -1,6 +1,7 @@ -//Determine which atomics to use based on platform being compiled for -// -//If compiling with CUDA +#ifndef AtomicMacro_HH_ +#define AtomicMacro_HH_ + +#define USE_MACRO_FUNCTIONS 1 #ifdef HAVE_OPENMP #define USE_OPENMP_ATOMICS @@ -8,6 +9,228 @@ #define USE_OPENMP_ATOMICS #endif +#ifdef HAVE_STDPAR + #define USE_CXX20_ATOMICS +#endif + +// -------------------------------------------------- +// Original Names -> Inline function names +// -------------------------------------------------- +// ATOMIC_WRITE( x, v ) -> ATOMIC_WRITE +// ATOMIC_UPDATE( x ) -> ATOMIC_INCREMENT +// ATOMIC_ADD( x, v ) -> ATOMIC_ADD +// ATOMIC_CAPTURE( x, v, p ) -> ATOMIC_FETCH_ADD +// -------------------------------------------------- + +#if defined (USE_MACRO_FUNCTIONS) + +#define ATOMIC_CAPTURE( x, v, p ) ATOMIC_FETCH_ADD((x),(v),(p)) +#define ATOMIC_UPDATE( x ) ATOMIC_INCREMENT((x)) + +#if defined(USE_CXX20_ATOMICS) + + #if (__cplusplus > 201703L) + + #include + + #if defined(__cpp_lib_atomic_float) && defined(__cpp_lib_atomic_ref) + + template + inline void ATOMIC_WRITE(T & x, T v) { + //x = v; + std::atomic_ref r{x}; + r = v; + } + + template + inline void ATOMIC_INCREMENT(T& x) { + //atomicAdd( &x, 1 ); + std::atomic_ref r{x}; + r++; + } + + template + inline void ATOMIC_ADD(T& x, T v) { + //atomicAdd( &x, v ); + std::atomic_ref r{x}; + r+=v; + } + + template + inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + //atomicAdd( &x, v ); + std::atomic_ref r{x}; + r+=v; + } + + template + inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + //p = atomicAdd( &x, v ); + std::atomic_ref r{x}; + p = r.fetch_add(v); + } + + template + inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + //p = atomicAdd( &x, v ); + std::atomic_ref r{x}; + p = r.fetch_add(v); + } + + template + inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + //p = atomicAdd( &x, v ); + std::atomic_ref r{x}; + p = r.fetch_add(v); + } + + #else + #error Your supposedly C++20 compiler does not support atomic_ref. + #endif + + #else + #error Sorry, you need C++20. + #endif + +#elif defined(HAVE_CUDA) && defined(__CUDA_ARCH__) + +template +inline void ATOMIC_WRITE(T & x, T v) { + x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + atomicAdd( &x, 1 ); +} + +template +inline void ATOMIC_ADD(T& x, T v) { + atomicAdd( &x, v ); +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + atomicAdd( &x, v ); +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + p = atomicAdd( &x, v ); +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + p = atomicAdd( &x, v ); +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + p = atomicAdd( &x, v ); +} + +#elif defined(USE_OPENMP_ATOMICS) + +#warning Should not be here + +template +inline void ATOMIC_WRITE(T & x, T v) { + _Pragma("omp atomic write") + x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + _Pragma("omp atomic update") + x++; +} + +template +inline void ATOMIC_ADD(T& x, T v) { + _Pragma("omp atomic") + x += v; +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + _Pragma("omp atomic") + x += v; +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + _Pragma("omp atomic capture") + {p = x; x = x + v;} +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + _Pragma("omp atomic capture") + {p = x; x = x + v;} +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + _Pragma("omp atomic capture") + {p = x; x = x + v;} +} + +#else // SEQUENTIAL + +template +inline void ATOMIC_WRITE(T & x, T v) { + x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + x++; +} + +template +inline void ATOMIC_ADD(T& x, T v) { + x += v; +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + x += v; +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + {p = x; x = x + v;} +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + {p = x; x = x + v;} +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + {p = x; x = x + v;} +} + +#endif // BACKENDS + +#else // ! USE_MACRO_FUNCTIONS #if defined (HAVE_CUDA) @@ -16,18 +239,20 @@ //Currently not atomic here. But its only used when it does not necissarially need to be atomic. #define ATOMIC_WRITE( x, v ) \ - x = v; + x = v; #define ATOMIC_ADD( x, v ) \ atomicAdd( &x, v ); - + #define ATOMIC_UPDATE( x ) \ atomicAdd( &x, 1 ); #define ATOMIC_CAPTURE( x, v, p ) \ p = atomicAdd( &x, v ); + //If in a CPU OpenMP section use the OpenMP atomics #elif defined (USE_OPENMP_ATOMICS) + #define ATOMIC_WRITE( x, v ) \ _Pragma("omp atomic write") \ x = v; @@ -46,6 +271,7 @@ //If in a serial section, no need to use atomics #else + #define ATOMIC_WRITE( x, v ) \ x = v; @@ -62,6 +288,7 @@ //If in a OpenMP section use the OpenMP atomics #elif defined (USE_OPENMP_ATOMICS) + #define ATOMIC_WRITE( x, v ) \ _Pragma("omp atomic write") \ x = v; @@ -74,12 +301,13 @@ _Pragma("omp atomic update") \ x++; - #define ATOMIC_CAPTURE( x, v, p ) \ - _Pragma("omp atomic capture") \ - {p = x; x = x + v;} + #define ATOMIC_CAPTURE( x, v, p ) \ + _Pragma("omp atomic capture") \ + {p = x; x = x + v;} //If in a serial section, no need to use atomics #else + #define ATOMIC_WRITE( x, v ) \ x = v; @@ -91,4 +319,9 @@ #define ATOMIC_CAPTURE( x, v, p ) \ {p = x; x = x + v;} -#endif + +#endif // BACKENDS + +#endif // USE_MACRO_FUNCTIONS + +#endif // AtomicMacro_HH_ diff --git a/src/Makefile b/src/Makefile index 5f42dc12..a9d04510 100644 --- a/src/Makefile +++ b/src/Makefile @@ -114,6 +114,18 @@ LDFLAGS = #LDFLAGS = $(OPENMP_LDFLAGS) +############################################################################### +### C++ parallelism +############################################################################### + +#OPTFLAGS = -g -O3 + +#CXX=clang++-12 +#CXXFLAGS = -std=c++20 $(OPTFLAGS) -Wpedantic +#CPPFLAGS = -DHAVE_STDPAR +## Sometimes GCC and Clang do not link this automatically. +#LDFLAGS = -ltbb + ############################################################################### ### GCC -- with MPI and OpenMP ############################################################################### diff --git a/src/StdParUtils.hpp b/src/StdParUtils.hpp new file mode 100644 index 00000000..6b946016 --- /dev/null +++ b/src/StdParUtils.hpp @@ -0,0 +1,117 @@ +/* +Copyright (c) 2021, NVIDIA +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef StdParUtils_HH_ +#define StdParUtils_HH_ + +#include + +#include +#include + +// This implementation was authored by David Olsen + +#include + +template +struct counting_iterator { + +private: + typedef counting_iterator self; + +public: + typedef T value_type; + typedef typename std::make_signed::type difference_type; + typedef T const* pointer; + typedef T const& reference; + typedef std::random_access_iterator_tag iterator_category; + + explicit counting_iterator(value_type v) : value(v) { } + + value_type operator*() const { return value; } + value_type operator[](difference_type n) const { return value + n; } + + self& operator++() { ++value; return *this; } + self operator++(int) { + self result{value}; + ++value; + return result; + } + self& operator--() { --value; return *this; } + self operator--(int) { + self result{value}; + --value; + return result; + } + self& operator+=(difference_type n) { value += n; return *this; } + self& operator-=(difference_type n) { value -= n; return *this; } + + friend self operator+(self const& i, difference_type n) { + return self(i.value + n); + } + friend self operator+(difference_type n, self const& i) { + return self(i.value + n); + } + friend difference_type operator-(self const& x, self const& y) { + return x.value - y.value; + } + friend self operator-(self const& i, difference_type n) { + return self(i.value - n); + } + + friend bool operator==(self const& x, self const& y) { + return x.value == y.value; + } + friend bool operator!=(self const& x, self const& y) { + return x.value != y.value; + } + friend bool operator<(self const& x, self const& y) { + return x.value < y.value; + } + friend bool operator<=(self const& x, self const& y) { + return x.value <= y.value; + } + friend bool operator>(self const& x, self const& y) { + return x.value > y.value; + } + friend bool operator>=(self const& x, self const& y) { + return x.value >= y.value; + } +private: + value_type value; +}; + +template ::value>::type> +inline counting_iterator make_counter(T value) { + return counting_iterator{value}; +} + +#endif // StdParUtils_HH_ diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh index 868c5477..5982e4cf 100644 --- a/src/cudaUtils.hh +++ b/src/cudaUtils.hh @@ -21,7 +21,7 @@ #define VAR_MEM MemoryControl::AllocationPolicy::HOST_MEM #endif -enum ExecutionPolicy{ cpu, gpuWithCUDA, gpuWithOpenMP }; +enum ExecutionPolicy{ cpu, cpuWithStdPar, gpuWithCUDA, gpuWithOpenMP }; inline ExecutionPolicy getExecutionPolicy( int useGPU ) { @@ -34,6 +34,10 @@ inline ExecutionPolicy getExecutionPolicy( int useGPU ) #elif defined (HAVE_OPENMP_TARGET) execPolicy = ExecutionPolicy::gpuWithOpenMP; #endif + } else { + #if defined (HAVE_STDPAR) + execPolicy = ExecutionPolicy::cpuWithStdPar; + #endif } return execPolicy; } diff --git a/src/main.cc b/src/main.cc index bb9517b2..c0e35544 100644 --- a/src/main.cc +++ b/src/main.cc @@ -2,6 +2,7 @@ #include "utils.hh" #include "Parameters.hh" #include "utilsMpi.hh" +#include "StdParUtils.hpp" #include "MonteCarlo.hh" #include "initMC.hh" #include "Tallies.hh" @@ -198,7 +199,7 @@ void cycleTracking(MonteCarlo *monteCarlo) #endif } break; - + case gpuWithOpenMP: { int nthreads=128; @@ -224,6 +225,23 @@ void cycleTracking(MonteCarlo *monteCarlo) } break; + case cpuWithStdPar: + { + static int printed{0}; + if (!printed) { + std::cout << "cpuWithStdPar" << std::endl; + printed++; + } + + auto begin = counting_iterator(0); + auto end = counting_iterator(numParticles); + std::for_each( std::execution::par_unseq, begin, end, [=](int particle_index) { + CycleTrackingGuts( monteCarlo, particle_index, processingVault, processedVault ); + }); + + } + break; + case cpu: #include "mc_omp_parallel_for_schedule_static.hh" for ( int particle_index = 0; particle_index < numParticles; particle_index++ ) @@ -231,6 +249,7 @@ void cycleTracking(MonteCarlo *monteCarlo) CycleTrackingGuts( monteCarlo, particle_index, processingVault, processedVault ); } break; + default: qs_assert(false); } // end switch diff --git a/src/utils.cc b/src/utils.cc index d6bf879d..685fb3f8 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -2,6 +2,7 @@ #include #include "qs_assert.hh" #include "utilsMpi.hh" +#include "StdParUtils.hpp" #include "macros.hh" #include #include @@ -71,8 +72,12 @@ void printBanner(const char *git_version, const char *git_hash) printf("Quicksilver Git Hash : %s\n",git_hash); printf("MPI Version : %d.%d\n",mpi_major,mpi_minor); printf("Number of MPI ranks : %d\n",size); +#ifdef HAVE_STDPAR + printf("Number of C++ HW Threads: %d\n\n",(int)std::thread::hardware_concurrency()); +#else printf("Number of OpenMP Threads: %d\n",(int)omp_get_max_threads()); printf("Number of OpenMP CPUs : %d\n\n",(int)omp_get_num_procs()); +#endif } }