diff --git a/src/AtomicMacro.hh b/src/AtomicMacro.hh index 4c31c853..3a67f605 100644 --- a/src/AtomicMacro.hh +++ b/src/AtomicMacro.hh @@ -1,3 +1,8 @@ +#ifndef AtomicMacro_HH_ +#define AtomicMacro_HH_ + +#define USE_MACRO_FUNCTIONS 1 + //Determine which atomics to use based on platform being compiled for // //If compiling with CUDA @@ -8,26 +13,260 @@ #define USE_OPENMP_ATOMICS #endif +#ifdef HAVE_SYCL +#include +#include +#endif + +// -------------------------------------------------- +// Original Names -> Inline function names +// -------------------------------------------------- +// ATOMIC_WRITE( x, v ) -> ATOMIC_WRITE +// ATOMIC_UPDATE( x ) -> ATOMIC_INCREMENT +// ATOMIC_ADD( x, v ) -> ATOMIC_ADD +// ATOMIC_CAPTURE( x, v, p ) -> ATOMIC_FETCH_ADD +// -------------------------------------------------- + +#if defined (USE_MACRO_FUNCTIONS) + +#define ATOMIC_CAPTURE( x, v, p ) ATOMIC_FETCH_ADD((x),(v),(p)) +#define ATOMIC_UPDATE( x ) ATOMIC_INCREMENT((x)) + +#if defined(HAVE_SYCL) + +static const cl::sycl::memory_order order = cl::sycl::memory_order::relaxed; +static const cl::sycl::access::address_space space = cl::sycl::access::address_space::global_space; + +template +inline void ATOMIC_WRITE(T & x, T v) { + //x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + //atomicAdd( &x, 1 ); + T one{1}; + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + cl::sycl::atomic_fetch_add(y, one, order); +} + +template +inline void ATOMIC_ADD(T& x, T v) { + //atomicAdd( &x, v ); + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + cl::sycl::atomic_fetch_add(y, v, order); +} + +template <> +inline void ATOMIC_ADD(double& x, double v) { + static_assert(sizeof(double) == sizeof(uint64_t), "Unsafe: double is not 64-bits"); + //atomicAdd( &x, v ); + cl::sycl::atomic t( (cl::sycl::multi_ptr( reinterpret_cast(&x)) )); + uint64_t old_i = t.load(order); + double old_d; + do { + old_d = *reinterpret_cast(&old_i); + const double new_d = old_d + v; + const uint64_t new_i = *reinterpret_cast(&new_d); + if (t.compare_exchange_strong(old_i, new_i, order)) break; + } while (true); + // p = old_d; +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + //atomicAdd( &x, v ); + T1 val = static_cast(v); + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + cl::sycl::atomic_fetch_add(y, val, order); +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + //p = atomicAdd( &x, v ); + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + p = cl::sycl::atomic_fetch_add(y, v, order); +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + //p = atomicAdd( &x, v ); + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + T1 val = static_cast(v); + p = cl::sycl::atomic_fetch_add(y, val, order); +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + //p = atomicAdd( &x, v ); + T1 val = static_cast(v); + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + p = cl::sycl::atomic_fetch_add(y, val, order); +} + +#elif defined(HAVE_CUDA) && defined(__CUDA_ARCH__) + +template +inline void ATOMIC_WRITE(T & x, T v) { + x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + atomicAdd( &x, 1 ); +} + +template +inline void ATOMIC_ADD(T& x, T v) { + atomicAdd( &x, v ); +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + atomicAdd( &x, v ); +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + p = atomicAdd( &x, v ); +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + p = atomicAdd( &x, v ); +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + p = atomicAdd( &x, v ); +} + +#elif defined(USE_OPENMP_ATOMICS) + +template +inline void ATOMIC_WRITE(T & x, T v) { + _Pragma("omp atomic write") + x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + _Pragma("omp atomic update") + x++; +} + +template +inline void ATOMIC_ADD(T& x, T v) { + _Pragma("omp atomic") + x += v; +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + _Pragma("omp atomic") + x += v; +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + _Pragma("omp atomic capture") + {p = x; x = x + v;} +} -#if defined (HAVE_CUDA) +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + _Pragma("omp atomic capture") + {p = x; x = x + v;} +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + _Pragma("omp atomic capture") + {p = x; x = x + v;} +} + +#else // SEQUENTIAL + +template +inline void ATOMIC_WRITE(T & x, T v) { + x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + x++; +} + +template +inline void ATOMIC_ADD(T& x, T v) { + x += v; +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + x += v; +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + {p = x; x = x + v;} +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + {p = x; x = x + v;} +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + {p = x; x = x + v;} +} + +#endif // BACKENDS + +#else // ! USE_MACRO_FUNCTIONS + +#if defined (HAVE_SYCL) + +#error You must define USE_MACRO_FUNCTIONS with SYCL! + +#elif defined (HAVE_CUDA) //If in a CUDA GPU section use the CUDA atomics #ifdef __CUDA_ARCH__ //Currently not atomic here. But its only used when it does not necissarially need to be atomic. #define ATOMIC_WRITE( x, v ) \ - x = v; + x = v; #define ATOMIC_ADD( x, v ) \ atomicAdd( &x, v ); - + #define ATOMIC_UPDATE( x ) \ atomicAdd( &x, 1 ); #define ATOMIC_CAPTURE( x, v, p ) \ p = atomicAdd( &x, v ); + //If in a CPU OpenMP section use the OpenMP atomics #elif defined (USE_OPENMP_ATOMICS) + #define ATOMIC_WRITE( x, v ) \ _Pragma("omp atomic write") \ x = v; @@ -46,6 +285,7 @@ //If in a serial section, no need to use atomics #else + #define ATOMIC_WRITE( x, v ) \ x = v; @@ -62,6 +302,7 @@ //If in a OpenMP section use the OpenMP atomics #elif defined (USE_OPENMP_ATOMICS) + #define ATOMIC_WRITE( x, v ) \ _Pragma("omp atomic write") \ x = v; @@ -74,12 +315,13 @@ _Pragma("omp atomic update") \ x++; - #define ATOMIC_CAPTURE( x, v, p ) \ - _Pragma("omp atomic capture") \ - {p = x; x = x + v;} + #define ATOMIC_CAPTURE( x, v, p ) \ + _Pragma("omp atomic capture") \ + {p = x; x = x + v;} //If in a serial section, no need to use atomics #else + #define ATOMIC_WRITE( x, v ) \ x = v; @@ -91,4 +333,9 @@ #define ATOMIC_CAPTURE( x, v, p ) \ {p = x; x = x + v;} -#endif + +#endif // BACKENDS + +#endif // USE_MACRO_FUNCTIONS + +#endif // AtomicMacro_HH_ diff --git a/src/CollisionEvent.cc b/src/CollisionEvent.cc index 20b667b4..b0522016 100644 --- a/src/CollisionEvent.cc +++ b/src/CollisionEvent.cc @@ -11,6 +11,8 @@ #include "PhysicalConstants.hh" #include "DeclareMacro.hh" #include "AtomicMacro.hh" +#include "mathHelp.hh" +#include "cudaUtils.hh" #define MAX_PRODUCTION_SIZE 4 @@ -21,19 +23,19 @@ // Return true if the particle will continue. //---------------------------------------------------------------------------------------------------------------------- -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void updateTrajectory( double energy, double angle, MC_Particle& particle ) { particle.kinetic_energy = energy; double cosTheta = angle; double randomNumber = rngSample(&particle.random_number_seed); double phi = 2 * 3.14159265 * randomNumber; - double sinPhi = sin(phi); - double cosPhi = cos(phi); - double sinTheta = sqrt((1.0 - (cosTheta*cosTheta))); + double sinPhi = SIN(phi); + double cosPhi = COS(phi); + double sinTheta = SQRT((1.0 - (cosTheta*cosTheta))); particle.direction_cosine.Rotate3DVector(sinTheta, cosTheta, sinPhi, cosPhi); double speed = (PhysicalConstants::_speedOfLight * - sqrt((1.0 - ((PhysicalConstants::_neutronRestMassEnergy * + SQRT((1.0 - ((PhysicalConstants::_neutronRestMassEnergy * PhysicalConstants::_neutronRestMassEnergy) / ((energy + PhysicalConstants::_neutronRestMassEnergy) * (energy + PhysicalConstants::_neutronRestMassEnergy)))))); @@ -41,12 +43,11 @@ void updateTrajectory( double energy, double angle, MC_Particle& particle ) particle.velocity.y = speed * particle.direction_cosine.beta; particle.velocity.z = speed * particle.direction_cosine.gamma; randomNumber = rngSample(&particle.random_number_seed); - particle.num_mean_free_paths = -1.0*log(randomNumber); + particle.num_mean_free_paths = -1.0*LOG(randomNumber); } HOST_DEVICE_END -HOST_DEVICE - +HOST_DEVICE SYCL_EXTERNAL bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned int tally_index) { const MC_Cell_State &cell = monteCarlo->domain[mc_particle.domain].cell_state[mc_particle.cell]; @@ -116,8 +117,18 @@ bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned i ATOMIC_ADD( monteCarlo->_tallies->_balanceTask[tally_index]._produce, nOut); break; case NuclearDataReaction::Undefined: + { +#ifdef HAVE_SYCL +#if HAVE_SYCL_PRINTF + static const OPENCL_CONSTANT char format[] = "reactionType invalid\n"; + syclx::printf(format); +#endif +#else printf("reactionType invalid\n"); +#endif qs_assert(false); + break; + } } if( nOut == 0 ) return false; diff --git a/src/CollisionEvent.hh b/src/CollisionEvent.hh index 270ae382..d1d42162 100644 --- a/src/CollisionEvent.hh +++ b/src/CollisionEvent.hh @@ -5,7 +5,7 @@ class MonteCarlo; class MC_Particle; #include "DeclareMacro.hh" -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned int tally_index ); HOST_DEVICE_END diff --git a/src/CycleTracking.cc b/src/CycleTracking.cc index e6238a83..1286bc4a 100644 --- a/src/CycleTracking.cc +++ b/src/CycleTracking.cc @@ -11,7 +11,7 @@ #include "macros.hh" #include "qs_assert.hh" -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void CycleTrackingGuts( MonteCarlo *monteCarlo, int particle_index, ParticleVault *processingVault, ParticleVault *processedVault ) { MC_Particle mc_particle; @@ -30,7 +30,7 @@ void CycleTrackingGuts( MonteCarlo *monteCarlo, int particle_index, ParticleVaul } HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void CycleTrackingFunction( MonteCarlo *monteCarlo, MC_Particle &mc_particle, int particle_index, ParticleVault* processingVault, ParticleVault* processedVault) { bool keepTrackingThisParticle = false; diff --git a/src/CycleTracking.hh b/src/CycleTracking.hh index 4dea37a0..f1e04426 100644 --- a/src/CycleTracking.hh +++ b/src/CycleTracking.hh @@ -5,10 +5,10 @@ class ParticleVault; class MonteCarlo; class MC_Particle; -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void CycleTrackingGuts( MonteCarlo *monteCarlo, int particle_index, ParticleVault *processingVault, ParticleVault *processedVault ); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void CycleTrackingFunction( MonteCarlo *monteCarlo, MC_Particle &mc_particle, int particle_index, ParticleVault* processingVault, ParticleVault* processedVault); HOST_DEVICE_END diff --git a/src/DeclareMacro.hh b/src/DeclareMacro.hh index 3599190f..9efa61e1 100644 --- a/src/DeclareMacro.hh +++ b/src/DeclareMacro.hh @@ -1,6 +1,10 @@ #ifndef DECLAREMACRO_HH #define DECLAREMACRO_HH +#ifdef HAVE_SYCL +#include +#endif + #ifdef HAVE_CUDA #define HOST_DEVICE __host__ __device__ #define HOST_DEVICE_CUDA __host__ __device__ diff --git a/src/DirectionCosine.cc b/src/DirectionCosine.cc index ded7a30b..fc3f5571 100644 --- a/src/DirectionCosine.cc +++ b/src/DirectionCosine.cc @@ -1,13 +1,14 @@ #include "DirectionCosine.hh" #include "MC_RNG_State.hh" #include "PhysicalConstants.hh" +#include "mathHelp.hh" void DirectionCosine::Sample_Isotropic(uint64_t *seed) { this->gamma = 1.0 - 2.0*rngSample(seed); - double sine_gamma = sqrt((1.0 - (gamma*gamma))); + double sine_gamma = SQRT((1.0 - (gamma*gamma))); double phi = PhysicalConstants::_pi*(2.0*rngSample(seed) - 1.0); - this->alpha = sine_gamma * cos(phi); - this->beta = sine_gamma * sin(phi); + this->alpha = sine_gamma * COS(phi); + this->beta = sine_gamma * SIN(phi); } diff --git a/src/DirectionCosine.hh b/src/DirectionCosine.hh index e7dd7363..325721dc 100644 --- a/src/DirectionCosine.hh +++ b/src/DirectionCosine.hh @@ -1,9 +1,9 @@ #ifndef DIRECTION_COSINE_INCLUDE #define DIRECTION_COSINE_INCLUDE -#include #include "portability.hh" #include "DeclareMacro.hh" +#include "mathHelp.hh" HOST_DEVICE_CLASS class DirectionCosine @@ -31,7 +31,7 @@ public: void Sample_Isotropic(uint64_t *seed); // rotate a direction cosine given the sine/cosine of theta and phi - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL inline void Rotate3DVector( double sine_Theta, double cosine_Theta, double sine_Phi, @@ -119,12 +119,12 @@ HOST_DEVICE_END // direction_cosine.beta = cos_theta*sin_phi*Alpha + cos_phi*Beta + sin_theta*sin_phi*Gamma; // direction_cosine.gamma = -sin_theta *Alpha + cos_theta *Gamma; //---------------------------------------------------------------------------------------------------------------------- -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL inline void DirectionCosine::Rotate3DVector(double sin_Theta, double cos_Theta, double sin_Phi, double cos_Phi) { // Calculate additional variables in the rotation matrix. double cos_theta = this->gamma; - double sin_theta = sqrt((1.0 - (cos_theta*cos_theta))); + double sin_theta = SQRT((1.0 - (cos_theta*cos_theta))); double cos_phi; double sin_phi; diff --git a/src/MCT.cc b/src/MCT.cc index 3faf3592..200a57bc 100644 --- a/src/MCT.cc +++ b/src/MCT.cc @@ -397,7 +397,7 @@ namespace /// Reflects the particle off of a reflection boundary. -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void MCT_Reflect_Particle(MonteCarlo *monteCarlo, MC_Particle &particle) { DirectionCosine *direction_cosine = particle.Get_Direction_Cosine(); diff --git a/src/MCT.hh b/src/MCT.hh index e6852acd..1e155cb0 100644 --- a/src/MCT.hh +++ b/src/MCT.hh @@ -14,7 +14,7 @@ class Subfacet_Adjacency; class MonteCarlo; -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL MC_Nearest_Facet MCT_Nearest_Facet( MC_Particle *mc_particle, MC_Location &location, @@ -22,12 +22,12 @@ MC_Nearest_Facet MCT_Nearest_Facet( const DirectionCosine *direction_cosine, double distance_threshold, double current_best_distance, - bool new_segment, + bool new_segment, MonteCarlo* monteCarlo); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void MCT_Generate_Coordinate_3D_G( uint64_t *random_number_seed, int domain_num, @@ -36,17 +36,17 @@ void MCT_Generate_Coordinate_3D_G( MonteCarlo* monteCarlo); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL MC_Vector MCT_Cell_Position_3D_G( const MC_Domain &domain, int cell_index); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL Subfacet_Adjacency &MCT_Adjacent_Facet(const MC_Location &location, MC_Particle &mc_particle, MonteCarlo* monteCarlo); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void MCT_Reflect_Particle(MonteCarlo *mcco, MC_Particle &particle); HOST_DEVICE_END diff --git a/src/MC_Facet_Crossing_Event.hh b/src/MC_Facet_Crossing_Event.hh index 98170036..8b0743e7 100644 --- a/src/MC_Facet_Crossing_Event.hh +++ b/src/MC_Facet_Crossing_Event.hh @@ -7,7 +7,7 @@ class ParticleVault; class MC_Particle; -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL MC_Tally_Event::Enum MC_Facet_Crossing_Event(MC_Particle &mc_particle, MonteCarlo* monteCarlo, int particle_index, ParticleVault* processingVault); HOST_DEVICE_END diff --git a/src/MC_Facet_Geometry.hh b/src/MC_Facet_Geometry.hh index 943e86da..12b9e809 100644 --- a/src/MC_Facet_Geometry.hh +++ b/src/MC_Facet_Geometry.hh @@ -22,7 +22,7 @@ public: C = ((r1.x - r0.x)*(r2.y - r0.y)) - ((r1.y - r0.y)*(r2.x - r0.x)); D = -1.0*(A*r0.x + B*r0.y + C*r0.z); - double magnitude = sqrt(A * A + B * B + C * C); + double magnitude = std::sqrt(A * A + B * B + C * C); if ( magnitude == 0.0 ) { diff --git a/src/MC_Fast_Timer.cc b/src/MC_Fast_Timer.cc index 70aa7bf9..0ea8366e 100644 --- a/src/MC_Fast_Timer.cc +++ b/src/MC_Fast_Timer.cc @@ -25,7 +25,7 @@ static double mc_std_dev(uint64_t const data[], int const nelm) for(int ndx=0; ndx -900.0 ) { -#if 1 +#ifdef HAVE_SYCL +#ifdef HAVE_SYCL_PRINTF + static const OPENCL_CONSTANT char format[] = " MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n"; + syclx::printf(format); +#endif +#else printf(" MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n"); - #else - std::string output_string; - MC_Warning( "Forced Collision: num_mean_free_paths < 0 \n" - "Particle record:\n%s", output_string.c_str()); #endif } @@ -80,7 +82,7 @@ MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Part // the next collision from an exponential distribution. double random_number = rngSample(&mc_particle.random_number_seed); - mc_particle.num_mean_free_paths = -1.0*log(random_number); + mc_particle.num_mean_free_paths = -1.0*LOG(random_number); } // Calculate the distances to collision, nearest facet, and census. diff --git a/src/MC_Segment_Outcome.hh b/src/MC_Segment_Outcome.hh index f8d6a25c..65a60d81 100644 --- a/src/MC_Segment_Outcome.hh +++ b/src/MC_Segment_Outcome.hh @@ -32,7 +32,7 @@ struct MC_Collision_Event_Return }; #include "DeclareMacro.hh" -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned int &flux_tally_index); HOST_DEVICE_END diff --git a/src/MC_SourceNow.cc b/src/MC_SourceNow.cc index 08e23138..2c1f7e96 100644 --- a/src/MC_SourceNow.cc +++ b/src/MC_SourceNow.cc @@ -18,6 +18,7 @@ #include "AtomicMacro.hh" #include "NVTX_Range.hh" #include +#include "mathHelp.hh" namespace { @@ -116,7 +117,7 @@ void MC_SourceNow(MonteCarlo *monteCarlo) particle.weight = source_particle_weight; double randomNumber = rngSample(&particle.random_number_seed); - particle.num_mean_free_paths = -1.0*log(randomNumber); + particle.num_mean_free_paths = -1.0*LOG(randomNumber); randomNumber = rngSample(&particle.random_number_seed); particle.time_to_census = monteCarlo->time_info->time_step * randomNumber; @@ -172,7 +173,7 @@ namespace static const double speed_of_light = PhysicalConstants::_speedOfLight; - return speed_of_light * sqrt(energy * (energy + 2.0*(rest_mass_energy)) / + return speed_of_light * std::sqrt(energy * (energy + 2.0*(rest_mass_energy)) / ((energy + rest_mass_energy) * (energy + rest_mass_energy))); } } diff --git a/src/MC_Vector.hh b/src/MC_Vector.hh index 91b5fdf6..34368d07 100644 --- a/src/MC_Vector.hh +++ b/src/MC_Vector.hh @@ -1,8 +1,8 @@ #ifndef MC_VECTOR_INCLUDE #define MC_VECTOR_INCLUDE -#include #include "DeclareMacro.hh" +#include "mathHelp.hh" HOST_DEVICE_CLASS class MC_Vector @@ -17,7 +17,7 @@ class MC_Vector HOST_DEVICE_CUDA MC_Vector(double a, double b, double c) : x(a), y(b), z(c) {} - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL MC_Vector& operator=( const MC_Vector&tmp ) { if ( this == &tmp ) { return *this; } @@ -29,13 +29,13 @@ class MC_Vector return *this; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL bool operator==( const MC_Vector& tmp ) { return tmp.x == x && tmp.y == y && tmp.z == z; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL MC_Vector& operator+=( const MC_Vector &tmp ) { x += tmp.x; @@ -44,7 +44,7 @@ class MC_Vector return *this; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL MC_Vector& operator-=( const MC_Vector &tmp ) { x -= tmp.x; @@ -53,7 +53,7 @@ class MC_Vector return *this; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL MC_Vector& operator*=(const double scalar) { x *= scalar; @@ -62,7 +62,7 @@ class MC_Vector return *this; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL MC_Vector& operator/=(const double scalar) { x /= scalar; @@ -71,39 +71,39 @@ class MC_Vector return *this; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL const MC_Vector operator+( const MC_Vector &tmp ) const { return MC_Vector(x + tmp.x, y + tmp.y, z + tmp.z); } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL const MC_Vector operator-( const MC_Vector &tmp ) const { return MC_Vector(x - tmp.x, y - tmp.y, z - tmp.z); } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL const MC_Vector operator*(const double scalar) const { return MC_Vector(scalar*x, scalar*y, scalar*z); } - HOST_DEVICE_CUDA - inline double Length() const { return std::sqrt(x*x + y*y + z*z); } + HOST_DEVICE_CUDA SYCL_EXTERNAL + inline double Length() const { return SQRT(x*x + y*y + z*z); } // Distance from this vector to another point. - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL inline double Distance(const MC_Vector& vv) const - { return std::sqrt((x - vv.x)*(x - vv.x) + (y - vv.y)*(y - vv.y)+ (z - vv.z)*(z - vv.z)); } + { return SQRT((x - vv.x)*(x - vv.x) + (y - vv.y)*(y - vv.y)+ (z - vv.z)*(z - vv.z)); } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL inline double Dot(const MC_Vector &tmp) const { return this->x*tmp.x + this->y*tmp.y + this->z*tmp.z; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL inline MC_Vector Cross(const MC_Vector &v) const { return MC_Vector(y * v.z - z * v.y, diff --git a/src/MacroscopicCrossSection.hh b/src/MacroscopicCrossSection.hh index 68ab7e1b..b059556e 100644 --- a/src/MacroscopicCrossSection.hh +++ b/src/MacroscopicCrossSection.hh @@ -5,12 +5,12 @@ class MonteCarlo; -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL double macroscopicCrossSection(MonteCarlo* monteCarlo, int reactionIndex, int domainIndex, int cellIndex, int isoIndex, int energyGroup); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL double weightedMacroscopicCrossSection(MonteCarlo* monteCarlo, int taskIndex, int domainIndex, int cellIndex, int energyGroup); HOST_DEVICE_END diff --git a/src/Makefile b/src/Makefile index 5f42dc12..54b73d54 100644 --- a/src/Makefile +++ b/src/Makefile @@ -99,6 +99,33 @@ CXXFLAGS = CPPFLAGS = LDFLAGS = +############################################################################### +# Cuda on LLNL CORAL EA nodes +############################################################################### +## Choose one Cuda path +#CUDA_PATH = /usr +#CUDA_PATH = /usr/tcetmp/packages/cuda-9.0.176 + +#HOST_COMPILER = g++ + +#OPTFLAGS = -O2 +## Version below for debugging +##OPTFLAGS = -DUSE_NVTX -g -G -lineinfo -O0 + +#CUDA_FLAGS = -I${CUDA_PATH}/include/ +#CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart +# +#CXX=$(CUDA_PATH)/bin/nvcc +#CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) #-Xptxas -v +#CXXFLAGS += -ccbin clang-8 +#CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\" +#CXXFLAGS += --compiler-bindir=$(HOST_COMPILER) +#CPPFLAGS = -x cu -dc +#CPPFLAGS += -DHAVE_MPI -DHAVE_ASYNC_MPI +#LDFLAGS = $(CUDA_LDFLAGS) +#LDFLAGS += -lstdc++ +##LDFLAGS += ${CUDA_PATH}/lib64/libnvToolsExt.so + ############################################################################### # Very simple GCC build with OpenMP but without MPI. # This works on a Macbook (if gcc is installed) @@ -128,18 +155,6 @@ LDFLAGS = #LDFLAGS = $(OPENMP_LDFLAGS) -############################################################################### -# LLNL LC BG/Q Comilers # -############################################################################### -### BGQ GNU -#OPTFLAGS = -g -O2 -## -#CXX=/usr/local/tools/compilers/ibm/mpicxx-4.8.4 -#CXXFLAGS = -std=c++11 $(OPTFLAGS) -#CPPFLAGS = -DCHRONO_MISSING -DHAVE_MPI -DHAVE_OPENMP -fopenmp -#LDFLAGS = -fopenmp - - ############################################################################### # OpenMP 4.5 on LLNL CORAL EA nodes ############################################################################### @@ -167,86 +182,45 @@ LDFLAGS = ############################################################################### -# Cuda on LLNL CORAL EA nodes +# SYCL/DPC++ ############################################################################### -## Choose one Cuda path -##CUDA_PATH = /usr/local/cuda-8.0 -#CUDA_PATH = /usr/tcetmp/packages/cuda-9.0.176 - -#HOST_COMPILER = /usr/tce/packages/spectrum-mpi/spectrum-mpi-2017.04.03-xl-beta-2017.09.13/bin/mpixlC - -#OPTFLAGS = -O2 -## Version below for debugging -##OPTFLAGS = -DUSE_NVTX -g -G -lineinfo -O0 - -#CUDA_FLAGS = -I${CUDA_PATH}/include/ -#CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart -# -#CXX=$(CUDA_PATH)/bin/nvcc -#CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) -Xptxas -v -#CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\" -#CXXFLAGS += --compiler-bindir=$(HOST_COMPILER) -#CPPFLAGS = -x cu -dc -DHAVE_MPI -DHAVE_ASYNC_MPI -#LDFLAGS = $(CUDA_LDFLAGS) -##LDFLAGS += ${CUDA_PATH}/lib64/libnvToolsExt.so - - - - -############################################################################### -# LLNL TOSS GCC + OpenMP (mvapich 2 - version 1.7) [cab] -############################################################################### -#OPTFLAGS = -g -O2 -#OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp -#OPENMP_LDFLAGS = -fopenmp -# -#CXX = /usr/apps/gnu/4.9.3/bin/mpig++ -#CXXFLAGS = -std=c++0x $(OPTFLAGS) -mpi=mvapich2-gnu-1.7 -#CPPFLAGS = -DHAVE_MPI $(OPENMP_FLAGS) -#LDFLAGS = $(OPENMP_LDFLAGS) - -############################################################################### -# LLNL TOSS Intel + OpenMP (mvapich 2 - version 2.1) [quartz] -############################################################################### -#OPENMP_FLAGS = -DHAVE_OPENMP -qopenmp -#OPENMP_LDFLAGS = -qopenmp -#OPTFLAGS = -g -O2 -# -#CXX=/usr/local/bin/mpiicpc-17.0.174 -#CXXFLAGS = -std=c++11 -mpi=mvapich2-intel-2.1 -DHAVE_MPI $(OPENMP_FLAGS) -#CXXFLAGS += -wd1128 -wd64 -wd21 -#LDFLAGS = $(OPENMP_LDFLAGS) - - -############################################################################### -# LLNL TOSS Clang (cab) -############################################################################### -#CLANGPATH = /usr/global/tools/clang/chaos_5_x86_64_ib/clang-omp-3.5.0 -#OPTFLAGS = -g -O2 -# -#CXX=${CLANGPATH}/bin/mpiclang++ -#CXXFLAGS = -std=c++11 $(OPTFLAGS) -#CPPFLAGS = -DHAVE_MPI -#LDFLAGS = -Wl,-rpath,${CLANGPATH}/lib - - -############################################################################### -# Trinity Compilers # -# # -# One must 'swap' modules on this machine to access different compilers. # -############################################################################### - -### Defaults to Intel. -#OPTFLAGS = -g -O2 -xmic-avx512 -ipo -#OPENMP_FLAGS = -DHAVE_OPENMP -qopenmp -pthread -DUSE_OPENMP_NO_GPU -#OPENMP_LDFLAGS = -qopenmp -pthread -# -#CXX=CC -#CXXFLAGS = -std=c++11 $(OPTFLAGS) -#CPPFLAGS = -DHAVE_MPI -DCHRONO_MISSING $(OPENMP_FLAGS) -#LDFLAGS = $(OPENMP_LDFLAGS) - +CPPFLAGS = -DHAVE_SYCL + +CXXFLAGS = +CXXFLAGS += -std=c++17 +CXXFLAGS += -O3 +CXXFLAGS += -ffast-math +CXXFLAGS += -ferror-limit=3 + +CXXFLAGS += -fsycl +CXXFLAGS += -fsycl-unnamed-lambda + +#CXXFLAGS += -g # SPIR-V backend generates debug output when this is used + +# ------------------------------------------------------- +# Intel oneAPI DPC++ compiler that supports Intel devices +# ------------------------------------------------------- +CXX = dpcpp +# NVIDIA back-end does not support printf yet... +CPPFLAGS += -DHAVE_SYCL_PRINTF + +# works around https://github.com/intel/llvm/issues/2629 +CXXFLAGS += -fno-sycl-early-optimizations + +# GPU offline compile +# https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/gpu-flow/offline-compilation-for-gpu.html +#CXXFLAGS += -fsycl-targets=spir64_gen-unknown-linux-sycldevice +#LDFLAGS += -Xsycl-target-backend=spir64_gen-unknown-linux-sycldevice "-device skl" + +# ------------------------------------------------------- +# Intel DPC++ open-source compiler built for NVIDIA +# ------------------------------------------------------- +#DPCPP_DIR = /home/jrhammon/ISYCL/llvm/build/install +#CXX = $(DPCPP_DIR)/bin/clang++ +#CXXFLAGS += -Wno-unknown-cuda-version +#CXXFLAGS += -fsycl-targets=nvptx64-nvidia-cuda-sycldevice +#LDFLAGS += -L$(DPCPP_DIR)/lib -Wl,-rpath=$(DPCPP_DIR)/lib ################################################################################ ### Below here, it is pitch black. ### diff --git a/src/MemoryControl.hh b/src/MemoryControl.hh index 6b13aaf5..6ffb92ce 100644 --- a/src/MemoryControl.hh +++ b/src/MemoryControl.hh @@ -14,7 +14,7 @@ namespace MemoryControl { if (size == 0) { return NULL;} T* tmp = NULL; - + switch (policy) { case AllocationPolicy::HOST_MEM: @@ -22,9 +22,13 @@ namespace MemoryControl break; #ifdef HAVE_UVM case AllocationPolicy::UVM_MEM: - void *ptr; + void * ptr; +#ifdef HAVE_SYCL + ptr = (void *)sycl::malloc_shared(size * sizeof(T), sycl_device_queue); +#else cudaMallocManaged(&ptr, size*sizeof(T), cudaMemAttachGlobal); - tmp = new(ptr) T[size]; +#endif + tmp = new(ptr) T[size]; break; #endif default: @@ -40,15 +44,18 @@ namespace MemoryControl switch (policy) { case MemoryControl::AllocationPolicy::HOST_MEM: - delete[] data; + delete[] data; break; #ifdef HAVE_UVM case UVM_MEM: - for (int i=0; i < size; ++i) - data[i].~T(); + for (int i=0; i < size; ++i) data[i].~T(); +#ifdef HAVE_SYCL + sycl::free(data, sycl_device_queue); +#else cudaFree(data); +#endif break; -#endif +#endif default: qs_assert(false); break; diff --git a/src/MonteCarlo.cc b/src/MonteCarlo.cc index 28c9c47a..aca40e45 100644 --- a/src/MonteCarlo.cc +++ b/src/MonteCarlo.cc @@ -25,13 +25,19 @@ MonteCarlo::MonteCarlo(const Parameters& params) _nuclearData = 0; _materialDatabase = 0; - #if defined (HAVE_UVM) +#if defined(HAVE_UVM) void *ptr1, *ptr2, *ptr3, *ptr4; - +#ifdef HAVE_SYCL + ptr1 = (void *)sycl::malloc_shared(sizeof(Tallies), sycl_device_queue); + ptr2 = (void *)sycl::malloc_shared(sizeof(MC_Processor_Info), sycl_device_queue); + ptr3 = (void *)sycl::malloc_shared(sizeof(MC_Time_Info), sycl_device_queue); + ptr4 = (void *)sycl::malloc_shared(sizeof(MC_Fast_Timer_Container), sycl_device_queue); +#else cudaMallocManaged( &ptr1, sizeof(Tallies), cudaMemAttachHost ); cudaMallocManaged( &ptr2, sizeof(MC_Processor_Info), cudaMemAttachHost ); cudaMallocManaged( &ptr3, sizeof(MC_Time_Info), cudaMemAttachHost ); cudaMallocManaged( &ptr4, sizeof(MC_Fast_Timer_Container) ); +#endif _tallies = new(ptr1) Tallies( params.simulationParams.balanceTallyReplications, params.simulationParams.fluxTallyReplications, @@ -98,8 +104,13 @@ MonteCarlo::MonteCarlo(const Parameters& params) #if defined(HAVE_UVM) void *ptr5, *ptr6; +#ifdef HAVE_SYCL + ptr5 = (void *)sycl::malloc_shared(sizeof(MC_Particle_Buffer), sycl_device_queue); + ptr6 = (void *)sycl::malloc_shared(sizeof(ParticleVaultContainer), sycl_device_queue); +#else cudaMallocManaged( &ptr5, sizeof(MC_Particle_Buffer) ); cudaMallocManaged( &ptr6, sizeof(ParticleVaultContainer), cudaMemAttachHost ); +#endif particle_buffer = new(ptr5) MC_Particle_Buffer(this, batch_size); _particleVaultContainer = new(ptr6) ParticleVaultContainer(batch_size, num_batches, num_extra_vaults); #else @@ -125,6 +136,16 @@ MonteCarlo::~MonteCarlo() fast_timer->~MC_Fast_Timer_Container(); particle_buffer->~MC_Particle_Buffer(); +#ifdef HAVE_SYCL + sycl::free(_nuclearData, sycl_device_queue); + sycl::free(_particleVaultContainer, sycl_device_queue); + sycl::free(_materialDatabase, sycl_device_queue); + sycl::free(_tallies, sycl_device_queue); + sycl::free(processor_info, sycl_device_queue); + sycl::free(time_info, sycl_device_queue); + sycl::free(fast_timer, sycl_device_queue); + sycl::free(particle_buffer, sycl_device_queue); +#else cudaFree( _nuclearData ); cudaFree( _particleVaultContainer); cudaFree( _materialDatabase); @@ -133,8 +154,9 @@ MonteCarlo::~MonteCarlo() cudaFree( time_info); cudaFree( fast_timer); cudaFree( particle_buffer); +#endif - #else +#else delete _nuclearData; delete _particleVaultContainer; delete _materialDatabase; @@ -143,7 +165,7 @@ MonteCarlo::~MonteCarlo() delete time_info; delete fast_timer; delete particle_buffer; - #endif +#endif } void MonteCarlo::clearCrossSectionCache() diff --git a/src/NuclearData.cc b/src/NuclearData.cc index fe9ad072..f6f46326 100644 --- a/src/NuclearData.cc +++ b/src/NuclearData.cc @@ -3,9 +3,8 @@ #include "MC_RNG_State.hh" #include "DeclareMacro.hh" #include "qs_assert.hh" - -using std::log10; -using std::pow; +#include "mathHelp.hh" +#include "cudaUtils.hh" // Set the cross section values and reaction type // Cross sections are scaled to produce the supplied reactionCrossSection at 1MeV. @@ -21,7 +20,7 @@ NuclearDataReaction::NuclearDataReaction( for (int ii=0; ii #include #include #endif +#if defined(HAVE_SYCL) + +#include + +extern sycl::queue sycl_device_queue; // global variable for device queue + +#ifdef __SYCL_DEVICE_ONLY__ +#define OPENCL_CONSTANT __attribute__((opencl_constant)) +#else +#define OPENCL_CONSTANT +#endif + +// printf +//namespace syclx = sycl::ONEAPI::experimental; // latest OSS DPC++ +namespace syclx = sycl::intel::experimental; // dpcpp beta09 + +#else // not SYCL + +#define SYCL_EXTERNAL + +#endif + #ifdef HAVE_OPENMP_TARGET #ifdef USE_OPENMP_NO_GPU #define VAR_MEM MemoryControl::AllocationPolicy::HOST_MEM @@ -17,11 +39,14 @@ #elif HAVE_CUDA #define VAR_MEM MemoryControl::AllocationPolicy::UVM_MEM #define HAVE_UVM +#elif HAVE_SYCL + #define VAR_MEM MemoryControl::AllocationPolicy::UVM_MEM + #define HAVE_UVM #else #define VAR_MEM MemoryControl::AllocationPolicy::HOST_MEM #endif -enum ExecutionPolicy{ cpu, gpuWithCUDA, gpuWithOpenMP }; +enum ExecutionPolicy{ cpu, gpuWithCUDA, gpuWithOpenMP, SYCL }; inline ExecutionPolicy getExecutionPolicy( int useGPU ) { @@ -33,6 +58,8 @@ inline ExecutionPolicy getExecutionPolicy( int useGPU ) execPolicy = ExecutionPolicy::gpuWithCUDA; #elif defined (HAVE_OPENMP_TARGET) execPolicy = ExecutionPolicy::gpuWithOpenMP; + #elif defined (HAVE_SYCL) + execPolicy = ExecutionPolicy::SYCL; #endif } return execPolicy; diff --git a/src/initMC.cc b/src/initMC.cc index 1be428fd..5c41f6de 100644 --- a/src/initMC.cc +++ b/src/initMC.cc @@ -54,7 +54,11 @@ MonteCarlo* initMC(const Parameters& params) MonteCarlo* monteCarlo; #ifdef HAVE_UVM void* ptr; +#ifdef HAVE_SYCL + ptr = (void *)sycl::malloc_shared(sizeof(MonteCarlo), sycl_device_queue); +#else cudaMallocManaged( &ptr, sizeof(MonteCarlo), cudaMemAttachGlobal ); +#endif monteCarlo = new(ptr) MonteCarlo(params); #else monteCarlo = new MonteCarlo(params); @@ -77,28 +81,32 @@ namespace //Init GPU usage information void initGPUInfo( MonteCarlo* monteCarlo) { + int Ngpus = 0; #if defined(HAVE_OPENMP_TARGET) - int Ngpus = omp_get_num_devices(); + Ngpus = omp_get_num_devices(); #elif defined(HAVE_CUDA) - int Ngpus; cudaGetDeviceCount(&Ngpus); - #else - int Ngpus = 0; + #elif defined(HAVE_SYCL) + Ngpus = 1; #endif if( Ngpus != 0 ) { - #if defined(HAVE_OPENMP_TARGET) || defined(HAVE_CUDA) + #if defined(HAVE_OPENMP_TARGET) || defined(HAVE_CUDA) || defined(HAVE_SYCL) + monteCarlo->processor_info->use_gpu = 1; int GPUID = monteCarlo->processor_info->rank%Ngpus; monteCarlo->processor_info->gpu_id = GPUID; - + #if defined(HAVE_OPENMP_TARGET) omp_set_default_device(GPUID); #endif - cudaSetDevice(GPUID); - //cudaDeviceSetLimit( cudaLimitStackSize, 64*1024 ); + #ifndef HAVE_SYCL + cudaSetDevice(GPUID); + //cudaDeviceSetLimit( cudaLimitStackSize, 64*1024 ); + #endif + #endif } else @@ -132,8 +140,13 @@ namespace { #if defined HAVE_UVM void *ptr1, *ptr2; +#ifdef HAVE_SYCL + ptr1 = (void *)sycl::malloc_shared(sizeof(NuclearData), sycl_device_queue); + ptr2 = (void *)sycl::malloc_shared(sizeof(MaterialDatabase), sycl_device_queue); +#else cudaMallocManaged( &ptr1, sizeof(NuclearData), cudaMemAttachGlobal ); cudaMallocManaged( &ptr2, sizeof(MaterialDatabase), cudaMemAttachGlobal ); +#endif monteCarlo->_nuclearData = new(ptr1) NuclearData(params.simulationParams.nGroups, params.simulationParams.eMin, diff --git a/src/main.cc b/src/main.cc index bb9517b2..49972c61 100644 --- a/src/main.cc +++ b/src/main.cc @@ -26,6 +26,10 @@ #include "git_hash.hh" #include "git_vers.hh" +#ifdef HAVE_SYCL +sycl::queue sycl_device_queue; +#endif + void gameOver(); void cycleInit( bool loadBalance ); void cycleTracking(MonteCarlo* monteCarlo); @@ -43,8 +47,45 @@ int main(int argc, char** argv) Parameters params = getParameters(argc, argv); printParameters(params, cout); - // mcco stores just about everything. - mcco = initMC(params); +#ifdef HAVE_SYCL + // HOIST INTO SETUP FUNCTION EVENTUALLY + char * devchar = std::getenv("QS_DEVICE"); + std::string devname = (devchar==NULL ? "None" : devchar); + if (devname == "CPU") { + sycl_device_queue = cl::sycl::cpu_selector{}; + } + else + if (devname == "GPU") { + sycl_device_queue = cl::sycl::gpu_selector{}; + } + else + if (devname == "HOST") { + sycl_device_queue = cl::sycl::host_selector{}; + } + else + { + std::cout << "QS_DEVICE must be CPU, GPU or HOST" << std::endl; + std::abort(); + } + + // DEBUG - REMOVE LATER + if ( sycl_device_queue.get_device().is_cpu() ) std::cout << "is cpu" << std::endl; + if ( sycl_device_queue.get_device().is_gpu() ) std::cout << "is gpu" << std::endl; + if ( sycl_device_queue.get_device().is_host() ) std::cout << "is host" << std::endl; + if ( sycl_device_queue.get_device().is_accelerator() ) std::cout << "is accelerator" << std::endl; +#endif + + // mcco stores just about everything. + mcco = initMC(params); + +#ifdef HAVE_SYCL // DEBUG - REMOVE LATER + if (mcco->processor_info->use_gpu) { + printf("Using SYCL device\n"); + } else { + printf("Bad\n"); + std::abort(); + } +#endif int loadBalance = params.simulationParams.loadBalance; @@ -74,13 +115,17 @@ int main(int argc, char** argv) #ifdef HAVE_UVM mcco->~MonteCarlo(); +#ifdef HAVE_SYCL + sycl::free(mcco, sycl_device_queue); +#else cudaFree( mcco ); +#endif #else delete mcco; #endif mpiFinalize(); - + return 0; } @@ -112,7 +157,7 @@ void cycleInit( bool loadBalance ) mcco->particle_buffer->Initialize(); MC_SourceNow(mcco); - + PopulationControl(mcco, loadBalance); // controls particle population RouletteLowWeightParticles(mcco); // Delete particles with low statistical weight @@ -125,7 +170,7 @@ void cycleInit( bool loadBalance ) __global__ void CycleTrackingKernel( MonteCarlo* monteCarlo, int num_particles, ParticleVault* processingVault, ParticleVault* processedVault ) { - int global_index = getGlobalThreadID(); + int global_index = getGlobalThreadID(); if( global_index < num_particles ) { @@ -167,9 +212,9 @@ void cycleTracking(MonteCarlo *monteCarlo) ParticleVault *processingVault = my_particle_vault.getTaskProcessingVault(processing_vault); ParticleVault *processedVault = my_particle_vault.getTaskProcessedVault(processed_vault); - + int numParticles = processingVault->size(); - + if ( numParticles != 0 ) { NVTX_Range trackingKernel("cycleTracking_TrackingKernel"); // range ends at end of scope @@ -187,28 +232,28 @@ void cycleTracking(MonteCarlo *monteCarlo) dim3 grid(1,1,1); dim3 block(1,1,1); int runKernel = ThreadBlockLayout( grid, block, numParticles); - + //Call Cycle Tracking Kernel - if( runKernel ) + if ( runKernel ) CycleTrackingKernel<<>>( monteCarlo, numParticles, processingVault, processedVault ); - + //Synchronize the stream so that memory is copied back before we begin MPI section cudaPeekAtLastError(); cudaDeviceSynchronize(); #endif } break; - + case gpuWithOpenMP: { int nthreads=128; - if (numParticles < 64*56 ) + if (numParticles < 64*56 ) nthreads = 64; int nteams = (numParticles + nthreads - 1 ) / nthreads; nteams = nteams > 1 ? nteams : 1; #ifdef HAVE_OPENMP_TARGET - #pragma omp target enter data map(to:monteCarlo[0:1]) - #pragma omp target enter data map(to:processingVault[0:1]) + #pragma omp target enter data map(to:monteCarlo[0:1]) + #pragma omp target enter data map(to:processingVault[0:1]) #pragma omp target enter data map(to:processedVault[0:1]) #pragma omp target teams distribute parallel for num_teams(nteams) thread_limit(128) #endif @@ -224,6 +269,21 @@ void cycleTracking(MonteCarlo *monteCarlo) } break; + case SYCL: + { + const size_t N = numParticles; + #ifdef HAVE_SYCL + sycl_device_queue.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{N}, [=](sycl::item<1> it) { + int particle_index = it[0]; + CycleTrackingGuts( monteCarlo, particle_index, processingVault, processedVault ); + }); + }); + sycl_device_queue.wait(); + #endif + } + break; + case cpu: #include "mc_omp_parallel_for_schedule_static.hh" for ( int particle_index = 0; particle_index < numParticles; particle_index++ ) @@ -245,7 +305,7 @@ void cycleTracking(MonteCarlo *monteCarlo) // Next, communicate particles that have crossed onto // other MPI ranks. NVTX_Range cleanAndComm("cycleTracking_clean_and_comm"); - + SendQueue &sendQueue = *(my_particle_vault.getSendQueue()); monteCarlo->particle_buffer->Allocate_Send_Buffer( sendQueue ); @@ -314,7 +374,7 @@ void cycleFinalize() mcco->_tallies->_balanceTask[0]._end = mcco->_particleVaultContainer->sizeProcessed(); // Update the cumulative tally data. - mcco->_tallies->CycleFinalize(mcco); + mcco->_tallies->CycleFinalize(mcco); mcco->time_info->cycle++; diff --git a/src/mathHelp.hh b/src/mathHelp.hh new file mode 100644 index 00000000..a2672aae --- /dev/null +++ b/src/mathHelp.hh @@ -0,0 +1,18 @@ +#ifndef MATH_HELP_HH +#define MATH_HELP_HH + +#ifdef HAVE_SYCL +#include +#define SIN sycl::sin +#define COS sycl::cos +#define SQRT sycl::sqrt +#define LOG sycl::log +#else +#include +#define SIN sin +#define COS cos +#define SQRT sqrt +#define LOG log +#endif + +#endif // MATH_HELP_HH diff --git a/src/qs_assert.hh b/src/qs_assert.hh index 55e1f004..f617c904 100644 --- a/src/qs_assert.hh +++ b/src/qs_assert.hh @@ -1,6 +1,8 @@ #include +#include "cudaUtils.hh" #ifdef __CUDA_ARCH__ + #define qs_assert( cond) \ do \ { \ @@ -9,7 +11,37 @@ printf("ERROR\n"); \ } \ } while(0) + +#elif defined(HAVE_SYCL) + +#ifdef HAVE_SYCL_PRINTF + +#define qs_assert( cond) \ + do \ + { \ + if (!(cond)) \ + { \ + static const OPENCL_CONSTANT char format[] = "file=%s: line=%d ERROR\n"; \ + syclx::printf(format,__FILE__,__LINE__); \ + } \ + } while(0) + +#else + +#define qs_assert( cond) \ + do \ + { \ + if (!(cond)) \ + { \ + /* FIXME eventually... + * printf("file=%s: line=%d ERROR\n",__FILE__,__LINE__); */ \ + } \ + } while(0) + +#endif + #else + #define qs_assert( cond) \ do \ { \ @@ -18,4 +50,5 @@ printf("file=%s: line=%d ERROR\n",__FILE__,__LINE__); \ } \ } while(0) + #endif