From 451f9ba9cd39cd405d1d87f2316bd1d73d8e849e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 26 May 2020 17:32:17 -0700 Subject: [PATCH 01/12] port to SYCL/DPC++ look for HAVE_SYCL and SYCL_EXTERNAL for most of the changes. this works on GPU and HOST but not CPU (OpenCL issue). Signed-off-by: Jeff Hammond --- src/CollisionEvent.cc | 18 +++-- src/CollisionEvent.hh | 2 +- src/CycleTracking.cc | 4 +- src/CycleTracking.hh | 4 +- src/DeclareMacro.hh | 4 + src/DirectionCosine.cc | 7 +- src/DirectionCosine.hh | 8 +- src/MCT.cc | 2 +- src/MCT.hh | 12 +-- src/MC_Facet_Crossing_Event.hh | 2 +- src/MC_Facet_Geometry.hh | 2 +- src/MC_Fast_Timer.cc | 2 +- src/MC_Location.hh | 4 +- src/MC_Particle.hh | 2 + src/MC_RNG_State.cc | 2 +- src/MC_RNG_State.hh | 4 +- src/MC_Segment_Outcome.cc | 14 ++-- src/MC_Segment_Outcome.hh | 2 +- src/MC_SourceNow.cc | 5 +- src/MC_Vector.hh | 32 ++++---- src/MacroscopicCrossSection.hh | 4 +- src/Makefile | 144 +++++++++++---------------------- src/MemoryControl.hh | 21 +++-- src/MonteCarlo.cc | 30 ++++++- src/NuclearData.cc | 14 ++-- src/NuclearData.hh | 12 +-- src/ParticleVault.hh | 4 +- src/ParticleVaultContainer.cc | 2 +- src/ParticleVaultContainer.hh | 4 +- src/SendQueue.hh | 2 +- src/cudaUtils.hh | 16 +++- src/initMC.cc | 29 +++++-- src/main.cc | 92 +++++++++++++++++---- src/mathHelp.hh | 18 +++++ src/qs_assert.hh | 14 ++++ 35 files changed, 323 insertions(+), 215 deletions(-) create mode 100644 src/mathHelp.hh diff --git a/src/CollisionEvent.cc b/src/CollisionEvent.cc index 20b667b4..e684b200 100644 --- a/src/CollisionEvent.cc +++ b/src/CollisionEvent.cc @@ -11,6 +11,7 @@ #include "PhysicalConstants.hh" #include "DeclareMacro.hh" #include "AtomicMacro.hh" +#include "mathHelp.hh" #define MAX_PRODUCTION_SIZE 4 @@ -21,19 +22,19 @@ // Return true if the particle will continue. //---------------------------------------------------------------------------------------------------------------------- -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void updateTrajectory( double energy, double angle, MC_Particle& particle ) { particle.kinetic_energy = energy; double cosTheta = angle; double randomNumber = rngSample(&particle.random_number_seed); double phi = 2 * 3.14159265 * randomNumber; - double sinPhi = sin(phi); - double cosPhi = cos(phi); - double sinTheta = sqrt((1.0 - (cosTheta*cosTheta))); + double sinPhi = SIN(phi); + double cosPhi = COS(phi); + double sinTheta = SQRT((1.0 - (cosTheta*cosTheta))); particle.direction_cosine.Rotate3DVector(sinTheta, cosTheta, sinPhi, cosPhi); double speed = (PhysicalConstants::_speedOfLight * - sqrt((1.0 - ((PhysicalConstants::_neutronRestMassEnergy * + SQRT((1.0 - ((PhysicalConstants::_neutronRestMassEnergy * PhysicalConstants::_neutronRestMassEnergy) / ((energy + PhysicalConstants::_neutronRestMassEnergy) * (energy + PhysicalConstants::_neutronRestMassEnergy)))))); @@ -41,12 +42,11 @@ void updateTrajectory( double energy, double angle, MC_Particle& particle ) particle.velocity.y = speed * particle.direction_cosine.beta; particle.velocity.z = speed * particle.direction_cosine.gamma; randomNumber = rngSample(&particle.random_number_seed); - particle.num_mean_free_paths = -1.0*log(randomNumber); + particle.num_mean_free_paths = -1.0*LOG(randomNumber); } HOST_DEVICE_END -HOST_DEVICE - +HOST_DEVICE SYCL_EXTERNAL bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned int tally_index) { const MC_Cell_State &cell = monteCarlo->domain[mc_particle.domain].cell_state[mc_particle.cell]; @@ -116,7 +116,9 @@ bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned i ATOMIC_ADD( monteCarlo->_tallies->_balanceTask[tally_index]._produce, nOut); break; case NuclearDataReaction::Undefined: +#ifndef HAVE_SYCL printf("reactionType invalid\n"); +#endif qs_assert(false); } diff --git a/src/CollisionEvent.hh b/src/CollisionEvent.hh index 270ae382..d1d42162 100644 --- a/src/CollisionEvent.hh +++ b/src/CollisionEvent.hh @@ -5,7 +5,7 @@ class MonteCarlo; class MC_Particle; #include "DeclareMacro.hh" -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned int tally_index ); HOST_DEVICE_END diff --git a/src/CycleTracking.cc b/src/CycleTracking.cc index e6238a83..1286bc4a 100644 --- a/src/CycleTracking.cc +++ b/src/CycleTracking.cc @@ -11,7 +11,7 @@ #include "macros.hh" #include "qs_assert.hh" -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void CycleTrackingGuts( MonteCarlo *monteCarlo, int particle_index, ParticleVault *processingVault, ParticleVault *processedVault ) { MC_Particle mc_particle; @@ -30,7 +30,7 @@ void CycleTrackingGuts( MonteCarlo *monteCarlo, int particle_index, ParticleVaul } HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void CycleTrackingFunction( MonteCarlo *monteCarlo, MC_Particle &mc_particle, int particle_index, ParticleVault* processingVault, ParticleVault* processedVault) { bool keepTrackingThisParticle = false; diff --git a/src/CycleTracking.hh b/src/CycleTracking.hh index 4dea37a0..f1e04426 100644 --- a/src/CycleTracking.hh +++ b/src/CycleTracking.hh @@ -5,10 +5,10 @@ class ParticleVault; class MonteCarlo; class MC_Particle; -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void CycleTrackingGuts( MonteCarlo *monteCarlo, int particle_index, ParticleVault *processingVault, ParticleVault *processedVault ); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void CycleTrackingFunction( MonteCarlo *monteCarlo, MC_Particle &mc_particle, int particle_index, ParticleVault* processingVault, ParticleVault* processedVault); HOST_DEVICE_END diff --git a/src/DeclareMacro.hh b/src/DeclareMacro.hh index 3599190f..9efa61e1 100644 --- a/src/DeclareMacro.hh +++ b/src/DeclareMacro.hh @@ -1,6 +1,10 @@ #ifndef DECLAREMACRO_HH #define DECLAREMACRO_HH +#ifdef HAVE_SYCL +#include +#endif + #ifdef HAVE_CUDA #define HOST_DEVICE __host__ __device__ #define HOST_DEVICE_CUDA __host__ __device__ diff --git a/src/DirectionCosine.cc b/src/DirectionCosine.cc index ded7a30b..fc3f5571 100644 --- a/src/DirectionCosine.cc +++ b/src/DirectionCosine.cc @@ -1,13 +1,14 @@ #include "DirectionCosine.hh" #include "MC_RNG_State.hh" #include "PhysicalConstants.hh" +#include "mathHelp.hh" void DirectionCosine::Sample_Isotropic(uint64_t *seed) { this->gamma = 1.0 - 2.0*rngSample(seed); - double sine_gamma = sqrt((1.0 - (gamma*gamma))); + double sine_gamma = SQRT((1.0 - (gamma*gamma))); double phi = PhysicalConstants::_pi*(2.0*rngSample(seed) - 1.0); - this->alpha = sine_gamma * cos(phi); - this->beta = sine_gamma * sin(phi); + this->alpha = sine_gamma * COS(phi); + this->beta = sine_gamma * SIN(phi); } diff --git a/src/DirectionCosine.hh b/src/DirectionCosine.hh index e7dd7363..325721dc 100644 --- a/src/DirectionCosine.hh +++ b/src/DirectionCosine.hh @@ -1,9 +1,9 @@ #ifndef DIRECTION_COSINE_INCLUDE #define DIRECTION_COSINE_INCLUDE -#include #include "portability.hh" #include "DeclareMacro.hh" +#include "mathHelp.hh" HOST_DEVICE_CLASS class DirectionCosine @@ -31,7 +31,7 @@ public: void Sample_Isotropic(uint64_t *seed); // rotate a direction cosine given the sine/cosine of theta and phi - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL inline void Rotate3DVector( double sine_Theta, double cosine_Theta, double sine_Phi, @@ -119,12 +119,12 @@ HOST_DEVICE_END // direction_cosine.beta = cos_theta*sin_phi*Alpha + cos_phi*Beta + sin_theta*sin_phi*Gamma; // direction_cosine.gamma = -sin_theta *Alpha + cos_theta *Gamma; //---------------------------------------------------------------------------------------------------------------------- -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL inline void DirectionCosine::Rotate3DVector(double sin_Theta, double cos_Theta, double sin_Phi, double cos_Phi) { // Calculate additional variables in the rotation matrix. double cos_theta = this->gamma; - double sin_theta = sqrt((1.0 - (cos_theta*cos_theta))); + double sin_theta = SQRT((1.0 - (cos_theta*cos_theta))); double cos_phi; double sin_phi; diff --git a/src/MCT.cc b/src/MCT.cc index 3faf3592..200a57bc 100644 --- a/src/MCT.cc +++ b/src/MCT.cc @@ -397,7 +397,7 @@ namespace /// Reflects the particle off of a reflection boundary. -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void MCT_Reflect_Particle(MonteCarlo *monteCarlo, MC_Particle &particle) { DirectionCosine *direction_cosine = particle.Get_Direction_Cosine(); diff --git a/src/MCT.hh b/src/MCT.hh index e6852acd..1e155cb0 100644 --- a/src/MCT.hh +++ b/src/MCT.hh @@ -14,7 +14,7 @@ class Subfacet_Adjacency; class MonteCarlo; -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL MC_Nearest_Facet MCT_Nearest_Facet( MC_Particle *mc_particle, MC_Location &location, @@ -22,12 +22,12 @@ MC_Nearest_Facet MCT_Nearest_Facet( const DirectionCosine *direction_cosine, double distance_threshold, double current_best_distance, - bool new_segment, + bool new_segment, MonteCarlo* monteCarlo); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void MCT_Generate_Coordinate_3D_G( uint64_t *random_number_seed, int domain_num, @@ -36,17 +36,17 @@ void MCT_Generate_Coordinate_3D_G( MonteCarlo* monteCarlo); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL MC_Vector MCT_Cell_Position_3D_G( const MC_Domain &domain, int cell_index); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL Subfacet_Adjacency &MCT_Adjacent_Facet(const MC_Location &location, MC_Particle &mc_particle, MonteCarlo* monteCarlo); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL void MCT_Reflect_Particle(MonteCarlo *mcco, MC_Particle &particle); HOST_DEVICE_END diff --git a/src/MC_Facet_Crossing_Event.hh b/src/MC_Facet_Crossing_Event.hh index 98170036..8b0743e7 100644 --- a/src/MC_Facet_Crossing_Event.hh +++ b/src/MC_Facet_Crossing_Event.hh @@ -7,7 +7,7 @@ class ParticleVault; class MC_Particle; -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL MC_Tally_Event::Enum MC_Facet_Crossing_Event(MC_Particle &mc_particle, MonteCarlo* monteCarlo, int particle_index, ParticleVault* processingVault); HOST_DEVICE_END diff --git a/src/MC_Facet_Geometry.hh b/src/MC_Facet_Geometry.hh index 943e86da..12b9e809 100644 --- a/src/MC_Facet_Geometry.hh +++ b/src/MC_Facet_Geometry.hh @@ -22,7 +22,7 @@ public: C = ((r1.x - r0.x)*(r2.y - r0.y)) - ((r1.y - r0.y)*(r2.x - r0.x)); D = -1.0*(A*r0.x + B*r0.y + C*r0.z); - double magnitude = sqrt(A * A + B * B + C * C); + double magnitude = std::sqrt(A * A + B * B + C * C); if ( magnitude == 0.0 ) { diff --git a/src/MC_Fast_Timer.cc b/src/MC_Fast_Timer.cc index 70aa7bf9..0ea8366e 100644 --- a/src/MC_Fast_Timer.cc +++ b/src/MC_Fast_Timer.cc @@ -25,7 +25,7 @@ static double mc_std_dev(uint64_t const data[], int const nelm) for(int ndx=0; ndx -900.0 ) { -#if 1 - printf(" MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n"); - #else +#if defined(HAVE_SYCL) + // FIXME +#elif 0 std::string output_string; MC_Warning( "Forced Collision: num_mean_free_paths < 0 \n" "Particle record:\n%s", output_string.c_str()); +#else + printf(" MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n"); #endif } @@ -80,7 +82,7 @@ MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Part // the next collision from an exponential distribution. double random_number = rngSample(&mc_particle.random_number_seed); - mc_particle.num_mean_free_paths = -1.0*log(random_number); + mc_particle.num_mean_free_paths = -1.0*LOG(random_number); } // Calculate the distances to collision, nearest facet, and census. diff --git a/src/MC_Segment_Outcome.hh b/src/MC_Segment_Outcome.hh index f8d6a25c..65a60d81 100644 --- a/src/MC_Segment_Outcome.hh +++ b/src/MC_Segment_Outcome.hh @@ -32,7 +32,7 @@ struct MC_Collision_Event_Return }; #include "DeclareMacro.hh" -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned int &flux_tally_index); HOST_DEVICE_END diff --git a/src/MC_SourceNow.cc b/src/MC_SourceNow.cc index 08e23138..2c1f7e96 100644 --- a/src/MC_SourceNow.cc +++ b/src/MC_SourceNow.cc @@ -18,6 +18,7 @@ #include "AtomicMacro.hh" #include "NVTX_Range.hh" #include +#include "mathHelp.hh" namespace { @@ -116,7 +117,7 @@ void MC_SourceNow(MonteCarlo *monteCarlo) particle.weight = source_particle_weight; double randomNumber = rngSample(&particle.random_number_seed); - particle.num_mean_free_paths = -1.0*log(randomNumber); + particle.num_mean_free_paths = -1.0*LOG(randomNumber); randomNumber = rngSample(&particle.random_number_seed); particle.time_to_census = monteCarlo->time_info->time_step * randomNumber; @@ -172,7 +173,7 @@ namespace static const double speed_of_light = PhysicalConstants::_speedOfLight; - return speed_of_light * sqrt(energy * (energy + 2.0*(rest_mass_energy)) / + return speed_of_light * std::sqrt(energy * (energy + 2.0*(rest_mass_energy)) / ((energy + rest_mass_energy) * (energy + rest_mass_energy))); } } diff --git a/src/MC_Vector.hh b/src/MC_Vector.hh index 91b5fdf6..34368d07 100644 --- a/src/MC_Vector.hh +++ b/src/MC_Vector.hh @@ -1,8 +1,8 @@ #ifndef MC_VECTOR_INCLUDE #define MC_VECTOR_INCLUDE -#include #include "DeclareMacro.hh" +#include "mathHelp.hh" HOST_DEVICE_CLASS class MC_Vector @@ -17,7 +17,7 @@ class MC_Vector HOST_DEVICE_CUDA MC_Vector(double a, double b, double c) : x(a), y(b), z(c) {} - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL MC_Vector& operator=( const MC_Vector&tmp ) { if ( this == &tmp ) { return *this; } @@ -29,13 +29,13 @@ class MC_Vector return *this; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL bool operator==( const MC_Vector& tmp ) { return tmp.x == x && tmp.y == y && tmp.z == z; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL MC_Vector& operator+=( const MC_Vector &tmp ) { x += tmp.x; @@ -44,7 +44,7 @@ class MC_Vector return *this; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL MC_Vector& operator-=( const MC_Vector &tmp ) { x -= tmp.x; @@ -53,7 +53,7 @@ class MC_Vector return *this; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL MC_Vector& operator*=(const double scalar) { x *= scalar; @@ -62,7 +62,7 @@ class MC_Vector return *this; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL MC_Vector& operator/=(const double scalar) { x /= scalar; @@ -71,39 +71,39 @@ class MC_Vector return *this; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL const MC_Vector operator+( const MC_Vector &tmp ) const { return MC_Vector(x + tmp.x, y + tmp.y, z + tmp.z); } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL const MC_Vector operator-( const MC_Vector &tmp ) const { return MC_Vector(x - tmp.x, y - tmp.y, z - tmp.z); } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL const MC_Vector operator*(const double scalar) const { return MC_Vector(scalar*x, scalar*y, scalar*z); } - HOST_DEVICE_CUDA - inline double Length() const { return std::sqrt(x*x + y*y + z*z); } + HOST_DEVICE_CUDA SYCL_EXTERNAL + inline double Length() const { return SQRT(x*x + y*y + z*z); } // Distance from this vector to another point. - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL inline double Distance(const MC_Vector& vv) const - { return std::sqrt((x - vv.x)*(x - vv.x) + (y - vv.y)*(y - vv.y)+ (z - vv.z)*(z - vv.z)); } + { return SQRT((x - vv.x)*(x - vv.x) + (y - vv.y)*(y - vv.y)+ (z - vv.z)*(z - vv.z)); } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL inline double Dot(const MC_Vector &tmp) const { return this->x*tmp.x + this->y*tmp.y + this->z*tmp.z; } - HOST_DEVICE_CUDA + HOST_DEVICE_CUDA SYCL_EXTERNAL inline MC_Vector Cross(const MC_Vector &v) const { return MC_Vector(y * v.z - z * v.y, diff --git a/src/MacroscopicCrossSection.hh b/src/MacroscopicCrossSection.hh index 68ab7e1b..b059556e 100644 --- a/src/MacroscopicCrossSection.hh +++ b/src/MacroscopicCrossSection.hh @@ -5,12 +5,12 @@ class MonteCarlo; -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL double macroscopicCrossSection(MonteCarlo* monteCarlo, int reactionIndex, int domainIndex, int cellIndex, int isoIndex, int energyGroup); HOST_DEVICE_END -HOST_DEVICE +HOST_DEVICE SYCL_EXTERNAL double weightedMacroscopicCrossSection(MonteCarlo* monteCarlo, int taskIndex, int domainIndex, int cellIndex, int energyGroup); HOST_DEVICE_END diff --git a/src/Makefile b/src/Makefile index 5f42dc12..5bbf141b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -99,19 +99,46 @@ CXXFLAGS = CPPFLAGS = LDFLAGS = +############################################################################### +# Cuda on LLNL CORAL EA nodes +############################################################################### +## Choose one Cuda path +CUDA_PATH = /usr +#CUDA_PATH = /usr/tcetmp/packages/cuda-9.0.176 + +HOST_COMPILER = g++ + +OPTFLAGS = -O2 +## Version below for debugging +##OPTFLAGS = -DUSE_NVTX -g -G -lineinfo -O0 + +CUDA_FLAGS = -I${CUDA_PATH}/include/ +CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart +# +CXX=$(CUDA_PATH)/bin/nvcc +CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) #-Xptxas -v +CXXFLAGS += -ccbin clang-8 +CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\" +#CXXFLAGS += --compiler-bindir=$(HOST_COMPILER) +CPPFLAGS = -x cu -dc +#CPPFLAGS += -DHAVE_MPI -DHAVE_ASYNC_MPI +LDFLAGS = $(CUDA_LDFLAGS) +LDFLAGS += -lstdc++ +##LDFLAGS += ${CUDA_PATH}/lib64/libnvToolsExt.so + ############################################################################### # Very simple GCC build with OpenMP but without MPI. # This works on a Macbook (if gcc is installed) ############################################################################### # -#OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp -#OPENMP_LDFLAGS = -fopenmp -#OPTFLAGS = -g -O2 +OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp +OPENMP_LDFLAGS = -fopenmp +OPTFLAGS = -g -O2 # -#CXX=g++ -#CXXFLAGS = -std=c++11 $(OPTFLAGS) -Wpedantic -#CPPFLAGS = $(OPENMP_FLAGS) -#LDFLAGS = $(OPENMP_LDFLAGS) +CXX=g++ +CXXFLAGS = -std=c++11 $(OPTFLAGS) -Wpedantic +CPPFLAGS = $(OPENMP_FLAGS) +LDFLAGS = $(OPENMP_LDFLAGS) ############################################################################### @@ -128,18 +155,6 @@ LDFLAGS = #LDFLAGS = $(OPENMP_LDFLAGS) -############################################################################### -# LLNL LC BG/Q Comilers # -############################################################################### -### BGQ GNU -#OPTFLAGS = -g -O2 -## -#CXX=/usr/local/tools/compilers/ibm/mpicxx-4.8.4 -#CXXFLAGS = -std=c++11 $(OPTFLAGS) -#CPPFLAGS = -DCHRONO_MISSING -DHAVE_MPI -DHAVE_OPENMP -fopenmp -#LDFLAGS = -fopenmp - - ############################################################################### # OpenMP 4.5 on LLNL CORAL EA nodes ############################################################################### @@ -167,86 +182,19 @@ LDFLAGS = ############################################################################### -# Cuda on LLNL CORAL EA nodes -############################################################################### -## Choose one Cuda path -##CUDA_PATH = /usr/local/cuda-8.0 -#CUDA_PATH = /usr/tcetmp/packages/cuda-9.0.176 - -#HOST_COMPILER = /usr/tce/packages/spectrum-mpi/spectrum-mpi-2017.04.03-xl-beta-2017.09.13/bin/mpixlC - -#OPTFLAGS = -O2 -## Version below for debugging -##OPTFLAGS = -DUSE_NVTX -g -G -lineinfo -O0 - -#CUDA_FLAGS = -I${CUDA_PATH}/include/ -#CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart -# -#CXX=$(CUDA_PATH)/bin/nvcc -#CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) -Xptxas -v -#CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\" -#CXXFLAGS += --compiler-bindir=$(HOST_COMPILER) -#CPPFLAGS = -x cu -dc -DHAVE_MPI -DHAVE_ASYNC_MPI -#LDFLAGS = $(CUDA_LDFLAGS) -##LDFLAGS += ${CUDA_PATH}/lib64/libnvToolsExt.so - - - - -############################################################################### -# LLNL TOSS GCC + OpenMP (mvapich 2 - version 1.7) [cab] -############################################################################### -#OPTFLAGS = -g -O2 -#OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp -#OPENMP_LDFLAGS = -fopenmp -# -#CXX = /usr/apps/gnu/4.9.3/bin/mpig++ -#CXXFLAGS = -std=c++0x $(OPTFLAGS) -mpi=mvapich2-gnu-1.7 -#CPPFLAGS = -DHAVE_MPI $(OPENMP_FLAGS) -#LDFLAGS = $(OPENMP_LDFLAGS) - -############################################################################### -# LLNL TOSS Intel + OpenMP (mvapich 2 - version 2.1) [quartz] -############################################################################### -#OPENMP_FLAGS = -DHAVE_OPENMP -qopenmp -#OPENMP_LDFLAGS = -qopenmp -#OPTFLAGS = -g -O2 -# -#CXX=/usr/local/bin/mpiicpc-17.0.174 -#CXXFLAGS = -std=c++11 -mpi=mvapich2-intel-2.1 -DHAVE_MPI $(OPENMP_FLAGS) -#CXXFLAGS += -wd1128 -wd64 -wd21 -#LDFLAGS = $(OPENMP_LDFLAGS) - - -############################################################################### -# LLNL TOSS Clang (cab) -############################################################################### -#CLANGPATH = /usr/global/tools/clang/chaos_5_x86_64_ib/clang-omp-3.5.0 -#OPTFLAGS = -g -O2 -# -#CXX=${CLANGPATH}/bin/mpiclang++ -#CXXFLAGS = -std=c++11 $(OPTFLAGS) -#CPPFLAGS = -DHAVE_MPI -#LDFLAGS = -Wl,-rpath,${CLANGPATH}/lib - - -############################################################################### -# Trinity Compilers # -# # -# One must 'swap' modules on this machine to access different compilers. # +# SYCL/DPC++ ############################################################################### - -### Defaults to Intel. -#OPTFLAGS = -g -O2 -xmic-avx512 -ipo -#OPENMP_FLAGS = -DHAVE_OPENMP -qopenmp -pthread -DUSE_OPENMP_NO_GPU -#OPENMP_LDFLAGS = -qopenmp -pthread -# -#CXX=CC -#CXXFLAGS = -std=c++11 $(OPTFLAGS) -#CPPFLAGS = -DHAVE_MPI -DCHRONO_MISSING $(OPENMP_FLAGS) -#LDFLAGS = $(OPENMP_LDFLAGS) - - +CXX = dpcpp +CXXFLAGS = -std=c++14 +CXXFLAGS += -g -O2 +CXXFLAGS += -ferror-limit=1 +CPPFLAGS = -DHAVE_SYCL +LDFLAGS = + +# GPU offline compile +# https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/gpu-flow/offline-compilation-for-gpu.html +#CXXFLAGS += -fsycl-targets=spir64_gen-unknown-linux-sycldevice +#LDFLAGS += -Xsycl-target-backend=spir64_gen-unknown-linux-sycldevice "-device skl" ################################################################################ ### Below here, it is pitch black. ### diff --git a/src/MemoryControl.hh b/src/MemoryControl.hh index 6b13aaf5..19d1ed71 100644 --- a/src/MemoryControl.hh +++ b/src/MemoryControl.hh @@ -14,7 +14,7 @@ namespace MemoryControl { if (size == 0) { return NULL;} T* tmp = NULL; - + switch (policy) { case AllocationPolicy::HOST_MEM: @@ -22,9 +22,13 @@ namespace MemoryControl break; #ifdef HAVE_UVM case AllocationPolicy::UVM_MEM: - void *ptr; + void * ptr; +#ifdef HAVE_SYCL + ptr = (void *)sycl::malloc_shared(size * sizeof(T), q); +#else cudaMallocManaged(&ptr, size*sizeof(T), cudaMemAttachGlobal); - tmp = new(ptr) T[size]; +#endif + tmp = new(ptr) T[size]; break; #endif default: @@ -40,15 +44,18 @@ namespace MemoryControl switch (policy) { case MemoryControl::AllocationPolicy::HOST_MEM: - delete[] data; + delete[] data; break; #ifdef HAVE_UVM case UVM_MEM: - for (int i=0; i < size; ++i) - data[i].~T(); + for (int i=0; i < size; ++i) data[i].~T(); +#ifdef HAVE_SYCL + sycl::free(data, q); +#else cudaFree(data); +#endif break; -#endif +#endif default: qs_assert(false); break; diff --git a/src/MonteCarlo.cc b/src/MonteCarlo.cc index 28c9c47a..f9fb116c 100644 --- a/src/MonteCarlo.cc +++ b/src/MonteCarlo.cc @@ -25,13 +25,19 @@ MonteCarlo::MonteCarlo(const Parameters& params) _nuclearData = 0; _materialDatabase = 0; - #if defined (HAVE_UVM) +#if defined(HAVE_UVM) void *ptr1, *ptr2, *ptr3, *ptr4; - +#ifdef HAVE_SYCL + ptr1 = (void *)sycl::malloc_shared(sizeof(Tallies), q); + ptr2 = (void *)sycl::malloc_shared(sizeof(MC_Processor_Info), q); + ptr3 = (void *)sycl::malloc_shared(sizeof(MC_Time_Info), q); + ptr4 = (void *)sycl::malloc_shared(sizeof(MC_Fast_Timer_Container), q); +#else cudaMallocManaged( &ptr1, sizeof(Tallies), cudaMemAttachHost ); cudaMallocManaged( &ptr2, sizeof(MC_Processor_Info), cudaMemAttachHost ); cudaMallocManaged( &ptr3, sizeof(MC_Time_Info), cudaMemAttachHost ); cudaMallocManaged( &ptr4, sizeof(MC_Fast_Timer_Container) ); +#endif _tallies = new(ptr1) Tallies( params.simulationParams.balanceTallyReplications, params.simulationParams.fluxTallyReplications, @@ -98,8 +104,13 @@ MonteCarlo::MonteCarlo(const Parameters& params) #if defined(HAVE_UVM) void *ptr5, *ptr6; +#ifdef HAVE_SYCL + ptr5 = (void *)sycl::malloc_shared(sizeof(MC_Particle_Buffer), q); + ptr6 = (void *)sycl::malloc_shared(sizeof(ParticleVaultContainer), q); +#else cudaMallocManaged( &ptr5, sizeof(MC_Particle_Buffer) ); cudaMallocManaged( &ptr6, sizeof(ParticleVaultContainer), cudaMemAttachHost ); +#endif particle_buffer = new(ptr5) MC_Particle_Buffer(this, batch_size); _particleVaultContainer = new(ptr6) ParticleVaultContainer(batch_size, num_batches, num_extra_vaults); #else @@ -125,6 +136,16 @@ MonteCarlo::~MonteCarlo() fast_timer->~MC_Fast_Timer_Container(); particle_buffer->~MC_Particle_Buffer(); +#ifdef HAVE_SYCL + sycl::free(_nuclearData, q); + sycl::free(_particleVaultContainer, q); + sycl::free(_materialDatabase, q); + sycl::free(_tallies, q); + sycl::free(processor_info, q); + sycl::free(time_info, q); + sycl::free(fast_timer, q); + sycl::free(particle_buffer, q); +#else cudaFree( _nuclearData ); cudaFree( _particleVaultContainer); cudaFree( _materialDatabase); @@ -133,8 +154,9 @@ MonteCarlo::~MonteCarlo() cudaFree( time_info); cudaFree( fast_timer); cudaFree( particle_buffer); +#endif - #else +#else delete _nuclearData; delete _particleVaultContainer; delete _materialDatabase; @@ -143,7 +165,7 @@ MonteCarlo::~MonteCarlo() delete time_info; delete fast_timer; delete particle_buffer; - #endif +#endif } void MonteCarlo::clearCrossSectionCache() diff --git a/src/NuclearData.cc b/src/NuclearData.cc index fe9ad072..b7bb4906 100644 --- a/src/NuclearData.cc +++ b/src/NuclearData.cc @@ -3,9 +3,7 @@ #include "MC_RNG_State.hh" #include "DeclareMacro.hh" #include "qs_assert.hh" - -using std::log10; -using std::pow; +#include "mathHelp.hh" // Set the cross section values and reaction type // Cross sections are scaled to produce the supplied reactionCrossSection at 1MeV. @@ -21,7 +19,7 @@ NuclearDataReaction::NuclearDataReaction( for (int ii=0; ii #include #include #endif +#if defined(HAVE_SYCL) +#include +extern sycl::queue q; // global variable for device queue +#else +#define SYCL_EXTERNAL +#endif + #ifdef HAVE_OPENMP_TARGET #ifdef USE_OPENMP_NO_GPU #define VAR_MEM MemoryControl::AllocationPolicy::HOST_MEM @@ -17,11 +24,14 @@ #elif HAVE_CUDA #define VAR_MEM MemoryControl::AllocationPolicy::UVM_MEM #define HAVE_UVM +#elif HAVE_SYCL + #define VAR_MEM MemoryControl::AllocationPolicy::UVM_MEM + #define HAVE_UVM #else #define VAR_MEM MemoryControl::AllocationPolicy::HOST_MEM #endif -enum ExecutionPolicy{ cpu, gpuWithCUDA, gpuWithOpenMP }; +enum ExecutionPolicy{ cpu, gpuWithCUDA, gpuWithOpenMP, SYCL }; inline ExecutionPolicy getExecutionPolicy( int useGPU ) { @@ -33,6 +43,8 @@ inline ExecutionPolicy getExecutionPolicy( int useGPU ) execPolicy = ExecutionPolicy::gpuWithCUDA; #elif defined (HAVE_OPENMP_TARGET) execPolicy = ExecutionPolicy::gpuWithOpenMP; + #elif defined (HAVE_SYCL) + execPolicy = ExecutionPolicy::SYCL; #endif } return execPolicy; diff --git a/src/initMC.cc b/src/initMC.cc index 1be428fd..02b38201 100644 --- a/src/initMC.cc +++ b/src/initMC.cc @@ -54,7 +54,11 @@ MonteCarlo* initMC(const Parameters& params) MonteCarlo* monteCarlo; #ifdef HAVE_UVM void* ptr; +#ifdef HAVE_SYCL + ptr = (void *)sycl::malloc_shared(sizeof(MonteCarlo), q); +#else cudaMallocManaged( &ptr, sizeof(MonteCarlo), cudaMemAttachGlobal ); +#endif monteCarlo = new(ptr) MonteCarlo(params); #else monteCarlo = new MonteCarlo(params); @@ -77,28 +81,32 @@ namespace //Init GPU usage information void initGPUInfo( MonteCarlo* monteCarlo) { + int Ngpus = 0; #if defined(HAVE_OPENMP_TARGET) - int Ngpus = omp_get_num_devices(); + Ngpus = omp_get_num_devices(); #elif defined(HAVE_CUDA) - int Ngpus; cudaGetDeviceCount(&Ngpus); - #else - int Ngpus = 0; + #elif defined(HAVE_SYCL) + Ngpus = 1; #endif if( Ngpus != 0 ) { - #if defined(HAVE_OPENMP_TARGET) || defined(HAVE_CUDA) + #if defined(HAVE_OPENMP_TARGET) || defined(HAVE_CUDA) || defined(HAVE_SYCL) + monteCarlo->processor_info->use_gpu = 1; int GPUID = monteCarlo->processor_info->rank%Ngpus; monteCarlo->processor_info->gpu_id = GPUID; - + #if defined(HAVE_OPENMP_TARGET) omp_set_default_device(GPUID); #endif - cudaSetDevice(GPUID); - //cudaDeviceSetLimit( cudaLimitStackSize, 64*1024 ); + #ifndef HAVE_SYCL + cudaSetDevice(GPUID); + //cudaDeviceSetLimit( cudaLimitStackSize, 64*1024 ); + #endif + #endif } else @@ -132,8 +140,13 @@ namespace { #if defined HAVE_UVM void *ptr1, *ptr2; +#ifdef HAVE_SYCL + ptr1 = (void *)sycl::malloc_shared(sizeof(NuclearData), q); + ptr2 = (void *)sycl::malloc_shared(sizeof(MaterialDatabase), q); +#else cudaMallocManaged( &ptr1, sizeof(NuclearData), cudaMemAttachGlobal ); cudaMallocManaged( &ptr2, sizeof(MaterialDatabase), cudaMemAttachGlobal ); +#endif monteCarlo->_nuclearData = new(ptr1) NuclearData(params.simulationParams.nGroups, params.simulationParams.eMin, diff --git a/src/main.cc b/src/main.cc index bb9517b2..dbc05105 100644 --- a/src/main.cc +++ b/src/main.cc @@ -26,6 +26,10 @@ #include "git_hash.hh" #include "git_vers.hh" +#ifdef HAVE_SYCL +sycl::queue q; +#endif + void gameOver(); void cycleInit( bool loadBalance ); void cycleTracking(MonteCarlo* monteCarlo); @@ -43,8 +47,45 @@ int main(int argc, char** argv) Parameters params = getParameters(argc, argv); printParameters(params, cout); - // mcco stores just about everything. - mcco = initMC(params); +#ifdef HAVE_SYCL + // HOIST INTO SETUP FUNCTION EVENTUALLY + char * devchar = std::getenv("QS_DEVICE"); + std::string devname = (devchar==NULL ? "None" : devchar); + if (devname == "CPU") { + q = cl::sycl::cpu_selector{}; + } + else + if (devname == "GPU") { + q = cl::sycl::gpu_selector{}; + } + else + if (devname == "HOST") { + q = cl::sycl::host_selector{}; + } + else + { + std::cout << "QS_DEVICE must be CPU, GPU or HOST" << std::endl; + std::abort(); + } + + // DEBUG - REMOVE LATER + if ( q.get_device().is_cpu() ) std::cout << "is cpu" << std::endl; + if ( q.get_device().is_gpu() ) std::cout << "is gpu" << std::endl; + if ( q.get_device().is_host() ) std::cout << "is host" << std::endl; + if ( q.get_device().is_accelerator() ) std::cout << "is accelerator" << std::endl; +#endif + + // mcco stores just about everything. + mcco = initMC(params); + +#ifdef HAVE_SYCL // DEBUG - REMOVE LATER + if (mcco->processor_info->use_gpu) { + printf("Using SYCL device\n"); + } else { + printf("Bad\n"); + std::abort(); + } +#endif int loadBalance = params.simulationParams.loadBalance; @@ -74,13 +115,17 @@ int main(int argc, char** argv) #ifdef HAVE_UVM mcco->~MonteCarlo(); +#ifdef HAVE_SYCL + sycl::free(mcco, q); +#else cudaFree( mcco ); +#endif #else delete mcco; #endif mpiFinalize(); - + return 0; } @@ -112,7 +157,7 @@ void cycleInit( bool loadBalance ) mcco->particle_buffer->Initialize(); MC_SourceNow(mcco); - + PopulationControl(mcco, loadBalance); // controls particle population RouletteLowWeightParticles(mcco); // Delete particles with low statistical weight @@ -125,7 +170,7 @@ void cycleInit( bool loadBalance ) __global__ void CycleTrackingKernel( MonteCarlo* monteCarlo, int num_particles, ParticleVault* processingVault, ParticleVault* processedVault ) { - int global_index = getGlobalThreadID(); + int global_index = getGlobalThreadID(); if( global_index < num_particles ) { @@ -167,9 +212,9 @@ void cycleTracking(MonteCarlo *monteCarlo) ParticleVault *processingVault = my_particle_vault.getTaskProcessingVault(processing_vault); ParticleVault *processedVault = my_particle_vault.getTaskProcessedVault(processed_vault); - + int numParticles = processingVault->size(); - + if ( numParticles != 0 ) { NVTX_Range trackingKernel("cycleTracking_TrackingKernel"); // range ends at end of scope @@ -187,28 +232,28 @@ void cycleTracking(MonteCarlo *monteCarlo) dim3 grid(1,1,1); dim3 block(1,1,1); int runKernel = ThreadBlockLayout( grid, block, numParticles); - + //Call Cycle Tracking Kernel - if( runKernel ) + if ( runKernel ) CycleTrackingKernel<<>>( monteCarlo, numParticles, processingVault, processedVault ); - + //Synchronize the stream so that memory is copied back before we begin MPI section cudaPeekAtLastError(); cudaDeviceSynchronize(); #endif } break; - + case gpuWithOpenMP: { int nthreads=128; - if (numParticles < 64*56 ) + if (numParticles < 64*56 ) nthreads = 64; int nteams = (numParticles + nthreads - 1 ) / nthreads; nteams = nteams > 1 ? nteams : 1; #ifdef HAVE_OPENMP_TARGET - #pragma omp target enter data map(to:monteCarlo[0:1]) - #pragma omp target enter data map(to:processingVault[0:1]) + #pragma omp target enter data map(to:monteCarlo[0:1]) + #pragma omp target enter data map(to:processingVault[0:1]) #pragma omp target enter data map(to:processedVault[0:1]) #pragma omp target teams distribute parallel for num_teams(nteams) thread_limit(128) #endif @@ -224,6 +269,21 @@ void cycleTracking(MonteCarlo *monteCarlo) } break; + case SYCL: + { + const size_t N = numParticles; + #ifdef HAVE_SYCL + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{N}, [=](sycl::item<1> it) { + int particle_index = it[0]; + CycleTrackingGuts( monteCarlo, particle_index, processingVault, processedVault ); + }); + }); + q.wait(); + #endif + } + break; + case cpu: #include "mc_omp_parallel_for_schedule_static.hh" for ( int particle_index = 0; particle_index < numParticles; particle_index++ ) @@ -245,7 +305,7 @@ void cycleTracking(MonteCarlo *monteCarlo) // Next, communicate particles that have crossed onto // other MPI ranks. NVTX_Range cleanAndComm("cycleTracking_clean_and_comm"); - + SendQueue &sendQueue = *(my_particle_vault.getSendQueue()); monteCarlo->particle_buffer->Allocate_Send_Buffer( sendQueue ); @@ -314,7 +374,7 @@ void cycleFinalize() mcco->_tallies->_balanceTask[0]._end = mcco->_particleVaultContainer->sizeProcessed(); // Update the cumulative tally data. - mcco->_tallies->CycleFinalize(mcco); + mcco->_tallies->CycleFinalize(mcco); mcco->time_info->cycle++; diff --git a/src/mathHelp.hh b/src/mathHelp.hh new file mode 100644 index 00000000..a2672aae --- /dev/null +++ b/src/mathHelp.hh @@ -0,0 +1,18 @@ +#ifndef MATH_HELP_HH +#define MATH_HELP_HH + +#ifdef HAVE_SYCL +#include +#define SIN sycl::sin +#define COS sycl::cos +#define SQRT sycl::sqrt +#define LOG sycl::log +#else +#include +#define SIN sin +#define COS cos +#define SQRT sqrt +#define LOG log +#endif + +#endif // MATH_HELP_HH diff --git a/src/qs_assert.hh b/src/qs_assert.hh index 55e1f004..7c0059d5 100644 --- a/src/qs_assert.hh +++ b/src/qs_assert.hh @@ -1,6 +1,7 @@ #include #ifdef __CUDA_ARCH__ + #define qs_assert( cond) \ do \ { \ @@ -9,7 +10,19 @@ printf("ERROR\n"); \ } \ } while(0) + +#elif defined(HAVE_SYCL) + +#define qs_assert( cond) \ + do \ + { \ + if (!(cond)) \ + { \ + } \ + } while(0) + #else + #define qs_assert( cond) \ do \ { \ @@ -18,4 +31,5 @@ printf("file=%s: line=%d ERROR\n",__FILE__,__LINE__); \ } \ } while(0) + #endif From 516be3e223d528e2f6b7cac4a46dbbbee6555f74 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 3 Jun 2020 16:45:19 -0700 Subject: [PATCH 02/12] better compiler flags --- src/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 5bbf141b..2effc631 100644 --- a/src/Makefile +++ b/src/Makefile @@ -186,7 +186,9 @@ LDFLAGS = $(OPENMP_LDFLAGS) ############################################################################### CXX = dpcpp CXXFLAGS = -std=c++14 -CXXFLAGS += -g -O2 +CXXFLAGS += -O3 +CXXFLAGS += -ffast-math +#CXXFLAGS += -g CXXFLAGS += -ferror-limit=1 CPPFLAGS = -DHAVE_SYCL LDFLAGS = From 7fa9b9d3ce2792727067d490a354cf40c15b129c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 5 Jun 2020 07:53:36 -0700 Subject: [PATCH 03/12] implement SYCL atomics --- src/AtomicMacro.hh | 261 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 254 insertions(+), 7 deletions(-) diff --git a/src/AtomicMacro.hh b/src/AtomicMacro.hh index 4c31c853..3a67f605 100644 --- a/src/AtomicMacro.hh +++ b/src/AtomicMacro.hh @@ -1,3 +1,8 @@ +#ifndef AtomicMacro_HH_ +#define AtomicMacro_HH_ + +#define USE_MACRO_FUNCTIONS 1 + //Determine which atomics to use based on platform being compiled for // //If compiling with CUDA @@ -8,26 +13,260 @@ #define USE_OPENMP_ATOMICS #endif +#ifdef HAVE_SYCL +#include +#include +#endif + +// -------------------------------------------------- +// Original Names -> Inline function names +// -------------------------------------------------- +// ATOMIC_WRITE( x, v ) -> ATOMIC_WRITE +// ATOMIC_UPDATE( x ) -> ATOMIC_INCREMENT +// ATOMIC_ADD( x, v ) -> ATOMIC_ADD +// ATOMIC_CAPTURE( x, v, p ) -> ATOMIC_FETCH_ADD +// -------------------------------------------------- + +#if defined (USE_MACRO_FUNCTIONS) + +#define ATOMIC_CAPTURE( x, v, p ) ATOMIC_FETCH_ADD((x),(v),(p)) +#define ATOMIC_UPDATE( x ) ATOMIC_INCREMENT((x)) + +#if defined(HAVE_SYCL) + +static const cl::sycl::memory_order order = cl::sycl::memory_order::relaxed; +static const cl::sycl::access::address_space space = cl::sycl::access::address_space::global_space; + +template +inline void ATOMIC_WRITE(T & x, T v) { + //x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + //atomicAdd( &x, 1 ); + T one{1}; + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + cl::sycl::atomic_fetch_add(y, one, order); +} + +template +inline void ATOMIC_ADD(T& x, T v) { + //atomicAdd( &x, v ); + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + cl::sycl::atomic_fetch_add(y, v, order); +} + +template <> +inline void ATOMIC_ADD(double& x, double v) { + static_assert(sizeof(double) == sizeof(uint64_t), "Unsafe: double is not 64-bits"); + //atomicAdd( &x, v ); + cl::sycl::atomic t( (cl::sycl::multi_ptr( reinterpret_cast(&x)) )); + uint64_t old_i = t.load(order); + double old_d; + do { + old_d = *reinterpret_cast(&old_i); + const double new_d = old_d + v; + const uint64_t new_i = *reinterpret_cast(&new_d); + if (t.compare_exchange_strong(old_i, new_i, order)) break; + } while (true); + // p = old_d; +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + //atomicAdd( &x, v ); + T1 val = static_cast(v); + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + cl::sycl::atomic_fetch_add(y, val, order); +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + //p = atomicAdd( &x, v ); + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + p = cl::sycl::atomic_fetch_add(y, v, order); +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + //p = atomicAdd( &x, v ); + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + T1 val = static_cast(v); + p = cl::sycl::atomic_fetch_add(y, val, order); +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + //p = atomicAdd( &x, v ); + T1 val = static_cast(v); + cl::sycl::atomic y( (cl::sycl::multi_ptr(&x))); + p = cl::sycl::atomic_fetch_add(y, val, order); +} + +#elif defined(HAVE_CUDA) && defined(__CUDA_ARCH__) + +template +inline void ATOMIC_WRITE(T & x, T v) { + x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + atomicAdd( &x, 1 ); +} + +template +inline void ATOMIC_ADD(T& x, T v) { + atomicAdd( &x, v ); +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + atomicAdd( &x, v ); +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + p = atomicAdd( &x, v ); +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + p = atomicAdd( &x, v ); +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + p = atomicAdd( &x, v ); +} + +#elif defined(USE_OPENMP_ATOMICS) + +template +inline void ATOMIC_WRITE(T & x, T v) { + _Pragma("omp atomic write") + x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + _Pragma("omp atomic update") + x++; +} + +template +inline void ATOMIC_ADD(T& x, T v) { + _Pragma("omp atomic") + x += v; +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + _Pragma("omp atomic") + x += v; +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + _Pragma("omp atomic capture") + {p = x; x = x + v;} +} -#if defined (HAVE_CUDA) +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + _Pragma("omp atomic capture") + {p = x; x = x + v;} +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + _Pragma("omp atomic capture") + {p = x; x = x + v;} +} + +#else // SEQUENTIAL + +template +inline void ATOMIC_WRITE(T & x, T v) { + x = v; +} + +template +inline void ATOMIC_INCREMENT(T& x) { + x++; +} + +template +inline void ATOMIC_ADD(T& x, T v) { + x += v; +} + +template +inline void ATOMIC_ADD(T1& x, T2 v) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + x += v; +} + +template +inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) { + {p = x; x = x + v;} +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + {p = x; x = x + v;} +} + +template +inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) { + static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large"); + static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large"); + {p = x; x = x + v;} +} + +#endif // BACKENDS + +#else // ! USE_MACRO_FUNCTIONS + +#if defined (HAVE_SYCL) + +#error You must define USE_MACRO_FUNCTIONS with SYCL! + +#elif defined (HAVE_CUDA) //If in a CUDA GPU section use the CUDA atomics #ifdef __CUDA_ARCH__ //Currently not atomic here. But its only used when it does not necissarially need to be atomic. #define ATOMIC_WRITE( x, v ) \ - x = v; + x = v; #define ATOMIC_ADD( x, v ) \ atomicAdd( &x, v ); - + #define ATOMIC_UPDATE( x ) \ atomicAdd( &x, 1 ); #define ATOMIC_CAPTURE( x, v, p ) \ p = atomicAdd( &x, v ); + //If in a CPU OpenMP section use the OpenMP atomics #elif defined (USE_OPENMP_ATOMICS) + #define ATOMIC_WRITE( x, v ) \ _Pragma("omp atomic write") \ x = v; @@ -46,6 +285,7 @@ //If in a serial section, no need to use atomics #else + #define ATOMIC_WRITE( x, v ) \ x = v; @@ -62,6 +302,7 @@ //If in a OpenMP section use the OpenMP atomics #elif defined (USE_OPENMP_ATOMICS) + #define ATOMIC_WRITE( x, v ) \ _Pragma("omp atomic write") \ x = v; @@ -74,12 +315,13 @@ _Pragma("omp atomic update") \ x++; - #define ATOMIC_CAPTURE( x, v, p ) \ - _Pragma("omp atomic capture") \ - {p = x; x = x + v;} + #define ATOMIC_CAPTURE( x, v, p ) \ + _Pragma("omp atomic capture") \ + {p = x; x = x + v;} //If in a serial section, no need to use atomics #else + #define ATOMIC_WRITE( x, v ) \ x = v; @@ -91,4 +333,9 @@ #define ATOMIC_CAPTURE( x, v, p ) \ {p = x; x = x + v;} -#endif + +#endif // BACKENDS + +#endif // USE_MACRO_FUNCTIONS + +#endif // AtomicMacro_HH_ From d14bbb5b65545858afba6abe0fcf7cc2f7614cb7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 27 Jun 2020 09:31:22 -0700 Subject: [PATCH 04/12] re-enable printf in device code using Intel extension Signed-off-by: Jeff Hammond --- src/CollisionEvent.cc | 6 +++++- src/NuclearData.cc | 6 +++++- src/cudaUtils.hh | 11 +++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/CollisionEvent.cc b/src/CollisionEvent.cc index e684b200..d66b9a57 100644 --- a/src/CollisionEvent.cc +++ b/src/CollisionEvent.cc @@ -12,6 +12,7 @@ #include "DeclareMacro.hh" #include "AtomicMacro.hh" #include "mathHelp.hh" +#include "cudaUtils.hh" #define MAX_PRODUCTION_SIZE 4 @@ -116,7 +117,10 @@ bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned i ATOMIC_ADD( monteCarlo->_tallies->_balanceTask[tally_index]._produce, nOut); break; case NuclearDataReaction::Undefined: -#ifndef HAVE_SYCL +#ifdef HAVE_SYCL + static const OPENCL_CONSTANT char format[] = "reactionType invalid\n"; + sycl::intel::experimental::printf(format); +#else printf("reactionType invalid\n"); #endif qs_assert(false); diff --git a/src/NuclearData.cc b/src/NuclearData.cc index b7bb4906..2ed3224d 100644 --- a/src/NuclearData.cc +++ b/src/NuclearData.cc @@ -4,6 +4,7 @@ #include "DeclareMacro.hh" #include "qs_assert.hh" #include "mathHelp.hh" +#include "cudaUtils.hh" // Set the cross section values and reaction type // Cross sections are scaled to produce the supplied reactionCrossSection at 1MeV. @@ -80,7 +81,10 @@ void NuclearDataReaction::sampleCollision( } break; case Undefined: -#ifndef HAVE_SYCL +#ifdef HAVE_SYCL + static const OPENCL_CONSTANT char format[] = "_reactionType invalid\n"; + sycl::intel::experimental::printf(format); +#else printf("_reactionType invalid\n"); #endif qs_assert(false); diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh index 1db62cd2..808146e1 100644 --- a/src/cudaUtils.hh +++ b/src/cudaUtils.hh @@ -8,10 +8,21 @@ #endif #if defined(HAVE_SYCL) + #include + extern sycl::queue q; // global variable for device queue + +#ifdef __SYCL_DEVICE_ONLY__ +#define OPENCL_CONSTANT __attribute__((opencl_constant)) #else +#define OPENCL_CONSTANT +#endif + +#else // not SYCL + #define SYCL_EXTERNAL + #endif #ifdef HAVE_OPENMP_TARGET From c6f20a60e8d1a9a05709b3f5f2e61e4a76f6f574 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 27 Jun 2020 09:35:01 -0700 Subject: [PATCH 05/12] re-enable printf in device code using Intel extension Signed-off-by: Jeff Hammond --- src/MC_Segment_Outcome.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/MC_Segment_Outcome.cc b/src/MC_Segment_Outcome.cc index 3c629514..12a892c5 100644 --- a/src/MC_Segment_Outcome.cc +++ b/src/MC_Segment_Outcome.cc @@ -14,6 +14,7 @@ #include "PhysicalConstants.hh" #include "DeclareMacro.hh" #include "mathHelp.hh" +#include "cudaUtils.hh" HOST_DEVICE static inline unsigned int MC_Find_Min(const double *array, @@ -46,12 +47,9 @@ MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Part if ( mc_particle.num_mean_free_paths > -900.0 ) { -#if defined(HAVE_SYCL) - // FIXME -#elif 0 - std::string output_string; - MC_Warning( "Forced Collision: num_mean_free_paths < 0 \n" - "Particle record:\n%s", output_string.c_str()); +#ifdef HAVE_SYCL + static const OPENCL_CONSTANT char format[] = " MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n"; + sycl::intel::experimental::printf(format); #else printf(" MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n"); #endif From a50696bc5424747ac1dec50dc14501ecdae2d7d4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 27 Jun 2020 09:38:48 -0700 Subject: [PATCH 06/12] re-enable printf in device code using Intel extension Signed-off-by: Jeff Hammond --- src/qs_assert.hh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/qs_assert.hh b/src/qs_assert.hh index 7c0059d5..5eebf080 100644 --- a/src/qs_assert.hh +++ b/src/qs_assert.hh @@ -1,4 +1,5 @@ #include +#include "cudaUtils.hh" #ifdef __CUDA_ARCH__ @@ -18,6 +19,8 @@ { \ if (!(cond)) \ { \ + static const OPENCL_CONSTANT char format[] = "file=%s: line=%d ERROR\n"; \ + sycl::intel::experimental::printf(format,__FILE__,__LINE__); \ } \ } while(0) From 7794a66f2a3d1ea6446519615dcadb7a15c2a707 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 27 Jun 2020 09:44:01 -0700 Subject: [PATCH 07/12] globally rename q->sycl_device_queue Signed-off-by: Jeff Hammond --- src/MemoryControl.hh | 4 ++-- src/MonteCarlo.cc | 28 ++++++++++++++-------------- src/cudaUtils.hh | 2 +- src/initMC.cc | 6 +++--- src/main.cc | 22 +++++++++++----------- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/MemoryControl.hh b/src/MemoryControl.hh index 19d1ed71..6ffb92ce 100644 --- a/src/MemoryControl.hh +++ b/src/MemoryControl.hh @@ -24,7 +24,7 @@ namespace MemoryControl case AllocationPolicy::UVM_MEM: void * ptr; #ifdef HAVE_SYCL - ptr = (void *)sycl::malloc_shared(size * sizeof(T), q); + ptr = (void *)sycl::malloc_shared(size * sizeof(T), sycl_device_queue); #else cudaMallocManaged(&ptr, size*sizeof(T), cudaMemAttachGlobal); #endif @@ -50,7 +50,7 @@ namespace MemoryControl case UVM_MEM: for (int i=0; i < size; ++i) data[i].~T(); #ifdef HAVE_SYCL - sycl::free(data, q); + sycl::free(data, sycl_device_queue); #else cudaFree(data); #endif diff --git a/src/MonteCarlo.cc b/src/MonteCarlo.cc index f9fb116c..aca40e45 100644 --- a/src/MonteCarlo.cc +++ b/src/MonteCarlo.cc @@ -28,10 +28,10 @@ MonteCarlo::MonteCarlo(const Parameters& params) #if defined(HAVE_UVM) void *ptr1, *ptr2, *ptr3, *ptr4; #ifdef HAVE_SYCL - ptr1 = (void *)sycl::malloc_shared(sizeof(Tallies), q); - ptr2 = (void *)sycl::malloc_shared(sizeof(MC_Processor_Info), q); - ptr3 = (void *)sycl::malloc_shared(sizeof(MC_Time_Info), q); - ptr4 = (void *)sycl::malloc_shared(sizeof(MC_Fast_Timer_Container), q); + ptr1 = (void *)sycl::malloc_shared(sizeof(Tallies), sycl_device_queue); + ptr2 = (void *)sycl::malloc_shared(sizeof(MC_Processor_Info), sycl_device_queue); + ptr3 = (void *)sycl::malloc_shared(sizeof(MC_Time_Info), sycl_device_queue); + ptr4 = (void *)sycl::malloc_shared(sizeof(MC_Fast_Timer_Container), sycl_device_queue); #else cudaMallocManaged( &ptr1, sizeof(Tallies), cudaMemAttachHost ); cudaMallocManaged( &ptr2, sizeof(MC_Processor_Info), cudaMemAttachHost ); @@ -105,8 +105,8 @@ MonteCarlo::MonteCarlo(const Parameters& params) #if defined(HAVE_UVM) void *ptr5, *ptr6; #ifdef HAVE_SYCL - ptr5 = (void *)sycl::malloc_shared(sizeof(MC_Particle_Buffer), q); - ptr6 = (void *)sycl::malloc_shared(sizeof(ParticleVaultContainer), q); + ptr5 = (void *)sycl::malloc_shared(sizeof(MC_Particle_Buffer), sycl_device_queue); + ptr6 = (void *)sycl::malloc_shared(sizeof(ParticleVaultContainer), sycl_device_queue); #else cudaMallocManaged( &ptr5, sizeof(MC_Particle_Buffer) ); cudaMallocManaged( &ptr6, sizeof(ParticleVaultContainer), cudaMemAttachHost ); @@ -137,14 +137,14 @@ MonteCarlo::~MonteCarlo() particle_buffer->~MC_Particle_Buffer(); #ifdef HAVE_SYCL - sycl::free(_nuclearData, q); - sycl::free(_particleVaultContainer, q); - sycl::free(_materialDatabase, q); - sycl::free(_tallies, q); - sycl::free(processor_info, q); - sycl::free(time_info, q); - sycl::free(fast_timer, q); - sycl::free(particle_buffer, q); + sycl::free(_nuclearData, sycl_device_queue); + sycl::free(_particleVaultContainer, sycl_device_queue); + sycl::free(_materialDatabase, sycl_device_queue); + sycl::free(_tallies, sycl_device_queue); + sycl::free(processor_info, sycl_device_queue); + sycl::free(time_info, sycl_device_queue); + sycl::free(fast_timer, sycl_device_queue); + sycl::free(particle_buffer, sycl_device_queue); #else cudaFree( _nuclearData ); cudaFree( _particleVaultContainer); diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh index 808146e1..b475dc1d 100644 --- a/src/cudaUtils.hh +++ b/src/cudaUtils.hh @@ -11,7 +11,7 @@ #include -extern sycl::queue q; // global variable for device queue +extern sycl::queue sycl_device_queue; // global variable for device queue #ifdef __SYCL_DEVICE_ONLY__ #define OPENCL_CONSTANT __attribute__((opencl_constant)) diff --git a/src/initMC.cc b/src/initMC.cc index 02b38201..5c41f6de 100644 --- a/src/initMC.cc +++ b/src/initMC.cc @@ -55,7 +55,7 @@ MonteCarlo* initMC(const Parameters& params) #ifdef HAVE_UVM void* ptr; #ifdef HAVE_SYCL - ptr = (void *)sycl::malloc_shared(sizeof(MonteCarlo), q); + ptr = (void *)sycl::malloc_shared(sizeof(MonteCarlo), sycl_device_queue); #else cudaMallocManaged( &ptr, sizeof(MonteCarlo), cudaMemAttachGlobal ); #endif @@ -141,8 +141,8 @@ namespace #if defined HAVE_UVM void *ptr1, *ptr2; #ifdef HAVE_SYCL - ptr1 = (void *)sycl::malloc_shared(sizeof(NuclearData), q); - ptr2 = (void *)sycl::malloc_shared(sizeof(MaterialDatabase), q); + ptr1 = (void *)sycl::malloc_shared(sizeof(NuclearData), sycl_device_queue); + ptr2 = (void *)sycl::malloc_shared(sizeof(MaterialDatabase), sycl_device_queue); #else cudaMallocManaged( &ptr1, sizeof(NuclearData), cudaMemAttachGlobal ); cudaMallocManaged( &ptr2, sizeof(MaterialDatabase), cudaMemAttachGlobal ); diff --git a/src/main.cc b/src/main.cc index dbc05105..49972c61 100644 --- a/src/main.cc +++ b/src/main.cc @@ -27,7 +27,7 @@ #include "git_vers.hh" #ifdef HAVE_SYCL -sycl::queue q; +sycl::queue sycl_device_queue; #endif void gameOver(); @@ -52,15 +52,15 @@ int main(int argc, char** argv) char * devchar = std::getenv("QS_DEVICE"); std::string devname = (devchar==NULL ? "None" : devchar); if (devname == "CPU") { - q = cl::sycl::cpu_selector{}; + sycl_device_queue = cl::sycl::cpu_selector{}; } else if (devname == "GPU") { - q = cl::sycl::gpu_selector{}; + sycl_device_queue = cl::sycl::gpu_selector{}; } else if (devname == "HOST") { - q = cl::sycl::host_selector{}; + sycl_device_queue = cl::sycl::host_selector{}; } else { @@ -69,10 +69,10 @@ int main(int argc, char** argv) } // DEBUG - REMOVE LATER - if ( q.get_device().is_cpu() ) std::cout << "is cpu" << std::endl; - if ( q.get_device().is_gpu() ) std::cout << "is gpu" << std::endl; - if ( q.get_device().is_host() ) std::cout << "is host" << std::endl; - if ( q.get_device().is_accelerator() ) std::cout << "is accelerator" << std::endl; + if ( sycl_device_queue.get_device().is_cpu() ) std::cout << "is cpu" << std::endl; + if ( sycl_device_queue.get_device().is_gpu() ) std::cout << "is gpu" << std::endl; + if ( sycl_device_queue.get_device().is_host() ) std::cout << "is host" << std::endl; + if ( sycl_device_queue.get_device().is_accelerator() ) std::cout << "is accelerator" << std::endl; #endif // mcco stores just about everything. @@ -116,7 +116,7 @@ int main(int argc, char** argv) #ifdef HAVE_UVM mcco->~MonteCarlo(); #ifdef HAVE_SYCL - sycl::free(mcco, q); + sycl::free(mcco, sycl_device_queue); #else cudaFree( mcco ); #endif @@ -273,13 +273,13 @@ void cycleTracking(MonteCarlo *monteCarlo) { const size_t N = numParticles; #ifdef HAVE_SYCL - q.submit([&](sycl::handler &h) { + sycl_device_queue.submit([&](sycl::handler &h) { h.parallel_for(sycl::range<1>{N}, [=](sycl::item<1> it) { int particle_index = it[0]; CycleTrackingGuts( monteCarlo, particle_index, processingVault, processedVault ); }); }); - q.wait(); + sycl_device_queue.wait(); #endif } break; From 96f836815c6d3efe0362f5e1dc87ab271ca88f81 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 9 Oct 2020 13:06:31 -0400 Subject: [PATCH 08/12] add DPC++ PTX stuff Signed-off-by: Jeff Hammond --- src/Makefile | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/Makefile b/src/Makefile index 2effc631..9045ec1a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -184,20 +184,40 @@ LDFLAGS = $(OPENMP_LDFLAGS) ############################################################################### # SYCL/DPC++ ############################################################################### -CXX = dpcpp -CXXFLAGS = -std=c++14 -CXXFLAGS += -O3 -CXXFLAGS += -ffast-math -#CXXFLAGS += -g -CXXFLAGS += -ferror-limit=1 -CPPFLAGS = -DHAVE_SYCL -LDFLAGS = + +CPPFLAGS = -DHAVE_SYCL + +CXXFLAGS = +CXXFLAGS += -std=c++17 +CXXFLAGS += -g +CXXFLAGS += -O3 +CXXFLAGS += -ffast-math +CXXFLAGS += -ferror-limit=3 + +CXXFLAGS += -fsycl +CXXFLAGS += -fsycl-unnamed-lambda + +# ------------------------------------------------------- +# Intel oneAPI DPC++ compiler that supports Intel devices +# ------------------------------------------------------- +#CXX = dpcpp +# NVIDIA back-end does not support printf yet... +#CPPFLAGS = -DHAVE_SYCL_PRINTF # GPU offline compile # https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/gpu-flow/offline-compilation-for-gpu.html #CXXFLAGS += -fsycl-targets=spir64_gen-unknown-linux-sycldevice #LDFLAGS += -Xsycl-target-backend=spir64_gen-unknown-linux-sycldevice "-device skl" +# ------------------------------------------------------- +# Intel DPC++ open-source compiler built for NVIDIA +# ------------------------------------------------------- +DPCPP_DIR = /media/disk2/jrhammon/llvm/build +CXX = $(DPCPP_DIR)/bin/clang++ +CXXFLAGS += -Wno-unknown-cuda-version +CXXFLAGS += -fsycl-targets=nvptx64-nvidia-cuda-sycldevice +LDFLAGS += -L$(DPCPP_DIR)/lib -Wl,-rpath=$(DPCPP_DIR)/lib + ################################################################################ ### Below here, it is pitch black. ### ### You are likely to be eaten by a grue. ### From 5512778c2d803b3a625630dd41072a79e7375206 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 9 Oct 2020 13:06:53 -0400 Subject: [PATCH 09/12] deal with broken printf w/ DPC++ NVIDIA backend Signed-off-by: Jeff Hammond --- src/CollisionEvent.cc | 7 ++++++- src/MC_Segment_Outcome.cc | 4 +++- src/NuclearData.cc | 4 +++- src/cudaUtils.hh | 3 +++ src/qs_assert.hh | 18 +++++++++++++++++- 5 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/CollisionEvent.cc b/src/CollisionEvent.cc index d66b9a57..b0522016 100644 --- a/src/CollisionEvent.cc +++ b/src/CollisionEvent.cc @@ -117,13 +117,18 @@ bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned i ATOMIC_ADD( monteCarlo->_tallies->_balanceTask[tally_index]._produce, nOut); break; case NuclearDataReaction::Undefined: + { #ifdef HAVE_SYCL +#if HAVE_SYCL_PRINTF static const OPENCL_CONSTANT char format[] = "reactionType invalid\n"; - sycl::intel::experimental::printf(format); + syclx::printf(format); +#endif #else printf("reactionType invalid\n"); #endif qs_assert(false); + break; + } } if( nOut == 0 ) return false; diff --git a/src/MC_Segment_Outcome.cc b/src/MC_Segment_Outcome.cc index 12a892c5..c7fb7f41 100644 --- a/src/MC_Segment_Outcome.cc +++ b/src/MC_Segment_Outcome.cc @@ -48,8 +48,10 @@ MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Part if ( mc_particle.num_mean_free_paths > -900.0 ) { #ifdef HAVE_SYCL +#ifdef HAVE_SYCL_PRINTF static const OPENCL_CONSTANT char format[] = " MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n"; - sycl::intel::experimental::printf(format); + syclx::printf(format); +#endif #else printf(" MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n"); #endif diff --git a/src/NuclearData.cc b/src/NuclearData.cc index 2ed3224d..f6f46326 100644 --- a/src/NuclearData.cc +++ b/src/NuclearData.cc @@ -82,8 +82,10 @@ void NuclearDataReaction::sampleCollision( break; case Undefined: #ifdef HAVE_SYCL +#ifdef HAVE_SYCL_PRINTF static const OPENCL_CONSTANT char format[] = "_reactionType invalid\n"; - sycl::intel::experimental::printf(format); + syclx::printf(format); +#endif #else printf("_reactionType invalid\n"); #endif diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh index b475dc1d..08ec23f1 100644 --- a/src/cudaUtils.hh +++ b/src/cudaUtils.hh @@ -19,6 +19,9 @@ extern sycl::queue sycl_device_queue; // global variable for device queue #define OPENCL_CONSTANT #endif +// printf +namespace syclx = sycl::ONEAPI::experimental; + #else // not SYCL #define SYCL_EXTERNAL diff --git a/src/qs_assert.hh b/src/qs_assert.hh index 5eebf080..f617c904 100644 --- a/src/qs_assert.hh +++ b/src/qs_assert.hh @@ -14,18 +14,34 @@ #elif defined(HAVE_SYCL) +#ifdef HAVE_SYCL_PRINTF + #define qs_assert( cond) \ do \ { \ if (!(cond)) \ { \ static const OPENCL_CONSTANT char format[] = "file=%s: line=%d ERROR\n"; \ - sycl::intel::experimental::printf(format,__FILE__,__LINE__); \ + syclx::printf(format,__FILE__,__LINE__); \ } \ } while(0) #else +#define qs_assert( cond) \ + do \ + { \ + if (!(cond)) \ + { \ + /* FIXME eventually... + * printf("file=%s: line=%d ERROR\n",__FILE__,__LINE__); */ \ + } \ + } while(0) + +#endif + +#else + #define qs_assert( cond) \ do \ { \ From 5d992a5971330bc4e426c202090c1508bd73ce71 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 13 Oct 2020 10:52:41 -0700 Subject: [PATCH 10/12] build system changes --- src/Makefile | 31 ++++++++++++++++--------------- src/cudaUtils.hh | 3 ++- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/Makefile b/src/Makefile index 9045ec1a..40ad886b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -131,14 +131,14 @@ LDFLAGS += -lstdc++ # This works on a Macbook (if gcc is installed) ############################################################################### # -OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp -OPENMP_LDFLAGS = -fopenmp -OPTFLAGS = -g -O2 +#OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp +#OPENMP_LDFLAGS = -fopenmp +#OPTFLAGS = -g -O2 # -CXX=g++ -CXXFLAGS = -std=c++11 $(OPTFLAGS) -Wpedantic -CPPFLAGS = $(OPENMP_FLAGS) -LDFLAGS = $(OPENMP_LDFLAGS) +#CXX=g++ +#CXXFLAGS = -std=c++11 $(OPTFLAGS) -Wpedantic +#CPPFLAGS = $(OPENMP_FLAGS) +#LDFLAGS = $(OPENMP_LDFLAGS) ############################################################################### @@ -189,7 +189,6 @@ CPPFLAGS = -DHAVE_SYCL CXXFLAGS = CXXFLAGS += -std=c++17 -CXXFLAGS += -g CXXFLAGS += -O3 CXXFLAGS += -ffast-math CXXFLAGS += -ferror-limit=3 @@ -197,12 +196,14 @@ CXXFLAGS += -ferror-limit=3 CXXFLAGS += -fsycl CXXFLAGS += -fsycl-unnamed-lambda +#CXXFLAGS += -g # SPIR-V backend generates debug output when this is used + # ------------------------------------------------------- # Intel oneAPI DPC++ compiler that supports Intel devices # ------------------------------------------------------- -#CXX = dpcpp +CXX = dpcpp # NVIDIA back-end does not support printf yet... -#CPPFLAGS = -DHAVE_SYCL_PRINTF +CPPFLAGS += -DHAVE_SYCL_PRINTF # GPU offline compile # https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/gpu-flow/offline-compilation-for-gpu.html @@ -212,11 +213,11 @@ CXXFLAGS += -fsycl-unnamed-lambda # ------------------------------------------------------- # Intel DPC++ open-source compiler built for NVIDIA # ------------------------------------------------------- -DPCPP_DIR = /media/disk2/jrhammon/llvm/build -CXX = $(DPCPP_DIR)/bin/clang++ -CXXFLAGS += -Wno-unknown-cuda-version -CXXFLAGS += -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -LDFLAGS += -L$(DPCPP_DIR)/lib -Wl,-rpath=$(DPCPP_DIR)/lib +#DPCPP_DIR = /home/jrhammon/ISYCL/llvm/build/install +#CXX = $(DPCPP_DIR)/bin/clang++ +#CXXFLAGS += -Wno-unknown-cuda-version +#CXXFLAGS += -fsycl-targets=nvptx64-nvidia-cuda-sycldevice +#LDFLAGS += -L$(DPCPP_DIR)/lib -Wl,-rpath=$(DPCPP_DIR)/lib ################################################################################ ### Below here, it is pitch black. ### diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh index 08ec23f1..60bfe009 100644 --- a/src/cudaUtils.hh +++ b/src/cudaUtils.hh @@ -20,7 +20,8 @@ extern sycl::queue sycl_device_queue; // global variable for device queue #endif // printf -namespace syclx = sycl::ONEAPI::experimental; +//namespace syclx = sycl::ONEAPI::experimental; // latest OSS DPC++ +namespace syclx = sycl::intel::experimental; // dpcpp beta09 #else // not SYCL From 3ae32fa00b4af044c2b662ec949fdfceefffb9a9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 13 Oct 2020 12:48:35 -0700 Subject: [PATCH 11/12] flag workaround compiler bug --- src/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Makefile b/src/Makefile index 40ad886b..6064a998 100644 --- a/src/Makefile +++ b/src/Makefile @@ -205,6 +205,9 @@ CXX = dpcpp # NVIDIA back-end does not support printf yet... CPPFLAGS += -DHAVE_SYCL_PRINTF +# works around https://github.com/intel/llvm/issues/2629 +CXXFLAGS += -fno-sycl-early-optimizations + # GPU offline compile # https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/gpu-flow/offline-compilation-for-gpu.html #CXXFLAGS += -fsycl-targets=spir64_gen-unknown-linux-sycldevice From 0b9c632ed7669de1aa49ddd131dfb419acf85a92 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 15 Oct 2020 16:56:13 -0700 Subject: [PATCH 12/12] disable unrelated flags --- src/Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/Makefile b/src/Makefile index 6064a998..54b73d54 100644 --- a/src/Makefile +++ b/src/Makefile @@ -103,27 +103,27 @@ LDFLAGS = # Cuda on LLNL CORAL EA nodes ############################################################################### ## Choose one Cuda path -CUDA_PATH = /usr +#CUDA_PATH = /usr #CUDA_PATH = /usr/tcetmp/packages/cuda-9.0.176 -HOST_COMPILER = g++ +#HOST_COMPILER = g++ -OPTFLAGS = -O2 +#OPTFLAGS = -O2 ## Version below for debugging ##OPTFLAGS = -DUSE_NVTX -g -G -lineinfo -O0 -CUDA_FLAGS = -I${CUDA_PATH}/include/ -CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart +#CUDA_FLAGS = -I${CUDA_PATH}/include/ +#CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart # -CXX=$(CUDA_PATH)/bin/nvcc -CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) #-Xptxas -v -CXXFLAGS += -ccbin clang-8 -CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\" +#CXX=$(CUDA_PATH)/bin/nvcc +#CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) #-Xptxas -v +#CXXFLAGS += -ccbin clang-8 +#CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\" #CXXFLAGS += --compiler-bindir=$(HOST_COMPILER) -CPPFLAGS = -x cu -dc +#CPPFLAGS = -x cu -dc #CPPFLAGS += -DHAVE_MPI -DHAVE_ASYNC_MPI -LDFLAGS = $(CUDA_LDFLAGS) -LDFLAGS += -lstdc++ +#LDFLAGS = $(CUDA_LDFLAGS) +#LDFLAGS += -lstdc++ ##LDFLAGS += ${CUDA_PATH}/lib64/libnvToolsExt.so ###############################################################################