From 451f9ba9cd39cd405d1d87f2316bd1d73d8e849e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 26 May 2020 17:32:17 -0700
Subject: [PATCH 01/12] port to SYCL/DPC++

look for HAVE_SYCL and SYCL_EXTERNAL for most of the changes.
this works on GPU and HOST but not CPU (OpenCL issue).

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 src/CollisionEvent.cc          |  18 +++--
 src/CollisionEvent.hh          |   2 +-
 src/CycleTracking.cc           |   4 +-
 src/CycleTracking.hh           |   4 +-
 src/DeclareMacro.hh            |   4 +
 src/DirectionCosine.cc         |   7 +-
 src/DirectionCosine.hh         |   8 +-
 src/MCT.cc                     |   2 +-
 src/MCT.hh                     |  12 +--
 src/MC_Facet_Crossing_Event.hh |   2 +-
 src/MC_Facet_Geometry.hh       |   2 +-
 src/MC_Fast_Timer.cc           |   2 +-
 src/MC_Location.hh             |   4 +-
 src/MC_Particle.hh             |   2 +
 src/MC_RNG_State.cc            |   2 +-
 src/MC_RNG_State.hh            |   4 +-
 src/MC_Segment_Outcome.cc      |  14 ++--
 src/MC_Segment_Outcome.hh      |   2 +-
 src/MC_SourceNow.cc            |   5 +-
 src/MC_Vector.hh               |  32 ++++----
 src/MacroscopicCrossSection.hh |   4 +-
 src/Makefile                   | 144 +++++++++++----------------------
 src/MemoryControl.hh           |  21 +++--
 src/MonteCarlo.cc              |  30 ++++++-
 src/NuclearData.cc             |  14 ++--
 src/NuclearData.hh             |  12 +--
 src/ParticleVault.hh           |   4 +-
 src/ParticleVaultContainer.cc  |   2 +-
 src/ParticleVaultContainer.hh  |   4 +-
 src/SendQueue.hh               |   2 +-
 src/cudaUtils.hh               |  16 +++-
 src/initMC.cc                  |  29 +++++--
 src/main.cc                    |  92 +++++++++++++++++----
 src/mathHelp.hh                |  18 +++++
 src/qs_assert.hh               |  14 ++++
 35 files changed, 323 insertions(+), 215 deletions(-)
 create mode 100644 src/mathHelp.hh

diff --git a/src/CollisionEvent.cc b/src/CollisionEvent.cc
index 20b667b4..e684b200 100644
--- a/src/CollisionEvent.cc
+++ b/src/CollisionEvent.cc
@@ -11,6 +11,7 @@
 #include "PhysicalConstants.hh"
 #include "DeclareMacro.hh"
 #include "AtomicMacro.hh"
+#include "mathHelp.hh"
 
 #define MAX_PRODUCTION_SIZE 4
 
@@ -21,19 +22,19 @@
 //  Return true if the particle will continue.
 //----------------------------------------------------------------------------------------------------------------------
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 void updateTrajectory( double energy, double angle, MC_Particle& particle )
 {
     particle.kinetic_energy = energy;
     double cosTheta = angle;
     double randomNumber = rngSample(&particle.random_number_seed);
     double phi = 2 * 3.14159265 * randomNumber;
-    double sinPhi = sin(phi);
-    double cosPhi = cos(phi);
-    double sinTheta = sqrt((1.0 - (cosTheta*cosTheta)));
+    double sinPhi = SIN(phi);
+    double cosPhi = COS(phi);
+    double sinTheta = SQRT((1.0 - (cosTheta*cosTheta)));
     particle.direction_cosine.Rotate3DVector(sinTheta, cosTheta, sinPhi, cosPhi);
     double speed = (PhysicalConstants::_speedOfLight *
-            sqrt((1.0 - ((PhysicalConstants::_neutronRestMassEnergy *
+            SQRT((1.0 - ((PhysicalConstants::_neutronRestMassEnergy *
             PhysicalConstants::_neutronRestMassEnergy) /
             ((energy + PhysicalConstants::_neutronRestMassEnergy) *
             (energy + PhysicalConstants::_neutronRestMassEnergy))))));
@@ -41,12 +42,11 @@ void updateTrajectory( double energy, double angle, MC_Particle& particle )
     particle.velocity.y = speed * particle.direction_cosine.beta;
     particle.velocity.z = speed * particle.direction_cosine.gamma;
     randomNumber = rngSample(&particle.random_number_seed);
-    particle.num_mean_free_paths = -1.0*log(randomNumber);
+    particle.num_mean_free_paths = -1.0*LOG(randomNumber);
 }
 HOST_DEVICE_END
 
-HOST_DEVICE
-
+HOST_DEVICE SYCL_EXTERNAL
 bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned int tally_index)
 {
    const MC_Cell_State &cell = monteCarlo->domain[mc_particle.domain].cell_state[mc_particle.cell];
@@ -116,7 +116,9 @@ bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned i
          ATOMIC_ADD( monteCarlo->_tallies->_balanceTask[tally_index]._produce, nOut);
          break;
       case NuclearDataReaction::Undefined:
+#ifndef HAVE_SYCL
          printf("reactionType invalid\n");
+#endif
          qs_assert(false);
    }
 
diff --git a/src/CollisionEvent.hh b/src/CollisionEvent.hh
index 270ae382..d1d42162 100644
--- a/src/CollisionEvent.hh
+++ b/src/CollisionEvent.hh
@@ -5,7 +5,7 @@ class MonteCarlo;
 class MC_Particle;
 
 #include "DeclareMacro.hh"
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned int tally_index );
 HOST_DEVICE_END
 
diff --git a/src/CycleTracking.cc b/src/CycleTracking.cc
index e6238a83..1286bc4a 100644
--- a/src/CycleTracking.cc
+++ b/src/CycleTracking.cc
@@ -11,7 +11,7 @@
 #include "macros.hh"
 #include "qs_assert.hh"
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 void CycleTrackingGuts( MonteCarlo *monteCarlo, int particle_index, ParticleVault *processingVault, ParticleVault *processedVault )
 {
     MC_Particle mc_particle;
@@ -30,7 +30,7 @@ void CycleTrackingGuts( MonteCarlo *monteCarlo, int particle_index, ParticleVaul
 }
 HOST_DEVICE_END
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 void CycleTrackingFunction( MonteCarlo *monteCarlo, MC_Particle &mc_particle, int particle_index, ParticleVault* processingVault, ParticleVault* processedVault)
 {
     bool keepTrackingThisParticle = false;
diff --git a/src/CycleTracking.hh b/src/CycleTracking.hh
index 4dea37a0..f1e04426 100644
--- a/src/CycleTracking.hh
+++ b/src/CycleTracking.hh
@@ -5,10 +5,10 @@ class ParticleVault;
 class MonteCarlo;
 class MC_Particle;
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 void CycleTrackingGuts( MonteCarlo *monteCarlo, int particle_index, ParticleVault *processingVault, ParticleVault *processedVault );
 HOST_DEVICE_END
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 void CycleTrackingFunction( MonteCarlo *monteCarlo, MC_Particle &mc_particle, int particle_index, ParticleVault* processingVault, ParticleVault* processedVault);
 HOST_DEVICE_END
diff --git a/src/DeclareMacro.hh b/src/DeclareMacro.hh
index 3599190f..9efa61e1 100644
--- a/src/DeclareMacro.hh
+++ b/src/DeclareMacro.hh
@@ -1,6 +1,10 @@
 #ifndef DECLAREMACRO_HH
 #define DECLAREMACRO_HH
 
+#ifdef HAVE_SYCL
+#include <CL/sycl.hpp>
+#endif
+
 #ifdef HAVE_CUDA
     #define HOST_DEVICE __host__ __device__
     #define HOST_DEVICE_CUDA __host__ __device__
diff --git a/src/DirectionCosine.cc b/src/DirectionCosine.cc
index ded7a30b..fc3f5571 100644
--- a/src/DirectionCosine.cc
+++ b/src/DirectionCosine.cc
@@ -1,13 +1,14 @@
 #include "DirectionCosine.hh"
 #include "MC_RNG_State.hh"
 #include "PhysicalConstants.hh"
+#include "mathHelp.hh"
 
 void DirectionCosine::Sample_Isotropic(uint64_t *seed)
 {
     this->gamma  = 1.0 - 2.0*rngSample(seed);
-    double sine_gamma  = sqrt((1.0 - (gamma*gamma)));
+    double sine_gamma  = SQRT((1.0 - (gamma*gamma)));
     double phi         = PhysicalConstants::_pi*(2.0*rngSample(seed) - 1.0);
 
-    this->alpha  = sine_gamma * cos(phi);
-    this->beta   = sine_gamma * sin(phi);
+    this->alpha  = sine_gamma * COS(phi);
+    this->beta   = sine_gamma * SIN(phi);
 }
diff --git a/src/DirectionCosine.hh b/src/DirectionCosine.hh
index e7dd7363..325721dc 100644
--- a/src/DirectionCosine.hh
+++ b/src/DirectionCosine.hh
@@ -1,9 +1,9 @@
 #ifndef DIRECTION_COSINE_INCLUDE
 #define DIRECTION_COSINE_INCLUDE
 
-#include <cmath>
 #include "portability.hh"
 #include "DeclareMacro.hh"
+#include "mathHelp.hh"
 
 HOST_DEVICE_CLASS
 class DirectionCosine
@@ -31,7 +31,7 @@ public:
    void Sample_Isotropic(uint64_t *seed);
 
    // rotate a direction cosine given the sine/cosine of theta and phi
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    inline void Rotate3DVector( double sine_Theta,
                                double cosine_Theta,
                                double sine_Phi,
@@ -119,12 +119,12 @@ HOST_DEVICE_END
 //        direction_cosine.beta =   cos_theta*sin_phi*Alpha + cos_phi*Beta + sin_theta*sin_phi*Gamma;
 //        direction_cosine.gamma = -sin_theta        *Alpha +                cos_theta        *Gamma;
 //----------------------------------------------------------------------------------------------------------------------
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 inline void DirectionCosine::Rotate3DVector(double sin_Theta, double cos_Theta, double sin_Phi, double cos_Phi)
 {
     // Calculate additional variables in the rotation matrix.
     double cos_theta = this->gamma;
-    double sin_theta = sqrt((1.0 - (cos_theta*cos_theta)));
+    double sin_theta = SQRT((1.0 - (cos_theta*cos_theta)));
 
     double cos_phi;
     double sin_phi;
diff --git a/src/MCT.cc b/src/MCT.cc
index 3faf3592..200a57bc 100644
--- a/src/MCT.cc
+++ b/src/MCT.cc
@@ -397,7 +397,7 @@ namespace
 
 
 ///  Reflects the particle off of a reflection boundary.
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 void MCT_Reflect_Particle(MonteCarlo *monteCarlo, MC_Particle &particle)
 {
     DirectionCosine *direction_cosine = particle.Get_Direction_Cosine();
diff --git a/src/MCT.hh b/src/MCT.hh
index e6852acd..1e155cb0 100644
--- a/src/MCT.hh
+++ b/src/MCT.hh
@@ -14,7 +14,7 @@ class Subfacet_Adjacency;
 class MonteCarlo;
 
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 MC_Nearest_Facet MCT_Nearest_Facet(
    MC_Particle *mc_particle,
    MC_Location &location,
@@ -22,12 +22,12 @@ MC_Nearest_Facet MCT_Nearest_Facet(
    const DirectionCosine *direction_cosine,
    double distance_threshold,
    double current_best_distance,
-   bool new_segment, 
+   bool new_segment,
    MonteCarlo* monteCarlo);
 HOST_DEVICE_END
 
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 void MCT_Generate_Coordinate_3D_G(
    uint64_t *random_number_seed,
    int domain_num,
@@ -36,17 +36,17 @@ void MCT_Generate_Coordinate_3D_G(
    MonteCarlo* monteCarlo);
 HOST_DEVICE_END
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 MC_Vector MCT_Cell_Position_3D_G(
    const MC_Domain   &domain,
    int cell_index);
 HOST_DEVICE_END
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 Subfacet_Adjacency &MCT_Adjacent_Facet(const MC_Location &location, MC_Particle &mc_particle, MonteCarlo* monteCarlo);
 HOST_DEVICE_END
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 void MCT_Reflect_Particle(MonteCarlo *mcco, MC_Particle &particle);
 HOST_DEVICE_END
 
diff --git a/src/MC_Facet_Crossing_Event.hh b/src/MC_Facet_Crossing_Event.hh
index 98170036..8b0743e7 100644
--- a/src/MC_Facet_Crossing_Event.hh
+++ b/src/MC_Facet_Crossing_Event.hh
@@ -7,7 +7,7 @@
 class ParticleVault;
 class MC_Particle;
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 MC_Tally_Event::Enum MC_Facet_Crossing_Event(MC_Particle &mc_particle, MonteCarlo* monteCarlo, int particle_index, ParticleVault* processingVault);
 HOST_DEVICE_END
 
diff --git a/src/MC_Facet_Geometry.hh b/src/MC_Facet_Geometry.hh
index 943e86da..12b9e809 100644
--- a/src/MC_Facet_Geometry.hh
+++ b/src/MC_Facet_Geometry.hh
@@ -22,7 +22,7 @@ public:
       C = ((r1.x - r0.x)*(r2.y - r0.y)) - ((r1.y - r0.y)*(r2.x - r0.x));
       D = -1.0*(A*r0.x + B*r0.y + C*r0.z);
 
-      double magnitude = sqrt(A * A + B * B + C * C);
+      double magnitude = std::sqrt(A * A + B * B + C * C);
 
       if ( magnitude == 0.0 )
       {
diff --git a/src/MC_Fast_Timer.cc b/src/MC_Fast_Timer.cc
index 70aa7bf9..0ea8366e 100644
--- a/src/MC_Fast_Timer.cc
+++ b/src/MC_Fast_Timer.cc
@@ -25,7 +25,7 @@ static double mc_std_dev(uint64_t const data[], int const nelm)
     for(int ndx=0; ndx<nelm; ++ndx) { mean+=data[ndx]; }
     mean=mean/nelm;
     for(int ndx=0; ndx<nelm; ++ndx) { sum_deviation += (data[ndx]-mean)*(data[ndx]-mean); }
-    return sqrt((double)sum_deviation / nelm);
+    return std::sqrt((double)sum_deviation / nelm);
 }
 
 void MC_Fast_Timer_Container::Print_Last_Cycle_Heading(int mpi_rank)
diff --git a/src/MC_Location.hh b/src/MC_Location.hh
index d6d082b3..0490a0db 100644
--- a/src/MC_Location.hh
+++ b/src/MC_Location.hh
@@ -33,12 +33,12 @@ HOST_DEVICE_CUDA
      facet(-1)
    {}
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    const MC_Domain& get_domain(MonteCarlo *mcco) const;
 };
 HOST_DEVICE_END
 
-HOST_DEVICE_CUDA
+HOST_DEVICE_CUDA SYCL_EXTERNAL
 inline bool operator==(const MC_Location& a, const MC_Location b)
 {
    return
diff --git a/src/MC_Particle.hh b/src/MC_Particle.hh
index 668cc19b..ed89e65e 100644
--- a/src/MC_Particle.hh
+++ b/src/MC_Particle.hh
@@ -127,6 +127,7 @@ public:
 //----------------------------------------------------------------------------------------------------------------------
 //  Return a MC_Location given domain, cell, facet.
 //----------------------------------------------------------------------------------------------------------------------
+   HOST_DEVICE_CUDA
 inline MC_Location MC_Particle::Get_Location() const
 {
     return MC_Location(domain, cell, facet);
@@ -135,6 +136,7 @@ inline MC_Location MC_Particle::Get_Location() const
 //----------------------------------------------------------------------------------------------------------------------
 //  Move the particle a straight-line distance along a specified cosine.
 //----------------------------------------------------------------------------------------------------------------------
+   HOST_DEVICE_CUDA
 inline void MC_Particle::Move_Particle( const DirectionCosine &my_direction_cosine,
                                       const double distance)
 {
diff --git a/src/MC_RNG_State.cc b/src/MC_RNG_State.cc
index 96c37c7e..b6f3de67 100644
--- a/src/MC_RNG_State.cc
+++ b/src/MC_RNG_State.cc
@@ -103,7 +103,7 @@ HOST_DEVICE_END
 //  This routine spawns a "child" random number seed from a "parent" random number seed.
 //----------------------------------------------------------------------------------------------------------------------
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 uint64_t rngSpawn_Random_Number_Seed(uint64_t *parent_seed)
 {
   uint64_t spawned_seed = hash_state(*parent_seed);
diff --git a/src/MC_RNG_State.hh b/src/MC_RNG_State.hh
index 8399d912..5e21f3a1 100644
--- a/src/MC_RNG_State.hh
+++ b/src/MC_RNG_State.hh
@@ -11,7 +11,7 @@
 //----------------------------------------------------------------------------------------------------------------------
 
 // Generate a new random number seed
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 uint64_t rngSpawn_Random_Number_Seed(uint64_t *parent_seed);
 HOST_DEVICE_END
 
@@ -19,7 +19,7 @@ HOST_DEVICE_END
 //  Sample returns the pseudo-random number produced by a call to a random
 //  number generator.
 //----------------------------------------------------------------------------------------------------------------------
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 inline double rngSample(uint64_t *seed)
 {
    // Reset the state from the previous value.
diff --git a/src/MC_Segment_Outcome.cc b/src/MC_Segment_Outcome.cc
index 81325697..3c629514 100644
--- a/src/MC_Segment_Outcome.cc
+++ b/src/MC_Segment_Outcome.cc
@@ -13,7 +13,7 @@
 #include "MCT.hh"
 #include "PhysicalConstants.hh"
 #include "DeclareMacro.hh"
-
+#include "mathHelp.hh"
 
 HOST_DEVICE
 static inline unsigned int MC_Find_Min(const double *array,
@@ -27,7 +27,7 @@ HOST_DEVICE_END
 //  (iii) census at the end of the time step.
 //--------------------------------------------------------------------------------------------------
 
-HOST_DEVICE 
+HOST_DEVICE SYCL_EXTERNAL
 MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned int &flux_tally_index)
 {
     // initialize distances to large number
@@ -46,12 +46,14 @@ MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Part
 
         if ( mc_particle.num_mean_free_paths > -900.0 )
         {
-#if 1
-            printf(" MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n");
- #else
+#if defined(HAVE_SYCL)
+            // FIXME
+#elif 0
             std::string output_string;
             MC_Warning( "Forced Collision: num_mean_free_paths < 0 \n"
                              "Particle record:\n%s", output_string.c_str());
+#else
+            printf(" MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n");
 #endif
         }
 
@@ -80,7 +82,7 @@ MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Part
         // the next collision from an exponential distribution.
         double random_number = rngSample(&mc_particle.random_number_seed);
 
-        mc_particle.num_mean_free_paths = -1.0*log(random_number);
+        mc_particle.num_mean_free_paths = -1.0*LOG(random_number);
     }
 
     // Calculate the distances to collision, nearest facet, and census.
diff --git a/src/MC_Segment_Outcome.hh b/src/MC_Segment_Outcome.hh
index f8d6a25c..65a60d81 100644
--- a/src/MC_Segment_Outcome.hh
+++ b/src/MC_Segment_Outcome.hh
@@ -32,7 +32,7 @@ struct MC_Collision_Event_Return
 };
 
 #include "DeclareMacro.hh"
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned int &flux_tally_index);
 HOST_DEVICE_END
 
diff --git a/src/MC_SourceNow.cc b/src/MC_SourceNow.cc
index 08e23138..2c1f7e96 100644
--- a/src/MC_SourceNow.cc
+++ b/src/MC_SourceNow.cc
@@ -18,6 +18,7 @@
 #include "AtomicMacro.hh"
 #include "NVTX_Range.hh"
 #include <vector>
+#include "mathHelp.hh"
 
 namespace
 {
@@ -116,7 +117,7 @@ void MC_SourceNow(MonteCarlo *monteCarlo)
                 particle.weight = source_particle_weight;
 
                 double randomNumber = rngSample(&particle.random_number_seed);
-                particle.num_mean_free_paths = -1.0*log(randomNumber);
+                particle.num_mean_free_paths = -1.0*LOG(randomNumber);
 
                 randomNumber = rngSample(&particle.random_number_seed);
                 particle.time_to_census = monteCarlo->time_info->time_step * randomNumber;
@@ -172,7 +173,7 @@ namespace
       static const double speed_of_light  = PhysicalConstants::_speedOfLight;
 
 
-      return speed_of_light * sqrt(energy * (energy + 2.0*(rest_mass_energy)) /
+      return speed_of_light * std::sqrt(energy * (energy + 2.0*(rest_mass_energy)) /
                                    ((energy + rest_mass_energy) * (energy + rest_mass_energy)));
    }
 }
diff --git a/src/MC_Vector.hh b/src/MC_Vector.hh
index 91b5fdf6..34368d07 100644
--- a/src/MC_Vector.hh
+++ b/src/MC_Vector.hh
@@ -1,8 +1,8 @@
 #ifndef MC_VECTOR_INCLUDE
 #define MC_VECTOR_INCLUDE
 
-#include <cmath>
 #include "DeclareMacro.hh"
+#include "mathHelp.hh"
 
 HOST_DEVICE_CLASS
 class MC_Vector
@@ -17,7 +17,7 @@ class MC_Vector
    HOST_DEVICE_CUDA
    MC_Vector(double a, double b, double c) : x(a), y(b), z(c) {}
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    MC_Vector& operator=( const MC_Vector&tmp )
    {
       if ( this == &tmp ) { return *this; }
@@ -29,13 +29,13 @@ class MC_Vector
       return *this;
    }
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    bool operator==( const MC_Vector& tmp )
    {
       return tmp.x == x && tmp.y == y && tmp.z == z;
    }
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    MC_Vector& operator+=( const MC_Vector &tmp )
    {
       x += tmp.x;
@@ -44,7 +44,7 @@ class MC_Vector
       return *this;
    }
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    MC_Vector& operator-=( const MC_Vector &tmp )
    {
       x -= tmp.x;
@@ -53,7 +53,7 @@ class MC_Vector
       return *this;
    }
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    MC_Vector& operator*=(const double scalar)
    {
       x *= scalar;
@@ -62,7 +62,7 @@ class MC_Vector
       return *this;
    }
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    MC_Vector& operator/=(const double scalar)
    {
       x /= scalar;
@@ -71,39 +71,39 @@ class MC_Vector
       return *this;
    }
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    const MC_Vector operator+( const MC_Vector &tmp ) const
    {
       return MC_Vector(x + tmp.x, y + tmp.y, z + tmp.z);
    }
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    const MC_Vector operator-( const MC_Vector &tmp ) const
    {
       return MC_Vector(x - tmp.x, y - tmp.y, z - tmp.z);
    }
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    const MC_Vector operator*(const double scalar) const
    {
       return MC_Vector(scalar*x, scalar*y, scalar*z);
    }
 
-   HOST_DEVICE_CUDA
-   inline double Length() const { return std::sqrt(x*x + y*y + z*z); }
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
+   inline double Length() const { return SQRT(x*x + y*y + z*z); }
 
    // Distance from this vector to another point.
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    inline double Distance(const MC_Vector& vv) const
-   { return std::sqrt((x - vv.x)*(x - vv.x) + (y - vv.y)*(y - vv.y)+ (z - vv.z)*(z - vv.z)); }
+   { return SQRT((x - vv.x)*(x - vv.x) + (y - vv.y)*(y - vv.y)+ (z - vv.z)*(z - vv.z)); }
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    inline double Dot(const MC_Vector &tmp) const
    {
       return this->x*tmp.x + this->y*tmp.y + this->z*tmp.z;
    }
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    inline MC_Vector Cross(const MC_Vector &v) const
    {
       return MC_Vector(y * v.z - z * v.y,
diff --git a/src/MacroscopicCrossSection.hh b/src/MacroscopicCrossSection.hh
index 68ab7e1b..b059556e 100644
--- a/src/MacroscopicCrossSection.hh
+++ b/src/MacroscopicCrossSection.hh
@@ -5,12 +5,12 @@
 
 class MonteCarlo;
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 double macroscopicCrossSection(MonteCarlo* monteCarlo, int reactionIndex, int domainIndex, int cellIndex,
                                int isoIndex, int energyGroup);
 HOST_DEVICE_END
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 double weightedMacroscopicCrossSection(MonteCarlo* monteCarlo, int taskIndex, int domainIndex,
                                        int cellIndex, int energyGroup);
 HOST_DEVICE_END
diff --git a/src/Makefile b/src/Makefile
index 5f42dc12..5bbf141b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -99,19 +99,46 @@ CXXFLAGS =
 CPPFLAGS = 
 LDFLAGS =
 
+###############################################################################
+# Cuda on LLNL CORAL EA nodes
+###############################################################################
+## Choose one Cuda path
+CUDA_PATH = /usr
+#CUDA_PATH = /usr/tcetmp/packages/cuda-9.0.176
+
+HOST_COMPILER = g++
+
+OPTFLAGS = -O2
+## Version below for debugging
+##OPTFLAGS = -DUSE_NVTX -g -G -lineinfo -O0
+
+CUDA_FLAGS = -I${CUDA_PATH}/include/
+CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart
+#
+CXX=$(CUDA_PATH)/bin/nvcc
+CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) #-Xptxas -v
+CXXFLAGS += -ccbin clang-8
+CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\"
+#CXXFLAGS += --compiler-bindir=$(HOST_COMPILER)
+CPPFLAGS = -x cu -dc
+#CPPFLAGS += -DHAVE_MPI -DHAVE_ASYNC_MPI
+LDFLAGS  = $(CUDA_LDFLAGS)
+LDFLAGS  += -lstdc++
+##LDFLAGS += ${CUDA_PATH}/lib64/libnvToolsExt.so
+
 ###############################################################################
 # Very simple GCC build with OpenMP but without MPI.  
 # This works on a Macbook (if gcc is installed)
 ###############################################################################
 #
-#OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp 
-#OPENMP_LDFLAGS = -fopenmp
-#OPTFLAGS = -g -O2
+OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp 
+OPENMP_LDFLAGS = -fopenmp
+OPTFLAGS = -g -O2
 #
-#CXX=g++
-#CXXFLAGS = -std=c++11  $(OPTFLAGS) -Wpedantic
-#CPPFLAGS = $(OPENMP_FLAGS)
-#LDFLAGS = $(OPENMP_LDFLAGS) 
+CXX=g++
+CXXFLAGS = -std=c++11  $(OPTFLAGS) -Wpedantic
+CPPFLAGS = $(OPENMP_FLAGS)
+LDFLAGS = $(OPENMP_LDFLAGS) 
 
 
 ###############################################################################
@@ -128,18 +155,6 @@ LDFLAGS =
 #LDFLAGS = $(OPENMP_LDFLAGS) 
 
 
-###############################################################################
-# LLNL LC BG/Q Comilers                                                       #
-###############################################################################
-### BGQ GNU
-#OPTFLAGS = -g -O2
-##
-#CXX=/usr/local/tools/compilers/ibm/mpicxx-4.8.4
-#CXXFLAGS = -std=c++11 $(OPTFLAGS) 
-#CPPFLAGS = -DCHRONO_MISSING -DHAVE_MPI -DHAVE_OPENMP -fopenmp
-#LDFLAGS  = -fopenmp
-
-
 ###############################################################################
 # OpenMP 4.5 on LLNL CORAL EA nodes
 ###############################################################################
@@ -167,86 +182,19 @@ LDFLAGS =
 
 
 ###############################################################################
-# Cuda on LLNL CORAL EA nodes
-###############################################################################
-## Choose one Cuda path
-##CUDA_PATH = /usr/local/cuda-8.0
-#CUDA_PATH = /usr/tcetmp/packages/cuda-9.0.176
-
-#HOST_COMPILER = /usr/tce/packages/spectrum-mpi/spectrum-mpi-2017.04.03-xl-beta-2017.09.13/bin/mpixlC
-
-#OPTFLAGS = -O2 
-## Version below for debugging
-##OPTFLAGS = -DUSE_NVTX -g -G -lineinfo -O0
-
-#CUDA_FLAGS = -I${CUDA_PATH}/include/
-#CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart
-#
-#CXX=$(CUDA_PATH)/bin/nvcc
-#CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) -Xptxas -v 
-#CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\" 
-#CXXFLAGS += --compiler-bindir=$(HOST_COMPILER)
-#CPPFLAGS = -x cu -dc -DHAVE_MPI -DHAVE_ASYNC_MPI
-#LDFLAGS  = $(CUDA_LDFLAGS) 
-##LDFLAGS += ${CUDA_PATH}/lib64/libnvToolsExt.so
-
-
-
-
-###############################################################################
-# LLNL TOSS GCC + OpenMP (mvapich 2 - version 1.7) [cab]
-###############################################################################
-#OPTFLAGS = -g -O2 
-#OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp 
-#OPENMP_LDFLAGS = -fopenmp
-#
-#CXX = /usr/apps/gnu/4.9.3/bin/mpig++
-#CXXFLAGS = -std=c++0x $(OPTFLAGS) -mpi=mvapich2-gnu-1.7 
-#CPPFLAGS = -DHAVE_MPI $(OPENMP_FLAGS)
-#LDFLAGS = $(OPENMP_LDFLAGS)
-
-###############################################################################
-# LLNL TOSS Intel + OpenMP (mvapich 2 - version 2.1) [quartz] 
-###############################################################################
-#OPENMP_FLAGS = -DHAVE_OPENMP -qopenmp 
-#OPENMP_LDFLAGS = -qopenmp 
-#OPTFLAGS = -g -O2
-#
-#CXX=/usr/local/bin/mpiicpc-17.0.174
-#CXXFLAGS = -std=c++11 -mpi=mvapich2-intel-2.1 -DHAVE_MPI $(OPENMP_FLAGS)
-#CXXFLAGS += -wd1128 -wd64 -wd21
-#LDFLAGS  = $(OPENMP_LDFLAGS)
-
-
-###############################################################################
-# LLNL TOSS Clang (cab)
-###############################################################################
-#CLANGPATH = /usr/global/tools/clang/chaos_5_x86_64_ib/clang-omp-3.5.0
-#OPTFLAGS = -g -O2 
-#
-#CXX=${CLANGPATH}/bin/mpiclang++
-#CXXFLAGS = -std=c++11 $(OPTFLAGS)
-#CPPFLAGS = -DHAVE_MPI
-#LDFLAGS  = -Wl,-rpath,${CLANGPATH}/lib
-
-
-###############################################################################
-# Trinity Compilers                                                           #
-#                                                                             #
-# One must 'swap'  modules on this machine to access different compilers.     #
+# SYCL/DPC++
 ###############################################################################
-
-### Defaults to Intel.
-#OPTFLAGS = -g -O2 -xmic-avx512 -ipo
-#OPENMP_FLAGS = -DHAVE_OPENMP -qopenmp -pthread -DUSE_OPENMP_NO_GPU
-#OPENMP_LDFLAGS = -qopenmp -pthread
-#
-#CXX=CC
-#CXXFLAGS = -std=c++11 $(OPTFLAGS)
-#CPPFLAGS = -DHAVE_MPI -DCHRONO_MISSING  $(OPENMP_FLAGS)
-#LDFLAGS  = $(OPENMP_LDFLAGS) 
-
-
+CXX       = dpcpp
+CXXFLAGS  = -std=c++14
+CXXFLAGS += -g -O2
+CXXFLAGS += -ferror-limit=1
+CPPFLAGS  = -DHAVE_SYCL
+LDFLAGS   =
+
+# GPU offline compile
+# https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/gpu-flow/offline-compilation-for-gpu.html
+#CXXFLAGS += -fsycl-targets=spir64_gen-unknown-linux-sycldevice
+#LDFLAGS  += -Xsycl-target-backend=spir64_gen-unknown-linux-sycldevice "-device skl"
 
 ################################################################################
 ### Below here, it is pitch black.                                           ###
diff --git a/src/MemoryControl.hh b/src/MemoryControl.hh
index 6b13aaf5..19d1ed71 100644
--- a/src/MemoryControl.hh
+++ b/src/MemoryControl.hh
@@ -14,7 +14,7 @@ namespace MemoryControl
    {
       if (size == 0) { return NULL;}
       T* tmp = NULL;
-      
+
       switch (policy)
       {
         case AllocationPolicy::HOST_MEM:
@@ -22,9 +22,13 @@ namespace MemoryControl
          break;
 #ifdef HAVE_UVM
         case AllocationPolicy::UVM_MEM:
-         void *ptr;
+         void * ptr;
+#ifdef HAVE_SYCL
+         ptr = (void *)sycl::malloc_shared(size * sizeof(T), q);
+#else
          cudaMallocManaged(&ptr, size*sizeof(T), cudaMemAttachGlobal);
-         tmp = new(ptr) T[size]; 
+#endif
+         tmp = new(ptr) T[size];
          break;
 #endif
         default:
@@ -40,15 +44,18 @@ namespace MemoryControl
       switch (policy)
       {
         case MemoryControl::AllocationPolicy::HOST_MEM:
-         delete[] data; 
+         delete[] data;
          break;
 #ifdef HAVE_UVM
         case UVM_MEM:
-         for (int i=0; i < size; ++i)
-            data[i].~T();
+         for (int i=0; i < size; ++i) data[i].~T();
+#ifdef HAVE_SYCL
+         sycl::free(data, q);
+#else
          cudaFree(data);
+#endif
          break;
-#endif         
+#endif
         default:
          qs_assert(false);
          break;
diff --git a/src/MonteCarlo.cc b/src/MonteCarlo.cc
index 28c9c47a..f9fb116c 100644
--- a/src/MonteCarlo.cc
+++ b/src/MonteCarlo.cc
@@ -25,13 +25,19 @@ MonteCarlo::MonteCarlo(const Parameters& params)
    _nuclearData            = 0;
    _materialDatabase       = 0;
 
-    #if defined (HAVE_UVM)
+#if defined(HAVE_UVM)
         void *ptr1, *ptr2, *ptr3, *ptr4;
-
+#ifdef HAVE_SYCL
+        ptr1 = (void *)sycl::malloc_shared(sizeof(Tallies), q);
+        ptr2 = (void *)sycl::malloc_shared(sizeof(MC_Processor_Info), q);
+        ptr3 = (void *)sycl::malloc_shared(sizeof(MC_Time_Info), q);
+        ptr4 = (void *)sycl::malloc_shared(sizeof(MC_Fast_Timer_Container), q);
+#else
         cudaMallocManaged( &ptr1, sizeof(Tallies), cudaMemAttachHost );
         cudaMallocManaged( &ptr2, sizeof(MC_Processor_Info), cudaMemAttachHost );
         cudaMallocManaged( &ptr3, sizeof(MC_Time_Info), cudaMemAttachHost );
         cudaMallocManaged( &ptr4, sizeof(MC_Fast_Timer_Container) );
+#endif
 
         _tallies                = new(ptr1) Tallies( params.simulationParams.balanceTallyReplications, 
                                                      params.simulationParams.fluxTallyReplications,
@@ -98,8 +104,13 @@ MonteCarlo::MonteCarlo(const Parameters& params)
 
     #if defined(HAVE_UVM)
         void *ptr5, *ptr6;
+#ifdef HAVE_SYCL
+        ptr5 = (void *)sycl::malloc_shared(sizeof(MC_Particle_Buffer), q);
+        ptr6 = (void *)sycl::malloc_shared(sizeof(ParticleVaultContainer), q);
+#else
         cudaMallocManaged( &ptr5, sizeof(MC_Particle_Buffer) );
         cudaMallocManaged( &ptr6, sizeof(ParticleVaultContainer), cudaMemAttachHost );
+#endif
         particle_buffer         = new(ptr5) MC_Particle_Buffer(this, batch_size);
         _particleVaultContainer = new(ptr6) ParticleVaultContainer(batch_size, num_batches, num_extra_vaults);
     #else
@@ -125,6 +136,16 @@ MonteCarlo::~MonteCarlo()
         fast_timer->~MC_Fast_Timer_Container();
         particle_buffer->~MC_Particle_Buffer();
 
+#ifdef HAVE_SYCL
+        sycl::free(_nuclearData, q);
+        sycl::free(_particleVaultContainer, q);
+        sycl::free(_materialDatabase, q);
+        sycl::free(_tallies, q);
+        sycl::free(processor_info, q);
+        sycl::free(time_info, q);
+        sycl::free(fast_timer, q);
+        sycl::free(particle_buffer, q);
+#else
         cudaFree( _nuclearData );
         cudaFree( _particleVaultContainer);
         cudaFree( _materialDatabase);
@@ -133,8 +154,9 @@ MonteCarlo::~MonteCarlo()
         cudaFree( time_info);
         cudaFree( fast_timer);
         cudaFree( particle_buffer);
+#endif
 
-    #else
+#else
         delete _nuclearData;
         delete _particleVaultContainer;
         delete _materialDatabase;
@@ -143,7 +165,7 @@ MonteCarlo::~MonteCarlo()
         delete time_info;
         delete fast_timer;
         delete particle_buffer;
-    #endif
+#endif
 }
 
 void MonteCarlo::clearCrossSectionCache()
diff --git a/src/NuclearData.cc b/src/NuclearData.cc
index fe9ad072..b7bb4906 100644
--- a/src/NuclearData.cc
+++ b/src/NuclearData.cc
@@ -3,9 +3,7 @@
 #include "MC_RNG_State.hh"
 #include "DeclareMacro.hh"
 #include "qs_assert.hh"
-
-using std::log10;
-using std::pow;
+#include "mathHelp.hh"
 
 // Set the cross section values and reaction type
 // Cross sections are scaled to produce the supplied reactionCrossSection at 1MeV.
@@ -21,7 +19,7 @@ NuclearDataReaction::NuclearDataReaction(
    for (int ii=0; ii<nGroups; ++ii)
    {
       double energy = (energies[ii] + energies[ii+1]) / 2.0;
-      _crossSection[ii] = pow( 10, polynomial(log10( energy)));
+      _crossSection[ii] = std::pow( 10, polynomial(std::log10( energy)));
    }
 
    // Find the normalization value for the polynomial.  This is the
@@ -82,7 +80,9 @@ void NuclearDataReaction::sampleCollision(
       }
       break;
      case Undefined:
+#ifndef HAVE_SYCL
       printf("_reactionType invalid\n");
+#endif
       qs_assert(false);
    }
 }
@@ -108,8 +108,8 @@ NuclearData::NuclearData(int numGroups, double energyLow, double energyHigh) : _
    _numEnergyGroups = numGroups;
    _energies[0] = energyLow;
    _energies[numGroups] = energyHigh;
-   double logLow = log(energyLow);
-   double logHigh = log(energyHigh);
+   double logLow = std::log(energyLow);
+   double logHigh = std::log(energyHigh);
    double delta = (logHigh - logLow) / (numGroups + 1.0);
    for (int energyIndex = 1; energyIndex < numGroups; energyIndex++)
    {
@@ -195,7 +195,7 @@ double NuclearDataReaction::getCrossSection(unsigned int group)
 }
 HOST_DEVICE_END
 
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 int NuclearData::getNumberReactions(unsigned int isotopeIndex)
 {
    qs_assert(isotopeIndex < _isotopes.size());
diff --git a/src/NuclearData.hh b/src/NuclearData.hh
index 6760568c..cb0fbc98 100644
--- a/src/NuclearData.hh
+++ b/src/NuclearData.hh
@@ -45,9 +45,9 @@ class NuclearDataReaction
                        const Polynomial& polynomial, double reationCrossSection);
    
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    double getCrossSection(unsigned int group);
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    void sampleCollision(double incidentEnergy, double material_mass, double* energyOut,
                         double* angleOut, int &nOut, uint64_t* seed, int max_production_size);
    
@@ -96,13 +96,13 @@ class NuclearData
                   double totalCrossSection,
                   double fissionWeight, double scatterWeight, double absorptionWeight);
 
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    int getEnergyGroup(double energy);
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    int getNumberReactions(unsigned int isotopeIndex);
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    double getTotalCrossSection(unsigned int isotopeIndex, unsigned int group);
-   HOST_DEVICE_CUDA
+   HOST_DEVICE_CUDA SYCL_EXTERNAL
    double getReactionCrossSection(unsigned int reactIndex, unsigned int isotopeIndex, unsigned int group);
 
    int _numEnergyGroups;
diff --git a/src/ParticleVault.hh b/src/ParticleVault.hh
index 365589f0..9af1729e 100644
--- a/src/ParticleVault.hh
+++ b/src/ParticleVault.hh
@@ -171,6 +171,7 @@ getParticle( MC_Particle &particle, int index )
 }
 
 // -----------------------------------------------------------------------
+   HOST_DEVICE_CUDA
 inline bool ParticleVault::
 putParticle(MC_Particle particle, int index)
 {
@@ -185,6 +186,7 @@ putParticle(MC_Particle particle, int index)
 }
 
 // -----------------------------------------------------------------------
+   HOST_DEVICE_CUDA
 inline void ParticleVault::
 invalidateParticle( int index )
 {
@@ -205,7 +207,7 @@ eraseSwapParticle(int index)
 }
 
 // -----------------------------------------------------------------------
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 void MC_Load_Particle(MonteCarlo *mcco, MC_Particle &mc_particle, ParticleVault *particleVault, int particle_index);
 HOST_DEVICE_END
 
diff --git a/src/ParticleVaultContainer.cc b/src/ParticleVaultContainer.cc
index 3ac6780e..a7cfa0f7 100644
--- a/src/ParticleVaultContainer.cc
+++ b/src/ParticleVaultContainer.cc
@@ -337,7 +337,7 @@ addProcessingParticle( MC_Base_Particle &particle, uint64_t &fill_vault_index )
 //------------addExtraParticle----------------------------------
 //adds a particle to the extra particle vaults (used in kernel)
 //--------------------------------------------------------------
-HOST_DEVICE
+HOST_DEVICE SYCL_EXTERNAL
 void ParticleVaultContainer::
 addExtraParticle( MC_Particle &particle)
 {
diff --git a/src/ParticleVaultContainer.hh b/src/ParticleVaultContainer.hh
index b5fb038c..0aeb4734 100644
--- a/src/ParticleVaultContainer.hh
+++ b/src/ParticleVaultContainer.hh
@@ -52,7 +52,7 @@ class ParticleVaultContainer
     uint64_t getFirstEmptyProcessedVault();
 
     //Returns a pointer to the Send Queue
-    HOST_DEVICE
+    HOST_DEVICE SYCL_EXTERNAL
     SendQueue* getSendQueue();
     HOST_DEVICE_END
 
@@ -74,7 +74,7 @@ class ParticleVaultContainer
     //Adds a particle to the processing particle vault
     void addProcessingParticle( MC_Base_Particle &particle, uint64_t &fill_vault_index );
     //Adds a particle to the extra particle vault
-    HOST_DEVICE
+    HOST_DEVICE SYCL_EXTERNAL
     void addExtraParticle( MC_Particle &particle );
     HOST_DEVICE_END
  
diff --git a/src/SendQueue.hh b/src/SendQueue.hh
index 11d4ea5b..255303ee 100644
--- a/src/SendQueue.hh
+++ b/src/SendQueue.hh
@@ -29,7 +29,7 @@ class SendQueue
     sendQueueTuple& getTuple( int index_ );
 
     //Add items to the send queue in a kernel
-    HOST_DEVICE_CUDA
+    HOST_DEVICE_CUDA SYCL_EXTERNAL
     void push( int neighbor_, int vault_index_ );
 
     //Clear send queue before after use
diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh
index 868c5477..1db62cd2 100644
--- a/src/cudaUtils.hh
+++ b/src/cudaUtils.hh
@@ -1,12 +1,19 @@
 #ifndef CUDAUTILS_HH
 #define CUDAUTILS_HH
 
-#if defined(HAVE_CUDA) || defined(HAVE_OPENMP_TARGET) 
+#if defined(HAVE_CUDA) || defined(HAVE_OPENMP_TARGET)
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #endif
 
+#if defined(HAVE_SYCL)
+#include <CL/sycl.hpp>
+extern sycl::queue q; // global variable for device queue
+#else
+#define SYCL_EXTERNAL
+#endif
+
 #ifdef HAVE_OPENMP_TARGET
     #ifdef USE_OPENMP_NO_GPU
         #define VAR_MEM MemoryControl::AllocationPolicy::HOST_MEM
@@ -17,11 +24,14 @@
 #elif HAVE_CUDA
     #define VAR_MEM MemoryControl::AllocationPolicy::UVM_MEM
     #define HAVE_UVM
+#elif HAVE_SYCL
+    #define VAR_MEM MemoryControl::AllocationPolicy::UVM_MEM
+    #define HAVE_UVM
 #else
     #define VAR_MEM MemoryControl::AllocationPolicy::HOST_MEM
 #endif
 
-enum ExecutionPolicy{ cpu, gpuWithCUDA, gpuWithOpenMP };
+enum ExecutionPolicy{ cpu, gpuWithCUDA, gpuWithOpenMP, SYCL };
 
 inline ExecutionPolicy getExecutionPolicy( int useGPU )
 {
@@ -33,6 +43,8 @@ inline ExecutionPolicy getExecutionPolicy( int useGPU )
         execPolicy = ExecutionPolicy::gpuWithCUDA;
         #elif defined (HAVE_OPENMP_TARGET)
         execPolicy = ExecutionPolicy::gpuWithOpenMP;
+        #elif defined (HAVE_SYCL)
+        execPolicy = ExecutionPolicy::SYCL;
         #endif
     }
     return execPolicy;
diff --git a/src/initMC.cc b/src/initMC.cc
index 1be428fd..02b38201 100644
--- a/src/initMC.cc
+++ b/src/initMC.cc
@@ -54,7 +54,11 @@ MonteCarlo* initMC(const Parameters& params)
    MonteCarlo* monteCarlo;
    #ifdef HAVE_UVM
       void* ptr;
+#ifdef HAVE_SYCL
+      ptr = (void *)sycl::malloc_shared(sizeof(MonteCarlo), q);
+#else
       cudaMallocManaged( &ptr, sizeof(MonteCarlo), cudaMemAttachGlobal );
+#endif
       monteCarlo = new(ptr) MonteCarlo(params);
    #else
      monteCarlo = new MonteCarlo(params);
@@ -77,28 +81,32 @@ namespace
 //Init GPU usage information
    void initGPUInfo( MonteCarlo* monteCarlo)
    {
+      int Ngpus = 0;
       #if defined(HAVE_OPENMP_TARGET)
-         int Ngpus = omp_get_num_devices();
+         Ngpus = omp_get_num_devices();
       #elif defined(HAVE_CUDA)
-         int Ngpus;
          cudaGetDeviceCount(&Ngpus);
-      #else
-         int Ngpus = 0;
+      #elif defined(HAVE_SYCL)
+         Ngpus = 1;
       #endif
 
          if( Ngpus != 0 )
          {
-            #if defined(HAVE_OPENMP_TARGET) || defined(HAVE_CUDA)
+            #if defined(HAVE_OPENMP_TARGET) || defined(HAVE_CUDA) || defined(HAVE_SYCL)
+
             monteCarlo->processor_info->use_gpu = 1;
             int GPUID = monteCarlo->processor_info->rank%Ngpus;
             monteCarlo->processor_info->gpu_id = GPUID;
-            
+
             #if defined(HAVE_OPENMP_TARGET)
                 omp_set_default_device(GPUID);
             #endif
 
-            cudaSetDevice(GPUID);
-            //cudaDeviceSetLimit( cudaLimitStackSize, 64*1024 );
+            #ifndef HAVE_SYCL
+                cudaSetDevice(GPUID);
+                //cudaDeviceSetLimit( cudaLimitStackSize, 64*1024 );
+            #endif
+
             #endif
          }
          else
@@ -132,8 +140,13 @@ namespace
    {
       #if defined HAVE_UVM
          void *ptr1, *ptr2;
+#ifdef HAVE_SYCL
+         ptr1 = (void *)sycl::malloc_shared(sizeof(NuclearData), q);
+         ptr2 = (void *)sycl::malloc_shared(sizeof(MaterialDatabase), q);
+#else
          cudaMallocManaged( &ptr1, sizeof(NuclearData), cudaMemAttachGlobal );
          cudaMallocManaged( &ptr2, sizeof(MaterialDatabase), cudaMemAttachGlobal );
+#endif
 
          monteCarlo->_nuclearData = new(ptr1) NuclearData(params.simulationParams.nGroups,
                                                           params.simulationParams.eMin,
diff --git a/src/main.cc b/src/main.cc
index bb9517b2..dbc05105 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -26,6 +26,10 @@
 #include "git_hash.hh"
 #include "git_vers.hh"
 
+#ifdef HAVE_SYCL
+sycl::queue q;
+#endif
+
 void gameOver();
 void cycleInit( bool loadBalance );
 void cycleTracking(MonteCarlo* monteCarlo);
@@ -43,8 +47,45 @@ int main(int argc, char** argv)
    Parameters params = getParameters(argc, argv);
    printParameters(params, cout);
 
-   // mcco stores just about everything. 
-   mcco = initMC(params); 
+#ifdef HAVE_SYCL
+   // HOIST INTO SETUP FUNCTION EVENTUALLY
+   char * devchar = std::getenv("QS_DEVICE");
+   std::string devname = (devchar==NULL ? "None" : devchar);
+   if (devname == "CPU") {
+       q = cl::sycl::cpu_selector{};
+   }
+   else
+   if (devname == "GPU") {
+       q = cl::sycl::gpu_selector{};
+   }
+   else
+   if (devname == "HOST") {
+       q = cl::sycl::host_selector{};
+   }
+   else
+   {
+       std::cout << "QS_DEVICE must be CPU, GPU or HOST" << std::endl;
+       std::abort();
+   }
+
+   // DEBUG - REMOVE LATER
+   if ( q.get_device().is_cpu() )         std::cout << "is cpu"         << std::endl;
+   if ( q.get_device().is_gpu() )         std::cout << "is gpu"         << std::endl;
+   if ( q.get_device().is_host() )        std::cout << "is host"        << std::endl;
+   if ( q.get_device().is_accelerator() ) std::cout << "is accelerator" << std::endl;
+#endif
+
+   // mcco stores just about everything.
+   mcco = initMC(params);
+
+#ifdef HAVE_SYCL // DEBUG - REMOVE LATER
+    if (mcco->processor_info->use_gpu) {
+        printf("Using SYCL device\n");
+    } else {
+        printf("Bad\n");
+        std::abort();
+    }
+#endif
 
    int loadBalance = params.simulationParams.loadBalance;
 
@@ -74,13 +115,17 @@ int main(int argc, char** argv)
 
 #ifdef HAVE_UVM
     mcco->~MonteCarlo();
+#ifdef HAVE_SYCL
+   sycl::free(mcco, q);
+#else
     cudaFree( mcco );
+#endif
 #else
    delete mcco;
 #endif
 
    mpiFinalize();
-   
+
    return 0;
 }
 
@@ -112,7 +157,7 @@ void cycleInit( bool loadBalance )
     mcco->particle_buffer->Initialize();
 
     MC_SourceNow(mcco);
-   
+
     PopulationControl(mcco, loadBalance); // controls particle population
 
     RouletteLowWeightParticles(mcco); // Delete particles with low statistical weight
@@ -125,7 +170,7 @@ void cycleInit( bool loadBalance )
 
 __global__ void CycleTrackingKernel( MonteCarlo* monteCarlo, int num_particles, ParticleVault* processingVault, ParticleVault* processedVault )
 {
-   int global_index = getGlobalThreadID(); 
+   int global_index = getGlobalThreadID();
 
     if( global_index < num_particles )
     {
@@ -167,9 +212,9 @@ void cycleTracking(MonteCarlo *monteCarlo)
 
                 ParticleVault *processingVault = my_particle_vault.getTaskProcessingVault(processing_vault);
                 ParticleVault *processedVault =  my_particle_vault.getTaskProcessedVault(processed_vault);
-            
+
                 int numParticles = processingVault->size();
-            
+
                 if ( numParticles != 0 )
                 {
                     NVTX_Range trackingKernel("cycleTracking_TrackingKernel"); // range ends at end of scope
@@ -187,28 +232,28 @@ void cycleTracking(MonteCarlo *monteCarlo)
                           dim3 grid(1,1,1);
                           dim3 block(1,1,1);
                           int runKernel = ThreadBlockLayout( grid, block, numParticles);
-                          
+
                           //Call Cycle Tracking Kernel
-                          if( runKernel )
+                          if ( runKernel )
                              CycleTrackingKernel<<<grid, block >>>( monteCarlo, numParticles, processingVault, processedVault );
-                          
+
                           //Synchronize the stream so that memory is copied back before we begin MPI section
                           cudaPeekAtLastError();
                           cudaDeviceSynchronize();
                           #endif
                        }
                        break;
-                       
+
                       case gpuWithOpenMP:
                        {
                           int nthreads=128;
-                          if (numParticles <  64*56 ) 
+                          if (numParticles <  64*56 )
                              nthreads = 64;
                           int nteams = (numParticles + nthreads - 1 ) / nthreads;
                           nteams = nteams > 1 ? nteams : 1;
                           #ifdef HAVE_OPENMP_TARGET
-                          #pragma omp target enter data map(to:monteCarlo[0:1]) 
-                          #pragma omp target enter data map(to:processingVault[0:1]) 
+                          #pragma omp target enter data map(to:monteCarlo[0:1])
+                          #pragma omp target enter data map(to:processingVault[0:1])
                           #pragma omp target enter data map(to:processedVault[0:1])
                           #pragma omp target teams distribute parallel for num_teams(nteams) thread_limit(128)
                           #endif
@@ -224,6 +269,21 @@ void cycleTracking(MonteCarlo *monteCarlo)
                        }
                        break;
 
+                      case SYCL:
+                       {
+                          const size_t N = numParticles;
+                      #ifdef HAVE_SYCL
+                          q.submit([&](sycl::handler &h) {
+                              h.parallel_for(sycl::range<1>{N},  [=](sycl::item<1> it) {
+                                 int particle_index = it[0];
+                                 CycleTrackingGuts( monteCarlo, particle_index, processingVault, processedVault );
+                              });
+                          });
+                          q.wait();
+                      #endif
+                       }
+                       break;
+
                       case cpu:
                        #include "mc_omp_parallel_for_schedule_static.hh"
                        for ( int particle_index = 0; particle_index < numParticles; particle_index++ )
@@ -245,7 +305,7 @@ void cycleTracking(MonteCarlo *monteCarlo)
                 // Next, communicate particles that have crossed onto
                 // other MPI ranks.
                 NVTX_Range cleanAndComm("cycleTracking_clean_and_comm");
-                
+
                 SendQueue &sendQueue = *(my_particle_vault.getSendQueue());
                 monteCarlo->particle_buffer->Allocate_Send_Buffer( sendQueue );
 
@@ -314,7 +374,7 @@ void cycleFinalize()
     mcco->_tallies->_balanceTask[0]._end = mcco->_particleVaultContainer->sizeProcessed();
 
     // Update the cumulative tally data.
-    mcco->_tallies->CycleFinalize(mcco); 
+    mcco->_tallies->CycleFinalize(mcco);
 
     mcco->time_info->cycle++;
 
diff --git a/src/mathHelp.hh b/src/mathHelp.hh
new file mode 100644
index 00000000..a2672aae
--- /dev/null
+++ b/src/mathHelp.hh
@@ -0,0 +1,18 @@
+#ifndef MATH_HELP_HH
+#define MATH_HELP_HH
+
+#ifdef HAVE_SYCL
+#include <CL/sycl.hpp>
+#define SIN  sycl::sin
+#define COS  sycl::cos
+#define SQRT sycl::sqrt
+#define LOG  sycl::log
+#else
+#include <cmath>
+#define SIN  sin
+#define COS  cos
+#define SQRT sqrt
+#define LOG  log
+#endif
+
+#endif // MATH_HELP_HH
diff --git a/src/qs_assert.hh b/src/qs_assert.hh
index 55e1f004..7c0059d5 100644
--- a/src/qs_assert.hh
+++ b/src/qs_assert.hh
@@ -1,6 +1,7 @@
 #include <cstdio>
 
 #ifdef __CUDA_ARCH__
+
 #define qs_assert( cond) \
    do \
    { \
@@ -9,7 +10,19 @@
         printf("ERROR\n"); \
       } \
    } while(0)
+
+#elif defined(HAVE_SYCL)
+
+#define qs_assert( cond) \
+   do \
+   { \
+      if (!(cond)) \
+      { \
+      } \
+   } while(0)
+
 #else
+
 #define qs_assert( cond)                        \
    do \
    { \
@@ -18,4 +31,5 @@
         printf("file=%s: line=%d ERROR\n",__FILE__,__LINE__); \
       } \
    } while(0)
+
 #endif

From 516be3e223d528e2f6b7cac4a46dbbbee6555f74 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 3 Jun 2020 16:45:19 -0700
Subject: [PATCH 02/12] better compiler flags

---
 src/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 5bbf141b..2effc631 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -186,7 +186,9 @@ LDFLAGS = $(OPENMP_LDFLAGS)
 ###############################################################################
 CXX       = dpcpp
 CXXFLAGS  = -std=c++14
-CXXFLAGS += -g -O2
+CXXFLAGS += -O3
+CXXFLAGS += -ffast-math
+#CXXFLAGS += -g
 CXXFLAGS += -ferror-limit=1
 CPPFLAGS  = -DHAVE_SYCL
 LDFLAGS   =

From 7fa9b9d3ce2792727067d490a354cf40c15b129c Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Fri, 5 Jun 2020 07:53:36 -0700
Subject: [PATCH 03/12] implement SYCL atomics

---
 src/AtomicMacro.hh | 261 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 254 insertions(+), 7 deletions(-)

diff --git a/src/AtomicMacro.hh b/src/AtomicMacro.hh
index 4c31c853..3a67f605 100644
--- a/src/AtomicMacro.hh
+++ b/src/AtomicMacro.hh
@@ -1,3 +1,8 @@
+#ifndef AtomicMacro_HH_
+#define AtomicMacro_HH_
+
+#define USE_MACRO_FUNCTIONS 1
+
 //Determine which atomics to use based on platform being compiled for
 //
 //If compiling with CUDA
@@ -8,26 +13,260 @@
     #define USE_OPENMP_ATOMICS
 #endif
 
+#ifdef HAVE_SYCL
+#include <CL/sycl.hpp>
+#include <cstdint>
+#endif
+
+// --------------------------------------------------
+// Original Names            -> Inline function names
+// --------------------------------------------------
+// ATOMIC_WRITE( x, v )      -> ATOMIC_WRITE
+// ATOMIC_UPDATE( x )        -> ATOMIC_INCREMENT
+// ATOMIC_ADD( x, v )        -> ATOMIC_ADD
+// ATOMIC_CAPTURE( x, v, p ) -> ATOMIC_FETCH_ADD
+// --------------------------------------------------
+
+#if defined (USE_MACRO_FUNCTIONS)
+
+#define ATOMIC_CAPTURE( x, v, p )  ATOMIC_FETCH_ADD((x),(v),(p))
+#define ATOMIC_UPDATE( x )         ATOMIC_INCREMENT((x))
+
+#if defined(HAVE_SYCL)
+
+static const cl::sycl::memory_order          order = cl::sycl::memory_order::relaxed;
+static const cl::sycl::access::address_space space = cl::sycl::access::address_space::global_space;
+
+template <typename T>
+inline void ATOMIC_WRITE(T & x, T v) {
+    //x = v;
+}
+
+template <typename T>
+inline void ATOMIC_INCREMENT(T& x) {
+    //atomicAdd( &x, 1 );
+    T one{1};
+    cl::sycl::atomic<T, space> y( (cl::sycl::multi_ptr<T, space>(&x)));
+    cl::sycl::atomic_fetch_add(y, one, order);
+}
+
+template <typename T>
+inline void ATOMIC_ADD(T& x, T v) {
+    //atomicAdd( &x, v );
+    cl::sycl::atomic<T, space> y( (cl::sycl::multi_ptr<T, space>(&x)));
+    cl::sycl::atomic_fetch_add(y, v, order);
+}
+
+template <>
+inline void ATOMIC_ADD(double& x, double v) {
+    static_assert(sizeof(double) == sizeof(uint64_t), "Unsafe: double is not 64-bits");
+    //atomicAdd( &x, v );
+    cl::sycl::atomic<uint64_t, space> t( (cl::sycl::multi_ptr<uint64_t, space>( reinterpret_cast<uint64_t*>(&x)) ));
+    uint64_t old_i = t.load(order);
+    double   old_d;
+    do {
+      old_d = *reinterpret_cast<const double*>(&old_i);
+      const double   new_d = old_d + v;
+      const uint64_t new_i = *reinterpret_cast<const uint64_t *>(&new_d);
+      if (t.compare_exchange_strong(old_i, new_i, order)) break;
+    } while (true);
+    // p = old_d;
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_ADD(T1& x, T2 v) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    //atomicAdd( &x, v );
+    T1 val = static_cast<T2>(v);
+    cl::sycl::atomic<T1, space> y( (cl::sycl::multi_ptr<T1, space>(&x)));
+    cl::sycl::atomic_fetch_add(y, val, order);
+}
+
+template <typename T>
+inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+    //p = atomicAdd( &x, v );
+    cl::sycl::atomic<T, space> y( (cl::sycl::multi_ptr<T, space>(&x)));
+    p = cl::sycl::atomic_fetch_add(y, v, order);
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    //p = atomicAdd( &x, v );
+    cl::sycl::atomic<T1, space> y( (cl::sycl::multi_ptr<T1, space>(&x)));
+    T1 val = static_cast<T2>(v);
+    p = cl::sycl::atomic_fetch_add(y, val, order);
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+    //p = atomicAdd( &x, v );
+    T1 val = static_cast<T2>(v);
+    cl::sycl::atomic<T1, space> y( (cl::sycl::multi_ptr<T1, space>(&x)));
+    p = cl::sycl::atomic_fetch_add(y, val, order);
+}
+
+#elif defined(HAVE_CUDA) && defined(__CUDA_ARCH__)
+
+template <typename T>
+inline void ATOMIC_WRITE(T & x, T v) {
+    x = v;
+}
+
+template <typename T>
+inline void ATOMIC_INCREMENT(T& x) {
+    atomicAdd( &x, 1 );
+}
+
+template <typename T>
+inline void ATOMIC_ADD(T& x, T v) {
+    atomicAdd( &x, v );
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_ADD(T1& x, T2 v) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    atomicAdd( &x, v );
+}
+
+template <typename T>
+inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+    p = atomicAdd( &x, v );
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    p = atomicAdd( &x, v );
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+    p = atomicAdd( &x, v );
+}
+
+#elif defined(USE_OPENMP_ATOMICS)
+
+template <typename T>
+inline void ATOMIC_WRITE(T & x, T v) {
+    _Pragma("omp atomic write")
+    x = v;
+}
+
+template <typename T>
+inline void ATOMIC_INCREMENT(T& x) {
+    _Pragma("omp atomic update")
+    x++;
+}
+
+template <typename T>
+inline void ATOMIC_ADD(T& x, T v) {
+    _Pragma("omp atomic")
+    x += v;
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_ADD(T1& x, T2 v) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    _Pragma("omp atomic")
+    x += v;
+}
+
+template <typename T>
+inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+    _Pragma("omp atomic capture")
+    {p = x; x = x + v;}
+}
 
-#if defined (HAVE_CUDA)
+template <typename T1, typename T2>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    _Pragma("omp atomic capture")
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+    _Pragma("omp atomic capture")
+    {p = x; x = x + v;}
+}
+
+#else // SEQUENTIAL
+
+template <typename T>
+inline void ATOMIC_WRITE(T & x, T v) {
+    x = v;
+}
+
+template <typename T>
+inline void ATOMIC_INCREMENT(T& x) {
+    x++;
+}
+
+template <typename T>
+inline void ATOMIC_ADD(T& x, T v) {
+    x += v;
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_ADD(T1& x, T2 v) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    x += v;
+}
+
+template <typename T>
+inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+    {p = x; x = x + v;}
+}
+
+#endif // BACKENDS
+
+#else // ! USE_MACRO_FUNCTIONS
+
+#if defined (HAVE_SYCL)
+
+#error You must define USE_MACRO_FUNCTIONS with SYCL!
+
+#elif defined (HAVE_CUDA)
 
     //If in a CUDA GPU section use the CUDA atomics
     #ifdef  __CUDA_ARCH__
 
         //Currently not atomic here. But its only used when it does not necissarially need to be atomic.
         #define ATOMIC_WRITE( x, v ) \
-            x = v;          
+            x = v;
 
         #define ATOMIC_ADD( x, v ) \
             atomicAdd( &x, v );
-        
+
         #define ATOMIC_UPDATE( x ) \
             atomicAdd( &x, 1 );
 
         #define ATOMIC_CAPTURE( x, v, p ) \
             p = atomicAdd( &x, v );
+
     //If in a CPU OpenMP section use the OpenMP atomics
     #elif defined (USE_OPENMP_ATOMICS)
+
         #define ATOMIC_WRITE( x, v ) \
             _Pragma("omp atomic write") \
             x = v;
@@ -46,6 +285,7 @@
 
     //If in a serial section, no need to use atomics
     #else
+
         #define ATOMIC_WRITE( x, v ) \
             x = v;
 
@@ -62,6 +302,7 @@
 
 //If in a OpenMP section use the OpenMP atomics
 #elif defined (USE_OPENMP_ATOMICS)
+
     #define ATOMIC_WRITE( x, v ) \
         _Pragma("omp atomic write") \
         x = v;
@@ -74,12 +315,13 @@
         _Pragma("omp atomic update") \
         x++;
 
-        #define ATOMIC_CAPTURE( x, v, p ) \
-            _Pragma("omp atomic capture") \
-            {p = x; x = x + v;}
+    #define ATOMIC_CAPTURE( x, v, p ) \
+        _Pragma("omp atomic capture") \
+        {p = x; x = x + v;}
 
 //If in a serial section, no need to use atomics
 #else
+
     #define ATOMIC_WRITE( x, v ) \
         x = v;
 
@@ -91,4 +333,9 @@
 
     #define ATOMIC_CAPTURE( x, v, p ) \
         {p = x; x = x + v;}
-#endif
+
+#endif // BACKENDS
+
+#endif // USE_MACRO_FUNCTIONS
+
+#endif // AtomicMacro_HH_

From d14bbb5b65545858afba6abe0fcf7cc2f7614cb7 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 27 Jun 2020 09:31:22 -0700
Subject: [PATCH 04/12] re-enable printf in device code using Intel extension

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 src/CollisionEvent.cc |  6 +++++-
 src/NuclearData.cc    |  6 +++++-
 src/cudaUtils.hh      | 11 +++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/CollisionEvent.cc b/src/CollisionEvent.cc
index e684b200..d66b9a57 100644
--- a/src/CollisionEvent.cc
+++ b/src/CollisionEvent.cc
@@ -12,6 +12,7 @@
 #include "DeclareMacro.hh"
 #include "AtomicMacro.hh"
 #include "mathHelp.hh"
+#include "cudaUtils.hh"
 
 #define MAX_PRODUCTION_SIZE 4
 
@@ -116,7 +117,10 @@ bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned i
          ATOMIC_ADD( monteCarlo->_tallies->_balanceTask[tally_index]._produce, nOut);
          break;
       case NuclearDataReaction::Undefined:
-#ifndef HAVE_SYCL
+#ifdef HAVE_SYCL
+         static const OPENCL_CONSTANT char format[] = "reactionType invalid\n";
+         sycl::intel::experimental::printf(format);
+#else
          printf("reactionType invalid\n");
 #endif
          qs_assert(false);
diff --git a/src/NuclearData.cc b/src/NuclearData.cc
index b7bb4906..2ed3224d 100644
--- a/src/NuclearData.cc
+++ b/src/NuclearData.cc
@@ -4,6 +4,7 @@
 #include "DeclareMacro.hh"
 #include "qs_assert.hh"
 #include "mathHelp.hh"
+#include "cudaUtils.hh"
 
 // Set the cross section values and reaction type
 // Cross sections are scaled to produce the supplied reactionCrossSection at 1MeV.
@@ -80,7 +81,10 @@ void NuclearDataReaction::sampleCollision(
       }
       break;
      case Undefined:
-#ifndef HAVE_SYCL
+#ifdef HAVE_SYCL
+      static const OPENCL_CONSTANT char format[] = "_reactionType invalid\n";
+      sycl::intel::experimental::printf(format);
+#else
       printf("_reactionType invalid\n");
 #endif
       qs_assert(false);
diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh
index 1db62cd2..808146e1 100644
--- a/src/cudaUtils.hh
+++ b/src/cudaUtils.hh
@@ -8,10 +8,21 @@
 #endif
 
 #if defined(HAVE_SYCL)
+
 #include <CL/sycl.hpp>
+
 extern sycl::queue q; // global variable for device queue
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define OPENCL_CONSTANT __attribute__((opencl_constant))
 #else
+#define OPENCL_CONSTANT
+#endif
+
+#else // not SYCL
+
 #define SYCL_EXTERNAL
+
 #endif
 
 #ifdef HAVE_OPENMP_TARGET

From c6f20a60e8d1a9a05709b3f5f2e61e4a76f6f574 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 27 Jun 2020 09:35:01 -0700
Subject: [PATCH 05/12] re-enable printf in device code using Intel extension

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 src/MC_Segment_Outcome.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/MC_Segment_Outcome.cc b/src/MC_Segment_Outcome.cc
index 3c629514..12a892c5 100644
--- a/src/MC_Segment_Outcome.cc
+++ b/src/MC_Segment_Outcome.cc
@@ -14,6 +14,7 @@
 #include "PhysicalConstants.hh"
 #include "DeclareMacro.hh"
 #include "mathHelp.hh"
+#include "cudaUtils.hh"
 
 HOST_DEVICE
 static inline unsigned int MC_Find_Min(const double *array,
@@ -46,12 +47,9 @@ MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Part
 
         if ( mc_particle.num_mean_free_paths > -900.0 )
         {
-#if defined(HAVE_SYCL)
-            // FIXME
-#elif 0
-            std::string output_string;
-            MC_Warning( "Forced Collision: num_mean_free_paths < 0 \n"
-                             "Particle record:\n%s", output_string.c_str());
+#ifdef HAVE_SYCL
+            static const OPENCL_CONSTANT char format[] = " MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n";
+            sycl::intel::experimental::printf(format);
 #else
             printf(" MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n");
 #endif

From a50696bc5424747ac1dec50dc14501ecdae2d7d4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 27 Jun 2020 09:38:48 -0700
Subject: [PATCH 06/12] re-enable printf in device code using Intel extension

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 src/qs_assert.hh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/qs_assert.hh b/src/qs_assert.hh
index 7c0059d5..5eebf080 100644
--- a/src/qs_assert.hh
+++ b/src/qs_assert.hh
@@ -1,4 +1,5 @@
 #include <cstdio>
+#include "cudaUtils.hh"
 
 #ifdef __CUDA_ARCH__
 
@@ -18,6 +19,8 @@
    { \
       if (!(cond)) \
       { \
+          static const OPENCL_CONSTANT char format[] = "file=%s: line=%d ERROR\n"; \
+          sycl::intel::experimental::printf(format,__FILE__,__LINE__); \
       } \
    } while(0)
 

From 7794a66f2a3d1ea6446519615dcadb7a15c2a707 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 27 Jun 2020 09:44:01 -0700
Subject: [PATCH 07/12] globally rename q->sycl_device_queue

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 src/MemoryControl.hh |  4 ++--
 src/MonteCarlo.cc    | 28 ++++++++++++++--------------
 src/cudaUtils.hh     |  2 +-
 src/initMC.cc        |  6 +++---
 src/main.cc          | 22 +++++++++++-----------
 5 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/src/MemoryControl.hh b/src/MemoryControl.hh
index 19d1ed71..6ffb92ce 100644
--- a/src/MemoryControl.hh
+++ b/src/MemoryControl.hh
@@ -24,7 +24,7 @@ namespace MemoryControl
         case AllocationPolicy::UVM_MEM:
          void * ptr;
 #ifdef HAVE_SYCL
-         ptr = (void *)sycl::malloc_shared(size * sizeof(T), q);
+         ptr = (void *)sycl::malloc_shared(size * sizeof(T), sycl_device_queue);
 #else
          cudaMallocManaged(&ptr, size*sizeof(T), cudaMemAttachGlobal);
 #endif
@@ -50,7 +50,7 @@ namespace MemoryControl
         case UVM_MEM:
          for (int i=0; i < size; ++i) data[i].~T();
 #ifdef HAVE_SYCL
-         sycl::free(data, q);
+         sycl::free(data, sycl_device_queue);
 #else
          cudaFree(data);
 #endif
diff --git a/src/MonteCarlo.cc b/src/MonteCarlo.cc
index f9fb116c..aca40e45 100644
--- a/src/MonteCarlo.cc
+++ b/src/MonteCarlo.cc
@@ -28,10 +28,10 @@ MonteCarlo::MonteCarlo(const Parameters& params)
 #if defined(HAVE_UVM)
         void *ptr1, *ptr2, *ptr3, *ptr4;
 #ifdef HAVE_SYCL
-        ptr1 = (void *)sycl::malloc_shared(sizeof(Tallies), q);
-        ptr2 = (void *)sycl::malloc_shared(sizeof(MC_Processor_Info), q);
-        ptr3 = (void *)sycl::malloc_shared(sizeof(MC_Time_Info), q);
-        ptr4 = (void *)sycl::malloc_shared(sizeof(MC_Fast_Timer_Container), q);
+        ptr1 = (void *)sycl::malloc_shared(sizeof(Tallies), sycl_device_queue);
+        ptr2 = (void *)sycl::malloc_shared(sizeof(MC_Processor_Info), sycl_device_queue);
+        ptr3 = (void *)sycl::malloc_shared(sizeof(MC_Time_Info), sycl_device_queue);
+        ptr4 = (void *)sycl::malloc_shared(sizeof(MC_Fast_Timer_Container), sycl_device_queue);
 #else
         cudaMallocManaged( &ptr1, sizeof(Tallies), cudaMemAttachHost );
         cudaMallocManaged( &ptr2, sizeof(MC_Processor_Info), cudaMemAttachHost );
@@ -105,8 +105,8 @@ MonteCarlo::MonteCarlo(const Parameters& params)
     #if defined(HAVE_UVM)
         void *ptr5, *ptr6;
 #ifdef HAVE_SYCL
-        ptr5 = (void *)sycl::malloc_shared(sizeof(MC_Particle_Buffer), q);
-        ptr6 = (void *)sycl::malloc_shared(sizeof(ParticleVaultContainer), q);
+        ptr5 = (void *)sycl::malloc_shared(sizeof(MC_Particle_Buffer), sycl_device_queue);
+        ptr6 = (void *)sycl::malloc_shared(sizeof(ParticleVaultContainer), sycl_device_queue);
 #else
         cudaMallocManaged( &ptr5, sizeof(MC_Particle_Buffer) );
         cudaMallocManaged( &ptr6, sizeof(ParticleVaultContainer), cudaMemAttachHost );
@@ -137,14 +137,14 @@ MonteCarlo::~MonteCarlo()
         particle_buffer->~MC_Particle_Buffer();
 
 #ifdef HAVE_SYCL
-        sycl::free(_nuclearData, q);
-        sycl::free(_particleVaultContainer, q);
-        sycl::free(_materialDatabase, q);
-        sycl::free(_tallies, q);
-        sycl::free(processor_info, q);
-        sycl::free(time_info, q);
-        sycl::free(fast_timer, q);
-        sycl::free(particle_buffer, q);
+        sycl::free(_nuclearData, sycl_device_queue);
+        sycl::free(_particleVaultContainer, sycl_device_queue);
+        sycl::free(_materialDatabase, sycl_device_queue);
+        sycl::free(_tallies, sycl_device_queue);
+        sycl::free(processor_info, sycl_device_queue);
+        sycl::free(time_info, sycl_device_queue);
+        sycl::free(fast_timer, sycl_device_queue);
+        sycl::free(particle_buffer, sycl_device_queue);
 #else
         cudaFree( _nuclearData );
         cudaFree( _particleVaultContainer);
diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh
index 808146e1..b475dc1d 100644
--- a/src/cudaUtils.hh
+++ b/src/cudaUtils.hh
@@ -11,7 +11,7 @@
 
 #include <CL/sycl.hpp>
 
-extern sycl::queue q; // global variable for device queue
+extern sycl::queue sycl_device_queue; // global variable for device queue
 
 #ifdef __SYCL_DEVICE_ONLY__
 #define OPENCL_CONSTANT __attribute__((opencl_constant))
diff --git a/src/initMC.cc b/src/initMC.cc
index 02b38201..5c41f6de 100644
--- a/src/initMC.cc
+++ b/src/initMC.cc
@@ -55,7 +55,7 @@ MonteCarlo* initMC(const Parameters& params)
    #ifdef HAVE_UVM
       void* ptr;
 #ifdef HAVE_SYCL
-      ptr = (void *)sycl::malloc_shared(sizeof(MonteCarlo), q);
+      ptr = (void *)sycl::malloc_shared(sizeof(MonteCarlo), sycl_device_queue);
 #else
       cudaMallocManaged( &ptr, sizeof(MonteCarlo), cudaMemAttachGlobal );
 #endif
@@ -141,8 +141,8 @@ namespace
       #if defined HAVE_UVM
          void *ptr1, *ptr2;
 #ifdef HAVE_SYCL
-         ptr1 = (void *)sycl::malloc_shared(sizeof(NuclearData), q);
-         ptr2 = (void *)sycl::malloc_shared(sizeof(MaterialDatabase), q);
+         ptr1 = (void *)sycl::malloc_shared(sizeof(NuclearData), sycl_device_queue);
+         ptr2 = (void *)sycl::malloc_shared(sizeof(MaterialDatabase), sycl_device_queue);
 #else
          cudaMallocManaged( &ptr1, sizeof(NuclearData), cudaMemAttachGlobal );
          cudaMallocManaged( &ptr2, sizeof(MaterialDatabase), cudaMemAttachGlobal );
diff --git a/src/main.cc b/src/main.cc
index dbc05105..49972c61 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -27,7 +27,7 @@
 #include "git_vers.hh"
 
 #ifdef HAVE_SYCL
-sycl::queue q;
+sycl::queue sycl_device_queue;
 #endif
 
 void gameOver();
@@ -52,15 +52,15 @@ int main(int argc, char** argv)
    char * devchar = std::getenv("QS_DEVICE");
    std::string devname = (devchar==NULL ? "None" : devchar);
    if (devname == "CPU") {
-       q = cl::sycl::cpu_selector{};
+       sycl_device_queue = cl::sycl::cpu_selector{};
    }
    else
    if (devname == "GPU") {
-       q = cl::sycl::gpu_selector{};
+       sycl_device_queue = cl::sycl::gpu_selector{};
    }
    else
    if (devname == "HOST") {
-       q = cl::sycl::host_selector{};
+       sycl_device_queue = cl::sycl::host_selector{};
    }
    else
    {
@@ -69,10 +69,10 @@ int main(int argc, char** argv)
    }
 
    // DEBUG - REMOVE LATER
-   if ( q.get_device().is_cpu() )         std::cout << "is cpu"         << std::endl;
-   if ( q.get_device().is_gpu() )         std::cout << "is gpu"         << std::endl;
-   if ( q.get_device().is_host() )        std::cout << "is host"        << std::endl;
-   if ( q.get_device().is_accelerator() ) std::cout << "is accelerator" << std::endl;
+   if ( sycl_device_queue.get_device().is_cpu() )         std::cout << "is cpu"         << std::endl;
+   if ( sycl_device_queue.get_device().is_gpu() )         std::cout << "is gpu"         << std::endl;
+   if ( sycl_device_queue.get_device().is_host() )        std::cout << "is host"        << std::endl;
+   if ( sycl_device_queue.get_device().is_accelerator() ) std::cout << "is accelerator" << std::endl;
 #endif
 
    // mcco stores just about everything.
@@ -116,7 +116,7 @@ int main(int argc, char** argv)
 #ifdef HAVE_UVM
     mcco->~MonteCarlo();
 #ifdef HAVE_SYCL
-   sycl::free(mcco, q);
+   sycl::free(mcco, sycl_device_queue);
 #else
     cudaFree( mcco );
 #endif
@@ -273,13 +273,13 @@ void cycleTracking(MonteCarlo *monteCarlo)
                        {
                           const size_t N = numParticles;
                       #ifdef HAVE_SYCL
-                          q.submit([&](sycl::handler &h) {
+                          sycl_device_queue.submit([&](sycl::handler &h) {
                               h.parallel_for(sycl::range<1>{N},  [=](sycl::item<1> it) {
                                  int particle_index = it[0];
                                  CycleTrackingGuts( monteCarlo, particle_index, processingVault, processedVault );
                               });
                           });
-                          q.wait();
+                          sycl_device_queue.wait();
                       #endif
                        }
                        break;

From 96f836815c6d3efe0362f5e1dc87ab271ca88f81 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 9 Oct 2020 13:06:31 -0400
Subject: [PATCH 08/12] add DPC++ PTX stuff

Signed-off-by: Jeff Hammond <jeff.science@gmail.com>
---
 src/Makefile | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 2effc631..9045ec1a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -184,20 +184,40 @@ LDFLAGS = $(OPENMP_LDFLAGS)
 ###############################################################################
 # SYCL/DPC++
 ###############################################################################
-CXX       = dpcpp
-CXXFLAGS  = -std=c++14
-CXXFLAGS += -O3
-CXXFLAGS += -ffast-math
-#CXXFLAGS += -g
-CXXFLAGS += -ferror-limit=1
-CPPFLAGS  = -DHAVE_SYCL
-LDFLAGS   =
+
+CPPFLAGS	 = -DHAVE_SYCL
+
+CXXFLAGS	 =
+CXXFLAGS	+= -std=c++17
+CXXFLAGS	+= -g
+CXXFLAGS	+= -O3
+CXXFLAGS	+= -ffast-math
+CXXFLAGS	+= -ferror-limit=3
+
+CXXFLAGS	+= -fsycl
+CXXFLAGS	+= -fsycl-unnamed-lambda
+
+# -------------------------------------------------------
+# Intel oneAPI DPC++ compiler that supports Intel devices
+# -------------------------------------------------------
+#CXX		 = dpcpp
+# NVIDIA back-end does not support printf yet...
+#CPPFLAGS	 = -DHAVE_SYCL_PRINTF
 
 # GPU offline compile
 # https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/gpu-flow/offline-compilation-for-gpu.html
 #CXXFLAGS += -fsycl-targets=spir64_gen-unknown-linux-sycldevice
 #LDFLAGS  += -Xsycl-target-backend=spir64_gen-unknown-linux-sycldevice "-device skl"
 
+# -------------------------------------------------------
+# Intel DPC++ open-source compiler built for NVIDIA
+# -------------------------------------------------------
+DPCPP_DIR	 = /media/disk2/jrhammon/llvm/build
+CXX		 = $(DPCPP_DIR)/bin/clang++
+CXXFLAGS	+= -Wno-unknown-cuda-version
+CXXFLAGS	+= -fsycl-targets=nvptx64-nvidia-cuda-sycldevice
+LDFLAGS		+= -L$(DPCPP_DIR)/lib -Wl,-rpath=$(DPCPP_DIR)/lib
+
 ################################################################################
 ### Below here, it is pitch black.                                           ###
 ### You are likely to be eaten by a grue.                                    ###

From 5512778c2d803b3a625630dd41072a79e7375206 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 9 Oct 2020 13:06:53 -0400
Subject: [PATCH 09/12] deal with broken printf w/ DPC++ NVIDIA backend

Signed-off-by: Jeff Hammond <jeff.science@gmail.com>
---
 src/CollisionEvent.cc     |  7 ++++++-
 src/MC_Segment_Outcome.cc |  4 +++-
 src/NuclearData.cc        |  4 +++-
 src/cudaUtils.hh          |  3 +++
 src/qs_assert.hh          | 18 +++++++++++++++++-
 5 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/CollisionEvent.cc b/src/CollisionEvent.cc
index d66b9a57..b0522016 100644
--- a/src/CollisionEvent.cc
+++ b/src/CollisionEvent.cc
@@ -117,13 +117,18 @@ bool CollisionEvent(MonteCarlo* monteCarlo, MC_Particle &mc_particle, unsigned i
          ATOMIC_ADD( monteCarlo->_tallies->_balanceTask[tally_index]._produce, nOut);
          break;
       case NuclearDataReaction::Undefined:
+      {
 #ifdef HAVE_SYCL
+#if HAVE_SYCL_PRINTF
          static const OPENCL_CONSTANT char format[] = "reactionType invalid\n";
-         sycl::intel::experimental::printf(format);
+         syclx::printf(format);
+#endif
 #else
          printf("reactionType invalid\n");
 #endif
          qs_assert(false);
+         break;
+      }
    }
 
    if( nOut == 0 ) return false;
diff --git a/src/MC_Segment_Outcome.cc b/src/MC_Segment_Outcome.cc
index 12a892c5..c7fb7f41 100644
--- a/src/MC_Segment_Outcome.cc
+++ b/src/MC_Segment_Outcome.cc
@@ -48,8 +48,10 @@ MC_Segment_Outcome_type::Enum MC_Segment_Outcome(MonteCarlo* monteCarlo, MC_Part
         if ( mc_particle.num_mean_free_paths > -900.0 )
         {
 #ifdef HAVE_SYCL
+#ifdef HAVE_SYCL_PRINTF
             static const OPENCL_CONSTANT char format[] = " MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n";
-            sycl::intel::experimental::printf(format);
+            syclx::printf(format);
+#endif
 #else
             printf(" MC_Segment_Outcome: mc_particle.num_mean_free_paths > -900.0 \n");
 #endif
diff --git a/src/NuclearData.cc b/src/NuclearData.cc
index 2ed3224d..f6f46326 100644
--- a/src/NuclearData.cc
+++ b/src/NuclearData.cc
@@ -82,8 +82,10 @@ void NuclearDataReaction::sampleCollision(
       break;
      case Undefined:
 #ifdef HAVE_SYCL
+#ifdef HAVE_SYCL_PRINTF
       static const OPENCL_CONSTANT char format[] = "_reactionType invalid\n";
-      sycl::intel::experimental::printf(format);
+      syclx::printf(format);
+#endif
 #else
       printf("_reactionType invalid\n");
 #endif
diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh
index b475dc1d..08ec23f1 100644
--- a/src/cudaUtils.hh
+++ b/src/cudaUtils.hh
@@ -19,6 +19,9 @@ extern sycl::queue sycl_device_queue; // global variable for device queue
 #define OPENCL_CONSTANT
 #endif
 
+// printf
+namespace syclx = sycl::ONEAPI::experimental;
+
 #else // not SYCL
 
 #define SYCL_EXTERNAL
diff --git a/src/qs_assert.hh b/src/qs_assert.hh
index 5eebf080..f617c904 100644
--- a/src/qs_assert.hh
+++ b/src/qs_assert.hh
@@ -14,18 +14,34 @@
 
 #elif defined(HAVE_SYCL)
 
+#ifdef HAVE_SYCL_PRINTF
+
 #define qs_assert( cond) \
    do \
    { \
       if (!(cond)) \
       { \
           static const OPENCL_CONSTANT char format[] = "file=%s: line=%d ERROR\n"; \
-          sycl::intel::experimental::printf(format,__FILE__,__LINE__); \
+          syclx::printf(format,__FILE__,__LINE__); \
       } \
    } while(0)
 
 #else
 
+#define qs_assert( cond) \
+   do \
+   { \
+      if (!(cond)) \
+      { \
+          /* FIXME eventually...
+           * printf("file=%s: line=%d ERROR\n",__FILE__,__LINE__); */ \
+      } \
+   } while(0)
+
+#endif
+
+#else
+
 #define qs_assert( cond)                        \
    do \
    { \

From 5d992a5971330bc4e426c202090c1508bd73ce71 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 13 Oct 2020 10:52:41 -0700
Subject: [PATCH 10/12] build system changes

---
 src/Makefile     | 31 ++++++++++++++++---------------
 src/cudaUtils.hh |  3 ++-
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 9045ec1a..40ad886b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -131,14 +131,14 @@ LDFLAGS  += -lstdc++
 # This works on a Macbook (if gcc is installed)
 ###############################################################################
 #
-OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp 
-OPENMP_LDFLAGS = -fopenmp
-OPTFLAGS = -g -O2
+#OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp 
+#OPENMP_LDFLAGS = -fopenmp
+#OPTFLAGS = -g -O2
 #
-CXX=g++
-CXXFLAGS = -std=c++11  $(OPTFLAGS) -Wpedantic
-CPPFLAGS = $(OPENMP_FLAGS)
-LDFLAGS = $(OPENMP_LDFLAGS) 
+#CXX=g++
+#CXXFLAGS = -std=c++11  $(OPTFLAGS) -Wpedantic
+#CPPFLAGS = $(OPENMP_FLAGS)
+#LDFLAGS = $(OPENMP_LDFLAGS) 
 
 
 ###############################################################################
@@ -189,7 +189,6 @@ CPPFLAGS	 = -DHAVE_SYCL
 
 CXXFLAGS	 =
 CXXFLAGS	+= -std=c++17
-CXXFLAGS	+= -g
 CXXFLAGS	+= -O3
 CXXFLAGS	+= -ffast-math
 CXXFLAGS	+= -ferror-limit=3
@@ -197,12 +196,14 @@ CXXFLAGS	+= -ferror-limit=3
 CXXFLAGS	+= -fsycl
 CXXFLAGS	+= -fsycl-unnamed-lambda
 
+#CXXFLAGS	+= -g # SPIR-V backend generates debug output when this is used
+
 # -------------------------------------------------------
 # Intel oneAPI DPC++ compiler that supports Intel devices
 # -------------------------------------------------------
-#CXX		 = dpcpp
+CXX		 = dpcpp
 # NVIDIA back-end does not support printf yet...
-#CPPFLAGS	 = -DHAVE_SYCL_PRINTF
+CPPFLAGS	+= -DHAVE_SYCL_PRINTF
 
 # GPU offline compile
 # https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/gpu-flow/offline-compilation-for-gpu.html
@@ -212,11 +213,11 @@ CXXFLAGS	+= -fsycl-unnamed-lambda
 # -------------------------------------------------------
 # Intel DPC++ open-source compiler built for NVIDIA
 # -------------------------------------------------------
-DPCPP_DIR	 = /media/disk2/jrhammon/llvm/build
-CXX		 = $(DPCPP_DIR)/bin/clang++
-CXXFLAGS	+= -Wno-unknown-cuda-version
-CXXFLAGS	+= -fsycl-targets=nvptx64-nvidia-cuda-sycldevice
-LDFLAGS		+= -L$(DPCPP_DIR)/lib -Wl,-rpath=$(DPCPP_DIR)/lib
+#DPCPP_DIR	 = /home/jrhammon/ISYCL/llvm/build/install
+#CXX		 = $(DPCPP_DIR)/bin/clang++
+#CXXFLAGS	+= -Wno-unknown-cuda-version
+#CXXFLAGS	+= -fsycl-targets=nvptx64-nvidia-cuda-sycldevice
+#LDFLAGS		+= -L$(DPCPP_DIR)/lib -Wl,-rpath=$(DPCPP_DIR)/lib
 
 ################################################################################
 ### Below here, it is pitch black.                                           ###
diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh
index 08ec23f1..60bfe009 100644
--- a/src/cudaUtils.hh
+++ b/src/cudaUtils.hh
@@ -20,7 +20,8 @@ extern sycl::queue sycl_device_queue; // global variable for device queue
 #endif
 
 // printf
-namespace syclx = sycl::ONEAPI::experimental;
+//namespace syclx = sycl::ONEAPI::experimental; // latest OSS DPC++
+namespace syclx = sycl::intel::experimental;    // dpcpp beta09
 
 #else // not SYCL
 

From 3ae32fa00b4af044c2b662ec949fdfceefffb9a9 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 13 Oct 2020 12:48:35 -0700
Subject: [PATCH 11/12] flag workaround compiler bug

---
 src/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index 40ad886b..6064a998 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -205,6 +205,9 @@ CXX		 = dpcpp
 # NVIDIA back-end does not support printf yet...
 CPPFLAGS	+= -DHAVE_SYCL_PRINTF
 
+# works around https://github.com/intel/llvm/issues/2629
+CXXFLAGS	+= -fno-sycl-early-optimizations
+
 # GPU offline compile
 # https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/gpu-flow/offline-compilation-for-gpu.html
 #CXXFLAGS += -fsycl-targets=spir64_gen-unknown-linux-sycldevice

From 0b9c632ed7669de1aa49ddd131dfb419acf85a92 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 15 Oct 2020 16:56:13 -0700
Subject: [PATCH 12/12] disable unrelated flags

---
 src/Makefile | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 6064a998..54b73d54 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -103,27 +103,27 @@ LDFLAGS =
 # Cuda on LLNL CORAL EA nodes
 ###############################################################################
 ## Choose one Cuda path
-CUDA_PATH = /usr
+#CUDA_PATH = /usr
 #CUDA_PATH = /usr/tcetmp/packages/cuda-9.0.176
 
-HOST_COMPILER = g++
+#HOST_COMPILER = g++
 
-OPTFLAGS = -O2
+#OPTFLAGS = -O2
 ## Version below for debugging
 ##OPTFLAGS = -DUSE_NVTX -g -G -lineinfo -O0
 
-CUDA_FLAGS = -I${CUDA_PATH}/include/
-CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart
+#CUDA_FLAGS = -I${CUDA_PATH}/include/
+#CUDA_LDFLAGS = -L${CUDA_PATH}/lib64/ -lcuda -lcudart
 #
-CXX=$(CUDA_PATH)/bin/nvcc
-CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) #-Xptxas -v
-CXXFLAGS += -ccbin clang-8
-CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\"
+#CXX=$(CUDA_PATH)/bin/nvcc
+#CXXFLAGS = -DHAVE_CUDA -std=c++11 $(OPTFLAGS) #-Xptxas -v
+#CXXFLAGS += -ccbin clang-8
+#CXXFLAGS += -gencode=arch=compute_60,code=\"sm_60,compute_60\"
 #CXXFLAGS += --compiler-bindir=$(HOST_COMPILER)
-CPPFLAGS = -x cu -dc
+#CPPFLAGS = -x cu -dc
 #CPPFLAGS += -DHAVE_MPI -DHAVE_ASYNC_MPI
-LDFLAGS  = $(CUDA_LDFLAGS)
-LDFLAGS  += -lstdc++
+#LDFLAGS  = $(CUDA_LDFLAGS)
+#LDFLAGS  += -lstdc++
 ##LDFLAGS += ${CUDA_PATH}/lib64/libnvToolsExt.so
 
 ###############################################################################