diff --git a/src/AtomicMacro.hh b/src/AtomicMacro.hh
index 4c31c853..3e01a15f 100644
--- a/src/AtomicMacro.hh
+++ b/src/AtomicMacro.hh
@@ -1,6 +1,7 @@
-//Determine which atomics to use based on platform being compiled for
-//
-//If compiling with CUDA
+#ifndef AtomicMacro_HH_
+#define AtomicMacro_HH_
+
+#define USE_MACRO_FUNCTIONS 1
 
 #ifdef HAVE_OPENMP
     #define USE_OPENMP_ATOMICS
@@ -8,6 +9,228 @@
     #define USE_OPENMP_ATOMICS
 #endif
 
+#ifdef HAVE_STDPAR
+    #define USE_CXX20_ATOMICS
+#endif
+
+// --------------------------------------------------
+// Original Names            -> Inline function names
+// --------------------------------------------------
+// ATOMIC_WRITE( x, v )      -> ATOMIC_WRITE
+// ATOMIC_UPDATE( x )        -> ATOMIC_INCREMENT
+// ATOMIC_ADD( x, v )        -> ATOMIC_ADD
+// ATOMIC_CAPTURE( x, v, p ) -> ATOMIC_FETCH_ADD
+// --------------------------------------------------
+
+#if defined (USE_MACRO_FUNCTIONS)
+
+#define ATOMIC_CAPTURE( x, v, p )  ATOMIC_FETCH_ADD((x),(v),(p))
+#define ATOMIC_UPDATE( x )         ATOMIC_INCREMENT((x))
+
+#if defined(USE_CXX20_ATOMICS)
+
+    #if (__cplusplus > 201703L)
+
+        #include <atomic>
+
+        #if defined(__cpp_lib_atomic_float) && defined(__cpp_lib_atomic_ref)
+
+            template <typename T>
+            inline void ATOMIC_WRITE(T & x, T v) {
+                //x = v;
+                std::atomic_ref<T> r{x};
+                r = v;
+            }
+
+            template <typename T>
+            inline void ATOMIC_INCREMENT(T& x) {
+                //atomicAdd( &x, 1 );
+                std::atomic_ref<T> r{x};
+                r++;
+            }
+
+            template <typename T>
+            inline void ATOMIC_ADD(T& x, T v) {
+                //atomicAdd( &x, v );
+                std::atomic_ref<T> r{x};
+                r+=v;
+            }
+
+            template <typename T1, typename T2>
+            inline void ATOMIC_ADD(T1& x, T2 v) {
+                static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+                //atomicAdd( &x, v );
+                std::atomic_ref<T1> r{x};
+                r+=v;
+            }
+
+            template <typename T>
+            inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+                //p = atomicAdd( &x, v );
+                std::atomic_ref<T> r{x};
+                p = r.fetch_add(v);
+            }
+
+            template <typename T1, typename T2>
+            inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+                static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+                //p = atomicAdd( &x, v );
+                std::atomic_ref<T1> r{x};
+                p = r.fetch_add(v);
+            }
+
+            template <typename T1, typename T2, typename T3>
+            inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+                static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+                static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+                //p = atomicAdd( &x, v );
+                std::atomic_ref<T1> r{x};
+                p = r.fetch_add(v);
+            }
+
+        #else
+            #error Your supposedly C++20 compiler does not support atomic_ref<double>.
+        #endif
+
+    #else
+        #error Sorry, you need C++20.
+    #endif
+
+#elif defined(HAVE_CUDA) && defined(__CUDA_ARCH__)
+
+template <typename T>
+inline void ATOMIC_WRITE(T & x, T v) {
+    x = v;
+}
+
+template <typename T>
+inline void ATOMIC_INCREMENT(T& x) {
+    atomicAdd( &x, 1 );
+}
+
+template <typename T>
+inline void ATOMIC_ADD(T& x, T v) {
+    atomicAdd( &x, v );
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_ADD(T1& x, T2 v) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    atomicAdd( &x, v );
+}
+
+template <typename T>
+inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+    p = atomicAdd( &x, v );
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    p = atomicAdd( &x, v );
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+    p = atomicAdd( &x, v );
+}
+
+#elif defined(USE_OPENMP_ATOMICS)
+
+#warning Should not be here
+
+template <typename T>
+inline void ATOMIC_WRITE(T & x, T v) {
+    _Pragma("omp atomic write")
+    x = v;
+}
+
+template <typename T>
+inline void ATOMIC_INCREMENT(T& x) {
+    _Pragma("omp atomic update")
+    x++;
+}
+
+template <typename T>
+inline void ATOMIC_ADD(T& x, T v) {
+    _Pragma("omp atomic")
+    x += v;
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_ADD(T1& x, T2 v) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    _Pragma("omp atomic")
+    x += v;
+}
+
+template <typename T>
+inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+    _Pragma("omp atomic capture")
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    _Pragma("omp atomic capture")
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+    _Pragma("omp atomic capture")
+    {p = x; x = x + v;}
+}
+
+#else // SEQUENTIAL
+
+template <typename T>
+inline void ATOMIC_WRITE(T & x, T v) {
+    x = v;
+}
+
+template <typename T>
+inline void ATOMIC_INCREMENT(T& x) {
+    x++;
+}
+
+template <typename T>
+inline void ATOMIC_ADD(T& x, T v) {
+    x += v;
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_ADD(T1& x, T2 v) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    x += v;
+}
+
+template <typename T>
+inline void ATOMIC_FETCH_ADD(T& x, T v, T& p) {
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T1& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    {p = x; x = x + v;}
+}
+
+template <typename T1, typename T2, typename T3>
+inline void ATOMIC_FETCH_ADD(T1& x, T2 v, T3& p) {
+    static_assert( sizeof(T1) >= sizeof(T2), "Unsafe: small += large");
+    static_assert( sizeof(T3) >= sizeof(T1), "Unsafe: small := large");
+    {p = x; x = x + v;}
+}
+
+#endif // BACKENDS
+
+#else // ! USE_MACRO_FUNCTIONS
 
 #if defined (HAVE_CUDA)
 
@@ -16,18 +239,20 @@
 
         //Currently not atomic here. But its only used when it does not necissarially need to be atomic.
         #define ATOMIC_WRITE( x, v ) \
-            x = v;          
+            x = v;
 
         #define ATOMIC_ADD( x, v ) \
             atomicAdd( &x, v );
-        
+
         #define ATOMIC_UPDATE( x ) \
             atomicAdd( &x, 1 );
 
         #define ATOMIC_CAPTURE( x, v, p ) \
             p = atomicAdd( &x, v );
+
     //If in a CPU OpenMP section use the OpenMP atomics
     #elif defined (USE_OPENMP_ATOMICS)
+
         #define ATOMIC_WRITE( x, v ) \
             _Pragma("omp atomic write") \
             x = v;
@@ -46,6 +271,7 @@
 
     //If in a serial section, no need to use atomics
     #else
+
         #define ATOMIC_WRITE( x, v ) \
             x = v;
 
@@ -62,6 +288,7 @@
 
 //If in a OpenMP section use the OpenMP atomics
 #elif defined (USE_OPENMP_ATOMICS)
+
     #define ATOMIC_WRITE( x, v ) \
         _Pragma("omp atomic write") \
         x = v;
@@ -74,12 +301,13 @@
         _Pragma("omp atomic update") \
         x++;
 
-        #define ATOMIC_CAPTURE( x, v, p ) \
-            _Pragma("omp atomic capture") \
-            {p = x; x = x + v;}
+    #define ATOMIC_CAPTURE( x, v, p ) \
+        _Pragma("omp atomic capture") \
+        {p = x; x = x + v;}
 
 //If in a serial section, no need to use atomics
 #else
+
     #define ATOMIC_WRITE( x, v ) \
         x = v;
 
@@ -91,4 +319,9 @@
 
     #define ATOMIC_CAPTURE( x, v, p ) \
         {p = x; x = x + v;}
-#endif
+
+#endif // BACKENDS
+
+#endif // USE_MACRO_FUNCTIONS
+
+#endif // AtomicMacro_HH_
diff --git a/src/Makefile b/src/Makefile
index 5f42dc12..a9d04510 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -114,6 +114,18 @@ LDFLAGS =
 #LDFLAGS = $(OPENMP_LDFLAGS) 
 
 
+###############################################################################
+### C++ parallelism
+###############################################################################
+
+#OPTFLAGS = -g -O3
+
+#CXX=clang++-12
+#CXXFLAGS = -std=c++20  $(OPTFLAGS) -Wpedantic
+#CPPFLAGS = -DHAVE_STDPAR
+## Sometimes GCC and Clang do not link this automatically.
+#LDFLAGS  = -ltbb
+
 ###############################################################################
 ### GCC -- with MPI and OpenMP 
 ###############################################################################
diff --git a/src/StdParUtils.hpp b/src/StdParUtils.hpp
new file mode 100644
index 00000000..6b946016
--- /dev/null
+++ b/src/StdParUtils.hpp
@@ -0,0 +1,117 @@
+/*
+Copyright (c) 2021, NVIDIA
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef StdParUtils_HH_
+#define StdParUtils_HH_
+
+#include <thread>
+
+#include <algorithm>
+#include <execution>
+
+// This implementation was authored by David Olsen
+
+#include <type_traits>
+
+template <class T>
+struct counting_iterator {
+
+private:
+  typedef counting_iterator<T> self;
+
+public:
+  typedef T value_type;
+  typedef typename std::make_signed<T>::type difference_type;
+  typedef T const* pointer;
+  typedef T const& reference;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  explicit counting_iterator(value_type v) : value(v) { }
+
+  value_type operator*() const { return value; }
+  value_type operator[](difference_type n) const { return value + n; }
+
+  self& operator++() { ++value; return *this; }
+  self operator++(int) {
+    self result{value};
+    ++value;
+    return result;
+  }
+  self& operator--() { --value; return *this; }
+  self operator--(int) {
+    self result{value};
+    --value;
+    return result;
+  }
+  self& operator+=(difference_type n) { value += n; return *this; }
+  self& operator-=(difference_type n) { value -= n; return *this; }
+
+  friend self operator+(self const& i, difference_type n) {
+    return self(i.value + n);
+  }
+  friend self operator+(difference_type n, self const& i) {
+    return self(i.value + n);
+  }
+  friend difference_type operator-(self const& x, self const& y) {
+    return x.value - y.value;
+  }
+  friend self operator-(self const& i, difference_type n) {
+    return self(i.value - n);
+  }
+
+  friend bool operator==(self const& x, self const& y) {
+    return x.value == y.value;
+  }
+  friend bool operator!=(self const& x, self const& y) {
+    return x.value != y.value;
+  }
+  friend bool operator<(self const& x, self const& y) {
+    return x.value < y.value;
+  }
+  friend bool operator<=(self const& x, self const& y) {
+    return x.value <= y.value;
+  }
+  friend bool operator>(self const& x, self const& y) {
+    return x.value > y.value;
+  }
+  friend bool operator>=(self const& x, self const& y) {
+    return x.value >= y.value;
+  }
+private:
+  value_type value;
+};
+
+template <class T,
+          class = typename std::enable_if<std::is_integral<T>::value>::type>
+inline counting_iterator<T> make_counter(T value) {
+  return counting_iterator<T>{value};
+}
+
+#endif // StdParUtils_HH_
diff --git a/src/cudaUtils.hh b/src/cudaUtils.hh
index 868c5477..5982e4cf 100644
--- a/src/cudaUtils.hh
+++ b/src/cudaUtils.hh
@@ -21,7 +21,7 @@
     #define VAR_MEM MemoryControl::AllocationPolicy::HOST_MEM
 #endif
 
-enum ExecutionPolicy{ cpu, gpuWithCUDA, gpuWithOpenMP };
+enum ExecutionPolicy{ cpu, cpuWithStdPar, gpuWithCUDA, gpuWithOpenMP };
 
 inline ExecutionPolicy getExecutionPolicy( int useGPU )
 {
@@ -34,6 +34,10 @@ inline ExecutionPolicy getExecutionPolicy( int useGPU )
         #elif defined (HAVE_OPENMP_TARGET)
         execPolicy = ExecutionPolicy::gpuWithOpenMP;
         #endif
+    } else {
+        #if defined (HAVE_STDPAR)
+        execPolicy = ExecutionPolicy::cpuWithStdPar;
+        #endif
     }
     return execPolicy;
 }
diff --git a/src/main.cc b/src/main.cc
index bb9517b2..c0e35544 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -2,6 +2,7 @@
 #include "utils.hh"
 #include "Parameters.hh"
 #include "utilsMpi.hh"
+#include "StdParUtils.hpp"
 #include "MonteCarlo.hh"
 #include "initMC.hh"
 #include "Tallies.hh"
@@ -198,7 +199,7 @@ void cycleTracking(MonteCarlo *monteCarlo)
                           #endif
                        }
                        break;
-                       
+
                       case gpuWithOpenMP:
                        {
                           int nthreads=128;
@@ -224,6 +225,23 @@ void cycleTracking(MonteCarlo *monteCarlo)
                        }
                        break;
 
+                      case cpuWithStdPar:
+                       {
+                          static int printed{0};
+                          if (!printed) {
+                            std::cout << "cpuWithStdPar" << std::endl;
+                            printed++;
+                          }
+
+                          auto begin = counting_iterator<int>(0);
+                          auto end   = counting_iterator<int>(numParticles);
+                          std::for_each( std::execution::par_unseq, begin, end, [=](int particle_index) {
+                             CycleTrackingGuts( monteCarlo, particle_index, processingVault, processedVault );
+                          });
+
+                       }
+                       break;
+
                       case cpu:
                        #include "mc_omp_parallel_for_schedule_static.hh"
                        for ( int particle_index = 0; particle_index < numParticles; particle_index++ )
@@ -231,6 +249,7 @@ void cycleTracking(MonteCarlo *monteCarlo)
                           CycleTrackingGuts( monteCarlo, particle_index, processingVault, processedVault );
                        }
                        break;
+
                       default:
                        qs_assert(false);
                     } // end switch
diff --git a/src/utils.cc b/src/utils.cc
index d6bf879d..685fb3f8 100644
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -2,6 +2,7 @@
 #include <cstdio>
 #include "qs_assert.hh"
 #include "utilsMpi.hh"
+#include "StdParUtils.hpp"
 #include "macros.hh"
 #include <vector>
 #include <stdarg.h>
@@ -71,8 +72,12 @@ void printBanner(const char *git_version, const char *git_hash)
         printf("Quicksilver Git Hash    : %s\n",git_hash);
         printf("MPI Version             : %d.%d\n",mpi_major,mpi_minor);
         printf("Number of MPI ranks     : %d\n",size);
+#ifdef HAVE_STDPAR
+        printf("Number of C++ HW Threads: %d\n\n",(int)std::thread::hardware_concurrency());
+#else
         printf("Number of OpenMP Threads: %d\n",(int)omp_get_max_threads());
         printf("Number of OpenMP CPUs   : %d\n\n",(int)omp_get_num_procs());
+#endif
     }
 }