diff --git a/include/dca/linalg/matrix.hpp b/include/dca/linalg/matrix.hpp
index bd7d96ced..8325b008a 100644
--- a/include/dca/linalg/matrix.hpp
+++ b/include/dca/linalg/matrix.hpp
@@ -30,6 +30,7 @@
 #include "dca/linalg/util/memory.hpp"
 #include "dca/linalg/util/stream_functions.hpp"
 #include "dca/util/type_help.hpp"
+#include "device_type.hpp"
 
 namespace dca {
 namespace linalg {
@@ -38,7 +39,8 @@ namespace linalg {
 /** Matrix class for interfacing with Blas, Cublas, Rocblas
  *  its row major i.e, row is fast.
  */
-template <typename ScalarType, DeviceType device_name, class ALLOC = util::DefaultAllocator<ScalarType, device_name>>
+template <typename ScalarType, DeviceType device_name,
+          class ALLOC = util::DefaultAllocator<ScalarType, device_name>>
 class Matrix : public ALLOC {
 public:
   using ThisType = Matrix<ScalarType, device_name>;
@@ -72,7 +74,8 @@ class Matrix : public ALLOC {
   // Contructs a matrix with name name, size rhs.size() and a copy of the elements of rhs, where rhs
   // elements are stored on a different device.
   template <DeviceType rhs_device_name, class rhs_ALLOC>
-  Matrix(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, const std::string& = default_name_);
+  Matrix(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
+         const std::string& = default_name_);
 
   // Contructs a matrix with name name, size rhs.size() and a copy of the elements of rhs, where rhs
   // elements are stored on a different device.
@@ -92,10 +95,12 @@ class Matrix : public ALLOC {
   // Resizes the matrix to rhs.size() and copy the elements, stored on a different device, of rhs.
   // Postcondition: The name of the matrix is unchanged.
   template <DeviceType rhs_device_name, class rhs_ALLOC>
-  Matrix<ScalarType, device_name, ALLOC>& operator=(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs);
+  Matrix<ScalarType, device_name, ALLOC>& operator=(
+      const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs);
 
   template <typename ScalarRhs, DeviceType rhs_device_name, class rhs_ALLOC>
-  Matrix<ScalarType, device_name, ALLOC>& operator=(const Matrix<ScalarRhs, rhs_device_name, rhs_ALLOC>& rhs);
+  Matrix<ScalarType, device_name, ALLOC>& operator=(
+      const Matrix<ScalarRhs, rhs_device_name, rhs_ALLOC>& rhs);
 
   // Returns true if this is equal to other, false otherwise.
   // Two matrices are equal, if they have the same size and contain the same elements. Name and
@@ -113,9 +118,9 @@ class Matrix : public ALLOC {
   template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
   ScalarType& operator()(int i, int j) {
 #ifndef NDEBUG
-    if(!(i >= 0 && i <= size_.first))
+    if (!(i >= 0 && i <= size_.first))
       throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
-    if(!(j >= 0 && j <= size_.second))
+    if (!(j >= 0 && j <= size_.second))
       throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
 #endif
     return data_[i + j * leadingDimension()];
@@ -123,9 +128,9 @@ class Matrix : public ALLOC {
   template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
   const ScalarType& operator()(int i, int j) const {
 #ifndef NDEBUG
-    if(!(i >= 0 && i <= size_.first))
+    if (!(i >= 0 && i <= size_.first))
       throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
-    if(!(j >= 0 && j <= size_.second))
+    if (!(j >= 0 && j <= size_.second))
       throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
 #endif
     return data_[i + j * leadingDimension()];
@@ -150,20 +155,20 @@ class Matrix : public ALLOC {
   // a pointer past the end of the range if i == size().first or j == size().second.
   // Preconditions: 0 <= i <= size().first, 0 <= j <= size().second.
   ValueType* ptr(int i, int j) {
-    //These can be an annoyance debugging so making them "manual" asserts
+    // These can be an annoyance debugging so making them "manual" asserts
 #ifndef NDEBUG
-    if(!(i >= 0 && i <= size_.first))
+    if (!(i >= 0 && i <= size_.first))
       throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
-    if(!(j >= 0 && j <= size_.second))
+    if (!(j >= 0 && j <= size_.second))
       throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
 #endif
     return data_ + i + j * leadingDimension();
   }
   const ValueType* ptr(int i, int j) const {
 #ifndef NDEBUG
-    if(!(i >= 0 && i <= size_.first))
+    if (!(i >= 0 && i <= size_.first))
       throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
-    if(!(j >= 0 && j <= size_.second))
+    if (!(j >= 0 && j <= size_.second))
       throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
 #endif
     return data_ + i + j * leadingDimension();
@@ -240,11 +245,13 @@ class Matrix : public ALLOC {
 
   // Asynchronous assignment.
   template <DeviceType rhs_device_name, class rhs_ALLOC>
-  void setAsync(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, const util::GpuStream& stream);
+  void setAsync(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
+                const util::GpuStream& stream);
 
   // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id))
   template <DeviceType rhs_device_name, class rhs_ALLOC>
-  void setAsync(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, int thread_id, int stream_id);
+  void setAsync(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, int thread_id,
+                int stream_id);
 
   void setToZero(const util::GpuStream& stream);
 
@@ -256,6 +263,7 @@ class Matrix : public ALLOC {
   std::size_t deviceFingerprint() const;
 
   std::string toStr() const;
+
 private:
   static std::pair<int, int> capacityMultipleOfBlockSize(std::pair<int, int> size);
   inline static size_t nrElements(std::pair<int, int> size) {
@@ -276,41 +284,41 @@ class Matrix : public ALLOC {
 };
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-const std::string Matrix<ScalarType, device_name,  ALLOC>::default_name_ = "no-name";
+const std::string Matrix<ScalarType, device_name, ALLOC>::default_name_ = "no-name";
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(const std::string& name) : Matrix(name, 0) {}
+Matrix<ScalarType, device_name, ALLOC>::Matrix(const std::string& name) : Matrix(name, 0) {}
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(int size) : Matrix(std::make_pair(size, size)) {}
+Matrix<ScalarType, device_name, ALLOC>::Matrix(int size) : Matrix(std::make_pair(size, size)) {}
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(const std::string& name, int size)
+Matrix<ScalarType, device_name, ALLOC>::Matrix(const std::string& name, int size)
     : Matrix(name, std::make_pair(size, size)) {}
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(int size, int capacity)
+Matrix<ScalarType, device_name, ALLOC>::Matrix(int size, int capacity)
     : Matrix(std::make_pair(size, size), std::make_pair(capacity, capacity)) {}
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(const std::string& name, int size, int capacity)
+Matrix<ScalarType, device_name, ALLOC>::Matrix(const std::string& name, int size, int capacity)
     : Matrix(name, std::make_pair(size, size), std::make_pair(capacity, capacity)) {}
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(std::pair<int, int> size) : Matrix(size, size) {}
+Matrix<ScalarType, device_name, ALLOC>::Matrix(std::pair<int, int> size) : Matrix(size, size) {}
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(const std::string& name, std::pair<int, int> size)
+Matrix<ScalarType, device_name, ALLOC>::Matrix(const std::string& name, std::pair<int, int> size)
     : Matrix(name, size, size) {}
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(std::pair<int, int> size, std::pair<int, int> capacity)
+Matrix<ScalarType, device_name, ALLOC>::Matrix(std::pair<int, int> size, std::pair<int, int> capacity)
     : Matrix(default_name_, size, capacity) {}
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 template <DeviceType rhs_device_name, class rhs_ALLOC>
 Matrix<ScalarType, device_name, ALLOC>::Matrix(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
-                                        const std::string& name)
+                                               const std::string& name)
     : name_(name), size_(rhs.size_), capacity_(rhs.capacity_) {
   data_ = Allocator::allocate(nrElements(capacity_));
   util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
@@ -318,18 +326,19 @@ Matrix<ScalarType, device_name, ALLOC>::Matrix(const Matrix<ScalarType, rhs_devi
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 template <typename ScalarRhs, DeviceType rhs_device_name, class rhs_ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(const Matrix<ScalarRhs, rhs_device_name, rhs_ALLOC>& rhs,
-                                        const std::string& name)
+Matrix<ScalarType, device_name, ALLOC>::Matrix(const Matrix<ScalarRhs, rhs_device_name, rhs_ALLOC>& rhs,
+                                               const std::string& name)
     : name_(name), size_(rhs.size_), capacity_(rhs.capacity_) {
   if (sizeof(ScalarType) != sizeof(ScalarRhs))
-    throw std::runtime_error("conversion of both type and location of Matrix not currently possible!");
+    throw std::runtime_error(
+        "conversion of both type and location of Matrix not currently possible!");
   data_ = ALLOC::allocate(nrElements(capacity_));
   util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 Matrix<ScalarType, device_name, ALLOC>::Matrix(const std::string& name, std::pair<int, int> size,
-                                        std::pair<int, int> capacity)
+                                               std::pair<int, int> capacity)
     : name_(name), size_(size), capacity_(capacityMultipleOfBlockSize(capacity)) {
   assert(size_.first >= 0 && size_.second >= 0);
   assert(capacity.first >= 0 && capacity.second >= 0);
@@ -341,14 +350,15 @@ Matrix<ScalarType, device_name, ALLOC>::Matrix(const std::string& name, std::pai
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(const Matrix<ScalarType, device_name,  ALLOC>& rhs,
-                                        const std::string& name)
+Matrix<ScalarType, device_name, ALLOC>::Matrix(const Matrix<ScalarType, device_name, ALLOC>& rhs,
+                                               const std::string& name)
     : name_(name) {
   *this = rhs;
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::Matrix(Matrix<ScalarType, device_name,  ALLOC>&& rhs, const std::string& name)
+Matrix<ScalarType, device_name, ALLOC>::Matrix(Matrix<ScalarType, device_name, ALLOC>&& rhs,
+                                               const std::string& name)
     : name_(name), size_(rhs.size_), capacity_(rhs.capacity_), data_(rhs.data_) {
   rhs.capacity_ = std::make_pair(0, 0);
   rhs.size_ = std::make_pair(0, 0);
@@ -356,20 +366,26 @@ Matrix<ScalarType, device_name,  ALLOC>::Matrix(Matrix<ScalarType, device_name,
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>::~Matrix() {
+Matrix<ScalarType, device_name, ALLOC>::~Matrix() {
   Allocator::deallocate(data_);
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::resize(std::pair<int, int> new_size) {
-  if (new_size.first == 0 || new_size.second ==0) {
+void Matrix<ScalarType, device_name, ALLOC>::resize(std::pair<int, int> new_size) {
+  if (new_size.first == 0 || new_size.second == 0) {
     size_ = new_size;
     return;
-  } else if (new_size.first > capacity_.first || new_size.second > capacity_.second) {
+  }
+  else if (new_size.first > capacity_.first || new_size.second > capacity_.second) {
     std::pair<int, int> new_capacity = capacityMultipleOfBlockSize(new_size);
 
     ValueType* new_data = nullptr;
     new_data = Allocator::allocate(nrElements(new_capacity));
+#ifndef NDEBUG
+    // This makes debugging matrix issues easier since there is never garbage in expanded matrices
+    if constexpr (device_name == linalg::DeviceType::CPU)
+      std::fill_n(new_data, nrElements(new_capacity), 0);
+#endif
     // hip memorycpy2D routines don't tolerate leadingDimension = 0
     const std::pair<int, int> copy_size(std::min(new_size.first, size_.first),
                                         std::min(new_size.second, size_.second));
@@ -379,14 +395,26 @@ void Matrix<ScalarType, device_name,  ALLOC>::resize(std::pair<int, int> new_siz
     capacity_ = new_capacity;
     size_ = new_size;
   }
-  else {
+#ifndef NDEBUG
+  else if (new_size.first > size_.first || new_size.second > size_.second) {
+    // we need to cover case where we expand the matrix but no allocation is necessary.
+    // This is slow but still thinking about what to do here, the legacy behavior is fine if
+    // we're just overwriting the new sections but not if its assumed they are 0, not all client
+    // code is clear about this.
+    if constexpr (device_name == linalg::DeviceType::CPU)
+      for (int i = size_.first; i < new_size.first; ++i)
+        for (int j = size_.second; j < new_size.second; ++j)
+          *(data_ + i * new_size.second + j) = 0;
     size_ = new_size;
   }
+#endif
+  else
+    size_ = new_size;
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name, ALLOC>::operator=(
-    const Matrix<ScalarType, device_name,  ALLOC>& rhs) {
+Matrix<ScalarType, device_name, ALLOC>& Matrix<ScalarType, device_name, ALLOC>::operator=(
+    const Matrix<ScalarType, device_name, ALLOC>& rhs) {
   resizeNoCopy(rhs.size_);
   if (device_name == CPU)
     util::memoryCopyCpu(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
@@ -396,15 +424,15 @@ Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name, ALLOC>:
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name,  ALLOC>::operator=(
-    Matrix<ScalarType, device_name,  ALLOC>&& rhs) {
+Matrix<ScalarType, device_name, ALLOC>& Matrix<ScalarType, device_name, ALLOC>::operator=(
+    Matrix<ScalarType, device_name, ALLOC>&& rhs) {
   swap(rhs);
   return *this;
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 template <DeviceType rhs_device_name, class rhs_ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name,  ALLOC>::operator=(
+Matrix<ScalarType, device_name, ALLOC>& Matrix<ScalarType, device_name, ALLOC>::operator=(
     const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs) {
   resizeNoCopy(rhs.size_);
   util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
@@ -414,7 +442,7 @@ Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name,  ALLOC>
 #ifdef DCA_HAVE_GPU
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 template <typename ScalarRhs, DeviceType rhs_device_name, class rhs_ALLOC>
-Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name,  ALLOC>::operator=(
+Matrix<ScalarType, device_name, ALLOC>& Matrix<ScalarType, device_name, ALLOC>::operator=(
     const Matrix<ScalarRhs, rhs_device_name, rhs_ALLOC>& rhs) {
   static_assert(sizeof(ScalarType) == sizeof(ScalarRhs),
                 "sizeof ScalarType and ScalarRhs are not equal");
@@ -426,7 +454,8 @@ Matrix<ScalarType, device_name,  ALLOC>& Matrix<ScalarType, device_name,  ALLOC>
 #endif
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-bool Matrix<ScalarType, device_name, ALLOC>::operator==(const Matrix<ScalarType, device_name,  ALLOC>& other) const {
+bool Matrix<ScalarType, device_name, ALLOC>::operator==(
+    const Matrix<ScalarType, device_name, ALLOC>& other) const {
   if (device_name == GPU)
     return Matrix<ScalarType, CPU>(*this) == Matrix<ScalarType, CPU>(other);
 
@@ -442,12 +471,13 @@ bool Matrix<ScalarType, device_name, ALLOC>::operator==(const Matrix<ScalarType,
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-bool Matrix<ScalarType, device_name,  ALLOC>::operator!=(const Matrix<ScalarType, device_name,  ALLOC>& other) const {
+bool Matrix<ScalarType, device_name, ALLOC>::operator!=(
+    const Matrix<ScalarType, device_name, ALLOC>& other) const {
   return not(*this == other);
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::resizeNoCopy(std::pair<int, int> new_size) {
+void Matrix<ScalarType, device_name, ALLOC>::resizeNoCopy(std::pair<int, int> new_size) {
   if (new_size.first > capacity_.first || new_size.second > capacity_.second) {
     size_ = new_size;
     capacity_ = capacityMultipleOfBlockSize(new_size);
@@ -461,28 +491,28 @@ void Matrix<ScalarType, device_name,  ALLOC>::resizeNoCopy(std::pair<int, int> n
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::clear() {
+void Matrix<ScalarType, device_name, ALLOC>::clear() {
   Allocator::deallocate(data_);
   size_ = capacity_ = std::make_pair(0, 0);
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::swap(Matrix<ScalarType, device_name,  ALLOC>& rhs) {
+void Matrix<ScalarType, device_name, ALLOC>::swap(Matrix<ScalarType, device_name, ALLOC>& rhs) {
   std::swap(size_, rhs.size_);
   std::swap(capacity_, rhs.capacity_);
   std::swap(data_, rhs.data_);
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::swapWithName(Matrix<ScalarType, device_name,  ALLOC>& rhs) {
+void Matrix<ScalarType, device_name, ALLOC>::swapWithName(Matrix<ScalarType, device_name, ALLOC>& rhs) {
   std::swap(name_, rhs.name_);
   swap(rhs);
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 template <DeviceType rhs_device_name, class rhs_ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::set(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
-                                          int thread_id, int stream_id) {
+void Matrix<ScalarType, device_name, ALLOC>::set(
+    const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, int thread_id, int stream_id) {
   resize(rhs.size_);
   // This specialization is required since without unified memory CUDA doesn't known which memory locality the pointer has.
   if constexpr (device_name == DeviceType::GPU && rhs_device_name == DeviceType::CPU)
@@ -498,8 +528,9 @@ void Matrix<ScalarType, device_name,  ALLOC>::set(const Matrix<ScalarType, rhs_d
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 template <DeviceType rhs_device_name, class rhs_ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::set(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
-                                          const util::GpuStream& stream [[maybe_unused]]) {
+void Matrix<ScalarType, device_name, ALLOC>::set(
+    const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
+    const util::GpuStream& stream [[maybe_unused]]) {
   resize(rhs.size_);
   if constexpr (device_name == DeviceType::GPU && rhs_device_name == DeviceType::CPU)
     util::memoryCopyH2D(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_);
@@ -509,26 +540,27 @@ void Matrix<ScalarType, device_name,  ALLOC>::set(const Matrix<ScalarType, rhs_d
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 template <DeviceType rhs_device_name, class rhs_ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::setAsync(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
-                                               const util::GpuStream& stream) {
+void Matrix<ScalarType, device_name, ALLOC>::setAsync(
+    const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, const util::GpuStream& stream) {
   resizeNoCopy(rhs.size_);
   util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream);
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 template <DeviceType rhs_device_name, class rhs_ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::setAsync(const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs,
-                                               const int thread_id, const int stream_id) {
+void Matrix<ScalarType, device_name, ALLOC>::setAsync(
+    const Matrix<ScalarType, rhs_device_name, rhs_ALLOC>& rhs, const int thread_id,
+    const int stream_id) {
   setAsync(rhs, util::getStream(thread_id, stream_id));
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::setToZero(const util::GpuStream& stream) {
+void Matrix<ScalarType, device_name, ALLOC>::setToZero(const util::GpuStream& stream) {
   util::Memory<device_name>::setToZeroAsync(data_, leadingDimension() * nrCols(), stream);
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::print() const {
+void Matrix<ScalarType, device_name, ALLOC>::print() const {
   if (device_name == GPU)
     return Matrix<ScalarType, CPU>(*this).print();
 
@@ -549,7 +581,7 @@ void Matrix<ScalarType, device_name,  ALLOC>::print() const {
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-std::string Matrix<ScalarType, device_name,  ALLOC>::toStr() const {
+std::string Matrix<ScalarType, device_name, ALLOC>::toStr() const {
   if (device_name == GPU)
     return Matrix<ScalarType, CPU>(*this).toStr();
 
@@ -566,9 +598,9 @@ std::string Matrix<ScalarType, device_name,  ALLOC>::toStr() const {
 
   return ss.str();
 }
-  
+
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-void Matrix<ScalarType, device_name,  ALLOC>::printFingerprint() const {
+void Matrix<ScalarType, device_name, ALLOC>::printFingerprint() const {
   std::stringstream ss;
 
   ss << "\n";
@@ -581,7 +613,7 @@ void Matrix<ScalarType, device_name,  ALLOC>::printFingerprint() const {
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-std::pair<int, int> Matrix<ScalarType, device_name,  ALLOC>::capacityMultipleOfBlockSize(
+std::pair<int, int> Matrix<ScalarType, device_name, ALLOC>::capacityMultipleOfBlockSize(
     std::pair<int, int> size) {
   assert(size.first >= 0);
   assert(size.second >= 0);
@@ -597,7 +629,7 @@ std::pair<int, int> Matrix<ScalarType, device_name,  ALLOC>::capacityMultipleOfB
 }
 
 template <typename ScalarType, DeviceType device_name, class ALLOC>
-std::size_t Matrix<ScalarType, device_name,  ALLOC>::deviceFingerprint() const {
+std::size_t Matrix<ScalarType, device_name, ALLOC>::deviceFingerprint() const {
   if (device_name == GPU)
     return capacity_.first * capacity_.second * sizeof(ScalarType);
   else
@@ -608,7 +640,7 @@ std::size_t Matrix<ScalarType, device_name,  ALLOC>::deviceFingerprint() const {
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 auto makeDiagonalMatrix(Vector<ScalarType, device_name, ALLOC>& diag) {
   int dsize = diag.size();
-  Matrix<ScalarType, device_name,  ALLOC> matrix("diag_matrix", dsize);
+  Matrix<ScalarType, device_name, ALLOC> matrix("diag_matrix", dsize);
   for (int i = 0; i < dsize; ++i) {
     matrix(i, i) = diag[i];
   }
@@ -619,7 +651,7 @@ auto makeDiagonalMatrix(Vector<ScalarType, device_name, ALLOC>& diag) {
 template <typename ScalarType, DeviceType device_name, class ALLOC>
 auto makeDiagonalMatrixInv(Vector<ScalarType, device_name, ALLOC>& diag) {
   int dsize = diag.size();
-  Matrix<ScalarType, device_name,  ALLOC> matrix("diag_matrix", dsize);
+  Matrix<ScalarType, device_name, ALLOC> matrix("diag_matrix", dsize);
   // insure that if ScalarType is complex the 1 is as well.
   // then std::complex will give us a proper complex multiplicative inverse
   ScalarType the_one{};
diff --git a/include/dca/linalg/vector.hpp b/include/dca/linalg/vector.hpp
index 3e8882b77..a21fd0203 100644
--- a/include/dca/linalg/vector.hpp
+++ b/include/dca/linalg/vector.hpp
@@ -34,7 +34,7 @@ namespace dca {
 namespace linalg {
 // dca::linalg::
 
-  template <typename ScalarType, DeviceType device_name = DeviceType::CPU,
+template <typename ScalarType, DeviceType device_name = DeviceType::CPU,
           class Allocator = util::DefaultAllocator<ScalarType, device_name>>
 class Vector : public Allocator {
 public:
@@ -52,9 +52,8 @@ class Vector : public Allocator {
 
   /** copy constructor except for name.
    *  this is strange but for historical reasons is kept.
-   *  has needed to be explicit because with the `const ThisType&` somehow lead to an implicit conversion
-   *  from an int to a Vector& argument that landed here.
-   *  This occurred in Debug with 
+   *  has needed to be explicit because with the `const ThisType&` somehow lead to an implicit
+   * conversion from an int to a Vector& argument that landed here. This occurred in Debug with
    */
   explicit Vector(const ThisType& rhs, const std::string& name = default_name_);
 
@@ -323,7 +322,8 @@ void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs,
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
-void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream [[maybe_unused]]) {
+void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream
+                                                                [[maybe_unused]]) {
   // TODO: implement in copy.hpp.
 #ifdef DCA_HAVE_GPU
   checkRC(cudaMemsetAsync(data_, 0, size_ * sizeof(ScalarType), stream));
@@ -333,12 +333,14 @@ void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuS
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
-void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
+void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
+                                                           [[maybe_unused]]) {
   dca::linalg::util::Memory<device_name>::setToZero(data_, size_, stream);
 }
 
 // template <typename ScalarType, DeviceType device_name, class Allocator>
-// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
+// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
+// [[maybe_unused]]) {
 //   // TODO: implement in copy.hpp.
 //   dca::linalg::util::memory<device_name>::setToZero(data_, size_, stream);
 // }
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp
index c63d70f18..94d9753bc 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp
@@ -33,6 +33,8 @@ namespace solver {
 namespace ctint {
 // dca::phys::solver::ctint::
 
+
+// I think the class hierarch isn't helpful here.
 class SolverConfiguration : public MatrixConfiguration {
 public:
   using BaseClass = MatrixConfiguration;
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
index 70cfb39c4..db2e560d5 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
@@ -41,6 +41,8 @@
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp"
 #endif
 
+//#define DEBUG_SUBMATRIX
+
 namespace dca {
 namespace phys {
 namespace solver {
@@ -238,7 +240,7 @@ CtintWalkerBase<Parameters, DIST>::CtintWalkerBase(const Parameters& parameters_
       thread_id_(id),
 
       streams_{&linalg::util::getStreamContainer()(thread_id_, 0),
-               &linalg::util::getStreamContainer()(thread_id_, 1)},
+               &linalg::util::getStreamContainer()(thread_id_, 0)},
 
       rng_(rng_ref),
 
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
index 861df5cb6..c650d1e9c 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
@@ -55,7 +55,8 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
 
   using Resource = DMatrixBuilder<linalg::CPU, Scalar>;
 
-  CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id = 0);
+  CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref,
+                          DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id = 0);
 
   virtual ~CtintWalkerSubmatrixCpu() = default;
 
@@ -64,11 +65,29 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   using BaseClass::order;
 
   void setMFromConfig() override;
+
 protected:
   void doSteps();
   void generateDelayedMoves(int nbr_of_movesto_delay);
 
-  /** This does a bunch of things, this is the majority of a step
+  void markThermalized() override;
+
+  void updateM() override;
+
+  DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder_;
+
+  BaseClass::MatrixPair getM();
+
+  /** The following methods are really only here to get decent unit testing
+      they shouldn't really be called outside of the base implementations
+  */
+  void computeMInit() override;
+  void computeGInit() override;
+  BaseClass::MatrixPair getRawM();
+  BaseClass::MatrixPair getRawG();
+
+private:
+    /** This does a bunch of things, this is the majority of a step
    *  For each delayed_move it:
    *  * computes the acceptance prob
    *  * compares std::abs(acceptance prob) and random value
@@ -82,17 +101,6 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
    *        weight_term = prob_const_[field_type][b] * the vertex interaction strength;
    *        BaseClass::mc_log_weight_ += std::log(std::abs(mc_weight_ratio));
    */
-  void mainSubmatrixProcess();
-
-  void markThermalized() override;
-
-  void updateM() override;
-
-  void transformM();
-
-  DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder_;
-private:
-
   void doSubmatrixUpdate();
 
   /** returns [acceptance_probability , mc_weight_ratio ]
@@ -103,10 +111,7 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
 
   void removeRowAndColOfGammaInv();
 
-  void computeMInit() override;
-
   //  void computeG0Init();
-  void computeGInit() override;
 
   Move generateMoveType();
 
@@ -174,11 +179,11 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   using SubmatrixBase::nbr_of_indices_;
   using SubmatrixBase::q_;
   using SubmatrixBase::det_ratio_;
+
 protected:
   using BaseClass::acceptance_prob_;
 
 protected:
-
   using BaseClass::flop_;
 };
 
@@ -187,8 +192,7 @@ CtintWalkerSubmatrixCpu<Parameters, DIST>::CtintWalkerSubmatrixCpu(
     const Parameters& parameters_ref, const Data& data, Rng& rng_ref,
     DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id)
     : SubmatrixBase(parameters_ref, data, rng_ref, id), d_matrix_builder_(d_matrix_builder) {
-
-    for (int b = 0; b < n_bands_; ++b) {
+  for (int b = 0; b < n_bands_; ++b) {
     for (int i = 1; i <= 3; ++i) {
       f_[i][b] = d_matrix_builder_.computeF(i, b);
       f_[-i][b] = d_matrix_builder_.computeF(-i, b);
@@ -202,17 +206,23 @@ CtintWalkerSubmatrixCpu<Parameters, DIST>::CtintWalkerSubmatrixCpu(
     }
     f_[0][b] = 1;
   }
-
 }
 
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixCpu<Parameters, DIST>::setMFromConfig() {
   BaseClass::setMFromConfigImpl(d_matrix_builder_);
-  transformM();
+  SubmatrixBase::transformM();
+#ifdef DEBUG_SUBMATRIX
+  std::cout << "cpu M post setMFromConfigImpl: \n";
+  M_[0].set_name("cpu_M_0");
+  M_[1].set_name("cpu_M_1");
+  M_[0].print();
+  M_[1].print();
+#endif
 }
 
 template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters,DIST>::markThermalized() {
+void CtintWalkerSubmatrixCpu<Parameters, DIST>::markThermalized() {
   thermalized_ = true;
 
   nb_steps_per_sweep_ = std::max(1., std::ceil(sweeps_per_meas_ * partial_order_avg_.mean()));
@@ -225,7 +235,7 @@ void CtintWalkerSubmatrixCpu<Parameters,DIST>::markThermalized() {
   // Recompute the Monte Carlo weight.
   setMFromConfig();
 #ifndef NDEBUG
-  //writeAlphas();
+  // writeAlphas();
 #endif
 }
 
@@ -242,26 +252,55 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMInit() {
       Scalar f_j;
       D_.resize(std::make_pair(delta, n_init_[s]));
 
+      if (delta == 0 || n_init_[s] == 0)
+        throw std::runtime_error(
+            "expansion factor dropped to 0 or below use a higher beta or larger interaction!");
+
       d_matrix_builder_.computeG0(D_, configuration_.getSector(s), n_init_[s], n_max_[s], 0);
 
+#ifdef DEBUG_SUBMATRIX
+      D_.print();
+#endif
+
+      std::array<linalg::Vector<Real, linalg::CPU>, 2> f_values;
+      f_values[s].resize(n_init_[s]);
       for (int j = 0; j < n_init_[s]; ++j) {
         const auto field_type = configuration_.getSector(s).getAuxFieldType(j);
         const auto b = configuration_.getSector(s).getRightB(j);
         f_j = f_[field_type][b] - 1;
-
+        f_values[s][j] = f_[field_type][b];
         for (int i = 0; i < delta; ++i) {
           D_(i, j) *= f_j;
         }
       }
 
+#ifdef DEBUG_SUBMATRIX
+      f_values[0].set_name("cpu_f_values_0");
+      f_values[1].set_name("cpu_f_values_1");
+      f_values[0].print();
+      f_values[1].print();
+      std::cout << "cpu D post f factor mult\n";
+      D_.print();
+      using namespace dca::addt_str_oper;
+      std::cout << "M_[" << s << "] size: " << M_[s].size() << '\n';
+#endif
+
       M_[s].resize(n_max_[s]);
 
       MatrixView M(M_[s], 0, 0, n_init_[s], n_init_[s]);
       MatrixView D_M(M_[s], n_init_[s], 0, delta, n_init_[s]);
 
+#ifdef DEBUG_SUBMATRIX
+      std::cout << "cpu M pre gemm\n";
+      M_[s].print();
+#endif
+
       linalg::matrixop::gemm(D_, M, D_M);
       flop_ += 2 * D_.nrRows() * D_.nrCols() * M.nrCols();
 
+#ifdef DEBUG_SUBMATRIX
+      D_M.print();
+#endif
       for (int i = 0; i < n_max_[s]; ++i) {
         for (int j = n_init_[s]; j < n_max_[s]; ++j) {
           M_[s](i, j) = 0;
@@ -270,6 +309,9 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMInit() {
 
       for (int i = n_init_[s]; i < n_max_[s]; ++i)
         M_[s](i, i) = 1;
+#ifdef DEBUG_SUBMATRIX
+      M_[s].print();
+#endif
     }
   }
 }
@@ -386,6 +428,8 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::updateM() {
     linalg::matrixop::copyCols(M_[s], source_list_[s], M_[s], removal_list_[s]);
     M_[s].resize(configuration_.getSector(s).size());
   }
+
+  
 }
 
 template <class Parameters, DistType DIST>
@@ -493,20 +537,6 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::recomputeGammaInv() {
   }
 }
 
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters, DIST>::transformM() {
-  for (int s = 0; s < 2; ++s) {
-    for (int j = 0; j < M_[s].size().second; ++j) {
-      for (int i = 0; i < M_[s].size().first; ++i) {
-        const auto field_type = configuration_.getSector(s).getAuxFieldType(i);
-        const auto b = configuration_.getSector(s).getLeftB(i);
-        const Scalar f_i = -(f_[field_type][b] - 1);
-        M_[s](i, j) /= f_i;
-      }
-    }
-  }
-}
-
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeM(typename BaseClass::MatrixPair& m_accum) {
   for (int s = 0; s < 2; ++s) {
@@ -591,6 +621,34 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMixedInsertionAndRemoval(
   computeRemovalMatrix(s);
 }
 
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixCpu<Parameters, DIST>::BaseClass::MatrixPair CtintWalkerSubmatrixCpu<
+    Parameters, DIST>::getRawM() {
+  typename BaseClass::MatrixPair M;
+  M = M_;
+  M[0].set_name("subMatrixCPU::M[0]");
+  M[1].set_name("subMatrixCPU::M[1]");
+  return M;
+}
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixCpu<Parameters, DIST>::BaseClass::MatrixPair CtintWalkerSubmatrixCpu<
+    Parameters, DIST>::getRawG() {
+  typename BaseClass::MatrixPair G;
+  G = G_;
+  G[0].set_name("subMatrixCPU::G[0]");
+  G[1].set_name("subMatrixCPU::G[1]");
+  return G;
+}
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixCpu<Parameters, DIST>::BaseClass::MatrixPair CtintWalkerSubmatrixCpu<Parameters,
+                                                                                         DIST>::getM() {
+  typename BaseClass::MatrixPair M;
+  computeM(M);
+  return M;
+}
+
 }  // namespace ctint
 }  // namespace solver
 }  // namespace phys
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
index e098599e6..a6096373b 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
@@ -29,6 +29,7 @@
 #include "dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/kernels_interface.hpp"
+#include "dca/util/to_string.hpp"
 
 namespace dca {
 namespace phys {
@@ -65,6 +66,8 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase<Parameters, DIST
 
   void computeM(MatrixPair<linalg::GPU>& m_accum);
 
+  MatrixPair<linalg::CPU> getM();
+
   void doSweep() override;
 
   void synchronize() const override;
@@ -75,19 +78,29 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   std::size_t deviceFingerprint() const override;
 
   void setMFromConfig() override;
+  void uploadConfiguration();
+
+  auto& getFValues() {
+    return f_values_;
+  }
+
+  auto& getDMatrixBuilder() {
+    return d_matrix_builder_;
+  }
+
 protected:
   // For testing purposes:
-  void doStep(int n_moves_to_delay);
+  void doStep(const int n_moves_to_delay) override;
+  void computeMInit() override;
+  void computeGInit();
+  MatrixPair<linalg::CPU> getRawG();
+  MatrixPair<linalg::CPU> getRawM();
+
+  void updateM() override;
 
 private:
   void doStep() override;
 
-  void computeMInit();
-  void computeGInit();
-  void updateM();
-
-  void uploadConfiguration();
-
 protected:
   using BaseClass::configuration_;
   using BaseClass::M_;
@@ -100,8 +113,6 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   using BaseClass::mc_log_weight_;
   using BaseClass::n_accepted_;
 
-
-  
 private:
   using BaseClass::concurrency_;
   using BaseClass::thread_id_;
@@ -169,11 +180,20 @@ CtintWalkerSubmatrixGpu<Parameters, DIST>::CtintWalkerSubmatrixGpu(
   if (concurrency_.id() == concurrency_.first() && thread_id_ == 0)
     std::cout << "\nCT-INT submatrix walker extended to GPU." << std::endl;
 }
-  
+
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::setMFromConfig() {
   BaseClass::setMFromConfigImpl(d_matrix_builder_);
+  SubmatrixBase::transformM();
+#ifdef DEBUG_SUBMATRIX
+  M_[0].set_name("gpu_M_0");
+  M_[1].set_name("gpu_M_0");
+#endif
   for (int s = 0; s < 2; ++s) {
+    std::cout << "GPU pre set M: \n";
+#ifdef DEBUG_SUBMATRIX
+    M_[s].print();
+#endif
     M_dev_[s].setAsync(M_[s], get_stream(s));
   }
 }
@@ -181,9 +201,7 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::setMFromConfig() {
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::synchronize() const {
   Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_);
-
   checkRC(cudaStreamSynchronize(get_stream(0)));
-  checkRC(cudaStreamSynchronize(get_stream(1)));
 }
 
 template <class Parameters, DistType DIST>
@@ -197,8 +215,9 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::doSweep() {
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::doStep(const int n_moves_to_delay) {
   SubmatrixBase::nbr_of_moves_to_delay_ = n_moves_to_delay;
-  doStep();
   uploadConfiguration();
+  synchronize();
+  doStep();
 }
 
 template <class Parameters, DistType DIST>
@@ -232,6 +251,12 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::uploadConfiguration() {
     f_dev_[s].setAsync(values, get_stream(s));
   }
 
+#ifdef DEBUG_SUBMATRIX
+  std::cout << "GPU f_values: \n";
+  f_values_[0].print();
+  f_values_[1].print();
+#endif
+
   for (int s = 0; s < 2; ++s)
     config_copied_[s].record(get_stream(s));
 }
@@ -240,24 +265,58 @@ template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeMInit() {
   //  Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_);
 
-  for (int s = 0; s < 2; ++s)
+  for (int s = 0; s < 2; ++s) {
+    const GpuStream& stream = get_stream(s);
+    stream.sync();
     M_dev_[s].resize(n_max_[s]);
+  }
 
   for (int s = 0; s < 2; ++s) {
     const int delta = n_max_[s] - n_init_[s];
     if (delta > 0) {
       D_dev_[s].resizeNoCopy(std::make_pair(delta, n_init_[s]));
+
+      if (delta == 0 || n_init_[s] == 0)
+        throw std::runtime_error(
+            "expansion factor dropped to 0 or below use a higher beta or larger interaction!");
+
       d_matrix_builder_.computeG0(D_dev_[s], device_config_.getDeviceData(s), n_init_[s], false,
-                                 get_stream(s));
+                                  get_stream(s));
       flop_ += D_dev_[s].nrCols() * D_dev_[s].nrRows() * 10;
 
+#ifdef DEBUG_SUBMATRIX
+      // debugging
+      SubmatrixBase::D_ = D_dev_[s];
+      SubmatrixBase::D_.print();
+#endif
       MatrixView<linalg::GPU> D_view(D_dev_[s]);
       details::multiplyByFColFactor(D_view, f_dev_[s].ptr(), get_stream(s));
 
+#ifdef DEBUG_SUBMATRIX
+      std::cout << "GPU D multiplied by FCol Factor\n";
+      SubmatrixBase::D_ = D_dev_[s];
+      SubmatrixBase::D_.print();
+#endif
       MatrixView<linalg::GPU> M(M_dev_[s], 0, 0, n_init_[s], n_init_[s]);
       MatrixView<linalg::GPU> D_M(M_dev_[s], n_init_[s], 0, delta, n_init_[s]);
 
+#ifdef DEBUG_SUBMATRIX
+      SubmatrixBase::M_[s] = M_dev_[s];
+      std::cout << "GPU M pre gemm M\n";
+      SubmatrixBase::M_[s].print();
+#endif
+
       linalg::matrixop::gemm(D_dev_[s], M, D_M, thread_id_, s);
+
+#ifdef DEBUG_SUBMATRIX
+      SubmatrixBase::M_[s] = M_dev_[s];
+      std::cout << "GPU M post gemm M\n";
+      SubmatrixBase::M_[s].print();
+      MatrixView<linalg::CPU> D_M_host(SubmatrixBase::M_[s], n_init_[s], 0, delta, n_init_[s]);
+      std::cout << "gpu D_M_ print\n";
+      D_M_host.print();
+#endif
+
       flop_ += 2 * D_dev_[s].nrRows() * D_dev_[s].nrCols() * M.nrCols();
 
       details::setRightSectorToId(M_dev_[s].ptr(), M_dev_[s].leadingDimension(), n_init_[s],
@@ -272,27 +331,31 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeGInit() {
 
   for (int s = 0; s < 2; ++s) {
     const int delta = n_max_[s] - n_init_[s];
+
+    // In cpu we only do all this if delta > 0
     auto& f_dev = f_dev_[s];
 
     G_dev_[s].resizeNoCopy(n_max_[s]);
 
-    MatrixView<linalg::GPU> G(G_dev_[s]);
+    MatrixView<linalg::GPU> G_view_dev(G_dev_[s]);
     const MatrixView<linalg::GPU> M(M_dev_[s]);
-    details::computeGLeft(G, M, f_dev.ptr(), n_init_[s], get_stream(s));
+    details::computeGLeft(G_view_dev, M, f_dev.ptr(), n_init_[s], get_stream(s));
     flop_ += n_init_[s] * n_max_[s] * 2;
 
     if (delta > 0) {
       G0_dev_[s].resizeNoCopy(std::make_pair(n_max_[s], delta));
+
       // This doesn't do flops but the g0 interp data it uses does somewhere.
       d_matrix_builder_.computeG0(G0_dev_[s], device_config_.getDeviceData(s), n_init_[s], true,
-                                 get_stream(s));
+                                  get_stream(s));
       flop_ += G0_dev_[s].nrCols() * G0_dev_[s].nrRows() * 10;
-      MatrixView<linalg::GPU> G(G_dev_[s], 0, n_init_[s], n_max_[s], delta);
+      MatrixView<linalg::GPU> G_view_reduced(G_dev_[s], 0, n_init_[s], n_max_[s], delta);
       // compute G right.
-      linalg::matrixop::gemm(M_dev_[s], G0_dev_[s], G, thread_id_, s);
+      linalg::matrixop::gemm(M_dev_[s], G0_dev_[s], G_view_reduced, thread_id_, s);
       flop_ += 2 * M_dev_[s].nrRows() * M_dev_[s].nrCols() * G0_dev_[s].nrCols();
+
+      G_[s].setAsync(G_dev_[s], get_stream(s));
     }
-    G_[s].setAsync(G_dev_[s], get_stream(s));
   }
 }
 
@@ -325,7 +388,10 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::updateM() {
     old_G.resizeNoCopy(std::make_pair(n_max_[s], gamma_size));
     old_M.resizeNoCopy(std::make_pair(gamma_size, n_max_[s]));
 
-    assert(dca::linalg::util::getStream(thread_id_, s) == get_stream(s));
+    // We no longer use the 1 stream per spin
+    // This isn't portable and i.e. AMD don't really benefit from multiple streams per thread
+    // and it adds much complexity.
+    // assert(dca::linalg::util::getStream(thread_id_, s) == get_stream(s));
     move_indices_dev_[s].setAsync(move_indices_[s], get_stream(s));
     // Note: an event synchronization might be necessary if the order of operation is changed.
     linalg::matrixop::copyCols(G_dev_[s], move_indices_dev_[s], old_G, thread_id_, s);
@@ -392,6 +458,52 @@ std::size_t CtintWalkerSubmatrixGpu<Parameters, DIST>::deviceFingerprint() const
   return res;
 }
 
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixGpu<Parameters, DIST>::MatrixPair<linalg::CPU> CtintWalkerSubmatrixGpu<
+    Parameters, DIST>::getRawM() {
+  std::array<dca::linalg::Matrix<Scalar, linalg::CPU>, 2> M_copy{M_dev_[0].size(), M_dev_[1].size()};
+  // Synchronous copies for testing
+  linalg::util::memoryCopyD2H(M_copy[0].ptr(), M_copy[0].leadingDimension(), M_dev_[0].ptr(),
+                              M_dev_[0].leadingDimension(), M_dev_[0].size());
+  linalg::util::memoryCopyD2H(M_copy[1].ptr(), M_copy[1].leadingDimension(), M_dev_[1].ptr(),
+                              M_dev_[1].leadingDimension(), M_dev_[1].size());
+  M_copy[0].set_name("subMatrixGPU::M[0]");
+  M_copy[1].set_name("subMatrixGPU::M[1]");
+
+  return M_copy;
+}
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixGpu<Parameters, DIST>::MatrixPair<linalg::CPU> CtintWalkerSubmatrixGpu<
+    Parameters, DIST>::getRawG() {
+  std::array<dca::linalg::Matrix<Scalar, linalg::CPU>, 2> G_copy{G_dev_[0].size(), G_dev_[1].size()};
+  // Synchronous copies for testing
+  linalg::util::memoryCopyD2H(G_copy[0].ptr(), G_copy[0].leadingDimension(), G_dev_[0].ptr(),
+                              G_dev_[0].leadingDimension(), G_dev_[0].size());
+  linalg::util::memoryCopyD2H(G_copy[1].ptr(), G_copy[1].leadingDimension(), G_dev_[1].ptr(),
+                              G_dev_[1].leadingDimension(), G_dev_[1].size());
+  G_copy[0].set_name("subMatrixGPU::G[0]");
+  G_copy[1].set_name("subMatrixGPU::G[1]");
+
+  return G_copy;
+}
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixGpu<Parameters, DIST>::MatrixPair<linalg::CPU> CtintWalkerSubmatrixGpu<
+    Parameters, DIST>::getM() {
+  std::array<dca::linalg::Matrix<Scalar, device>, 2> M;
+  synchronize();
+  computeM(M);
+  checkRC(cudaDeviceSynchronize());
+
+  std::array<dca::linalg::Matrix<Scalar, linalg::CPU>, 2> M_copy{M[0].size(), M[1].size()};
+  linalg::util::memoryCopyD2H(M_copy[0].ptr(), M_copy[0].leadingDimension(), M[0].ptr(),
+                              M[0].leadingDimension(), M[0].size());
+  linalg::util::memoryCopyD2H(M_copy[1].ptr(), M_copy[1].leadingDimension(), M[1].ptr(),
+                              M[1].leadingDimension(), M[1].size());
+  return M_copy;
+}
+
 }  // namespace ctint
 }  // namespace solver
 }  // namespace phys
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
index 3b2901aa6..7336a43fe 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
@@ -50,6 +50,7 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
   using BaseClass::order;
 
   virtual void setMFromConfig() = 0;
+  auto getF() const { return f_; }
 protected:
   virtual void doStep() override;
   void doSteps();
@@ -80,6 +81,8 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
 
   virtual void updateM() = 0;
 
+  virtual void computeMInit() = 0;
+
 private:
 
   void doSubmatrixUpdate();
@@ -92,8 +95,6 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
 
   void removeRowAndColOfGammaInv();
 
-  virtual void computeMInit() = 0;
-
   //  void computeG0Init();
   virtual void computeGInit() = 0;
 
@@ -113,7 +114,7 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
     assert(move_idx < sector_indices_[s].size());
     return sector_indices_[s][move_idx] >= n_init_[s];
   }
-
+  
 protected:
   using MatrixView = linalg::MatrixView<Scalar, linalg::CPU>;
   using Matrix = linalg::Matrix<Scalar, linalg::CPU>;
@@ -209,7 +210,7 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
 
   std::array<Matrix, 2> Gamma_q_;
   Matrix workspace_;
-  Matrix D_;
+  Matrix D_{"SubMatrixHostD"};
 
   using BaseClass::flop_;
 };
@@ -248,6 +249,7 @@ void CtintWalkerSubmatrixBase<Parameters, DIST>::doSweep() {
   doSteps();
 }
 
+
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixBase<Parameters, DIST>::doSteps() {
   // Here nbr_of_steps is the number of single steps/moves during the current sweep,
@@ -346,6 +348,20 @@ void CtintWalkerSubmatrixBase<Parameters, DIST>::doSubmatrixUpdate() {
   updateM();
 }
 
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::transformM() {
+  for (int s = 0; s < 2; ++s) {
+    for (int j = 0; j < M_[s].size().second; ++j) {
+      for (int i = 0; i < M_[s].size().first; ++i) {
+        const auto field_type = configuration_.getSector(s).getAuxFieldType(i);
+        const auto b = configuration_.getSector(s).getLeftB(i);
+        const Scalar f_i = -(f_[field_type][b] - 1);
+        M_[s](i, j) /= f_i;
+      }
+    }
+  }
+}
+  
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixBase<Parameters, DIST>::mainSubmatrixProcess() {
   Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_);
@@ -778,20 +794,6 @@ void CtintWalkerSubmatrixBase<Parameters, DIST>::recomputeGammaInv() {
   }
 }
 
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixBase<Parameters, DIST>::transformM() {
-  for (int s = 0; s < 2; ++s) {
-    for (int j = 0; j < M_[s].size().second; ++j) {
-      for (int i = 0; i < M_[s].size().first; ++i) {
-        const auto field_type = configuration_.getSector(s).getAuxFieldType(i);
-        const auto b = configuration_.getSector(s).getLeftB(i);
-        const Scalar f_i = -(f_[field_type][b] - 1);
-        M_[s](i, j) /= f_i;
-      }
-    }
-  }
-}
-
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixBase<Parameters, DIST>::computeInsertionMatrices(
     const std::vector<int>& insertion_indices, const int s) {
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp
index ecec506e0..40bdb92a4 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp
@@ -83,6 +83,8 @@ class DMatrixBuilder<linalg::CPU, Scalar> {
   void computeG0(Matrix& G0, const Sector& configuration, const int n_init, const int n_max,
                  const int which_section) const;
 
+  auto& getSiteDiff() { return site_diff_; }
+  
 #ifdef DCA_HAVE_GPU
   virtual void computeG0(linalg::Matrix<Scalar, linalg::GPU>& /*G0*/,
                          const details::DeviceConfiguration& /*configuration*/, int /*n_init*/,
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh
index b36133118..8da2658ac 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh
@@ -96,6 +96,10 @@ protected:
 // Global instance to be used in the tp accumulation kernel.
 extern __device__ __constant__ G4Helper g4_helper;
 
+extern __CONSTANT__ int* w_ex_indices_actual;
+extern __CONSTANT__ int* k_ex_indices_actual;
+
+  
 inline __device__ int G4Helper::addWex(const int w_idx, const int w_ex_idx) const {
   return w_idx + w_ex_indices_[w_ex_idx];
 }
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh b/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh
index 0dba00d07..83962ef68 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh
@@ -39,12 +39,15 @@ public:
   // returns the index of -id
   __DEVICE__ inline int minus(int id) const;
 
-private:
+  // private:
   int nc_;
   const int* add_matrix_;
   const int* sub_matrix_;
 };
 
+extern __CONSTANT__ int* cluster_add_matrix;
+extern __CONSTANT__ int* cluster_sub_matrix;
+
 // Global instance for real space and momentum clusters.
 extern __DEVICE__ __CONSTANT__ ClusterHelper cluster_real_helper;
 extern __DEVICE__ __CONSTANT__ ClusterHelper cluster_momentum_helper;
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh b/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh
index 8f2b75355..bdc861b3b 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh
@@ -44,7 +44,6 @@ public:
 
 private:
   static bool initialized_;
-
   std::size_t subdm_step_[2];
 };
 
diff --git a/include/dca/phys/parameters/mc_solver_parameters.hpp b/include/dca/phys/parameters/mc_solver_parameters.hpp
index 595b946b5..c2440a50c 100644
--- a/include/dca/phys/parameters/mc_solver_parameters.hpp
+++ b/include/dca/phys/parameters/mc_solver_parameters.hpp
@@ -99,7 +99,7 @@ int McSolverParameters<ClusterSolverId::CT_AUX>::getBufferSize(const Concurrency
 
 template <typename Concurrency>
 void McSolverParameters<ClusterSolverId::CT_AUX>::pack(const Concurrency& concurrency, char* buffer,
-                                              int buffer_size, int& position) const {
+                                                       int buffer_size, int& position) const {
   concurrency.pack(buffer, buffer_size, position, expansion_parameter_K_);
   concurrency.pack(buffer, buffer_size, position, initial_configuration_size_);
   concurrency.pack(buffer, buffer_size, position, initial_matrix_size_);
@@ -110,7 +110,7 @@ void McSolverParameters<ClusterSolverId::CT_AUX>::pack(const Concurrency& concur
 
 template <typename Concurrency>
 void McSolverParameters<ClusterSolverId::CT_AUX>::unpack(const Concurrency& concurrency, char* buffer,
-                                                int buffer_size, int& position) {
+                                                         int buffer_size, int& position) {
   concurrency.unpack(buffer, buffer_size, position, expansion_parameter_K_);
   concurrency.unpack(buffer, buffer_size, position, initial_configuration_size_);
   concurrency.unpack(buffer, buffer_size, position, initial_matrix_size_);
@@ -207,16 +207,18 @@ int McSolverParameters<ClusterSolverId::SS_CT_HYB>::getBufferSize(const Concurre
 }
 
 template <typename Concurrency>
-void McSolverParameters<ClusterSolverId::SS_CT_HYB>::pack(const Concurrency& concurrency, char* buffer,
-                                                 int buffer_size, int& position) const {
+void McSolverParameters<ClusterSolverId::SS_CT_HYB>::pack(const Concurrency& concurrency,
+                                                          char* buffer, int buffer_size,
+                                                          int& position) const {
   concurrency.pack(buffer, buffer_size, position, self_energy_tail_cutoff_);
   concurrency.pack(buffer, buffer_size, position, steps_per_sweep_);
   concurrency.pack(buffer, buffer_size, position, shifts_per_sweep_);
 }
 
 template <typename Concurrency>
-void McSolverParameters<ClusterSolverId::SS_CT_HYB>::unpack(const Concurrency& concurrency, char* buffer,
-                                                   int buffer_size, int& position) {
+void McSolverParameters<ClusterSolverId::SS_CT_HYB>::unpack(const Concurrency& concurrency,
+                                                            char* buffer, int buffer_size,
+                                                            int& position) {
   concurrency.unpack(buffer, buffer_size, position, self_energy_tail_cutoff_);
   concurrency.unpack(buffer, buffer_size, position, steps_per_sweep_);
   concurrency.unpack(buffer, buffer_size, position, shifts_per_sweep_);
@@ -252,6 +254,17 @@ void McSolverParameters<ClusterSolverId::SS_CT_HYB>::readWrite(ReaderOrWriter& r
 // Specialization for CT-INT
 template <>
 class McSolverParameters<ClusterSolverId::CT_INT> {
+private:
+  /// code does not currently cover corner case of configuration size 0, as such this isn't a sensible default to use.
+  int initial_configuration_size_ = 2;
+  double alpha_dd_pos_ = 0.501;
+  double alpha_dd_neg_ = 0;
+  double alpha_ndd_ = 1e-4;
+  bool adjust_alpha_dd_ = false;
+  double double_update_probability_ = 0;
+  bool all_sites_partnership_ = 0;
+  int max_submatrix_size_ = 1;
+
 public:
   template <typename Concurrency>
   int getBufferSize(const Concurrency& concurrency) const;
@@ -298,16 +311,6 @@ class McSolverParameters<ClusterSolverId::CT_INT> {
   void setMaxSubmatrixSize(const int size) {
     max_submatrix_size_ = size;
   }
-
-private:
-  int initial_configuration_size_ = 0;
-  double alpha_dd_pos_ = 0.501;
-  double alpha_dd_neg_ = 0;
-  double alpha_ndd_ = 1e-4;
-  bool adjust_alpha_dd_ = false;
-  double double_update_probability_ = 0;
-  bool all_sites_partnership_ = 0;
-  int max_submatrix_size_ = 1;
 };
 
 template <typename Concurrency>
@@ -327,7 +330,7 @@ int McSolverParameters<ClusterSolverId::CT_INT>::getBufferSize(const Concurrency
 
 template <typename Concurrency>
 void McSolverParameters<ClusterSolverId::CT_INT>::pack(const Concurrency& concurrency, char* buffer,
-                                              const int buffer_size, int& position) const {
+                                                       const int buffer_size, int& position) const {
   concurrency.pack(buffer, buffer_size, position, initial_configuration_size_);
   concurrency.pack(buffer, buffer_size, position, alpha_dd_pos_);
   concurrency.pack(buffer, buffer_size, position, alpha_dd_neg_);
@@ -340,7 +343,7 @@ void McSolverParameters<ClusterSolverId::CT_INT>::pack(const Concurrency& concur
 
 template <typename Concurrency>
 void McSolverParameters<ClusterSolverId::CT_INT>::unpack(const Concurrency& concurrency, char* buffer,
-                                                const int buffer_size, int& position) {
+                                                         const int buffer_size, int& position) {
   concurrency.unpack(buffer, buffer_size, position, initial_configuration_size_);
   concurrency.unpack(buffer, buffer_size, position, alpha_dd_pos_);
   concurrency.unpack(buffer, buffer_size, position, alpha_dd_neg_);
diff --git a/include/dca/phys/parameters/physics_parameters.hpp b/include/dca/phys/parameters/physics_parameters.hpp
index 51f53cba4..12ff59554 100644
--- a/include/dca/phys/parameters/physics_parameters.hpp
+++ b/include/dca/phys/parameters/physics_parameters.hpp
@@ -57,6 +57,7 @@ class PhysicsParameters {
     return adjust_chemical_potential_;
   }
 
+  const void set_beta(double beta) { beta_ = beta; }
 private:
   double beta_;
   double density_;
diff --git a/include/dca/util/cuda2hip.h b/include/dca/util/cuda2hip.h
index 4de6d8caa..256897d86 100644
--- a/include/dca/util/cuda2hip.h
+++ b/include/dca/util/cuda2hip.h
@@ -141,6 +141,7 @@
 #define cudaIpcMemHandle_t              hipIpcMemHandle_t
 #define cudaIpcMemLazyEnablePeerAccess  hipIpcMemLazyEnablePeerAccess
 #define cudaIpcOpenMemHandle            hipIpcOpenMemHandle
+#define cudaGetSymbolSize               hipGetSymbolSize
 #define cudaMalloc                      hipMalloc
 #define cudaMallocArray                 hipMallocArray
 #define cudaMallocManaged               hipMallocManaged
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp
index c188ee348..14365bbba 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp
@@ -126,7 +126,7 @@ auto DMatrixBuilder<CPU, Scalar>::computeGamma(const int aux_spin_type, const in
 
 // Compute only the parts of G0 required at a given moment. (Re)Computing every element is not needed in most situations.
 template <typename Scalar>
-void DMatrixBuilder<CPU, Scalar>::computeG0(Matrix& G0, const Sector& configuration, const int n_init,
+ void DMatrixBuilder<CPU, Scalar>::computeG0(Matrix& G0, const Sector& configuration, const int n_init,
                                             const int n_max, const int which_section) const {
   int b_i, b_j, r_i, r_j;
   Real tau_i, tau_j;
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp
index 2b59fffa6..4f2b171f9 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp
@@ -32,6 +32,8 @@ DMatrixBuilder<linalg::GPU, Scalar>::DMatrixBuilder(const G0Interpolation<GPU, S
                                                   const int nb, const int r0)
     : BaseClass(g0, site_diff, nb), g0_ref_(g0) {
   assert(site_add.size() == site_diff.size());
+  
+
   dca::phys::solver::details::SolverHelper::set(site_add.ptr(), site_add.leadingDimension(), site_diff.ptr(),
                     site_diff.leadingDimension(), nb, site_add.nrRows(), r0);
 }
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu
index 632639e10..4d0d7f94e 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu
@@ -56,6 +56,7 @@ __global__ void buildG0MatrixKernel(linalg::MatrixView<Scalar, linalg::GPU> G0,
   const int b_j = config.getRightB(j);
   const auto tau_j = config.getTau(j);
 
+  
   const int label = dca::phys::solver::details::solver_helper.index(b_i, b_j, config.getLeftR(i),
                                                                     config.getRightR(j));
 
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu b/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu
index ef354ba10..a116287c6 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu
@@ -34,6 +34,7 @@ using dca::util::castGPUType;
 template <typename Scalar>
 __global__ void setRightSectorToIdKernel(Scalar* m, const int ldm, const int n0, const int n_max) {
   const int i = threadIdx.x + blockDim.x * blockIdx.x;
+  // this preserves the behavior where we skip the lower left sector.
   const int j = threadIdx.y + blockDim.y * blockIdx.y + n0;
 
   if (i >= n_max || j >= n_max)
@@ -43,8 +44,9 @@ __global__ void setRightSectorToIdKernel(Scalar* m, const int ldm, const int n0,
   Scalar the_one{};
   the_one += 1.0;
   Scalar the_zero{};
+
   //assert(the_one == (the_zero -= 1.0));    
-  m[i + ldm * j] = (i == j) ? the_one : the_zero;
+  m[i +  ldm * j] = (i == j) ? the_one : the_zero;
 }
 
 template <typename Scalar>
@@ -79,18 +81,18 @@ __global__ void computeGLeftKernel(MatrixView<Scalar, GPU> G, const MatrixView<S
 }
 
 template <typename Scalar, typename Real>
-void computeGLeft(MatrixView<Scalar, GPU>& G, const MatrixView<Scalar, GPU>& M, const Real* f,
+void computeGLeft(MatrixView<Scalar, GPU>& G_dev, const MatrixView<Scalar, GPU>& M_dev, const Real* f_dev,
                   int n_init, cudaStream_t stream) {
   if (n_init == 0)
     return;
-  const int n = G.nrRows();
+  const int n = G_dev.nrRows();
 
   constexpr int thread_j = 4;
   constexpr int thread_i = 64;
   dim3 threads(thread_i, thread_j);
   dim3 blocks(std::max(n / (10 * thread_i), 1), dca::util::ceilDiv(n_init, thread_j));
 
-  computeGLeftKernel<<<blocks, threads, 0, stream>>>(G, M, f, n_init);
+  computeGLeftKernel<<<blocks, threads, 0, stream>>>(G_dev, M_dev, f_dev, n_init);
 }
 
 template void computeGLeft(MatrixView<float, GPU>&, const MatrixView<float, GPU>&, const float*,
diff --git a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cu b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cu
index 62dc9f69c..73613c596 100644
--- a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cu
+++ b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cu
@@ -10,6 +10,7 @@
 // This file implements G4Helper::set.
 
 #include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh"
+#include "dca/platform/dca_gpu.h"
 
 #include <algorithm>
 #include <array>
@@ -27,6 +28,8 @@ namespace details {
 // dca::phys::solver::accumulator::details::
 
 __device__ __constant__ G4Helper g4_helper;
+__CONSTANT__ int* w_ex_indices_actual;
+__CONSTANT__ int* k_ex_indices_actual;
 
 void G4Helper::set(int nb, int nk, int nw, const std::vector<int>& delta_k,
                    const std::vector<int>& delta_w, const int extension_offset, const int* add_k, int lda, const int* sub_k,
@@ -72,7 +75,9 @@ void G4Helper::set(int nb, int nk, int nw, const std::vector<int>& delta_k,
 #ifndef NDEBUG
   cudaMalloc(&host_helper.bad_indicies_, sizeof(int) * 1024);
 #endif
-  cudaMemcpyToSymbol(g4_helper, &host_helper, sizeof(G4Helper));
+  checkRC(cudaMemcpyToSymbol(w_ex_indices_actual, &host_helper.w_ex_indices_, sizeof(int*)));
+  checkRC(cudaMemcpyToSymbol(k_ex_indices_actual, &host_helper.k_ex_indices_, sizeof(int*)));
+  checkRC(cudaMemcpyToSymbol(g4_helper, &host_helper, sizeof(G4Helper)));
 }
 
 __device__ bool G4Helper::extendGIndices(int& k1, int& k2, int& w1, int& w2) const {
diff --git a/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu b/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu
index f6fa66987..4b3debc5f 100644
--- a/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu
+++ b/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu
@@ -13,6 +13,7 @@
 
 #include <mutex>
 #include <array>
+#include <cassert>
 #include "dca/platform/dca_gpu.h"
 #include "dca/linalg/util/allocators/vectors_typedefs.hpp"
 
@@ -24,6 +25,21 @@ namespace details {
 
 __device__ __constant__ ClusterHelper cluster_real_helper;
 __device__ __constant__ ClusterHelper cluster_momentum_helper;
+__CONSTANT__ int* cluster_add_matrix;
+__CONSTANT__ int* cluster_sub_matrix;
+
+  __global__ void checkClusterHelper(int nc, int lds) {
+  const int i_t = threadIdx.x + blockDim.x * blockIdx.x;
+  const int j = threadIdx.y + blockDim.y * blockIdx.y;
+  if (i_t == 0 && j == 0) {
+    for (int ch_i = 0; ch_i < nc; ++ch_i) {
+      for (int ch_j = 0; ch_j < lds; ++ch_j) {
+        printf("%d ", cluster_real_helper.sub_matrix_[ch_i * lds + ch_j]);
+      }
+      printf("\n");
+    }
+  }
+}
 
 void ClusterHelper::set(int nc, const int* add, int lda, const int* sub, int lds, bool momentum) {
   static std::array<std::once_flag, 2> flags;
@@ -44,13 +60,29 @@ void ClusterHelper::set(int nc, const int* add, int lda, const int* sub, int lds
 
     compact_transfer(add, lda, const_cast<int**>(&host_helper.add_matrix_));
     compact_transfer(sub, lds, const_cast<int**>(&host_helper.sub_matrix_));
+    // The logic here for CUDA is transfers from unpinned memory only block for the copy from host
+    // memory to a DMA buffer.  This is on the default queue here and so a stream synch will not
+    // work to insure the ClusterHelper copy is actually complete.
+    cudaDeviceSynchronize();
 
     if (momentum) {
-      cudaMemcpyToSymbol(cluster_momentum_helper, &host_helper, sizeof(ClusterHelper));
+
+      cudaMemcpyToSymbol(cluster_momentum_helper, &host_helper, sizeof(ClusterHelper),
+                         cudaMemcpyHostToDevice);
     }
     else {
-      cudaMemcpyToSymbol(cluster_real_helper, &host_helper, sizeof(ClusterHelper));
+      // In debug on sdgx-2 for CTINT I see know evidence this actually works, it appears not to.
+      size_t cluster_helper_size;
+      checkRC(cudaGetSymbolSize(&cluster_helper_size, cluster_real_helper));
+      assert(cluster_helper_size == sizeof(ClusterHelper));
+      checkRC(cudaMemcpyToSymbol(cluster_add_matrix, &host_helper.add_matrix_, sizeof(int*)));
+      checkRC(cudaMemcpyToSymbol(cluster_sub_matrix, &host_helper.sub_matrix_, sizeof(int*)));
+      checkRC(cudaMemcpyToSymbol(cluster_real_helper, &host_helper, cluster_helper_size));
+      cudaDeviceSynchronize();
     }
+#ifndef NDEBUG
+    checkClusterHelper<<<1, 1, 0, 0>>>(nc, lds);
+#endif
   });
 }
 
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
index 1f727cd4e..fd837332a 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
@@ -30,6 +30,9 @@ using McOptions = MockMcOptions<Scalar>;
 #include "walker_wrapper_submatrix.hpp"
 #include "dca/linalg/matrixop.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp"
+#include "dca/util/to_string.hpp"
+
+using namespace dca::addt_str_oper;
 
 constexpr char input_name[] =
     DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json";
@@ -59,12 +62,12 @@ using namespace dca::phys::solver;
 template <typename Scalar>
 using CtintWalkerSubmatrixGpuTest = CtINTWalkerSubmatrixGPUTestT<Scalar>;
 
-template<dca::linalg::DeviceType DEVICE>
+template <dca::linalg::DeviceType DEVICE>
 using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<DEVICE, Scalar>;
 
 // Currently testing float isn't really possible due to the way the Scalar type is
 // carried through from mc_options. See test_setup.hpp PD
-using ScalarTypes = ::testing::Types<double>; //double,
+using ScalarTypes = ::testing::Types<double>;  // double,
 TYPED_TEST_CASE(CtintWalkerSubmatrixGpuTest, ScalarTypes);
 
 // Compare the submatrix update with a direct computation of the M matrix, and compare the
@@ -77,9 +80,9 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
   using Matrix = typename Walker::Matrix;
   using MatrixPair = std::array<Matrix, 2>;
   using SbmWalkerCpu =
-    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
+      testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
   using SbmWalkerGpu =
-    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::GPU>;
+      testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::GPU>;
 
   std::vector<double> setup_rngs{0., 0.00, 0.9,  0.5, 0.01, 0,    0.75, 0.02,
                                  0,  0.6,  0.03, 1,   0.99, 0.04, 0.99};
@@ -97,15 +100,14 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
   G0Interpolation<GPU, Scalar> g0_gpu(g0_func_gpu);
   typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::LabelDomain label_dmn;
 
-
   constexpr int bands = dca::testing::LatticeBilayer::BANDS;
 
   DMatrixBuilder<dca::linalg::CPU> d_matrix_cpu(g0_cpu, bands, RDmn());
+  d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false);  // cpu_parameters.adjustAlphaDd());
   SbmWalkerCpu::setInteractionVertices(cpu_data, cpu_parameters);
-  d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false); //cpu_parameters.adjustAlphaDd());
   DMatrixBuilder<dca::linalg::GPU> d_matrix_gpu(g0_gpu, bands, RDmn());
+  d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false);  // gpu_parameters.adjustAlphaDd());
   SbmWalkerGpu::setInteractionVertices(cpu_data, gpu_parameters);
-  d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false); //gpu_parameters.adjustAlphaDd());
 
   // ************************************
   // Test vertex insertion / removal ****
@@ -128,10 +130,29 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
       0, 0.99, 0.4, 0.2, -1,  // Insertion
   };
 
-  for (const int initial_size : std::array<int, 2>{0, 5}) {
+  constexpr Scalar tolerance = 10e-12;
+  auto compareSubMatrix = [tolerance](auto& mat_cpu, auto& mat_gpu, const std::string& mat_tag) {
+    int fail = 0;
+    for (int s = 0; s < 2; ++s) {
+      std::cout << mat_tag << "_cpu[" << s << "] size: " << mat_cpu[s].size() << '\n';
+      std::cout << mat_tag << "_gpu[" << s << "] size: " << mat_gpu[s].size() << '\n';
+      for (int j = 0; j < mat_cpu[s].size().second; ++j)
+        for (int i = 0; i < mat_cpu[s].size().first; ++i) {
+          if (std::abs(mat_cpu[s](i, j) - mat_gpu[s](i, j)) > tolerance) {
+            std::cout << mat_tag << " element: " << s << "," << i << "," << j << "  cpu("
+                      << mat_cpu[s](i, j) << ") - gpu(" << mat_gpu[s](i, j) << ") > " << tolerance
+                      << '\n';
+            ++fail;
+          }
+        }
+    }
+    return fail;
+  };
+
+  for (const int initial_size : std::array<int, 1>{2}) {
     cpu_parameters.setInitialConfigurationSize(initial_size);
     gpu_parameters.setInitialConfigurationSize(initial_size);
-    for (int steps = 1; steps <= 8; ++steps) {
+    for (int steps = 2; steps <= 8; ++steps) {
       cpu_rng.setNewValues(setup_rngs);
       SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng, d_matrix_cpu);
       gpu_rng.setNewValues(setup_rngs);
@@ -139,15 +160,12 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
 
       cpu_rng.setNewValues(rng_vals);
       walker_cpu.doStep(steps);
+
       gpu_rng.setNewValues(rng_vals);
       walker_gpu.doStep(steps);
 
-      constexpr Scalar tolerance = std::numeric_limits<Scalar>::epsilon() * 100;
-
-      auto M_cpu = walker_cpu.getM();
-      auto M_gpu = walker_gpu.getM();
-      for (int s = 0; s < 2; ++s)
-        EXPECT_TRUE(dca::linalg::matrixop::areNear(M_cpu[s], M_gpu[s], tolerance));
+      // doSweep does this
+      walker_gpu.uploadConfiguration();
 
       // The final configuration is the same.
       const auto& config1 = walker_cpu.getWalkerConfiguration();
@@ -156,6 +174,211 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
       for (int i = 0; i < config1.size(); ++i)
         EXPECT_EQ(config1[i], config2[i]);
       EXPECT_EQ(walker_cpu.get_sign(), walker_gpu.get_sign());
+
+      auto M_cpu = walker_cpu.getM();
+      auto M_gpu = walker_gpu.getM();
+
+      auto fail = compareSubMatrix(M_cpu, M_gpu, "M");
+      EXPECT_FALSE(fail);
     }
   }
 }
+
+// This test still fails if its run second in the batch.
+
+TYPED_TEST(CtintWalkerSubmatrixGpuTest, fineGrained) {
+  using Scalar = TypeParam;
+  using Parameters = typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::Parameters;
+  using Walker = testing::phys::solver::ctint::WalkerWrapper<Scalar, Parameters>;
+  using Matrix = typename Walker::Matrix;
+  using MatrixPair = std::array<Matrix, 2>;
+  using SbmWalkerCpu =
+      testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
+  using SbmWalkerGpu =
+      testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::GPU>;
+
+  std::vector<double> setup_rngs{0., 0.00, 0.9,  0.5, 0.01, 0,    0.75, 0.02,
+                                 0,  0.6,  0.03, 1,   0.99, 0.04, 0.99};
+  typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::RngType cpu_rng(setup_rngs);
+  typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::RngType gpu_rng(setup_rngs);
+
+  auto& cpu_data = *this->host_setup.data_;
+  auto& gpu_data = *this->gpu_setup.data_;
+  auto& cpu_parameters = this->host_setup.parameters_;
+  auto& gpu_parameters = this->gpu_setup.parameters_;
+
+  const auto g0_func_cpu = dca::phys::solver::ctint::details::shrinkG0(cpu_data.G0_r_t);
+  G0Interpolation<CPU, Scalar> g0_cpu(g0_func_cpu);
+  const auto g0_func_gpu = dca::phys::solver::ctint::details::shrinkG0(gpu_data.G0_r_t);
+  G0Interpolation<GPU, Scalar> g0_gpu(g0_func_gpu);
+  typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::LabelDomain label_dmn;
+
+  constexpr int bands = dca::testing::LatticeBilayer::BANDS;
+
+  DMatrixBuilder<dca::linalg::CPU> d_matrix_cpu(g0_cpu, bands, RDmn());
+  d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false);  // cpu_parameters.adjustAlphaDd());
+  SbmWalkerCpu::setInteractionVertices(cpu_data, cpu_parameters);
+  DMatrixBuilder<dca::linalg::GPU> d_matrix_gpu(g0_gpu, bands, RDmn());
+  d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false);  // gpu_parameters.adjustAlphaDd());
+  SbmWalkerGpu::setInteractionVertices(cpu_data, gpu_parameters);
+
+  // ************************************
+  // Test vertex insertion / removal ****
+  // ************************************
+  // Set rng values.
+  //
+  // Insertion, vertex_id, tau, aux_spin, acceptance_rng
+  // Removal, vertex_id, acceptance_rng
+  // ...
+  // Note: if acceptance_rng <= 0 the move is always accepted, if it is > 1 the move is always
+  // rejected.
+  const std::vector<double> rng_vals{
+      0, 0,    0.1, 0.8, -1,  // Insertion.
+      0, 0.99, 0.2, 0.8, -1,  // Insertion.
+      0, 0,    0.3, 0.8, 2,   // Insertion. Rejected.
+      1, 0,    -1,            // Remove pre-existing.
+      1, 0.99, -1,            // Remove recently inserted.
+      1, 0.99, 2,             // Remove recently inserted. Rejected
+      1, 0,    2,             // Remove . Rejected
+      0, 0.99, 0.4, 0.2, -1,  // Insertion
+  };
+
+  int initial_size = 2;
+
+  cpu_parameters.setInitialConfigurationSize(initial_size);
+  gpu_parameters.setInitialConfigurationSize(initial_size);
+  int steps = 4;
+
+  cpu_rng.setNewValues(setup_rngs);
+  SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng, d_matrix_cpu);
+  gpu_rng.setNewValues(setup_rngs);
+  SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng, d_matrix_gpu);
+
+  constexpr Scalar tolerance = 10e-12;
+
+  cpu_rng.setNewValues(rng_vals);
+  walker_cpu.generateDelayedMoves(steps);
+  walker_cpu.computeMInit();
+
+  walker_gpu.uploadConfiguration();
+  gpu_rng.setNewValues(rng_vals);
+  walker_gpu.generateDelayedMoves(steps);
+  walker_gpu.uploadConfiguration();
+  walker_gpu.computeMInit();
+
+  // walker_gpu.computeGInit();
+  auto M_cpu = walker_cpu.getRawM();
+  auto M_gpu = walker_gpu.getRawM();
+
+  std::cout << "Minit init cpu size " << M_cpu[0].size().first << "," << M_cpu[0].size().second
+            << " : " << M_cpu[1].size().first << "," << M_cpu[1].size().second << '\n';
+  std::cout << "Minit init gpu size " << M_gpu[0].size().first << "," << M_gpu[0].size().second
+            << " : " << M_gpu[1].size().first << "," << M_gpu[1].size().second << '\n';
+
+  std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n';
+  std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n';
+  std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n';
+  std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n';
+
+  M_cpu[0].print();
+  M_cpu[1].print();
+  M_gpu[0].print();
+  M_gpu[1].print();
+
+  auto compareSubMatrix = [tolerance](auto& mat_cpu, auto& mat_gpu, const std::string& mat_tag) {
+    int fail = 0;
+    for (int s = 0; s < 2; ++s) {
+      std::cout << mat_tag << "_cpu[" << s << "] size: " << mat_cpu[s].size() << '\n';
+      std::cout << mat_tag << "_gpu[" << s << "] size: " << mat_gpu[s].size() << '\n';
+      for (int j = 0; j < mat_cpu[s].size().second; ++j)
+        for (int i = 0; i < mat_cpu[s].size().first; ++i) {
+          if (std::abs(mat_cpu[s](i, j) - mat_gpu[s](i, j)) > tolerance) {
+            std::cout << mat_tag << " element: " << s << "," << i << "," << j << "  cpu("
+                      << mat_cpu[s](i, j) << ") - gpu(" << mat_gpu[s](i, j) << ") > " << tolerance
+                      << '\n';
+            ++fail;
+          }
+        }
+    }
+    return fail;
+  };
+
+  auto fail = compareSubMatrix(M_cpu, M_gpu, "M");
+  EXPECT_FALSE(fail);
+
+  walker_cpu.computeGInit();
+  walker_gpu.computeGInit();
+  walker_gpu.synchronize();
+
+  auto G_cpu = walker_cpu.getRawG();
+  auto G_gpu = walker_gpu.getRawG();
+
+  fail = compareSubMatrix(G_cpu, G_gpu, "G");
+  EXPECT_FALSE(fail);
+
+  walker_cpu.mainSubmatrixProcess();
+  walker_gpu.mainSubmatrixProcess();
+
+  walker_cpu.updateM();
+  walker_gpu.updateM();
+
+  M_cpu = walker_cpu.getRawM();
+  M_gpu = walker_gpu.getRawM();
+
+  std::cout << "M post update cpu size " << M_cpu[0].size().first << "," << M_cpu[0].size().second
+            << " : " << M_cpu[1].size().first << "," << M_cpu[1].size().second << '\n';
+  std::cout << "M post update gpu size " << M_gpu[0].size().first << "," << M_gpu[0].size().second
+            << " : " << M_gpu[1].size().first << "," << M_gpu[1].size().second << '\n';
+
+  std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n';
+  std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n';
+  std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n';
+  std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n';
+
+  M_cpu[0].print();
+  M_cpu[1].print();
+  M_gpu[0].print();
+  M_gpu[1].print();
+
+  fail = compareSubMatrix(M_cpu, M_gpu, "updated_M");
+  EXPECT_FALSE(fail);
+
+  // The final configuration is the same.
+  const auto& config1 = walker_cpu.getWalkerConfiguration();
+  const auto& config2 = walker_gpu.getWalkerConfiguration();
+  ASSERT_EQ(config1.size(), config2.size());
+  for (int i = 0; i < config1.size(); ++i)
+    EXPECT_EQ(config1[i], config2[i]);
+  EXPECT_EQ(walker_cpu.get_sign(), walker_gpu.get_sign());
+
+  walker_gpu.uploadConfiguration();
+
+  auto comp_M_cpu = walker_cpu.getM();
+  auto comp_M_gpu = walker_gpu.getM();
+
+  std::cout << "computed M cpu size " << comp_M_cpu[0].size().first << ","
+            << comp_M_cpu[0].size().second << " : " << comp_M_cpu[1].size().first << ","
+            << comp_M_cpu[1].size().second << '\n';
+  std::cout << "computed M gpu size " << comp_M_gpu[0].size().first << ","
+            << comp_M_gpu[0].size().second << " : " << comp_M_gpu[1].size().first << ","
+            << comp_M_gpu[1].size().second << '\n';
+
+  std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n';
+  std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n';
+  std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n';
+  std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n';
+
+  comp_M_cpu[0].print();
+  comp_M_cpu[1].print();
+  comp_M_gpu[0].print();
+  comp_M_gpu[1].print();
+
+  fail = compareSubMatrix(comp_M_cpu, comp_M_gpu, "comp_M");
+  EXPECT_FALSE(fail);
+}
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json b/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json
index a6a5a4b05..88a9aeed0 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json
@@ -9,16 +9,18 @@
 
   "physics" :
   {
-    "beta"                      :  2,
+    "beta"                      :  4,
     "chemical-potential"        : 0
   },
 
   "bilayer-Hubbard-model":
   {
-    "t"       : 0,
-    "U"       : 0,
-    "V"       : 0,
-    "V-prime" : 2
+    "t"       : 1.0,
+    "t_prime" : 0.1,
+    "t_perp"  : 0.01,
+    "U"       : 1.0,
+    "V"       : 1.0,
+    "V-prime" : 2.0
   },
 
   "Hund-model":
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
index 2d5dc66a8..4c451a525 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
@@ -50,7 +50,7 @@ struct WalkerSelector<Parameters, GPU, DIST> {
 #endif  // DCA_HAVE_GPU
 
 using namespace dca::phys::solver::ctint;
-  template <typename SCALAR, class Parameters, DeviceType device_t = CPU, DistType DIST = DistType::NONE>
+template <typename SCALAR, class Parameters, DeviceType device_t = CPU, DistType DIST = DistType::NONE>
 struct WalkerWrapperSubmatrix : public WalkerSelector<Parameters, device_t, DIST>::type {
   using BaseClass = typename WalkerSelector<Parameters, device_t, DIST>::type;
   using Scalar = SCALAR;
@@ -59,31 +59,19 @@ struct WalkerWrapperSubmatrix : public WalkerSelector<Parameters, device_t, DIST
   using Data = typename BaseClass::Data;
   using BaseClass::setMFromConfig;
 
-    WalkerWrapperSubmatrix(/*const*/ Parameters& parameters_ref, Rng& rng_ref, DMatrixBuilder<device_t, Scalar>& d_matrix_builder)
-      : BaseClass(parameters_ref, dca::phys::DcaData<Parameters>(parameters_ref), rng_ref, d_matrix_builder, 0),
+  WalkerWrapperSubmatrix(/*const*/ Parameters& parameters_ref, Rng& rng_ref,
+                         DMatrixBuilder<device_t, Scalar>& d_matrix_builder)
+      : BaseClass(parameters_ref, dca::phys::DcaData<Parameters>(parameters_ref), rng_ref,
+                  d_matrix_builder, 0),
         streams_(3) {
     BaseClass::initialize(0);
-
-  }
-
-  // This purposefully shadows
-  void doStep(const int n_steps_to_delay) {
-    BaseClass::SubmatrixBase::doStep(n_steps_to_delay);
   }
 
   using Matrix = dca::linalg::Matrix<Scalar, CPU>;
   using MatrixPair = std::array<Matrix, 2>;
 
   MatrixPair getM() {
-    std::array<dca::linalg::Matrix<Scalar, device_t>, 2> M;
-
-    BaseClass::computeM(M);
-#ifdef DCA_HAVE_GPU
-    checkRC(cudaDeviceSynchronize());
-#endif
-
-    std::array<dca::linalg::Matrix<Scalar, CPU>, 2> M_copy{M[0], M[1]};
-    return M_copy;
+    return BaseClass::getM();
   }
 
   const auto& getWalkerConfiguration() const {
@@ -94,6 +82,41 @@ struct WalkerWrapperSubmatrix : public WalkerSelector<Parameters, device_t, DIST
     return BaseClass::acceptance_prob_;
   }
 
+  /** These methods allow access to protected member functions in order to be
+   *  able to write meaningful unit tests for the CtintWalker
+   */
+  void doStep(const int n_steps_to_delay) {
+    BaseClass::doStep(n_steps_to_delay);
+  }
+
+  void generateDelayedMoves(int nbr_of_movesto_delay) {
+    BaseClass::SubmatrixBase::generateDelayedMoves(nbr_of_movesto_delay);
+  }
+
+  void computeMInit() {
+    BaseClass::computeMInit();
+  }
+
+  void computeGInit() {
+    BaseClass::computeGInit();
+  }
+
+  auto getRawM() {
+    return BaseClass::getRawM();
+  }
+
+  auto getRawG() {
+    return BaseClass::getRawG();
+  }
+
+  void updateM() {
+    BaseClass::updateM();
+  }
+
+  void mainSubmatrixProcess() {
+    BaseClass::mainSubmatrixProcess();
+  }
+
 private:
   std::vector<dca::linalg::util::GpuStream> streams_;
 };
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json
index 2d27da0f8..6b997974b 100644
--- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json
@@ -9,14 +9,18 @@
 
   "physics" :
   {
-    "beta"                      :  2,
+    "beta"                      :  4,
     "chemical-potential"        : 0
   },
 
   "bilayer-Hubbard-model":
   {
-    "t"       : 1,
-    "U"       : 2
+    "t"       : 1.0,
+    "t_prime" : 0.1,
+    "t_perp"  : 0.01,
+    "U"       : 1.0,
+    "V"       : 1.0,
+    "V-prime" : 2.0
   },
 
   "Hund-model":