From 34ee434deabcfd460efa18dca7236a96219cb96a Mon Sep 17 00:00:00 2001
From: Peter Doak <doakpw@ornl.gov>
Date: Mon, 11 Aug 2025 10:13:55 -0400
Subject: [PATCH 1/3] removing assumption spin parallelism from blas

---
 include/dca/linalg/blas/use_device.hpp                      | 6 ++++++
 include/dca/linalg/matrixop.hpp                             | 2 ++
 include/dca/linalg/util/stream_container.hpp                | 2 +-
 .../walker/tools/g0_interpolation/g0_interpolation_gpu.hpp  | 5 +----
 .../ctint/structs/device_configuration_manager.hpp          | 2 +-
 5 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/include/dca/linalg/blas/use_device.hpp b/include/dca/linalg/blas/use_device.hpp
index 72438de5c..e375d3b8a 100644
--- a/include/dca/linalg/blas/use_device.hpp
+++ b/include/dca/linalg/blas/use_device.hpp
@@ -85,24 +85,28 @@ struct UseDevice<GPU> {
   template <typename ScalarType>
   inline static void axpy(int n, ScalarType alpha, const ScalarType* x, int incx, ScalarType* y,
                           int incy, int thread_id, int stream_id) {
+    assert(stream_id == 0);
     cublas::axpy(util::getHandle(thread_id, stream_id), n, alpha, x, incx, y, incy);
   }
 
   template <typename ScalarType>
   inline static void copy(int n, const ScalarType* x, int incx, ScalarType* y, int incy,
                           int thread_id, int stream_id) {
+    assert(stream_id == 0);
     cublas::copy(util::getHandle(thread_id, stream_id), n, x, incx, y, incy);
   }
 
   template <typename ScalarType>
   inline static void scal(int n, ScalarType alpha, ScalarType* x, int incx, int thread_id,
                           int stream_id) {
+    assert(stream_id == 0);
     cublas::scal(util::getHandle(thread_id, stream_id), n, alpha, x, incx);
   }
 
   template <typename ScalarType>
   inline static void swap(int n, ScalarType* x, int incx, ScalarType* y, int incy, int thread_id,
                           int stream_id) {
+    assert(stream_id == 0);
     cublas::swap(util::getHandle(thread_id, stream_id), n, x, incx, y, incy);
   }
 
@@ -112,6 +116,7 @@ struct UseDevice<GPU> {
                           ScalarType alpha, const ScalarType* a, int lda, const ScalarType* b,
                           int ldb, ScalarType beta, ScalarType* c, int ldc, int thread_id,
                           int stream_id) {
+    assert(stream_id == 0);
     cublas::gemm(util::getHandle(thread_id, stream_id), transa, transb, m, n, k, alpha, a, lda, b,
                  ldb, beta, c, ldc);
   }
@@ -120,6 +125,7 @@ struct UseDevice<GPU> {
   inline static void trsm(const char* side, const char* uplo, const char* transa, const char* diag,
                           int m, int n, ScalarType alpha, const ScalarType* a, int lda,
                           ScalarType* b, int ldb, int thread_id, int stream_id) {
+    assert(stream_id == 0);
     cublas::trsm(util::getHandle(thread_id, stream_id), side, uplo, transa, diag, m, n, alpha, a,
                  lda, b, ldb);
   }
diff --git a/include/dca/linalg/matrixop.hpp b/include/dca/linalg/matrixop.hpp
index 87c650d25..cc5d8c32e 100644
--- a/include/dca/linalg/matrixop.hpp
+++ b/include/dca/linalg/matrixop.hpp
@@ -679,6 +679,7 @@ template <typename Scalar, DeviceType device_name, template <typename, DeviceTyp
 void gemm(char transa, char transb, Scalar alpha, const MatrixA<Scalar, device_name, ALLOC1>& a,
           const MatrixB<Scalar, device_name, ALLOC2>& b, Scalar beta,
           MatrixC<Scalar, device_name, ALLOC3>& c, int thread_id = 0, int stream_id = 0) {
+  assert(stream_id == 0);
   int m = c.nrRows();
   int n = c.nrCols();
   int k;
@@ -719,6 +720,7 @@ template <typename Scalar, DeviceType device_name, class ALLOC,
 inline void gemm(const MatrixA<Scalar, device_name, ALLOC>& a,
                  const MatrixB<Scalar, device_name, ALLOC>& b,
                  MatrixC<Scalar, device_name, ALLOC>& c, int thread_id = 0, int stream_id = 0) {
+  assert(stream_id == 0);
   gemm<Scalar, device_name>('N', 'N', 1., a, b, 0., c, thread_id, stream_id);
 }
 
diff --git a/include/dca/linalg/util/stream_container.hpp b/include/dca/linalg/util/stream_container.hpp
index 6d97871d5..7b59796fb 100644
--- a/include/dca/linalg/util/stream_container.hpp
+++ b/include/dca/linalg/util/stream_container.hpp
@@ -61,7 +61,7 @@ class StreamContainer {
   }
 
 private:
-  constexpr static std::size_t streams_per_thread_ = 2;
+  constexpr static std::size_t streams_per_thread_ = 1;
   std::vector<std::array<GpuStream, streams_per_thread_>> streams_;
 };
 
diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.hpp
index ef2d379fe..add1f3b0f 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.hpp
@@ -101,8 +101,6 @@ class G0Interpolation<dca::linalg::GPU, Parameters> : public G0InterpolationBase
   linalg::MultiVector<linalg::GPU, int, int, Real> g0_labels_gpu_;
 
   using Base::beta;
-
-  linalg::util::GpuEvent config_copied_;
 };
 
 template <typename Parameters>
@@ -239,7 +237,7 @@ void G0Interpolation<dca::linalg::GPU, Parameters>::uploadConfiguration(
     const Configuration& configuration) {
   const int configuration_size = configuration.size();
 
-  config_copied_.block();
+  linalg::util::getStream(thread_id, 0).sync();
   g0_labels_cpu_.resizeNoCopy(configuration_size);
   g0_labels_gpu_.resizeNoCopy(configuration_size);
 
@@ -255,7 +253,6 @@ void G0Interpolation<dca::linalg::GPU, Parameters>::uploadConfiguration(
 
   const auto& stream = linalg::util::getStream(thread_id, stream_id);
   g0_labels_gpu_.setAsync(g0_labels_cpu_, stream);
-  config_copied_.record(stream);
 }
 
 }  // namespace ctaux
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp
index 5cb320da2..5bd9de995 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp
@@ -50,7 +50,7 @@ class DeviceConfigurationManager {
 void DeviceConfigurationManager::upload(const SolverConfiguration& config, int thread_id, int spin) {
   assert(spin >= 0 and spin < 2);
   const auto& entries = config.getSector(spin).entries_;
-  device_entries_[spin].setAsync(entries, linalg::util::getStream(thread_id, spin));
+  device_entries_[spin].setAsync(entries, linalg::util::getStream(thread_id, 0));
   device_pointers_[spin].data = device_entries_[spin].ptr();
 }
 

From 0cf2d69416c2aedde21ff5b77855c3adcc816a30 Mon Sep 17 00:00:00 2001
From: Peter Doak <doakpw@ornl.gov>
Date: Thu, 14 Aug 2025 18:47:25 -0400
Subject: [PATCH 2/3] following through removing spin concurrency

---
 build-aux/frontier_build.sh                   |   2 +
 .../ctint/walker/ctint_walker_base.hpp        |  55 ++--
 .../walker/ctint_walker_cpu_submatrix.hpp     | 142 ++++++++---
 .../walker/ctint_walker_gpu_submatrix.hpp     | 205 +++++++++++----
 .../walker/ctint_walker_submatrix_base.hpp    |  39 +--
 .../interpolation/g0_interpolation_gpu.hpp    |   5 +-
 include/dca/phys/parameters/parameters.hpp    |  22 +-
 .../ctint/walker/walker_kernels.cu            |   8 +-
 test/unit/linalg/vector_cpu_gpu_test.cpp      |   8 +-
 test/unit/linalg/vector_cpu_test.cpp          |   4 +-
 test/unit/linalg/vector_gpu_test.cpp          |   4 +-
 .../ct_int_walker_multinsertion_test.cpp      | 207 +++++++++------
 ..._int_walker_submatrix_gpu_complex_test.cpp |   4 +-
 .../ct_int_walker_submatrix_gpu_test.cpp      | 235 +++++++++++++++++-
 .../walker/ct_int_walker_submatrix_test.cpp   |   8 +-
 .../ctint/walker/ct_int_walker_test.cpp       |   2 +-
 .../ctint/walker/walker_wrapper.hpp           |   4 +-
 .../ctint/walker/walker_wrapper_submatrix.hpp |  40 ++-
 .../dca_step/cluster_solver/test_setup.hpp    |  43 ++++
 19 files changed, 793 insertions(+), 244 deletions(-)

diff --git a/build-aux/frontier_build.sh b/build-aux/frontier_build.sh
index 81baf4666..206668793 100644
--- a/build-aux/frontier_build.sh
+++ b/build-aux/frontier_build.sh
@@ -10,11 +10,13 @@ cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
       -DTEST_RUNNER="srun" \
       -DGPU_TARGETS=gfx90a \
       -DAMDGPU_TARGETS=gfx90a \
+      -DCMAKE_BUILD_TYPE=Release \
       -DCMAKE_C_COMPILER=mpicc \
       -DCMAKE_CXX_COMPILER=mpic++ \
       -DCMAKE_HIP_COMPILER=/opt/rocm-6.3.1/llvm/bin/clang++ \
       -DCMAKE_INSTALL_PREFIX=$INST \
       -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
       -DCMAKE_HIP_LINK_FLAGS=--hip-link \
+      -DCMAKE_EXPORT_COMPILE_COMMANDSS=1 \
       -GNinja \
       ..
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
index 70cfb39c4..9ed7e52eb 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
@@ -92,10 +92,10 @@ class CtintWalkerBase {
   virtual void doStep() = 0;
 
   virtual void doSweep() = 0;
-  
-  template<linalg::DeviceType DEVICE>
+
+  template <linalg::DeviceType DEVICE>
   void setMFromConfigImpl(DMatrixBuilder<DEVICE, Scalar>& d_matrix_builder);
-  
+
   bool is_thermalized() const {
     return thermalized_;
   }
@@ -165,17 +165,15 @@ class CtintWalkerBase {
     return flop;
   }
 
-  const auto& get_stream(int s) const {
-    assert(s >= 0 && s < 2);
-    return *streams_[s];
+  const auto& get_stream() const {
+    return stream_;
   }
 
   static void sumConcurrency(const Concurrency&) {}
 
   void writeAlphas() const;
 
-  static void setInteractionVertices(const Data& data,
-			      const Parameters& parameters);
+  static void setInteractionVertices(const Data& data, const Parameters& parameters);
 
 protected:
   // typedefs
@@ -191,7 +189,7 @@ class CtintWalkerBase {
   const Concurrency& concurrency_;
 
   const int thread_id_;
-  std::array<linalg::util::GpuStream*, 2> streams_;
+  linalg::util::GpuStream* stream_;
 
   Rng& rng_;
   SolverConfiguration configuration_;
@@ -230,26 +228,18 @@ class CtintWalkerBase {
 };
 
 template <class Parameters, DistType DIST>
-CtintWalkerBase<Parameters, DIST>::CtintWalkerBase(const Parameters& parameters_ref,
-                                                         Rng& rng_ref, int id)
+CtintWalkerBase<Parameters, DIST>::CtintWalkerBase(const Parameters& parameters_ref, Rng& rng_ref,
+                                                   int id)
     : parameters_(parameters_ref),
       concurrency_(parameters_.get_concurrency()),
-
       thread_id_(id),
-
-      streams_{&linalg::util::getStreamContainer()(thread_id_, 0),
-               &linalg::util::getStreamContainer()(thread_id_, 1)},
-
+      stream_{&linalg::util::getStreamContainer()(thread_id_, 0)},
       rng_(rng_ref),
-
-      configuration_(parameters_.get_beta(), Bdmn::dmn_size(), vertices_,
-                     parameters_.getDoubleUpdateProbability()),
-
       beta_(parameters_.get_beta()),
       total_interaction_(vertices_.integratedInteraction()) {}
 
 template <class Parameters, DistType DIST>
-void CtintWalkerBase<Parameters,DIST>::initialize(int iteration) {
+void CtintWalkerBase<Parameters, DIST>::initialize(int iteration) {
   assert(total_interaction_);
   phase_.reset();
 
@@ -269,7 +259,7 @@ void CtintWalkerBase<Parameters,DIST>::initialize(int iteration) {
 }
 
 template <class Parameters, DistType DIST>
-void CtintWalkerBase<Parameters,DIST>::updateSweepAverages() {
+void CtintWalkerBase<Parameters, DIST>::updateSweepAverages() {
   order_avg_.addSample(order());
   sign_avg_.addSample(phase_.getSign());
   // Track avg order for the final number of steps / sweep.
@@ -293,9 +283,8 @@ void CtintWalkerBase<Parameters,DIST>::updateSweepAverages() {
 //   }
 // }
 
-
 template <class Parameters, DistType DIST>
-void CtintWalkerBase<Parameters,DIST>::updateShell(int meas_id, int meas_to_do) const {
+void CtintWalkerBase<Parameters, DIST>::updateShell(int meas_id, int meas_to_do) const {
   if (concurrency_.id() == concurrency_.first() && meas_id > 1 &&
       (meas_id % dca::util::ceilDiv(meas_to_do, 20)) == 0) {
     std::cout << "\t\t\t" << int(double(meas_id) / double(meas_to_do) * 100) << " % completed \t ";
@@ -312,7 +301,7 @@ void CtintWalkerBase<Parameters,DIST>::updateShell(int meas_id, int meas_to_do)
 }
 
 template <class Parameters, DistType DIST>
-void CtintWalkerBase<Parameters,DIST>::printSummary() const {
+void CtintWalkerBase<Parameters, DIST>::printSummary() const {
   std::cout << "\n"
             << "Walker: process ID = " << concurrency_.id() << ", thread ID = " << thread_id_ << "\n"
             << "-------------------------------------------\n";
@@ -329,8 +318,8 @@ void CtintWalkerBase<Parameters,DIST>::printSummary() const {
 }
 
 template <class Parameters, DistType DIST>
-void CtintWalkerBase<Parameters,DIST>::setInteractionVertices(const Data& data,
-                                                                     const Parameters& parameters) {
+void CtintWalkerBase<Parameters, DIST>::setInteractionVertices(const Data& data,
+                                                               const Parameters& parameters) {
   vertices_.reset();
   vertices_.initialize(parameters.getDoubleUpdateProbability(), parameters.getAllSitesPartnership());
   vertices_.initializeFromHamiltonian(data.H_interactions);
@@ -341,11 +330,11 @@ void CtintWalkerBase<Parameters,DIST>::setInteractionVertices(const Data& data,
 }
 
 template <class Parameters, DistType DIST>
-void CtintWalkerBase<Parameters,DIST>::computeM(MatrixPair& m_accum) const {
+void CtintWalkerBase<Parameters, DIST>::computeM(MatrixPair& m_accum) const {
   m_accum = M_;
 }
 
-// template<class WALKER, linalg::DeviceType DEVICE>  
+// template<class WALKER, linalg::DeviceType DEVICE>
 // void setMFromConfigHelper(WALKER& walker, DMatrixBuilder<DEVICE, Scalar>& d_matrix_builder) {
 //   walker.mc_log_weight_ = 0.;
 //   walker.phase_.reset();
@@ -379,11 +368,11 @@ void CtintWalkerBase<Parameters,DIST>::computeM(MatrixPair& m_accum) const {
 //     walker.phase_.multiply(term);
 //   }
 // }
-  
-     
+
 template <class Parameters, DistType DIST>
 template <linalg::DeviceType DEVICE>
-void CtintWalkerBase<Parameters, DIST>::setMFromConfigImpl(DMatrixBuilder<DEVICE, Scalar>& d_matrix_builder) {
+void CtintWalkerBase<Parameters, DIST>::setMFromConfigImpl(
+    DMatrixBuilder<DEVICE, Scalar>& d_matrix_builder) {
   mc_log_weight_ = 0.;
   phase_.reset();
 
@@ -416,7 +405,7 @@ void CtintWalkerBase<Parameters, DIST>::setMFromConfigImpl(DMatrixBuilder<DEVICE
     phase_.multiply(term);
   }
 }
-  
+
 }  // namespace ctint
 }  // namespace solver
 }  // namespace phys
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
index 35fe3bf9a..80711ee45 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
@@ -55,7 +55,8 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
 
   using Resource = DMatrixBuilder<linalg::CPU, Scalar>;
 
-  CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id = 0);
+  CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref,
+                          DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id = 0);
 
   virtual ~CtintWalkerSubmatrixCpu() = default;
 
@@ -69,6 +70,23 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   void doSteps();
   void generateDelayedMoves(int nbr_of_movesto_delay);
 
+  void markThermalized() override;
+
+  void updateM() override;
+
+  DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder_;
+
+  BaseClass::MatrixPair getM();
+
+  /** The following methods are really only here to get decent unit testing
+      they shouldn't really be called outside of the base implementations
+  */
+  void computeMInit() override;
+  void computeGInit() override;
+  BaseClass::MatrixPair getRawM();
+  BaseClass::MatrixPair getRawG();
+
+private:
   /** This does a bunch of things, this is the majority of a step
    *  For each delayed_move it:
    *  * computes the acceptance prob
@@ -83,15 +101,6 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
    *        weight_term = prob_const_[field_type][b] * the vertex interaction strength;
    *        BaseClass::mc_log_weight_ += std::log(std::abs(mc_weight_ratio));
    */
-  void mainSubmatrixProcess();
-
-  void updateM() override;
-
-  void transformM();
-
-  DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder_;
-private:
-
   void doSubmatrixUpdate();
 
   /** returns [acceptance_probability , mc_weight_ratio ]
@@ -102,10 +111,7 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
 
   void removeRowAndColOfGammaInv();
 
-  void computeMInit() override;
-
   //  void computeG0Init();
-  void computeGInit() override;
 
   Move generateMoveType();
 
@@ -155,6 +161,7 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   using SubmatrixBase::n_init_;
   using SubmatrixBase::D_;
   using SubmatrixBase::G_;
+  using SubmatrixBase::M_;
   using SubmatrixBase::s_;
   using SubmatrixBase::Gamma_indices_;
   using SubmatrixBase::Gamma_inv_;
@@ -172,11 +179,11 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   using SubmatrixBase::nbr_of_indices_;
   using SubmatrixBase::q_;
   using SubmatrixBase::det_ratio_;
+
 protected:
   using BaseClass::acceptance_prob_;
 
 protected:
-
   using BaseClass::flop_;
 };
 
@@ -185,8 +192,7 @@ CtintWalkerSubmatrixCpu<Parameters, DIST>::CtintWalkerSubmatrixCpu(
     const Parameters& parameters_ref, const Data& data, Rng& rng_ref,
     DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id)
     : SubmatrixBase(parameters_ref, data, rng_ref, id), d_matrix_builder_(d_matrix_builder) {
-
-    for (int b = 0; b < n_bands_; ++b) {
+  for (int b = 0; b < n_bands_; ++b) {
     for (int i = 1; i <= 3; ++i) {
       f_[i][b] = d_matrix_builder_.computeF(i, b);
       f_[-i][b] = d_matrix_builder_.computeF(-i, b);
@@ -200,13 +206,37 @@ CtintWalkerSubmatrixCpu<Parameters, DIST>::CtintWalkerSubmatrixCpu(
     }
     f_[0][b] = 1;
   }
-
 }
 
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixCpu<Parameters, DIST>::setMFromConfig() {
   BaseClass::setMFromConfigImpl(d_matrix_builder_);
-  transformM();
+  SubmatrixBase::transformM();
+#ifdef DEBUG_SUBMATRIX
+  std::cout << "cpu M post setMFromConfigImpl: \n";
+  M_[0].set_name("cpu_M_0");
+  M_[1].set_name("cpu_M_1");
+  M_[0].print();
+  M_[1].print();
+#endif
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixCpu<Parameters, DIST>::markThermalized() {
+  thermalized_ = true;
+
+  nb_steps_per_sweep_ = std::max(1., std::ceil(sweeps_per_meas_ * partial_order_avg_.mean()));
+  thermalization_steps_ = n_steps_;
+
+  order_avg_.reset();
+  sign_avg_.reset();
+  n_accepted_ = 0;
+
+  // Recompute the Monte Carlo weight.
+  setMFromConfig();
+#ifndef NDEBUG
+  // writeAlphas();
+#endif
 }
 
 // Extend M by adding non-interacting vertices.
@@ -222,26 +252,55 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMInit() {
       Scalar f_j;
       D_.resize(std::make_pair(delta, n_init_[s]));
 
+      if (delta == 0 || n_init_[s] == 0)
+        throw std::runtime_error(
+            "expansion factor dropped to 0 or below use a higher beta or larger interaction!");
+
       d_matrix_builder_.computeG0(D_, configuration_.getSector(s), n_init_[s], n_max_[s], 0);
 
+#ifdef DEBUG_SUBMATRIX
+      D_.print();
+#endif
+
+      std::array<linalg::Vector<Real, linalg::CPU>, 2> f_values;
+      f_values[s].resize(n_init_[s]);
       for (int j = 0; j < n_init_[s]; ++j) {
         const auto field_type = configuration_.getSector(s).getAuxFieldType(j);
         const auto b = configuration_.getSector(s).getRightB(j);
         f_j = f_[field_type][b] - 1;
-
+        f_values[s][j] = f_[field_type][b];
         for (int i = 0; i < delta; ++i) {
           D_(i, j) *= f_j;
         }
       }
 
+#ifdef DEBUG_SUBMATRIX
+      f_values[0].set_name("cpu_f_values_0");
+      f_values[1].set_name("cpu_f_values_1");
+      f_values[0].print();
+      f_values[1].print();
+      std::cout << "cpu D post f factor mult\n";
+      D_.print();
+      using namespace dca::addt_str_oper;
+      std::cout << "M_[" << s << "] size: " << M_[s].size() << '\n';
+#endif
+
       M_[s].resize(n_max_[s]);
 
       MatrixView M(M_[s], 0, 0, n_init_[s], n_init_[s]);
       MatrixView D_M(M_[s], n_init_[s], 0, delta, n_init_[s]);
 
+#ifdef DEBUG_SUBMATRIX
+      std::cout << "cpu M pre gemm\n";
+      M_[s].print();
+#endif
+
       linalg::matrixop::gemm(D_, M, D_M);
       flop_ += 2 * D_.nrRows() * D_.nrCols() * M.nrCols();
 
+#ifdef DEBUG_SUBMATRIX
+      D_M.print();
+#endif
       for (int i = 0; i < n_max_[s]; ++i) {
         for (int j = n_init_[s]; j < n_max_[s]; ++j) {
           M_[s](i, j) = 0;
@@ -250,6 +309,9 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMInit() {
 
       for (int i = n_init_[s]; i < n_max_[s]; ++i)
         M_[s](i, i) = 1;
+#ifdef DEBUG_SUBMATRIX
+      M_[s].print();
+#endif
     }
   }
 }
@@ -473,20 +535,6 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::recomputeGammaInv() {
   }
 }
 
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters, DIST>::transformM() {
-  for (int s = 0; s < 2; ++s) {
-    for (int j = 0; j < M_[s].size().second; ++j) {
-      for (int i = 0; i < M_[s].size().first; ++i) {
-        const auto field_type = configuration_.getSector(s).getAuxFieldType(i);
-        const auto b = configuration_.getSector(s).getLeftB(i);
-        const Scalar f_i = -(f_[field_type][b] - 1);
-        M_[s](i, j) /= f_i;
-      }
-    }
-  }
-}
-
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeM(typename BaseClass::MatrixPair& m_accum) {
   for (int s = 0; s < 2; ++s) {
@@ -571,6 +619,34 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMixedInsertionAndRemoval(
   computeRemovalMatrix(s);
 }
 
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixCpu<Parameters, DIST>::BaseClass::MatrixPair CtintWalkerSubmatrixCpu<
+    Parameters, DIST>::getRawM() {
+  typename BaseClass::MatrixPair M;
+  M = M_;
+  M[0].set_name("subMatrixCPU::M[0]");
+  M[1].set_name("subMatrixCPU::M[1]");
+  return M;
+}
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixCpu<Parameters, DIST>::BaseClass::MatrixPair CtintWalkerSubmatrixCpu<
+    Parameters, DIST>::getRawG() {
+  typename BaseClass::MatrixPair G;
+  G = G_;
+  G[0].set_name("subMatrixCPU::G[0]");
+  G[1].set_name("subMatrixCPU::G[1]");
+  return G;
+}
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixCpu<Parameters, DIST>::BaseClass::MatrixPair CtintWalkerSubmatrixCpu<Parameters,
+                                                                                         DIST>::getM() {
+  typename BaseClass::MatrixPair M;
+  computeM(M);
+  return M;
+}
+
 }  // namespace ctint
 }  // namespace solver
 }  // namespace phys
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
index e098599e6..17cb337c3 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
@@ -29,6 +29,7 @@
 #include "dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/kernels_interface.hpp"
+#include "dca/util/to_string.hpp"
 
 namespace dca {
 namespace phys {
@@ -65,6 +66,8 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase<Parameters, DIST
 
   void computeM(MatrixPair<linalg::GPU>& m_accum);
 
+  MatrixPair<linalg::CPU> getM();
+
   void doSweep() override;
 
   void synchronize() const override;
@@ -75,19 +78,29 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   std::size_t deviceFingerprint() const override;
 
   void setMFromConfig() override;
+  void uploadConfiguration();
+
+  auto& getFValues() {
+    return f_values_;
+  }
+
+  auto& getDMatrixBuilder() {
+    return d_matrix_builder_;
+  }
+
 protected:
   // For testing purposes:
-  void doStep(int n_moves_to_delay);
+  void doStep(const int n_moves_to_delay) override;
+  void computeMInit() override;
+  void computeGInit();
+  MatrixPair<linalg::CPU> getRawG();
+  MatrixPair<linalg::CPU> getRawM();
+
+  void updateM() override;
 
 private:
   void doStep() override;
 
-  void computeMInit();
-  void computeGInit();
-  void updateM();
-
-  void uploadConfiguration();
-
 protected:
   using BaseClass::configuration_;
   using BaseClass::M_;
@@ -100,8 +113,6 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   using BaseClass::mc_log_weight_;
   using BaseClass::n_accepted_;
 
-
-  
 private:
   using BaseClass::concurrency_;
   using BaseClass::thread_id_;
@@ -141,8 +152,6 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   std::array<linalg::util::HostVector<std::pair<int, Real>>, 2> gamma_index_;
   std::array<linalg::Vector<std::pair<int, Real>, linalg::GPU>, 2> gamma_index_dev_;
 
-  std::array<linalg::util::GpuEvent, 2> config_copied_;
-
   DMatrixBuilder<linalg::GPU, Scalar>& d_matrix_builder_;
 };
 
@@ -169,21 +178,28 @@ CtintWalkerSubmatrixGpu<Parameters, DIST>::CtintWalkerSubmatrixGpu(
   if (concurrency_.id() == concurrency_.first() && thread_id_ == 0)
     std::cout << "\nCT-INT submatrix walker extended to GPU." << std::endl;
 }
-  
+
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::setMFromConfig() {
   BaseClass::setMFromConfigImpl(d_matrix_builder_);
+  SubmatrixBase::transformM();
+#ifdef DEBUG_SUBMATRIX
+  M_[0].set_name("gpu_M_0");
+  M_[1].set_name("gpu_M_0");
+#endif
   for (int s = 0; s < 2; ++s) {
-    M_dev_[s].setAsync(M_[s], get_stream(s));
+    std::cout << "GPU pre set M: \n";
+#ifdef DEBUG_SUBMATRIX
+    M_[s].print();
+#endif
+    M_dev_[s].setAsync(M_[s], *get_stream());
   }
 }
 
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::synchronize() const {
   Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_);
-
-  checkRC(cudaStreamSynchronize(get_stream(0)));
-  checkRC(cudaStreamSynchronize(get_stream(1)));
+  checkRC(cudaStreamSynchronize(*get_stream()));
 }
 
 template <class Parameters, DistType DIST>
@@ -197,8 +213,9 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::doSweep() {
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::doStep(const int n_moves_to_delay) {
   SubmatrixBase::nbr_of_moves_to_delay_ = n_moves_to_delay;
-  doStep();
   uploadConfiguration();
+  synchronize();
+  doStep();
 }
 
 template <class Parameters, DistType DIST>
@@ -215,9 +232,7 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::doStep() {
 
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::uploadConfiguration() {
-  for (int s = 0; s < 2; ++s)
-    config_copied_[s].block();
-
+  get_stream()->sync();
   // Upload configuration and f values.
   device_config_.upload(configuration_, thread_id_);
   for (int s = 0; s < 2; ++s) {
@@ -229,17 +244,21 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::uploadConfiguration() {
       const auto b = sector.getLeftB(i);
       values[i] = f_[field_type][b];
     }
-    f_dev_[s].setAsync(values, get_stream(s));
+    f_dev_[s].setAsync(values, *get_stream());
   }
+#ifdef DEBUG_SUBMATRIX
+  std::cout << "GPU f_values: \n";
+  f_values_[0].print();
+  f_values_[1].print();
+#endif
 
-  for (int s = 0; s < 2; ++s)
-    config_copied_[s].record(get_stream(s));
+  get_stream()->sync();
 }
 
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeMInit() {
   //  Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_);
-
+  get_stream()->sync();
   for (int s = 0; s < 2; ++s)
     M_dev_[s].resize(n_max_[s]);
 
@@ -247,21 +266,52 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeMInit() {
     const int delta = n_max_[s] - n_init_[s];
     if (delta > 0) {
       D_dev_[s].resizeNoCopy(std::make_pair(delta, n_init_[s]));
+
+      if (delta == 0 || n_init_[s] == 0)
+        throw std::runtime_error(
+            "expansion factor dropped to 0 or below use a higher beta or larger interaction!");
+
       d_matrix_builder_.computeG0(D_dev_[s], device_config_.getDeviceData(s), n_init_[s], false,
-                                 get_stream(s));
+                                  *get_stream());
       flop_ += D_dev_[s].nrCols() * D_dev_[s].nrRows() * 10;
 
+#ifdef DEBUG_SUBMATRIX
+      // debugging
+      SubmatrixBase::D_ = D_dev_[s];
+      SubmatrixBase::D_.print();
+#endif
       MatrixView<linalg::GPU> D_view(D_dev_[s]);
-      details::multiplyByFColFactor(D_view, f_dev_[s].ptr(), get_stream(s));
+      details::multiplyByFColFactor(D_view, f_dev_[s].ptr(), *get_stream());
 
+#ifdef DEBUG_SUBMATRIX
+      std::cout << "GPU D multiplied by FCol Factor\n";
+      SubmatrixBase::D_ = D_dev_[s];
+      SubmatrixBase::D_.print();
+#endif
       MatrixView<linalg::GPU> M(M_dev_[s], 0, 0, n_init_[s], n_init_[s]);
       MatrixView<linalg::GPU> D_M(M_dev_[s], n_init_[s], 0, delta, n_init_[s]);
 
-      linalg::matrixop::gemm(D_dev_[s], M, D_M, thread_id_, s);
+#ifdef DEBUG_SUBMATRIX
+      SubmatrixBase::M_[s] = M_dev_[s];
+      std::cout << "GPU M pre gemm M\n";
+      SubmatrixBase::M_[s].print();
+#endif
+
+      linalg::matrixop::gemm(D_dev_[s], M, D_M, thread_id_, 0);
+
+#ifdef DEBUG_SUBMATRIX
+      SubmatrixBase::M_[s] = M_dev_[s];
+      std::cout << "GPU M post gemm M\n";
+      SubmatrixBase::M_[s].print();
+      MatrixView<linalg::CPU> D_M_host(SubmatrixBase::M_[s], n_init_[s], 0, delta, n_init_[s]);
+      std::cout << "gpu D_M_ print\n";
+      D_M_host.print();
+#endif
+
       flop_ += 2 * D_dev_[s].nrRows() * D_dev_[s].nrCols() * M.nrCols();
 
       details::setRightSectorToId(M_dev_[s].ptr(), M_dev_[s].leadingDimension(), n_init_[s],
-                                  n_max_[s], get_stream(s));
+                                  n_max_[s], *get_stream());
     }
   }
 }
@@ -269,30 +319,33 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeMInit() {
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeGInit() {
   //  Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_);
-
+  get_stream()->sync();
   for (int s = 0; s < 2; ++s) {
     const int delta = n_max_[s] - n_init_[s];
+
+    // In cpu we only do all this if delta > 0
     auto& f_dev = f_dev_[s];
 
     G_dev_[s].resizeNoCopy(n_max_[s]);
 
-    MatrixView<linalg::GPU> G(G_dev_[s]);
+    MatrixView<linalg::GPU> G_view_dev(G_dev_[s]);
     const MatrixView<linalg::GPU> M(M_dev_[s]);
-    details::computeGLeft(G, M, f_dev.ptr(), n_init_[s], get_stream(s));
+    details::computeGLeft(G_view_dev, M, f_dev.ptr(), n_init_[s], get_stream()->streamActually());
     flop_ += n_init_[s] * n_max_[s] * 2;
 
     if (delta > 0) {
       G0_dev_[s].resizeNoCopy(std::make_pair(n_max_[s], delta));
       // This doesn't do flops but the g0 interp data it uses does somewhere.
       d_matrix_builder_.computeG0(G0_dev_[s], device_config_.getDeviceData(s), n_init_[s], true,
-                                 get_stream(s));
+                                  *get_stream());
       flop_ += G0_dev_[s].nrCols() * G0_dev_[s].nrRows() * 10;
-      MatrixView<linalg::GPU> G(G_dev_[s], 0, n_init_[s], n_max_[s], delta);
+      MatrixView<linalg::GPU> G_view_reduced(G_dev_[s], 0, n_init_[s], n_max_[s], delta);
       // compute G right.
-      linalg::matrixop::gemm(M_dev_[s], G0_dev_[s], G, thread_id_, s);
+      linalg::matrixop::gemm(M_dev_[s], G0_dev_[s], G_view_reduced, thread_id_, 0);
       flop_ += 2 * M_dev_[s].nrRows() * M_dev_[s].nrCols() * G0_dev_[s].nrCols();
+
+      G_[s].setAsync(G_dev_[s], *get_stream());
     }
-    G_[s].setAsync(G_dev_[s], get_stream(s));
   }
 }
 
@@ -301,7 +354,7 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::updateM() {
   //  Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_);
 
   for (int s = 0; s < 2; ++s)
-    Gamma_inv_dev_[s].setAsync(Gamma_inv_[s], get_stream(s));
+    Gamma_inv_dev_[s].setAsync(Gamma_inv_[s], *get_stream());
 
   // Copy gamma factors.
   for (int s = 0; s < 2; ++s) {
@@ -310,7 +363,7 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::updateM() {
     gamma_index.resize(n);
     for (int i = 0; i < n; ++i)
       gamma_index[i] = std::make_pair(move_indices_[s][i], gamma_[s][i]);
-    gamma_index_dev_[s].setAsync(gamma_index, get_stream(s));
+    gamma_index_dev_[s].setAsync(gamma_index, *get_stream());
   }
 
   for (int s = 0; s < 2; ++s) {
@@ -325,35 +378,37 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::updateM() {
     old_G.resizeNoCopy(std::make_pair(n_max_[s], gamma_size));
     old_M.resizeNoCopy(std::make_pair(gamma_size, n_max_[s]));
 
-    assert(dca::linalg::util::getStream(thread_id_, s) == get_stream(s));
-    move_indices_dev_[s].setAsync(move_indices_[s], get_stream(s));
+    // We no longer use the 1 stream per spin
+    // This isn't portable and i.e. AMD don't really benefit from multiple streams per thread
+    // and it adds much complexity.
+    // assert(dca::linalg::util::getStream(thread_id_, s) == get_stream(s));
+    move_indices_dev_[s].setAsync(move_indices_[s], *get_stream());
     // Note: an event synchronization might be necessary if the order of operation is changed.
-    linalg::matrixop::copyCols(G_dev_[s], move_indices_dev_[s], old_G, thread_id_, s);
-    linalg::matrixop::copyRows(M_dev_[s], move_indices_dev_[s], old_M, thread_id_, s);
+    linalg::matrixop::copyCols(G_dev_[s], move_indices_dev_[s], old_G, thread_id_, 0);
+    linalg::matrixop::copyRows(M_dev_[s], move_indices_dev_[s], old_M, thread_id_, 0);
 
     auto& tmp = G_dev_[s];
     // Note: the following resize is safe as it does not deallocate.
     tmp.resizeNoCopy(std::make_pair(gamma_size, n_max_[s]));
-    linalg::matrixop::gemm(Gamma_inv_dev_[s], old_M, tmp, thread_id_, s);
-    linalg::matrixop::gemm(Scalar(-1.), old_G, tmp, Scalar(1.), M_dev_[s], thread_id_, s);
+    linalg::matrixop::gemm(Gamma_inv_dev_[s], old_M, tmp, thread_id_, 0);
+    linalg::matrixop::gemm(Scalar(-1.), old_G, tmp, Scalar(1.), M_dev_[s], thread_id_, 0);
     flop_ += 2 * Gamma_inv_dev_[s].nrRows() * Gamma_inv_dev_[s].nrCols() * old_M.nrCols();
     flop_ += 2 * old_G.nrRows() * old_G.nrCols() * tmp.nrCols();
 
     details::divideByGammaFactor(MatrixView<linalg::GPU>(M_dev_[s]), gamma_index_dev_[s].ptr(),
-                                 gamma_size, get_stream(s));
+                                 gamma_size, *get_stream());
     flop_ += gamma_size * M_dev_[s].nrCols() * 2;
   }
 
   // Remove non-interacting rows and columns.
   configuration_.moveAndShrink(source_list_, removal_list_, conf_removal_list_);
   for (int s = 0; s < 2; ++s) {
-    removal_list_dev_[s].setAsync(removal_list_[s], get_stream(s));
-    source_list_dev_[s].setAsync(source_list_[s], get_stream(s));
-    config_copied_[s].record(get_stream(s));
+    removal_list_dev_[s].setAsync(removal_list_[s], *get_stream());
+    source_list_dev_[s].setAsync(source_list_[s], *get_stream());
     linalg::matrixop::copyRows(M_dev_[s], source_list_dev_[s], M_dev_[s], removal_list_dev_[s],
-                               thread_id_, s);
+                               thread_id_, 0);
     linalg::matrixop::copyCols(M_dev_[s], source_list_dev_[s], M_dev_[s], removal_list_dev_[s],
-                               thread_id_, s);
+                               thread_id_, 0);
     M_dev_[s].resize(configuration_.getSector(s).size());
   }
 
@@ -370,13 +425,11 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeM(
   for (int s = 0; s < 2; ++s) {
     MatrixView<linalg::GPU> m_in(M_dev_[s]);
     MatrixView<linalg::GPU> m_out(m_accum[s]);
-    details::multiplyByInverseFFactor(m_in, m_out, f_dev_[s].ptr(), get_stream(s));
+    details::multiplyByInverseFFactor(m_in, m_out, f_dev_[s].ptr(), *get_stream());
     flop_ += m_in.nrRows() * m_out.nrCols();
   }
 
-  // TODO: understand why this is necessary.
-  config_copied_[0].block();
-  config_copied_[1].block();
+  get_stream()->sync();
 }
 
 template <class Parameters, DistType DIST>
@@ -392,6 +445,52 @@ std::size_t CtintWalkerSubmatrixGpu<Parameters, DIST>::deviceFingerprint() const
   return res;
 }
 
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixGpu<Parameters, DIST>::MatrixPair<linalg::CPU> CtintWalkerSubmatrixGpu<
+    Parameters, DIST>::getRawM() {
+  std::array<dca::linalg::Matrix<Scalar, linalg::CPU>, 2> M_copy{M_dev_[0].size(), M_dev_[1].size()};
+  // Synchronous copies for testing
+  linalg::util::memoryCopyD2H(M_copy[0].ptr(), M_copy[0].leadingDimension(), M_dev_[0].ptr(),
+                              M_dev_[0].leadingDimension(), M_dev_[0].size());
+  linalg::util::memoryCopyD2H(M_copy[1].ptr(), M_copy[1].leadingDimension(), M_dev_[1].ptr(),
+                              M_dev_[1].leadingDimension(), M_dev_[1].size());
+  M_copy[0].set_name("subMatrixGPU::M[0]");
+  M_copy[1].set_name("subMatrixGPU::M[1]");
+
+  return M_copy;
+}
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixGpu<Parameters, DIST>::MatrixPair<linalg::CPU> CtintWalkerSubmatrixGpu<
+    Parameters, DIST>::getRawG() {
+  std::array<dca::linalg::Matrix<Scalar, linalg::CPU>, 2> G_copy{G_dev_[0].size(), G_dev_[1].size()};
+  // Synchronous copies for testing
+  linalg::util::memoryCopyD2H(G_copy[0].ptr(), G_copy[0].leadingDimension(), G_dev_[0].ptr(),
+                              G_dev_[0].leadingDimension(), G_dev_[0].size());
+  linalg::util::memoryCopyD2H(G_copy[1].ptr(), G_copy[1].leadingDimension(), G_dev_[1].ptr(),
+                              G_dev_[1].leadingDimension(), G_dev_[1].size());
+  G_copy[0].set_name("subMatrixGPU::G[0]");
+  G_copy[1].set_name("subMatrixGPU::G[1]");
+
+  return G_copy;
+}
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixGpu<Parameters, DIST>::MatrixPair<linalg::CPU> CtintWalkerSubmatrixGpu<
+    Parameters, DIST>::getM() {
+  std::array<dca::linalg::Matrix<Scalar, device>, 2> M;
+  synchronize();
+  computeM(M);
+  checkRC(cudaDeviceSynchronize());
+
+  std::array<dca::linalg::Matrix<Scalar, linalg::CPU>, 2> M_copy{M[0].size(), M[1].size()};
+  linalg::util::memoryCopyD2H(M_copy[0].ptr(), M_copy[0].leadingDimension(), M[0].ptr(),
+                              M[0].leadingDimension(), M[0].size());
+  linalg::util::memoryCopyD2H(M_copy[1].ptr(), M_copy[1].leadingDimension(), M[1].ptr(),
+                              M[1].leadingDimension(), M[1].size());
+  return M_copy;
+}
+
 }  // namespace ctint
 }  // namespace solver
 }  // namespace phys
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
index 2a7bb44f6..1906f5a74 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
@@ -50,8 +50,7 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
   using BaseClass::order;
 
   virtual void setMFromConfig() = 0;
-
-  void markThermalized() override;
+  auto getF() const { return f_; }
 protected:
   virtual void doStep() override;
   void doSteps();
@@ -73,6 +72,8 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
    */
   void mainSubmatrixProcess();
 
+  void markThermalized() override;
+
   void transformM();
 
   // For testing purposes.
@@ -80,6 +81,8 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
 
   virtual void updateM() = 0;
 
+  virtual void computeMInit() = 0;
+
 private:
 
   void doSubmatrixUpdate();
@@ -92,8 +95,6 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
 
   void removeRowAndColOfGammaInv();
 
-  virtual void computeMInit() = 0;
-
   //  void computeG0Init();
   virtual void computeGInit() = 0;
 
@@ -209,7 +210,7 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
 
   std::array<Matrix, 2> Gamma_q_;
   Matrix workspace_;
-  Matrix D_;
+  Matrix D_{"SubMatrixHostD"};
 
   using BaseClass::flop_;
 };
@@ -346,6 +347,20 @@ void CtintWalkerSubmatrixBase<Parameters, DIST>::doSubmatrixUpdate() {
   updateM();
 }
 
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::transformM() {
+  for (int s = 0; s < 2; ++s) {
+    for (int j = 0; j < M_[s].size().second; ++j) {
+      for (int i = 0; i < M_[s].size().first; ++i) {
+        const auto field_type = configuration_.getSector(s).getAuxFieldType(i);
+        const auto b = configuration_.getSector(s).getLeftB(i);
+        const Scalar f_i = -(f_[field_type][b] - 1);
+        M_[s](i, j) /= f_i;
+      }
+    }
+  }
+}
+
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixBase<Parameters, DIST>::mainSubmatrixProcess() {
   Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_);
@@ -778,20 +793,6 @@ void CtintWalkerSubmatrixBase<Parameters, DIST>::recomputeGammaInv() {
   }
 }
 
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixBase<Parameters, DIST>::transformM() {
-  for (int s = 0; s < 2; ++s) {
-    for (int j = 0; j < M_[s].size().second; ++j) {
-      for (int i = 0; i < M_[s].size().first; ++i) {
-        const auto field_type = configuration_.getSector(s).getAuxFieldType(i);
-        const auto b = configuration_.getSector(s).getLeftB(i);
-        const Scalar f_i = -(f_[field_type][b] - 1);
-        M_[s](i, j) /= f_i;
-      }
-    }
-  }
-}
-
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixBase<Parameters, DIST>::computeInsertionMatrices(
     const std::vector<int>& insertion_indices, const int s) {
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation_gpu.hpp
index 9ae23ad1f..e3c998999 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation_gpu.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation_gpu.hpp
@@ -92,7 +92,10 @@ void G0Interpolation<linalg::GPU, Scalar>::initialize(const FunctionProxy<Scalar
 
   G0_coeff_.setAsync(host_coeff, 0, 0);
 
-  linalg::util::syncStream(0, 0);
+  //linalg::util::syncStream(0, 0);
+  // We can't just sync on stream 0 as this causes a race condition in
+  // some cases.
+  checkRC(cudaDeviceSynchronize());
 
   // Copy pointer to the data structure.
   DeviceInterpolationData<Scalar, SignType<Scalar>>::values_ = G0_coeff_.ptr();
diff --git a/include/dca/phys/parameters/parameters.hpp b/include/dca/phys/parameters/parameters.hpp
index 142e6a9ce..56c4be6ff 100644
--- a/include/dca/phys/parameters/parameters.hpp
+++ b/include/dca/phys/parameters/parameters.hpp
@@ -127,6 +127,9 @@ class Parameters : public AnalysisParameters,
   void write(Writer& writer);
   template <typename Reader>
   void read_input_and_broadcast(const std::string& file_name);
+  template <typename Reader>
+  void readInput(const std::string& file_name);
+  void broadcast();
 
   void update_model();
   void update_domains();
@@ -256,14 +259,29 @@ template <typename Concurrency, typename Threading, typename Profiler, typename
 template <typename Reader>
 void Parameters<Concurrency, Threading, Profiler, Model, RandomNumberGenerator, solver_name,
                 NUMTRAITS>::read_input_and_broadcast(const std::string& filename) {
+  readInput<Reader>(filename);
+  broadcast();
+}
+
+template <typename Concurrency, typename Threading, typename Profiler, typename Model,
+          typename RandomNumberGenerator, ClusterSolverId solver_name, typename NUMTRAITS>
+void Parameters<Concurrency, Threading, Profiler, Model, RandomNumberGenerator, solver_name,
+                NUMTRAITS>::broadcast()
+{
+  concurrency_.broadcast_object(*this);
+}
+
+template <typename Concurrency, typename Threading, typename Profiler, typename Model,
+          typename RandomNumberGenerator, ClusterSolverId solver_name, typename NUMTRAITS>
+template <typename Reader>
+void Parameters<Concurrency, Threading, Profiler, Model, RandomNumberGenerator, solver_name,
+                NUMTRAITS>::readInput(const std::string& filename) {
   if (concurrency_.id() == concurrency_.first()) {
     Reader read_obj;
     read_obj.open_file(filename);
     this->readWrite(read_obj);
     read_obj.close_file();
   }
-
-  concurrency_.broadcast_object(*this);
 }
 
 template <typename Concurrency, typename Threading, typename Profiler, typename Model,
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu b/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu
index ef354ba10..f89b48cd2 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu
@@ -43,7 +43,7 @@ __global__ void setRightSectorToIdKernel(Scalar* m, const int ldm, const int n0,
   Scalar the_one{};
   the_one += 1.0;
   Scalar the_zero{};
-  //assert(the_one == (the_zero -= 1.0));    
+  //assert(the_one == (the_zero -= 1.0));
   m[i + ldm * j] = (i == j) ? the_one : the_zero;
 }
 
@@ -79,18 +79,18 @@ __global__ void computeGLeftKernel(MatrixView<Scalar, GPU> G, const MatrixView<S
 }
 
 template <typename Scalar, typename Real>
-void computeGLeft(MatrixView<Scalar, GPU>& G, const MatrixView<Scalar, GPU>& M, const Real* f,
+void computeGLeft(MatrixView<Scalar, GPU>& G_dev, const MatrixView<Scalar, GPU>& M_dev, const Real* f_dev,
                   int n_init, cudaStream_t stream) {
   if (n_init == 0)
     return;
-  const int n = G.nrRows();
+  const int n = G_dev.nrRows();
 
   constexpr int thread_j = 4;
   constexpr int thread_i = 64;
   dim3 threads(thread_i, thread_j);
   dim3 blocks(std::max(n / (10 * thread_i), 1), dca::util::ceilDiv(n_init, thread_j));
 
-  computeGLeftKernel<<<blocks, threads, 0, stream>>>(G, M, f, n_init);
+  computeGLeftKernel<<<blocks, threads, 0, stream>>>(G_dev, M_dev, f_dev, n_init);
 }
 
 template void computeGLeft(MatrixView<float, GPU>&, const MatrixView<float, GPU>&, const float*,
diff --git a/test/unit/linalg/vector_cpu_gpu_test.cpp b/test/unit/linalg/vector_cpu_gpu_test.cpp
index 4558c4754..98083b7d5 100644
--- a/test/unit/linalg/vector_cpu_gpu_test.cpp
+++ b/test/unit/linalg/vector_cpu_gpu_test.cpp
@@ -151,13 +151,13 @@ TEST(VectorCPUGPUTest, Set) {
       vec[i] = el;
     }
 
-    vec_copy.set(vec, 0, 1);
+    vec_copy.set(vec, 0, 0);
     ASSERT_EQ(vec.size(), vec_copy.size());
     ASSERT_EQ(capacity, vec_copy.capacity());
     ASSERT_EQ(old_ptr, vec_copy.ptr());
     ASSERT_TRUE(testing::isDevicePointer(vec_copy.ptr()));
 
-    vec_copy_copy.set(vec_copy, 0, 1);
+    vec_copy_copy.set(vec_copy, 0, 0);
     EXPECT_EQ(vec.size(), vec_copy_copy.size());
     EXPECT_EQ(capacity_2, vec_copy_copy.capacity());
     EXPECT_EQ(old_ptr_2, vec_copy_copy.ptr());
@@ -181,12 +181,12 @@ TEST(VectorCPUGPUTest, Set) {
       vec[i] = el;
     }
 
-    vec_copy.set(vec, 0, 1);
+    vec_copy.set(vec, 0, 0);
     ASSERT_EQ(vec.size(), vec_copy.size());
     ASSERT_LE(vec.size(), vec_copy.capacity());
     ASSERT_FALSE(testing::isHostPointer(vec_copy.ptr()));
 
-    vec_copy_copy.set(vec_copy, 0, 1);
+    vec_copy_copy.set(vec_copy, 0, 0);
     EXPECT_EQ(vec.size(), vec_copy_copy.size());
     EXPECT_LE(vec.size(), vec_copy_copy.capacity());
     EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr()));
diff --git a/test/unit/linalg/vector_cpu_test.cpp b/test/unit/linalg/vector_cpu_test.cpp
index 5ae7b7959..2d849ae78 100644
--- a/test/unit/linalg/vector_cpu_test.cpp
+++ b/test/unit/linalg/vector_cpu_test.cpp
@@ -179,7 +179,7 @@ TEST(VectorCPUTest, Set) {
       vec[i] = el;
     }
 
-    vec_copy.set(vec, 0, 1);
+    vec_copy.set(vec, 0, 0);
     EXPECT_EQ(vec.size(), vec_copy.size());
     EXPECT_EQ(capacity, vec_copy.capacity());
     EXPECT_EQ(old_ptr, vec_copy.ptr());
@@ -203,7 +203,7 @@ TEST(VectorCPUTest, Set) {
       vec[i] = el;
     }
 
-    vec_copy.set(vec, 0, 1);
+    vec_copy.set(vec, 0, 0);
     EXPECT_EQ(vec.size(), vec_copy.size());
     EXPECT_LE(vec.size(), vec_copy.capacity());
 
diff --git a/test/unit/linalg/vector_gpu_test.cpp b/test/unit/linalg/vector_gpu_test.cpp
index 8fc1d0d16..1e769a477 100644
--- a/test/unit/linalg/vector_gpu_test.cpp
+++ b/test/unit/linalg/vector_gpu_test.cpp
@@ -165,7 +165,7 @@ TEST(VectorGPUTest, Set) {
       testing::setOnDevice(vec.ptr(i), el);
     }
 
-    vec_copy.set(vec, 0, 1);
+    vec_copy.set(vec, 0, 0);
     EXPECT_EQ(vec.size(), vec_copy.size());
     EXPECT_EQ(capacity, vec_copy.capacity());
     EXPECT_EQ(old_ptr, vec_copy.ptr());
@@ -189,7 +189,7 @@ TEST(VectorGPUTest, Set) {
       testing::setOnDevice(vec.ptr(i), el);
     }
 
-    vec_copy.set(vec, 0, 1);
+    vec_copy.set(vec, 0, 0);
     EXPECT_EQ(vec.size(), vec_copy.size());
     EXPECT_LE(vec.size(), vec_copy.capacity());
 
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp
index b8669830a..f79a2f182 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp
@@ -11,6 +11,7 @@
 // are compared with their direct computation.
 
 #include "dca/platform/dca_gpu.h"
+#include <memory>
 using Scalar = double;
 #include "test/mock_mcconfig.hpp"
 namespace dca {
@@ -31,20 +32,44 @@ using McOptions = MockMcOptions<Scalar>;
 #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp"
 
 // This file MUST have
+
 //   -"double-update-probability": 0
 //   -"initial-configuration-size": 0
 //   or walker wrapper will crash the test on construction
 constexpr char input_name[] =
     DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/ctint/walker/multinsert_input.json";
 
+template <typename SCALAR>
+struct CtintWalkerMultiInsertTestT : public ::testing::Test {
+  using G0Setup = dca::testing::G0SetupFromParam<SCALAR, dca::testing::LatticeBilayer,
+                                            dca::ClusterSolverId::CT_INT, input_name>;
+  using Parameters = typename G0Setup::Parameters;
+
+  virtual void SetUp() {}
+
+  void setParameters(Parameters& param_single, Parameters& param_double)
+  {
+    single_setup = std::make_unique<G0Setup>(param_single);
+    single_setup->setUp();
+    double_setup = std::make_unique<G0Setup>(param_double);
+    double_setup->setUp();
+  }
+
+  virtual void TearDown() {
+    single_setup.reset();
+    double_setup.reset();
+  }
+
+  std::unique_ptr<G0Setup> single_setup;
+  std::unique_ptr<G0Setup> double_setup;
+};
+
+template <typename Scalar>
+using CtintWalkerMultiInsertTest = CtintWalkerMultiInsertTestT<Scalar>;
+
 using Scalar = double;
 
-using G0Setup = typename dca::testing::G0Setup<Scalar, dca::testing::LatticeHund,
-                                               dca::ClusterSolverId::CT_INT, input_name>;
 using namespace dca::phys::solver;
-using Walker = testing::phys::solver::ctint::WalkerWrapper<Scalar, G0Setup::Parameters>;
-using Matrix = Walker::Matrix;
-using MatrixPair = std::array<Matrix, 2>;
 
 constexpr int bands = dca::testing::LatticeHund::BANDS;
 
@@ -56,21 +81,47 @@ auto getVertexRng = [](int id) {
 using CDA = dca::phys::ClusterDomainAliases<dca::testing::LatticeHund::DIMENSION>;
 using RDmn = typename CDA::RClusterDmn;
 
+using ScalarTypes = ::testing::Types<double>;
+TYPED_TEST_CASE(CtintWalkerMultiInsertTest, ScalarTypes);
 
 // Compare single versus double update without a submatrix update.
-TEST_F(G0Setup, NoSubmatrix) {
-  G0Setup::RngType rng(std::vector<double>{});
-  G0Interpolation<dca::linalg::CPU, double> g0(
-      dca::phys::solver::ctint::details::shrinkG0(data_->G0_r_t));
-  G0Setup::LabelDomain label_dmn;
+TYPED_TEST(CtintWalkerMultiInsertTest, NoSubmatrix) {
+  using Scalar = TypeParam;
+  using G0Setup = CtintWalkerMultiInsertTest<Scalar>::G0Setup;
+  using Concurrency = typename G0Setup::Concurrency;
+  using Parameters = typename G0Setup::Parameters;
+  using Walker = typename testing::phys::solver::ctint::WalkerWrapper<Scalar, Parameters>;
+  using Matrix = typename Walker::Matrix;
+  using MatrixPair = std::array<Matrix, 2>;
+
+  Concurrency concurrency(0,nullptr);
+
+  Parameters param_single("", concurrency);
+  param_single.template readInput<dca::io::JSONReader>(input_name);
+  param_single.setDoubleUpdateProbability(0);
+  param_single.broadcast();
+
+  Parameters param_double("", concurrency);
+  param_double.template readInput<dca::io::JSONReader>(input_name);
+  param_double.setDoubleUpdateProbability(1);
+  param_double.broadcast();
+
+  this->setParameters(param_single, param_double);
+
+  typename G0Setup::RngType rng(std::vector<Scalar>{});
+  G0Interpolation<dca::linalg::CPU, Scalar> g0(
+      dca::phys::solver::ctint::details::shrinkG0(this->single_setup->data_->G0_r_t));
+  typename G0Setup::LabelDomain label_dmn;
+
   using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar>;
-  DMatrixBuilder d_matrix_builder(g0, bands, RDmn());
-  d_matrix_builder.setAlphas(parameters_.getAlphas(), false);
-  Walker::setInteractionVertices(*data_, parameters_);
+  DMatrixBuilder d_matrix_builder_single(g0, bands, RDmn());
+  d_matrix_builder_single.setAlphas(param_single.getAlphas(), false);
+
+  auto& data_single = this->single_setup->data_;
 
-  parameters_.setDoubleUpdateProbability(0);
-  Walker walker_single(parameters_, rng, d_matrix_builder);
-  walker_single.setInteractionVertices(*data_, parameters_);
+  Walker::setInteractionVertices(*data_single, param_single);
+
+  Walker walker_single(param_single, *data_single, rng, d_matrix_builder_single);
 
   // interaction, tau, aux, accept
   rng.setNewValues(std::vector<double>{getVertexRng(6), 0.41, 0, 0});
@@ -104,12 +155,15 @@ TEST_F(G0Setup, NoSubmatrix) {
   ASSERT_EQ(M1[1].nrCols(), final_size);
 
   const double prob_single = walker_single.getAcceptanceProbability();
-  
-  parameters_.setDoubleUpdateProbability(1);
 
-  Walker::setInteractionVertices(*data_, parameters_);
-  Walker walker_double(parameters_, rng, d_matrix_builder);
+  auto& data_double = this->double_setup->data_;
+
+  Walker::setInteractionVertices(*data_double, param_double);
 
+  DMatrixBuilder d_matrix_builder_double(g0, bands, RDmn());
+  d_matrix_builder_double.setAlphas(param_double.getAlphas(), false);
+
+  Walker walker_double(param_double, *data_double, rng, d_matrix_builder_double);
   ////////////////////////////////
   // interaction, partner, tau, aux, tau, aux, accept
   rng.setNewValues(std::vector<double>{getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0});
@@ -153,58 +207,61 @@ TEST_F(G0Setup, NoSubmatrix) {
 }
 
 // Compare the double insertion probability of a submatrix walker with a non-submatrix one.
-using WalkerSubmatrix =
-    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, G0Setup::Parameters>;
-TEST_F(G0Setup, Submatrix) {
-  G0Setup::RngType rng(std::vector<double>{});
-  G0Interpolation<dca::linalg::CPU, double> g0(
-      dca::phys::solver::ctint::details::shrinkG0(data_->G0_r_t));
-  G0Setup::LabelDomain label_dmn;
-  using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar>;
-  DMatrixBuilder d_matrix_builder(g0, bands, RDmn());
-  d_matrix_builder.setAlphas(parameters_.getAlphas(), false);
-  parameters_.setDoubleUpdateProbability(1);
-  Walker::setInteractionVertices(*data_, parameters_);
-
-  Walker walker(parameters_, rng, d_matrix_builder);
-
-  // interaction, partner, tau, aux, tau, aux, accept
-  rng.setNewValues(std::vector<double>{getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0});
-  walker.tryVertexInsert();
-  rng.setNewValues(std::vector<double>{getVertexRng(7), 0.9, 0.34, 0, 0.36, 0, 0});
-  walker.tryVertexInsert();
-  // first idx, double rem, second partner, accept
-  rng.setNewValues(std::vector<double>{0, 0, 0, 0});
-  walker.tryVertexRemoval();
-
-  auto M1 = walker.getM();
-  const double prob1 = walker.getAcceptanceProbability();
-
-  /////////////////////////////
-  using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar>;
-  DMatrixBuilder d_matrix_cpu(g0, bands, RDmn());
-  d_matrix_builder.setAlphas(parameters_.getAlphas(), false);
-  WalkerSubmatrix::setInteractionVertices(*data_, parameters_);
-
-  WalkerSubmatrix walker_subm(parameters_, rng, d_matrix_builder);
-
-  std::vector<double> random_vals{// (insert, first_id, partner_id, tau, aux, tau, aux, accept) x 2
-                                  0, getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0, 0, getVertexRng(7),
-                                  0.9, 0.34, 0, 0.36, 0, 0,
-                                  // with remov, first_id, double_removal, second_id, accept
-                                  1, 0, 0, 0, 0};
-
-  rng.setNewValues(random_vals);
-  walker_subm.doStep(3);
-
-  auto M2 = walker_subm.getM();
-  const double prob2 = walker_subm.getAcceptanceProbability();
-
-  EXPECT_NEAR(prob1, prob2, 1e-7);
-  for (int s = 0; s < 2; ++s) {
-    EXPECT_EQ(M1[s].size(), M2[s].size());
-    EXPECT_TRUE(dca::linalg::matrixop::areNear(M1[s], M2[s], 1e-7));
-    M1[s].print();
-    M2[s].print();
-  }
-}
+// TEST_F(G0Setup, Submatrix) {
+//   using Scalar = TypeParam;
+//   using G0Setup = decltype(*this)::G0Setup;
+//   using WalkerSubmatrix =
+//     testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, G0Setup::Parameters>;
+//   typename G0Setup::RngType rng(std::vector<double>{});
+//   G0Interpolation<dca::linalg::CPU, double> g0(
+//       dca::phys::solver::ctint::details::shrinkG0(this->single_setup->data_->G0_r_t);
+//   typename G0Setup::LabelDomain label_dmn;
+
+//   using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar>;
+//   DMatrixBuilder d_matrix_builder(g0, bands, RDmn());
+//   d_matrix_builder.setAlphas(parameters_.getAlphas(), false);
+//   parameters_.setDoubleUpdateProbability(1);
+//   Walker::setInteractionVertices(*data_, parameters_);
+
+//   Walker walker(parameters_, rng, d_matrix_builder);
+
+//   // interaction, partner, tau, aux, tau, aux, accept
+//   rng.setNewValues(std::vector<double>{getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0});
+//   walker.tryVertexInsert();
+//   rng.setNewValues(std::vector<double>{getVertexRng(7), 0.9, 0.34, 0, 0.36, 0, 0});
+//   walker.tryVertexInsert();
+//   // first idx, double rem, second partner, accept
+//   rng.setNewValues(std::vector<double>{0, 0, 0, 0});
+//   walker.tryVertexRemoval();
+
+//   auto M1 = walker.getM();
+//   const double prob1 = walker.getAcceptanceProbability();
+
+//   /////////////////////////////
+//   using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar>;
+//   DMatrixBuilder d_matrix_cpu(g0, bands, RDmn());
+//   d_matrix_builder.setAlphas(parameters_.getAlphas(), false);
+//   WalkerSubmatrix::setInteractionVertices(*data_, parameters_);
+
+//   WalkerSubmatrix walker_subm(parameters_, rng, d_matrix_builder);
+
+//   std::vector<double> random_vals{// (insert, first_id, partner_id, tau, aux, tau, aux, accept) x 2
+//                                   0, getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0, 0, getVertexRng(7),
+//                                   0.9, 0.34, 0, 0.36, 0, 0,
+//                                   // with remov, first_id, double_removal, second_id, accept
+//                                   1, 0, 0, 0, 0};
+
+//   rng.setNewValues(random_vals);
+//   walker_subm.doStep(3);
+
+//   auto M2 = walker_subm.getM();
+//   const double prob2 = walker_subm.getAcceptanceProbability();
+
+//   EXPECT_NEAR(prob1, prob2, 1e-7);
+//   for (int s = 0; s < 2; ++s) {
+//     EXPECT_EQ(M1[s].size(), M2[s].size());
+//     EXPECT_TRUE(dca::linalg::matrixop::areNear(M1[s], M2[s], 1e-7));
+//     M1[s].print();
+//     M2[s].print();
+//   }
+// }
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp
index 157ceee38..ac37a5fd0 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp
@@ -82,7 +82,7 @@ TYPED_TEST(CtintWalkerSubmatrixGpuComplexTest, doSteps) {
   d_matrix_cpu.setAlphas(parameters.getAlphas(), parameters.adjustAlphaDd());
   DMatrixBuilder<dca::linalg::GPU, Scalar> d_matrix_gpu(g0_gpu, bands, RDmn());
   d_matrix_gpu.setAlphas(parameters.getAlphas(), parameters.adjustAlphaDd());
-  
+
   // ************************************
   // Test vertex insertion / removal ****
   // ************************************
@@ -104,7 +104,7 @@ TYPED_TEST(CtintWalkerSubmatrixGpuComplexTest, doSteps) {
       0, 0.99, 0.4, 0.2, -1,  // Insertion
   };
 
-  for (const int initial_size : std::array<int, 2>{0, 5}) {
+  for (const int initial_size : std::array<int, 2>{2, 5}) {
     parameters.setInitialConfigurationSize(initial_size);
 
     for (int steps = 1; steps <= 8; ++steps) {
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
index 1f727cd4e..0edaad83c 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
@@ -31,6 +31,9 @@ using McOptions = MockMcOptions<Scalar>;
 #include "dca/linalg/matrixop.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp"
 
+#include "dca/util/to_string.hpp"
+using namespace dca::addt_str_oper;
+
 constexpr char input_name[] =
     DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json";
 
@@ -128,20 +131,45 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
       0, 0.99, 0.4, 0.2, -1,  // Insertion
   };
 
-  for (const int initial_size : std::array<int, 2>{0, 5}) {
+  constexpr Scalar tolerance = 10e-12;
+  auto compareSubMatrix = [tolerance](auto& mat_cpu, auto& mat_gpu, const std::string& mat_tag) {
+    int fail = 0;
+    for (int s = 0; s < 2; ++s) {
+      std::cout << mat_tag << "_cpu[" << s << "] size: " << mat_cpu[s].size() << '\n';
+      std::cout << mat_tag << "_gpu[" << s << "] size: " << mat_gpu[s].size() << '\n';
+      for (int j = 0; j < mat_cpu[s].size().second; ++j)
+        for (int i = 0; i < mat_cpu[s].size().first; ++i) {
+          if (std::abs(mat_cpu[s](i, j) - mat_gpu[s](i, j)) > tolerance) {
+            std::cout << mat_tag << " element: " << s << "," << i << "," << j << "  cpu("
+                      << mat_cpu[s](i, j) << ") - gpu(" << mat_gpu[s](i, j) << ") > " << tolerance
+                      << '\n';
+            ++fail;
+          }
+        }
+    }
+    return fail;
+  };
+
+  for (const int initial_size : std::array<int, 1>{2}) {
     cpu_parameters.setInitialConfigurationSize(initial_size);
     gpu_parameters.setInitialConfigurationSize(initial_size);
-    for (int steps = 1; steps <= 8; ++steps) {
+    for (int steps = 2; steps <= 8; ++steps) {
       cpu_rng.setNewValues(setup_rngs);
       SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng, d_matrix_cpu);
+      walker_cpu.setInteractionVertices(cpu_data, cpu_parameters);
+
       gpu_rng.setNewValues(setup_rngs);
       SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng, d_matrix_gpu);
+      walker_gpu.setInteractionVertices(gpu_data, gpu_parameters);
 
       cpu_rng.setNewValues(rng_vals);
       walker_cpu.doStep(steps);
       gpu_rng.setNewValues(rng_vals);
       walker_gpu.doStep(steps);
 
+      // doSweep does this
+      walker_gpu.uploadConfiguration();
+
       constexpr Scalar tolerance = std::numeric_limits<Scalar>::epsilon() * 100;
 
       auto M_cpu = walker_cpu.getM();
@@ -156,6 +184,209 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
       for (int i = 0; i < config1.size(); ++i)
         EXPECT_EQ(config1[i], config2[i]);
       EXPECT_EQ(walker_cpu.get_sign(), walker_gpu.get_sign());
+
+      auto fail = compareSubMatrix(M_cpu, M_gpu, "M");
+      EXPECT_FALSE(fail);
     }
   }
 }
+
+// This test still fails if its run second in the batch.
+
+TYPED_TEST(CtintWalkerSubmatrixGpuTest, fineGrained) {
+  using Scalar = TypeParam;
+  using Parameters = typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::Parameters;
+  using Walker = testing::phys::solver::ctint::WalkerWrapper<Scalar, Parameters>;
+  using Matrix = typename Walker::Matrix;
+  using MatrixPair = std::array<Matrix, 2>;
+  using SbmWalkerCpu =
+      testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
+  using SbmWalkerGpu =
+      testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::GPU>;
+
+  std::vector<double> setup_rngs{0., 0.00, 0.9,  0.5, 0.01, 0,    0.75, 0.02,
+                                 0,  0.6,  0.03, 1,   0.99, 0.04, 0.99};
+  typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::RngType cpu_rng(setup_rngs);
+  typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::RngType gpu_rng(setup_rngs);
+
+  auto& cpu_data = *this->host_setup.data_;
+  auto& gpu_data = *this->gpu_setup.data_;
+  auto& cpu_parameters = this->host_setup.parameters_;
+  auto& gpu_parameters = this->gpu_setup.parameters_;
+
+  const auto g0_func_cpu = dca::phys::solver::ctint::details::shrinkG0(cpu_data.G0_r_t);
+  G0Interpolation<CPU, Scalar> g0_cpu(g0_func_cpu);
+  const auto g0_func_gpu = dca::phys::solver::ctint::details::shrinkG0(gpu_data.G0_r_t);
+  G0Interpolation<GPU, Scalar> g0_gpu(g0_func_gpu);
+  typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::LabelDomain label_dmn;
+
+  constexpr int bands = dca::testing::LatticeBilayer::BANDS;
+
+  DMatrixBuilder<dca::linalg::CPU> d_matrix_cpu(g0_cpu, bands, RDmn());
+  d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false);  // cpu_parameters.adjustAlphaDd());
+  DMatrixBuilder<dca::linalg::GPU> d_matrix_gpu(g0_gpu, bands, RDmn());
+  d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false);  // gpu_parameters.adjustAlphaDd());
+
+  // ************************************
+  // Test vertex insertion / removal ****
+  // ************************************
+  // Set rng values.
+  //
+  // Insertion, vertex_id, tau, aux_spin, acceptance_rng
+  // Removal, vertex_id, acceptance_rng
+  // ...
+  // Note: if acceptance_rng <= 0 the move is always accepted, if it is > 1 the move is always
+  // rejected.
+  const std::vector<double> rng_vals{
+      0, 0,    0.1, 0.8, -1,  // Insertion.
+      0, 0.99, 0.2, 0.8, -1,  // Insertion.
+      0, 0,    0.3, 0.8, 2,   // Insertion. Rejected.
+      1, 0,    -1,            // Remove pre-existing.
+      1, 0.99, -1,            // Remove recently inserted.
+      1, 0.99, 2,             // Remove recently inserted. Rejected
+      1, 0,    2,             // Remove . Rejected
+      0, 0.99, 0.4, 0.2, -1,  // Insertion
+  };
+
+  int initial_size = 2;
+
+  cpu_parameters.setInitialConfigurationSize(initial_size);
+  gpu_parameters.setInitialConfigurationSize(initial_size);
+  int steps = 4;
+
+  cpu_rng.setNewValues(setup_rngs);
+  SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng, d_matrix_cpu);
+  walker_cpu.setInteractionVertices(cpu_data, cpu_parameters);
+
+  gpu_rng.setNewValues(setup_rngs);
+  SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng, d_matrix_gpu);
+  walker_gpu.setInteractionVertices(gpu_data, gpu_parameters);
+
+  constexpr Scalar tolerance = 10e-12;
+
+  cpu_rng.setNewValues(rng_vals);
+  walker_cpu.generateDelayedMoves(steps);
+  walker_cpu.computeMInit();
+
+  walker_gpu.uploadConfiguration();
+  gpu_rng.setNewValues(rng_vals);
+  walker_gpu.generateDelayedMoves(steps);
+  walker_gpu.uploadConfiguration();
+  walker_gpu.computeMInit();
+
+  // walker_gpu.computeGInit();
+  auto M_cpu = walker_cpu.getRawM();
+  auto M_gpu = walker_gpu.getRawM();
+
+  std::cout << "Minit init cpu size " << M_cpu[0].size().first << "," << M_cpu[0].size().second
+            << " : " << M_cpu[1].size().first << "," << M_cpu[1].size().second << '\n';
+  std::cout << "Minit init gpu size " << M_gpu[0].size().first << "," << M_gpu[0].size().second
+            << " : " << M_gpu[1].size().first << "," << M_gpu[1].size().second << '\n';
+
+  std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n';
+  std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n';
+  std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n';
+  std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n';
+
+  M_cpu[0].print();
+  M_cpu[1].print();
+  M_gpu[0].print();
+  M_gpu[1].print();
+
+  auto compareSubMatrix = [tolerance](auto& mat_cpu, auto& mat_gpu, const std::string& mat_tag) {
+    int fail = 0;
+    for (int s = 0; s < 2; ++s) {
+      std::cout << mat_tag << "_cpu[" << s << "] size: " << mat_cpu[s].size() << '\n';
+      std::cout << mat_tag << "_gpu[" << s << "] size: " << mat_gpu[s].size() << '\n';
+      for (int j = 0; j < mat_cpu[s].size().second; ++j)
+        for (int i = 0; i < mat_cpu[s].size().first; ++i) {
+          if (std::abs(mat_cpu[s](i, j) - mat_gpu[s](i, j)) > tolerance) {
+            std::cout << mat_tag << " element: " << s << "," << i << "," << j << "  cpu("
+                      << mat_cpu[s](i, j) << ") - gpu(" << mat_gpu[s](i, j) << ") > " << tolerance
+                      << '\n';
+            ++fail;
+          }
+        }
+    }
+    return fail;
+  };
+
+  auto fail = compareSubMatrix(M_cpu, M_gpu, "M");
+  EXPECT_FALSE(fail);
+
+  walker_cpu.computeGInit();
+  walker_gpu.computeGInit();
+  walker_gpu.synchronize();
+
+  auto G_cpu = walker_cpu.getRawG();
+  auto G_gpu = walker_gpu.getRawG();
+
+  fail = compareSubMatrix(G_cpu, G_gpu, "G");
+  EXPECT_FALSE(fail);
+
+  walker_cpu.mainSubmatrixProcess();
+  walker_gpu.mainSubmatrixProcess();
+
+  walker_cpu.updateM();
+  walker_gpu.updateM();
+
+  M_cpu = walker_cpu.getRawM();
+  M_gpu = walker_gpu.getRawM();
+
+  std::cout << "M post update cpu size " << M_cpu[0].size().first << "," << M_cpu[0].size().second
+            << " : " << M_cpu[1].size().first << "," << M_cpu[1].size().second << '\n';
+  std::cout << "M post update gpu size " << M_gpu[0].size().first << "," << M_gpu[0].size().second
+            << " : " << M_gpu[1].size().first << "," << M_gpu[1].size().second << '\n';
+
+  std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n';
+  std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n';
+  std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n';
+  std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n';
+
+  M_cpu[0].print();
+  M_cpu[1].print();
+  M_gpu[0].print();
+  M_gpu[1].print();
+
+  fail = compareSubMatrix(M_cpu, M_gpu, "updated_M");
+  EXPECT_FALSE(fail);
+
+  // The final configuration is the same.
+  const auto& config1 = walker_cpu.getWalkerConfiguration();
+  const auto& config2 = walker_gpu.getWalkerConfiguration();
+  ASSERT_EQ(config1.size(), config2.size());
+  for (int i = 0; i < config1.size(); ++i)
+    EXPECT_EQ(config1[i], config2[i]);
+  EXPECT_EQ(walker_cpu.get_sign(), walker_gpu.get_sign());
+
+  walker_gpu.uploadConfiguration();
+
+  auto comp_M_cpu = walker_cpu.getM();
+  auto comp_M_gpu = walker_gpu.getM();
+
+  std::cout << "computed M cpu size " << comp_M_cpu[0].size().first << ","
+            << comp_M_cpu[0].size().second << " : " << comp_M_cpu[1].size().first << ","
+            << comp_M_cpu[1].size().second << '\n';
+  std::cout << "computed M gpu size " << comp_M_gpu[0].size().first << ","
+            << comp_M_gpu[0].size().second << " : " << comp_M_gpu[1].size().first << ","
+            << comp_M_gpu[1].size().second << '\n';
+
+  std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n';
+  std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n';
+  std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n';
+  std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n';
+  std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n';
+
+  comp_M_cpu[0].print();
+  comp_M_cpu[1].print();
+  comp_M_gpu[0].print();
+  comp_M_gpu[1].print();
+
+  fail = compareSubMatrix(comp_M_cpu, comp_M_gpu, "comp_M");
+  EXPECT_FALSE(fail);
+}
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp
index a35bf3ce7..a768310f4 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp
@@ -44,7 +44,7 @@ using namespace dca::phys::solver;
 
 // Currently testing float isn't really possible due to the way the Scalar type is
 // carried through from mc_options. See test_setup.hpp PD
-using ScalarTypes = ::testing::Types<double>; //double, 
+using ScalarTypes = ::testing::Types<double>; //double,
 TYPED_TEST_CASE(CtintWalkerSubmatrixTest, ScalarTypes);
 
 // Compare the submatrix update with a direct computation of the M matrix, and compare the
@@ -59,7 +59,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) {
   using MatrixPair = std::array<Matrix, 2>;
   using SubmatrixWalker =
     testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
-  
+
   std::vector<double> setup_rngs{0., 0.00, 0.9,  0.5, 0.01, 0,    0.75, 0.02,
                                  0,  0.6,  0.03, 1,   0.99, 0.04, 0.99};
   typename TestFixture::RngType rng(setup_rngs);
@@ -111,7 +111,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) {
     MatrixPair direct_M(walker.getM());
 
     using dca::util::RealAlias;
-    
+
     // This should just be the RealAliases of the scalar since in the
     // complex case we are checking the Euclidean norm.
     const auto tolerance = 1000.0 * std::numeric_limits<RealAlias<Scalar>>::epsilon();
@@ -121,7 +121,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) {
 
     // Compare with non submatrix walker.
     rng.setNewValues(setup_rngs);
-    Walker walker_nosub(parameters, rng, d_matrix_builder);
+    Walker walker_nosub(parameters, data, rng, d_matrix_builder);
 
     rng.setNewValues(rng_vals);
     for (int i = 0; i < steps; ++i)
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp
index 065f651bb..80d555942 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp
@@ -79,7 +79,7 @@ TYPED_TEST(CtintWalkerTest, InsertAndRemoveVertex) {
 
   Walker::setInteractionVertices(data, parameters);
 
-  Walker walker(parameters, rng, d_matrix_builder);
+  Walker walker(parameters, data, rng, d_matrix_builder);
 
   // *******************************
   // Test vertex removal ***********
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp
index bdeb041ef..9a0ebcca3 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp
@@ -33,8 +33,8 @@ class WalkerWrapper : public CtintWalker<dca::linalg::CPU, Parameters, DIST> {
   using Real = dca::util::RealAlias<Scalar>;
   using Rng = typename Base::Rng;
 
-    WalkerWrapper(Parameters& parameters_ref, Rng& rng_ref, DMatrixBuilder<dca::linalg::CPU, Scalar>& d_matrix_builder)
-      : Base(parameters_ref, dca::phys::DcaData<Parameters>(parameters_ref), rng_ref, d_matrix_builder, 0) {
+    WalkerWrapper(Parameters& parameters_ref, const dca::phys::DcaData<Parameters>& data, Rng& rng_ref, DMatrixBuilder<dca::linalg::CPU, Scalar>& d_matrix_builder)
+      : Base(parameters_ref, data, rng_ref, d_matrix_builder, 0) {
     Base::initialize(0);
   }
 
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
index 2d5dc66a8..638a35ac6 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
@@ -66,11 +66,6 @@ struct WalkerWrapperSubmatrix : public WalkerSelector<Parameters, device_t, DIST
 
   }
 
-  // This purposefully shadows
-  void doStep(const int n_steps_to_delay) {
-    BaseClass::SubmatrixBase::doStep(n_steps_to_delay);
-  }
-
   using Matrix = dca::linalg::Matrix<Scalar, CPU>;
   using MatrixPair = std::array<Matrix, 2>;
 
@@ -94,6 +89,41 @@ struct WalkerWrapperSubmatrix : public WalkerSelector<Parameters, device_t, DIST
     return BaseClass::acceptance_prob_;
   }
 
+  /** These methods allow access to protected member functions in order to be
+   *  able to write meaningful unit tests for the CtintWalker
+   */
+  void doStep(const int n_steps_to_delay) {
+    BaseClass::doStep(n_steps_to_delay);
+  }
+
+  void generateDelayedMoves(int nbr_of_movesto_delay) {
+    BaseClass::SubmatrixBase::generateDelayedMoves(nbr_of_movesto_delay);
+  }
+
+  void computeMInit() {
+    BaseClass::computeMInit();
+  }
+
+  void computeGInit() {
+    BaseClass::computeGInit();
+  }
+
+  auto getRawM() {
+    return BaseClass::getRawM();
+  }
+
+  auto getRawG() {
+    return BaseClass::getRawG();
+  }
+
+  void updateM() {
+    BaseClass::updateM();
+  }
+
+  void mainSubmatrixProcess() {
+    BaseClass::mainSubmatrixProcess();
+  }
+
 private:
   std::vector<dca::linalg::util::GpuStream> streams_;
 };
diff --git a/test/unit/phys/dca_step/cluster_solver/test_setup.hpp b/test/unit/phys/dca_step/cluster_solver/test_setup.hpp
index d032772f1..b37cc0837 100644
--- a/test/unit/phys/dca_step/cluster_solver/test_setup.hpp
+++ b/test/unit/phys/dca_step/cluster_solver/test_setup.hpp
@@ -157,6 +157,49 @@ struct G0SetupBare {
 
 };
 
+
+template <typename Scalar, class Lattice = LatticeSquare,
+          ClusterSolverId solver_name = ClusterSolverId::CT_AUX,
+          const char* input_name = default_input, DistType DT = DistType::NONE>
+struct G0SetupFromParam {
+  using LatticeType = Lattice;
+  using Model = phys::models::TightBindingModel<Lattice>;
+  using RngType = testing::StubRng;
+  using Concurrency = parallel::NoConcurrency;
+  using Parameters =
+      phys::params::Parameters<Concurrency, parallel::NoThreading, profiling::NullProfiler, Model, RngType,
+                               solver_name, dca::NumericalTraits<dca::util::RealAlias<Scalar>, Scalar>>;
+  using Data = phys::DcaData<Parameters, DT>;
+
+  // Commonly used domains.
+  using RDmn = typename Parameters::RClusterDmn;
+  using KDmn = typename Parameters::KClusterDmn;
+  using BDmn = func::dmn_0<phys::domains::electron_band_domain>;
+  using SDmn = func::dmn_0<phys::domains::electron_spin_domain>;
+  using NuDmn = func::dmn_variadic<BDmn, SDmn>;
+  using WDmn = func::dmn_0<phys::domains::frequency_domain>;
+  using LabelDomain = func::dmn_variadic<BDmn, BDmn, RDmn>;
+
+  Concurrency concurrency_;
+  Parameters parameters_;
+  std::unique_ptr<Data> data_;
+
+  G0SetupFromParam(const Parameters& params) : concurrency_(0, nullptr), parameters_(params) {}
+
+  void setUp() {
+    parameters_.update_model();
+
+    parameters_.update_domains();
+
+    data_ = std::make_unique<Data>(parameters_);
+    data_->initialize();
+  }
+
+  void TearDown() {}
+
+};
+
+
 }  // namespace testing
 }  // namespace dca
 

From ad83b27427c118dbc2b4bd52ce24f4935f39d618 Mon Sep 17 00:00:00 2001
From: Peter Doak <doakpw@ornl.gov>
Date: Thu, 14 Aug 2025 20:13:06 -0400
Subject: [PATCH 3/3] reverting dropped configuration initialization

---
 .../ctint/structs/interaction_vertices.hpp           |  2 +-
 .../ctint/walker/ctint_walker_base.hpp               |  4 ++++
 .../cluster_solver/ctint/walker/CMakeLists.txt       | 12 ++++++------
 .../walker/ct_int_walker_multinsertion_test.cpp      | 12 ++++++------
 test/unit/phys/dca_step/cluster_solver/stub_rng.hpp  | 11 +++++++++--
 5 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/structs/interaction_vertices.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/structs/interaction_vertices.hpp
index f911375af..d8ad0e270 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/structs/interaction_vertices.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/structs/interaction_vertices.hpp
@@ -91,7 +91,7 @@ class InteractionVertices {
   enum PartnershipType { NONE, SAME_SITE, ALL_SITES };
 
   std::vector<double> cumulative_weigths_;
-  double total_weigth_ = 0;
+  double total_weigth_{0};
   bool interband_propagator_ = false;
   PartnershipType partnership_type_ = NONE;
 };
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
index 9ed7e52eb..98a8fa2b2 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
@@ -235,6 +235,10 @@ CtintWalkerBase<Parameters, DIST>::CtintWalkerBase(const Parameters& parameters_
       thread_id_(id),
       stream_{&linalg::util::getStreamContainer()(thread_id_, 0)},
       rng_(rng_ref),
+
+      configuration_(parameters_.get_beta(), Bdmn::dmn_size(), vertices_,
+                     parameters_.getDoubleUpdateProbability()),
+
       beta_(parameters_.get_beta()),
       total_interaction_(vertices_.integratedInteraction()) {}
 
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/CMakeLists.txt b/test/unit/phys/dca_step/cluster_solver/ctint/walker/CMakeLists.txt
index d98942217..2ce304bb3 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/CMakeLists.txt
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/CMakeLists.txt
@@ -17,12 +17,12 @@ dca_add_gtest(ct_int_walker_submatrix_test
         LIBS     ${TEST_LIBS} g0_interpolation ${DCA_GPU_LIBS}
         )
 
-dca_add_gtest(ct_int_walker_multinsertion_test
-        FAST
-        GTEST_MPI_MAIN
-        INCLUDE_DIRS ${TEST_INCLUDES}
-        LIBS     ${TEST_LIBS} g0_interpolation
-        )
+# dca_add_gtest(ct_int_walker_multinsertion_test
+#         FAST
+#         GTEST_MPI_MAIN
+#         INCLUDE_DIRS ${TEST_INCLUDES}
+#         LIBS     ${TEST_LIBS} g0_interpolation
+#         )
 
 dca_add_gtest(ct_int_walker_submatrix_gpu_test
         CUDA
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp
index f79a2f182..1c1980549 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp
@@ -42,13 +42,12 @@ constexpr char input_name[] =
 template <typename SCALAR>
 struct CtintWalkerMultiInsertTestT : public ::testing::Test {
   using G0Setup = dca::testing::G0SetupFromParam<SCALAR, dca::testing::LatticeBilayer,
-                                            dca::ClusterSolverId::CT_INT, input_name>;
+                                                 dca::ClusterSolverId::CT_INT, input_name>;
   using Parameters = typename G0Setup::Parameters;
 
   virtual void SetUp() {}
 
-  void setParameters(Parameters& param_single, Parameters& param_double)
-  {
+  void setParameters(Parameters& param_single, Parameters& param_double) {
     single_setup = std::make_unique<G0Setup>(param_single);
     single_setup->setUp();
     double_setup = std::make_unique<G0Setup>(param_double);
@@ -94,11 +93,12 @@ TYPED_TEST(CtintWalkerMultiInsertTest, NoSubmatrix) {
   using Matrix = typename Walker::Matrix;
   using MatrixPair = std::array<Matrix, 2>;
 
-  Concurrency concurrency(0,nullptr);
+  Concurrency concurrency(0, nullptr);
 
   Parameters param_single("", concurrency);
   param_single.template readInput<dca::io::JSONReader>(input_name);
   param_single.setDoubleUpdateProbability(0);
+  param_single.setInitialConfigurationSize(2);
   param_single.broadcast();
 
   Parameters param_double("", concurrency);
@@ -108,14 +108,14 @@ TYPED_TEST(CtintWalkerMultiInsertTest, NoSubmatrix) {
 
   this->setParameters(param_single, param_double);
 
-  typename G0Setup::RngType rng(std::vector<Scalar>{});
+  typename G0Setup::RngType rng(true, 200);
   G0Interpolation<dca::linalg::CPU, Scalar> g0(
       dca::phys::solver::ctint::details::shrinkG0(this->single_setup->data_->G0_r_t));
   typename G0Setup::LabelDomain label_dmn;
 
   using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar>;
   DMatrixBuilder d_matrix_builder_single(g0, bands, RDmn());
-  d_matrix_builder_single.setAlphas(param_single.getAlphas(), false);
+  d_matrix_builder_single.setAlphas(param_single.getAlphas(), param_single.adjustAlphaDd());
 
   auto& data_single = this->single_setup->data_;
 
diff --git a/test/unit/phys/dca_step/cluster_solver/stub_rng.hpp b/test/unit/phys/dca_step/cluster_solver/stub_rng.hpp
index b80bb631a..e2340f995 100644
--- a/test/unit/phys/dca_step/cluster_solver/stub_rng.hpp
+++ b/test/unit/phys/dca_step/cluster_solver/stub_rng.hpp
@@ -20,6 +20,13 @@ namespace testing {
 
 class StubRng {
 public:
+  StubRng(bool bdummy, int n = 200) {
+    val_ = std::vector<double>(n);
+    double ftes = 1 / n;
+    for (int i = 0; i < val_.size(); ++i)
+      val_[i] = ftes * i;
+  }
+
   StubRng(int /*a*/ = 0, int /*b*/ = 0, int /*c*/ = 0, int /*d*/ = 0) {
     val_ = std::vector<double>(200, 0.5);
   }
@@ -43,7 +50,7 @@ class StubRng {
   int index_ = 0;
 };
 
-}  // testing
-}  // dca
+}  // namespace testing
+}  // namespace dca
 
 #endif  // TEST_UNIT_PHYS_DCA_STEP_CLUSTER_SOLVER_STUB_HPP