diff --git a/.github/workflows/ci-github-actions-self-hosted.yaml b/.github/workflows/ci-github-actions-self-hosted.yaml
index 3c342950a..073880006 100644
--- a/.github/workflows/ci-github-actions-self-hosted.yaml
+++ b/.github/workflows/ci-github-actions-self-hosted.yaml
@@ -9,8 +9,8 @@ on:
       - master
 
 jobs:
-  v100:
-    runs-on: [self-hosted, Linux, X64, V100, CUDA]
+  two_a30:
+    runs-on: [self-hosted, Linux, X64, A30]
 
     env:
       GH_JOBNAME: ${{matrix.jobname}}
@@ -18,11 +18,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        jobname: [
-            GCC12-MPI-CUDA-Real-Full,
-            ]
-#            GCC12-MPI-NoMPI-CUDA-Real-Full,
-#            GCC12-MPI-NoMPI-CUDA-Real-Debug-Full,
+        jobname: [GCC12-MPI-CUDA-Real-Full]
+    #            GCC12-MPI-NoMPI-CUDA-Real-Full,
+    #            GCC12-MPI-NoMPI-CUDA-Real-Debug-Full,
 
     steps:
       - name: Checkout PR branch
@@ -36,4 +34,3 @@ jobs:
 
       - name: Test
         run: test/test_automation/ci/run_step.sh test
-
diff --git a/include/dca/linalg/lapack/magma.hpp b/include/dca/linalg/lapack/magma.hpp
index fe2da8bb0..e838ffc48 100644
--- a/include/dca/linalg/lapack/magma.hpp
+++ b/include/dca/linalg/lapack/magma.hpp
@@ -150,9 +150,9 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
                                     int* ldc, const int batch_count, const magma_queue_t queue) {
   using util::castMAGMAComplex;
   magmablas_cgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
-                           convertToMagmaComplex(alpha), castMAGMAComplex(a), lda,
-                           castMAGMAComplex(b), ldb, convertToMagmaComplex(beta),
-                           castMAGMAComplex(c), ldc, batch_count, queue);
+                           convertToMagmaType(alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b),
+                           ldb, convertToMagmaType(beta), castMAGMAComplex(c), ldc, batch_count,
+                           queue);
   checkErrorsCudaDebug();
 }
 inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k,
@@ -163,9 +163,9 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
                                     int* ldc, const int batch_count, const magma_queue_t queue) {
   using util::castMAGMAComplex;
   magmablas_zgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
-                           convertToMagmaComplex(alpha), castMAGMAComplex(a), lda,
-                           castMAGMAComplex(b), ldb, convertToMagmaComplex(beta),
-                           castMAGMAComplex(c), ldc, batch_count, queue);
+                           convertToMagmaType(alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b),
+                           ldb, convertToMagmaType(beta), castMAGMAComplex(c), ldc, batch_count,
+                           queue);
   checkErrorsCudaDebug();
 }
 
diff --git a/include/dca/linalg/matrixop.hpp b/include/dca/linalg/matrixop.hpp
index 9db5f6731..0679554f1 100644
--- a/include/dca/linalg/matrixop.hpp
+++ b/include/dca/linalg/matrixop.hpp
@@ -43,6 +43,7 @@
 #include "dca/linalg/blas/use_device.hpp"
 #include "dca/linalg/lapack/use_device.hpp"
 #include "dca/linalg/matrix.hpp"
+#include "dca/linalg/util/allocators/aligned_allocator.hpp"
 #include "dca/linalg/util/util_lapack.hpp"
 #include "dca/linalg/util/util_matrixop.hpp"
 #include "dca/linalg/vector.hpp"
@@ -230,7 +231,6 @@ auto difference(const Matrix<Scalar, CPU, ALLOC>& a, const Matrix<Scalar, CPU, A
 
   return max_diff;
 }
-
 template <typename Scalar, class ALLOC>
 auto difference(const Matrix<Scalar, GPU>& a, const Matrix<Scalar, CPU, ALLOC>& b,
                 double diff_threshold = 1e-3) {
@@ -241,7 +241,7 @@ auto difference(const Matrix<Scalar, GPU>& a, const Matrix<Scalar, CPU, ALLOC>&
 template <typename Scalar, class ALLOC>
 auto difference(const Matrix<Scalar, CPU, ALLOC>& a, const Matrix<Scalar, GPU>& b,
                 double diff_threshold = 1e-3) {
-  Matrix<Scalar, CPU> cp_b(b);
+  Matrix<Scalar, CPU, ALLOC> cp_b(b);
   return difference(a, cp_b, diff_threshold);
 }
 
@@ -354,7 +354,7 @@ void smallInverse(Matrix<Scalar, CPU, ALLOC>& m_inv, Vector<int, CPU>& ipiv,
       break;
     }
     case 3: {
-      const Matrix<Scalar, CPU, ALLOC> m(m_inv);
+      const Matrix<Scalar, CPU, dca::linalg::util::AlignedAllocator<Scalar>> m(m_inv);
       const Scalar det = m(0, 0) * (m(1, 1) * m(2, 2) - m(2, 1) * m(1, 2)) -
                          m(1, 0) * (m(0, 1) * m(2, 2) - m(0, 2) * m(2, 1)) +
                          m(2, 0) * (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1));
@@ -1005,7 +1005,8 @@ inline void multiplyDiagonalLeft(const Vector<ScalarIn, device_name>& d,
 template <typename ScalarIn, typename ScalarOut>
 inline void multiplyDiagonalLeft(const Vector<ScalarIn, CPU>& d, const Matrix<ScalarIn, GPU>& a,
                                  Matrix<ScalarOut, GPU>& b, int thread_id = 0, int stream_id = 0) {
-  Vector<ScalarIn, GPU> d_gpu(d);
+  Vector<ScalarIn, GPU> d_gpu;
+  d_gpu.setAsync(d, thread_id, stream_id);
   multiplyDiagonalLeft(d_gpu, a, b, thread_id, stream_id);
 }
 
@@ -1025,7 +1026,8 @@ inline void multiplyDiagonalRight(const Matrix<Scalar, device_name>& a,
 template <typename Scalar>
 inline void multiplyDiagonalRight(const Matrix<Scalar, GPU>& a, const Vector<Scalar, CPU>& d,
                                   Matrix<Scalar, GPU>& b, int thread_id = 0, int stream_id = 0) {
-  Vector<Scalar, GPU> d_gpu(d);
+  Vector<Scalar, GPU> d_gpu;
+  d_gpu.setAsync(d, thread_id, stream_id);
   multiplyDiagonalRight(a, d_gpu, b, thread_id, stream_id);
 }
 
diff --git a/include/dca/linalg/util/cast_gpu.hpp b/include/dca/linalg/util/cast_gpu.hpp
index b764e0c85..b6edab4a3 100644
--- a/include/dca/linalg/util/cast_gpu.hpp
+++ b/include/dca/linalg/util/cast_gpu.hpp
@@ -202,14 +202,24 @@ template <typename Real>
 using CudaComplex = typename details::ComplexContainer<Real>::type;
 }  // namespace dca::linalg::util
 
-inline double2 convertToMagmaType(std::complex<double> var) {
+inline magmaDoubleComplex convertToMagmaType(std::complex<double> var) {
   return {reinterpret_cast<double (&)[2]>(var)[0], reinterpret_cast<double (&)[2]>(var)[1]};
 }
 
-inline float2 convertToMagmaType(std::complex<float> var) {
+inline magmaFloatComplex convertToMagmaType(std::complex<float> var) {
   return {reinterpret_cast<float (&)[2]>(var)[0], reinterpret_cast<float (&)[2]>(var)[1]};
 }
 
+#ifdef DCA_HAVE_HIP
+inline magmaFloatComplex convertToMagmaType(HIP_vector_type<float, 2> var) {
+  return {reinterpret_cast<float (&)[2]>(var)[0], reinterpret_cast<float (&)[2]>(var)[1]};
+}
+
+inline magmaDoubleComplex convertToMagmaType(HIP_vector_type<double, 2> var) {
+  return {reinterpret_cast<double (&)[2]>(var)[0], reinterpret_cast<double (&)[2]>(var)[1]};
+}
+#endif
+
 namespace dca::util {
 template <typename T>
 using MAGMATypeMap = typename std::disjunction<
@@ -231,8 +241,13 @@ using MAGMATypeMap = typename std::disjunction<
     OnTypesEqual<T, const std::complex<double>**, const magmaDoubleComplex**>,
     OnTypesEqual<T, const std::complex<float>* const*, const magmaFloatComplex* const*>,
     OnTypesEqual<T, const std::complex<double>* const*, const magmaDoubleComplex* const*>,
+    OnTypesEqual<T, const double2* const*, const magmaDoubleComplex* const*>,
+    OnTypesEqual<T, const float2* const*, const magmaFloatComplex* const*>,
+#ifdef DCA_HAVE_HIP
+    OnTypesEqual<T, const HIP_vector_type<float, 2>* const*, const magmaFloatComplex* const*>,
+    OnTypesEqual<T, const HIP_vector_type<double, 2>* const*, const magmaDoubleComplex* const*>,
+#endif
     default_type<void>>::type;
-
 template <typename T>
 __device__ __host__ MAGMATypeMap<T> castMagmaType(T var) {
   return reinterpret_cast<MAGMATypeMap<T>>(var);
diff --git a/include/dca/linalg/vector.hpp b/include/dca/linalg/vector.hpp
index 3e8882b77..a21fd0203 100644
--- a/include/dca/linalg/vector.hpp
+++ b/include/dca/linalg/vector.hpp
@@ -34,7 +34,7 @@ namespace dca {
 namespace linalg {
 // dca::linalg::
 
-  template <typename ScalarType, DeviceType device_name = DeviceType::CPU,
+template <typename ScalarType, DeviceType device_name = DeviceType::CPU,
           class Allocator = util::DefaultAllocator<ScalarType, device_name>>
 class Vector : public Allocator {
 public:
@@ -52,9 +52,8 @@ class Vector : public Allocator {
 
   /** copy constructor except for name.
    *  this is strange but for historical reasons is kept.
-   *  has needed to be explicit because with the `const ThisType&` somehow lead to an implicit conversion
-   *  from an int to a Vector& argument that landed here.
-   *  This occurred in Debug with 
+   *  has needed to be explicit because with the `const ThisType&` somehow lead to an implicit
+   * conversion from an int to a Vector& argument that landed here. This occurred in Debug with
    */
   explicit Vector(const ThisType& rhs, const std::string& name = default_name_);
 
@@ -323,7 +322,8 @@ void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs,
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
-void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream [[maybe_unused]]) {
+void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream
+                                                                [[maybe_unused]]) {
   // TODO: implement in copy.hpp.
 #ifdef DCA_HAVE_GPU
   checkRC(cudaMemsetAsync(data_, 0, size_ * sizeof(ScalarType), stream));
@@ -333,12 +333,14 @@ void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuS
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
-void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
+void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
+                                                           [[maybe_unused]]) {
   dca::linalg::util::Memory<device_name>::setToZero(data_, size_, stream);
 }
 
 // template <typename ScalarType, DeviceType device_name, class Allocator>
-// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
+// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
+// [[maybe_unused]]) {
 //   // TODO: implement in copy.hpp.
 //   dca::linalg::util::memory<device_name>::setToZero(data_, size_, stream);
 // }
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/accumulator/ctint_accumulator.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/accumulator/ctint_accumulator.hpp
index 69ac57e18..4162d9d82 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/accumulator/ctint_accumulator.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/accumulator/ctint_accumulator.hpp
@@ -35,8 +35,7 @@ namespace solver {
 namespace ctint {
 // dca::phys::solver::ctint::
 
-template <class Parameters, linalg::DeviceType device,
-          DistType DIST = dca::DistType::NONE>
+template <class Parameters, linalg::DeviceType device, DistType DIST = dca::DistType::NONE>
 class CtintAccumulator : public MC_accumulator_data<typename Parameters::Scalar> {
 public:
   constexpr static ClusterSolverId solver_id{ClusterSolverId::CT_INT};
@@ -47,7 +46,7 @@ class CtintAccumulator : public MC_accumulator_data<typename Parameters::Scalar>
   using Base::accumulated_phase_;
   using Base::current_phase_;
   using Base::number_of_measurements_;
-  
+
   using ParametersType = Parameters;
   using DataType = phys::DcaData<Parameters, DIST>;
   using SpAccumulator = accumulator::SpAccumulator<Parameters, device>;
@@ -112,7 +111,7 @@ class CtintAccumulator : public MC_accumulator_data<typename Parameters::Scalar>
   int get_number_of_measurements() const {
     std::cout << "number_of_measurements ==" << number_of_measurements_ << '\n';
     std::cout << "accumulated_phase_.count() == " << accumulated_phase_.count() << '\n';
-    //assert(accumulated_phase_.count() == number_of_measurements_);
+    // assert(accumulated_phase_.count() == number_of_measurements_);
     return number_of_measurements_;
   }
 
@@ -147,7 +146,6 @@ class CtintAccumulator : public MC_accumulator_data<typename Parameters::Scalar>
   MatrixConfiguration configuration_;
 
   std::vector<const linalg::util::GpuStream*> streams_;
-  linalg::util::GpuEvent event_;
 
   util::Accumulator<unsigned long> accumulated_order_;
 
@@ -170,7 +168,7 @@ class CtintAccumulator : public MC_accumulator_data<typename Parameters::Scalar>
 template <class Parameters, linalg::DeviceType device, DistType DIST>
 template <class Data>
 CtintAccumulator<Parameters, device, DIST>::CtintAccumulator(const Parameters& pars,
-                                                                   const Data& data, int id)
+                                                             const Data& data, int id)
     : parameters_(pars),
       thread_id_(id),
       sp_accumulator_(pars),
@@ -187,7 +185,7 @@ void CtintAccumulator<Parameters, device, DIST>::initialize(const int dca_iterat
                                          parameters_.dump_every_iteration());
   accumulated_order_.reset();
   accumulated_phase_.reset();
-  
+
   Base::initialize(dca_iteration);
   sp_accumulator_.resetAccumulation();
   sp_accumulator_.clearSingleMeasurement();
@@ -215,13 +213,7 @@ void CtintAccumulator<Parameters, device, DIST>::updateFrom(Walker& walker) {
     measure_flops_ = M_[0].nrCols() * M_[0].nrCols() * 2 * 2 * 8 * 19;
 
   if constexpr (device == linalg::GPU) {
-    for (int s = 0; s < 2; ++s) {
-      event_.record(walker.get_stream(s));
-      //  Synchronize sp accumulator streams with walker.
-      event_.block(*sp_accumulator_.get_streams()[s]);
-      //  Synchronize both walker streams with tp accumulator.
-      event_.block(*tp_accumulator_.get_stream());
-    }
+    walker.synchronize();
   }
 
   configuration_ = walker.getConfiguration();
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp
index adf4b72ad..6db6d167c 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp
@@ -37,7 +37,7 @@
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_choice.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp"
 #include "dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation.hpp"
-//#include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator.hpp"
+// #include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator.hpp"
 #include "dca/phys/dca_data/dca_data.hpp"
 #include "dca/phys/dca_loop/dca_loop_data.hpp"
 #include "dca/phys/dca_step/symmetrization/symmetrize.hpp"
@@ -58,7 +58,7 @@ class CtintClusterSolver {
   static constexpr ClusterSolverId solver_type{ClusterSolverId::CT_INT};
 
   using Real = typename config::McOptions::MC_REAL;
-  using Scalar = typename dca::util::ScalarSelect<Real,Parameters::complex_g0>::type;
+  using Scalar = typename dca::util::ScalarSelect<Real, Parameters::complex_g0>::type;
   using Concurrency = typename Parameters::concurrency_type;
 
   using CDA = ClusterDomainAliases<Parameters::lattice_type::DIMENSION>;
@@ -111,8 +111,11 @@ class CtintClusterSolver {
   // Returns the function G(k,w) without averaging across MPI ranks.
   auto local_G_k_w() const;
 
-  DMatrixBuilder& getResource() { return *d_matrix_builder_; };
-protected:  // thread jacket interface.  
+  DMatrixBuilder& getResource() {
+    return *d_matrix_builder_;
+  };
+
+protected:  // thread jacket interface.
   using ParametersType = Parameters;
   using DataType = Data;
   using Rng = typename Parameters::random_number_generator;
@@ -140,8 +143,8 @@ class CtintClusterSolver {
 
   /** gather all M and G4 and accumulated sign
    *  \param[out] Returns: average phase
-   *  \param[in,out]  G                 greens function has allreduce or leaveoneoutSum applied to it
-   *                                    side effect seems undesirable and motivated by saving copy.
+   *  \param[in,out]  G                 greens function has allreduce or leaveoneoutSum applied to
+   * it side effect seems undesirable and motivated by saving copy.
    *  \param[in]      compute_error     does leave one out sum removing the local accumulated type.
    */
   auto gatherMAndG4(SpGreensFunction& M, bool compute_error) const;
@@ -192,7 +195,7 @@ CtintClusterSolver<DEV, PARAM, use_submatrix, DIST>::CtintClusterSolver(
 template <dca::linalg::DeviceType device_t, class Parameters, bool use_submatrix, DistType DIST>
 void CtintClusterSolver<device_t, Parameters, use_submatrix, DIST>::initialize(int dca_iteration) {
   dca_iteration_ = dca_iteration;
-  
+
   g0_.initializeShrinked(data_.G0_r_t_cluster_excluded);
 
   d_matrix_builder_->setAlphas(parameters_.getAlphas(), parameters_.adjustAlphaDd());
@@ -202,7 +205,6 @@ void CtintClusterSolver<device_t, Parameters, use_submatrix, DIST>::initialize(i
   if (concurrency_.id() == concurrency_.first())
     std::cout << "\n\n\t CT-INT Integrator has initialized (DCA-iteration : " << dca_iteration
               << ")\n\n";
-
 }
 
 template <dca::linalg::DeviceType device_t, class Parameters, bool use_submatrix, DistType DIST>
@@ -353,10 +355,8 @@ void CtintClusterSolver<device_t, Parameters, use_submatrix, DIST>::warmUp() {
   const int n_sweep = parameters_.get_warm_up_sweeps();
   for (int i = 0; i < n_sweep; i++) {
     walker_->doSweep();
-
     walker_->updateShell(i, n_sweep);
   }
-
   walker_->markThermalized();
 }
 
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp
index c63d70f18..94d9753bc 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp
@@ -33,6 +33,8 @@ namespace solver {
 namespace ctint {
 // dca::phys::solver::ctint::
 
+
+// I think the class hierarch isn't helpful here.
 class SolverConfiguration : public MatrixConfiguration {
 public:
   using BaseClass = MatrixConfiguration;
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
index 98a8fa2b2..469785075 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
@@ -41,6 +41,8 @@
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp"
 #endif
 
+//#define DEBUG_SUBMATRIX
+
 namespace dca {
 namespace phys {
 namespace solver {
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
index 80711ee45..f183c1620 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
@@ -66,12 +66,12 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
 
   void setMFromConfig() override;
 
+  void markThermalized() override;
+
 protected:
   void doSteps();
   void generateDelayedMoves(int nbr_of_movesto_delay);
 
-  void markThermalized() override;
-
   void updateM() override;
 
   DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder_;
@@ -258,10 +258,6 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMInit() {
 
       d_matrix_builder_.computeG0(D_, configuration_.getSector(s), n_init_[s], n_max_[s], 0);
 
-#ifdef DEBUG_SUBMATRIX
-      D_.print();
-#endif
-
       std::array<linalg::Vector<Real, linalg::CPU>, 2> f_values;
       f_values[s].resize(n_init_[s]);
       for (int j = 0; j < n_init_[s]; ++j) {
@@ -284,7 +280,6 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMInit() {
       using namespace dca::addt_str_oper;
       std::cout << "M_[" << s << "] size: " << M_[s].size() << '\n';
 #endif
-
       M_[s].resize(n_max_[s]);
 
       MatrixView M(M_[s], 0, 0, n_init_[s], n_init_[s]);
@@ -301,6 +296,7 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMInit() {
 #ifdef DEBUG_SUBMATRIX
       D_M.print();
 #endif
+
       for (int i = 0; i < n_max_[s]; ++i) {
         for (int j = n_init_[s]; j < n_max_[s]; ++j) {
           M_[s](i, j) = 0;
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
index 17cb337c3..277115fb0 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
@@ -24,7 +24,6 @@
 #include <stdexcept>
 #include <vector>
 
-#include "dca/linalg/util/gpu_event.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp"
@@ -91,16 +90,13 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase<Parameters, DIST
 protected:
   // For testing purposes:
   void doStep(const int n_moves_to_delay) override;
+  void doStep() override;
   void computeMInit() override;
-  void computeGInit();
+  void computeGInit() override;
   MatrixPair<linalg::CPU> getRawG();
   MatrixPair<linalg::CPU> getRawM();
-
   void updateM() override;
 
-private:
-  void doStep() override;
-
 protected:
   using BaseClass::configuration_;
   using BaseClass::M_;
@@ -140,6 +136,7 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   std::array<linalg::Vector<int, linalg::GPU>, 2> source_list_dev_;
 
   MatrixPair<linalg::GPU> M_dev_;
+  MatrixPair<linalg::GPU> M_D_dev_;
   MatrixPair<linalg::GPU> Gamma_inv_dev_;
   MatrixPair<linalg::GPU> D_dev_;
   MatrixPair<linalg::GPU> G_dev_;
@@ -259,9 +256,10 @@ template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeMInit() {
   //  Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_);
   get_stream()->sync();
-  for (int s = 0; s < 2; ++s)
+  for (int s = 0; s < 2; ++s) {
     M_dev_[s].resize(n_max_[s]);
-
+    M_D_dev_[s].resize(n_max_[s]);
+  }
   for (int s = 0; s < 2; ++s) {
     const int delta = n_max_[s] - n_init_[s];
     if (delta > 0) {
@@ -320,10 +318,10 @@ template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeGInit() {
   //  Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_);
   get_stream()->sync();
+
   for (int s = 0; s < 2; ++s) {
     const int delta = n_max_[s] - n_init_[s];
 
-    // In cpu we only do all this if delta > 0
     auto& f_dev = f_dev_[s];
 
     G_dev_[s].resizeNoCopy(n_max_[s]);
@@ -477,6 +475,7 @@ CtintWalkerSubmatrixGpu<Parameters, DIST>::MatrixPair<linalg::CPU> CtintWalkerSu
 
 template <class Parameters, DistType DIST>
 CtintWalkerSubmatrixGpu<Parameters, DIST>::MatrixPair<linalg::CPU> CtintWalkerSubmatrixGpu<
+
     Parameters, DIST>::getM() {
   std::array<dca::linalg::Matrix<Scalar, device>, 2> M;
   synchronize();
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
index 1906f5a74..bd92c2001 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
@@ -39,7 +39,8 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
   using typename BaseClass::Real;
   using typename BaseClass::Scalar;
 
-  CtintWalkerSubmatrixBase(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, int id = 0);
+  CtintWalkerSubmatrixBase(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref,
+                           int id = 0);
 
   virtual ~CtintWalkerSubmatrixBase() = default;
 
@@ -50,7 +51,12 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
   using BaseClass::order;
 
   virtual void setMFromConfig() = 0;
-  auto getF() const { return f_; }
+  auto getF() const {
+    return f_;
+  }
+
+  virtual void markThermalized() override;
+
 protected:
   virtual void doStep() override;
   void doSteps();
@@ -72,8 +78,6 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
    */
   void mainSubmatrixProcess();
 
-  void markThermalized() override;
-
   void transformM();
 
   // For testing purposes.
@@ -84,7 +88,6 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
   virtual void computeMInit() = 0;
 
 private:
-
   void doSubmatrixUpdate();
 
   /** returns [acceptance_probability , mc_weight_ratio ]
@@ -217,16 +220,15 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
 
 template <class Parameters, DistType DIST>
 CtintWalkerSubmatrixBase<Parameters, DIST>::CtintWalkerSubmatrixBase(const Parameters& parameters_ref,
-                                                                   const Data& /*data*/,
-                                                                   Rng& rng_ref,
-								   int id)
-  : BaseClass(parameters_ref, rng_ref, id) {
+                                                                     const Data& /*data*/,
+                                                                     Rng& rng_ref, int id)
+    : BaseClass(parameters_ref, rng_ref, id) {
   if (BaseClass::concurrency_.id() == BaseClass::concurrency_.first() && thread_id_ == 0)
     std::cout << "\nCT-INT submatrix walker created." << std::endl;
 }
 
 template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixBase<Parameters,DIST>::markThermalized() {
+void CtintWalkerSubmatrixBase<Parameters, DIST>::markThermalized() {
   thermalized_ = true;
 
   nb_steps_per_sweep_ = std::max(1., std::ceil(sweeps_per_meas_ * partial_order_avg_.mean()));
@@ -239,7 +241,7 @@ void CtintWalkerSubmatrixBase<Parameters,DIST>::markThermalized() {
   // Recompute the Monte Carlo weight.
   setMFromConfig();
 #ifndef NDEBUG
-  //writeAlphas();
+  // writeAlphas();
 #endif
 }
 
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp
index ecec506e0..40bdb92a4 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp
@@ -83,6 +83,8 @@ class DMatrixBuilder<linalg::CPU, Scalar> {
   void computeG0(Matrix& G0, const Sector& configuration, const int n_init, const int n_max,
                  const int which_section) const;
 
+  auto& getSiteDiff() { return site_diff_; }
+  
 #ifdef DCA_HAVE_GPU
   virtual void computeG0(linalg::Matrix<Scalar, linalg::GPU>& /*G0*/,
                          const details::DeviceConfiguration& /*configuration*/, int /*n_init*/,
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh b/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh
index 56b2527df..9a47b7973 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh
@@ -31,7 +31,7 @@ class ClusterHelper {
 public:
   /// Initialize real Cluster
   static void set(int nc, const int* add, int lda, const int* sub, int lds);
-  /// Initialize reciprocal cluster 
+  /// Initialize reciprocal cluster
   static void setMomentum(int nc, const int* add, int lda, const int* sub, int lds);
 
   // Returns the index of id_1 + id_2.
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh b/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh
index 8f2b75355..bdc861b3b 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh
@@ -44,7 +44,6 @@ public:
 
 private:
   static bool initialized_;
-
   std::size_t subdm_step_[2];
 };
 
diff --git a/include/dca/phys/parameters/mc_solver_parameters.hpp b/include/dca/phys/parameters/mc_solver_parameters.hpp
index 595b946b5..c2440a50c 100644
--- a/include/dca/phys/parameters/mc_solver_parameters.hpp
+++ b/include/dca/phys/parameters/mc_solver_parameters.hpp
@@ -99,7 +99,7 @@ int McSolverParameters<ClusterSolverId::CT_AUX>::getBufferSize(const Concurrency
 
 template <typename Concurrency>
 void McSolverParameters<ClusterSolverId::CT_AUX>::pack(const Concurrency& concurrency, char* buffer,
-                                              int buffer_size, int& position) const {
+                                                       int buffer_size, int& position) const {
   concurrency.pack(buffer, buffer_size, position, expansion_parameter_K_);
   concurrency.pack(buffer, buffer_size, position, initial_configuration_size_);
   concurrency.pack(buffer, buffer_size, position, initial_matrix_size_);
@@ -110,7 +110,7 @@ void McSolverParameters<ClusterSolverId::CT_AUX>::pack(const Concurrency& concur
 
 template <typename Concurrency>
 void McSolverParameters<ClusterSolverId::CT_AUX>::unpack(const Concurrency& concurrency, char* buffer,
-                                                int buffer_size, int& position) {
+                                                         int buffer_size, int& position) {
   concurrency.unpack(buffer, buffer_size, position, expansion_parameter_K_);
   concurrency.unpack(buffer, buffer_size, position, initial_configuration_size_);
   concurrency.unpack(buffer, buffer_size, position, initial_matrix_size_);
@@ -207,16 +207,18 @@ int McSolverParameters<ClusterSolverId::SS_CT_HYB>::getBufferSize(const Concurre
 }
 
 template <typename Concurrency>
-void McSolverParameters<ClusterSolverId::SS_CT_HYB>::pack(const Concurrency& concurrency, char* buffer,
-                                                 int buffer_size, int& position) const {
+void McSolverParameters<ClusterSolverId::SS_CT_HYB>::pack(const Concurrency& concurrency,
+                                                          char* buffer, int buffer_size,
+                                                          int& position) const {
   concurrency.pack(buffer, buffer_size, position, self_energy_tail_cutoff_);
   concurrency.pack(buffer, buffer_size, position, steps_per_sweep_);
   concurrency.pack(buffer, buffer_size, position, shifts_per_sweep_);
 }
 
 template <typename Concurrency>
-void McSolverParameters<ClusterSolverId::SS_CT_HYB>::unpack(const Concurrency& concurrency, char* buffer,
-                                                   int buffer_size, int& position) {
+void McSolverParameters<ClusterSolverId::SS_CT_HYB>::unpack(const Concurrency& concurrency,
+                                                            char* buffer, int buffer_size,
+                                                            int& position) {
   concurrency.unpack(buffer, buffer_size, position, self_energy_tail_cutoff_);
   concurrency.unpack(buffer, buffer_size, position, steps_per_sweep_);
   concurrency.unpack(buffer, buffer_size, position, shifts_per_sweep_);
@@ -252,6 +254,17 @@ void McSolverParameters<ClusterSolverId::SS_CT_HYB>::readWrite(ReaderOrWriter& r
 // Specialization for CT-INT
 template <>
 class McSolverParameters<ClusterSolverId::CT_INT> {
+private:
+  /// code does not currently cover corner case of configuration size 0, as such this isn't a sensible default to use.
+  int initial_configuration_size_ = 2;
+  double alpha_dd_pos_ = 0.501;
+  double alpha_dd_neg_ = 0;
+  double alpha_ndd_ = 1e-4;
+  bool adjust_alpha_dd_ = false;
+  double double_update_probability_ = 0;
+  bool all_sites_partnership_ = 0;
+  int max_submatrix_size_ = 1;
+
 public:
   template <typename Concurrency>
   int getBufferSize(const Concurrency& concurrency) const;
@@ -298,16 +311,6 @@ class McSolverParameters<ClusterSolverId::CT_INT> {
   void setMaxSubmatrixSize(const int size) {
     max_submatrix_size_ = size;
   }
-
-private:
-  int initial_configuration_size_ = 0;
-  double alpha_dd_pos_ = 0.501;
-  double alpha_dd_neg_ = 0;
-  double alpha_ndd_ = 1e-4;
-  bool adjust_alpha_dd_ = false;
-  double double_update_probability_ = 0;
-  bool all_sites_partnership_ = 0;
-  int max_submatrix_size_ = 1;
 };
 
 template <typename Concurrency>
@@ -327,7 +330,7 @@ int McSolverParameters<ClusterSolverId::CT_INT>::getBufferSize(const Concurrency
 
 template <typename Concurrency>
 void McSolverParameters<ClusterSolverId::CT_INT>::pack(const Concurrency& concurrency, char* buffer,
-                                              const int buffer_size, int& position) const {
+                                                       const int buffer_size, int& position) const {
   concurrency.pack(buffer, buffer_size, position, initial_configuration_size_);
   concurrency.pack(buffer, buffer_size, position, alpha_dd_pos_);
   concurrency.pack(buffer, buffer_size, position, alpha_dd_neg_);
@@ -340,7 +343,7 @@ void McSolverParameters<ClusterSolverId::CT_INT>::pack(const Concurrency& concur
 
 template <typename Concurrency>
 void McSolverParameters<ClusterSolverId::CT_INT>::unpack(const Concurrency& concurrency, char* buffer,
-                                                const int buffer_size, int& position) {
+                                                         const int buffer_size, int& position) {
   concurrency.unpack(buffer, buffer_size, position, initial_configuration_size_);
   concurrency.unpack(buffer, buffer_size, position, alpha_dd_pos_);
   concurrency.unpack(buffer, buffer_size, position, alpha_dd_neg_);
diff --git a/include/dca/phys/parameters/physics_parameters.hpp b/include/dca/phys/parameters/physics_parameters.hpp
index 51f53cba4..12ff59554 100644
--- a/include/dca/phys/parameters/physics_parameters.hpp
+++ b/include/dca/phys/parameters/physics_parameters.hpp
@@ -57,6 +57,7 @@ class PhysicsParameters {
     return adjust_chemical_potential_;
   }
 
+  const void set_beta(double beta) { beta_ = beta; }
 private:
   double beta_;
   double density_;
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp
index c188ee348..14365bbba 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp
@@ -126,7 +126,7 @@ auto DMatrixBuilder<CPU, Scalar>::computeGamma(const int aux_spin_type, const in
 
 // Compute only the parts of G0 required at a given moment. (Re)Computing every element is not needed in most situations.
 template <typename Scalar>
-void DMatrixBuilder<CPU, Scalar>::computeG0(Matrix& G0, const Sector& configuration, const int n_init,
+ void DMatrixBuilder<CPU, Scalar>::computeG0(Matrix& G0, const Sector& configuration, const int n_init,
                                             const int n_max, const int which_section) const {
   int b_i, b_j, r_i, r_j;
   Real tau_i, tau_j;
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp
index 2b59fffa6..4f2b171f9 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp
@@ -32,6 +32,8 @@ DMatrixBuilder<linalg::GPU, Scalar>::DMatrixBuilder(const G0Interpolation<GPU, S
                                                   const int nb, const int r0)
     : BaseClass(g0, site_diff, nb), g0_ref_(g0) {
   assert(site_add.size() == site_diff.size());
+  
+
   dca::phys::solver::details::SolverHelper::set(site_add.ptr(), site_add.leadingDimension(), site_diff.ptr(),
                     site_diff.leadingDimension(), nb, site_add.nrRows(), r0);
 }
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu
index 632639e10..4d0d7f94e 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu
@@ -56,6 +56,7 @@ __global__ void buildG0MatrixKernel(linalg::MatrixView<Scalar, linalg::GPU> G0,
   const int b_j = config.getRightB(j);
   const auto tau_j = config.getTau(j);
 
+  
   const int label = dca::phys::solver::details::solver_helper.index(b_i, b_j, config.getLeftR(i),
                                                                     config.getRightR(j));
 
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu b/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu
index f89b48cd2..9a27dac9d 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu
@@ -34,6 +34,7 @@ using dca::util::castGPUType;
 template <typename Scalar>
 __global__ void setRightSectorToIdKernel(Scalar* m, const int ldm, const int n0, const int n_max) {
   const int i = threadIdx.x + blockDim.x * blockIdx.x;
+  // this preserves the behavior where we skip the lower left sector.
   const int j = threadIdx.y + blockDim.y * blockIdx.y + n0;
 
   if (i >= n_max || j >= n_max)
diff --git a/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu b/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu
index 62eb122be..072d1af19 100644
--- a/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu
+++ b/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu
@@ -85,11 +85,14 @@ void ClusterHelper::setMomentum(int nc, const int* add, int lda, const int* sub,
 
     size_t cluster_helper_size;
 
-    checkRC(cudaMemcpyToSymbol(HIP_SYMBOL(cluster_momentum_add_matrix), &host_helper.add_matrix_, sizeof(int*)));
-    checkRC(cudaMemcpyToSymbol(HIP_SYMBOL(cluster_momentum_sub_matrix), &host_helper.sub_matrix_, sizeof(int*)));
+    checkRC(cudaMemcpyToSymbol(HIP_SYMBOL(cluster_momentum_add_matrix), &host_helper.add_matrix_,
+                               sizeof(int*)));
+    checkRC(cudaMemcpyToSymbol(HIP_SYMBOL(cluster_momentum_sub_matrix), &host_helper.sub_matrix_,
+                               sizeof(int*)));
     checkRC(cudaGetSymbolSize(&cluster_helper_size, HIP_SYMBOL(cluster_momentum_helper)));
     assert(cluster_helper_size == sizeof(ClusterHelper));
-    checkRC(cudaMemcpyToSymbol(HIP_SYMBOL(cluster_momentum_helper), &host_helper, sizeof(ClusterHelper)));
+    checkRC(cudaMemcpyToSymbol(HIP_SYMBOL(cluster_momentum_helper), &host_helper,
+                               sizeof(ClusterHelper)));
     checkRC(cudaDeviceSynchronize());
 #ifdef DEBUG_CLUSTER_HELPER
     checkClusterMomentumHelper<<<1, 1, 0, 0>>>(nc, lds);
@@ -127,8 +130,10 @@ void ClusterHelper::set(int nc, const int* add, int lda, const int* sub, int lds
     checkRC(cudaGetSymbolSize(&cluster_helper_size, HIP_SYMBOL(cluster_real_helper)));
     assert(cluster_helper_size == sizeof(ClusterHelper));
 
-    checkRC(cudaMemcpyToSymbol(HIP_SYMBOL(cluster_add_matrix), &host_helper.add_matrix_, sizeof(int*)));
-    checkRC(cudaMemcpyToSymbol(HIP_SYMBOL(cluster_sub_matrix), &host_helper.sub_matrix_, sizeof(int*)));
+    checkRC(
+        cudaMemcpyToSymbol(HIP_SYMBOL(cluster_add_matrix), &host_helper.add_matrix_, sizeof(int*)));
+    checkRC(
+        cudaMemcpyToSymbol(HIP_SYMBOL(cluster_sub_matrix), &host_helper.sub_matrix_, sizeof(int*)));
     checkRC(cudaMemcpyToSymbol(HIP_SYMBOL(cluster_real_helper), &host_helper, cluster_helper_size));
     checkRC(cudaDeviceSynchronize());
 
diff --git a/test/integration/cluster_solver/ctint/ctint_double_update_comparison_test.cpp b/test/integration/cluster_solver/ctint/ctint_double_update_comparison_test.cpp
index dd218f5f2..33104e949 100644
--- a/test/integration/cluster_solver/ctint/ctint_double_update_comparison_test.cpp
+++ b/test/integration/cluster_solver/ctint/ctint_double_update_comparison_test.cpp
@@ -46,8 +46,7 @@ using McOptions = MockMcOptions<Scalar>;
 #include "dca/profiling/null_profiler.hpp"
 #include "dca/util/git_version.hpp"
 #include "dca/util/modules.hpp"
-
-const std::string input_dir = DCA_SOURCE_DIR "/test/integration/cluster_solver/ctint/";
+#include "test/unit/phys/dca_step/cluster_solver/test_setup.hpp"
 
 template <class WalkerType, class G0, class Parameters, class Data>
 void initializeWalkerStatic(const G0& g0, const Parameters& parameters, const Data& data) {
@@ -56,7 +55,23 @@ void initializeWalkerStatic(const G0& g0, const Parameters& parameters, const Da
   WalkerType::setInteractionVertices(data, parameters);
 }
 
-TEST(CtintDoubleUpdateComparisonTest, Self_Energy) {
+constexpr char input_name[] =
+    DCA_SOURCE_DIR "/test/integration/cluster_solver/ctint/double_insertion_comparison_input.json";
+
+struct CtintDoubleUpdateComparisonTest : public ::testing::Test {
+  using G0Setup = dca::testing::G0SetupBare<Scalar, dca::testing::LatticeFeAs,
+                                            dca::ClusterSolverId::CT_INT, input_name>;
+  virtual void SetUp() {
+    host_setup.SetUp();
+    host2_setup.SetUp();
+  }
+
+  virtual void TearDown() {}
+  G0Setup host_setup;
+  G0Setup host2_setup;
+};
+
+TEST_F(CtintDoubleUpdateComparisonTest, Self_Energy) {
   using RngType = dca::testing::StubRng;
   using RealRng = dca::math::random::StdRandomWrapper<std::mt19937_64>;
   using Lattice = dca::phys::models::FeAsLattice<dca::phys::domains::D4>;
@@ -65,44 +80,45 @@ TEST(CtintDoubleUpdateComparisonTest, Self_Energy) {
   using Concurrency = dca::parallel::NoConcurrency;
   using Parameters =
       dca::phys::params::Parameters<Concurrency, Threading, dca::profiling::NullProfiler, Model,
-                                    RngType, dca::ClusterSolverId::CT_INT, dca::NumericalTraits<dca::util::RealAlias<Scalar>, Scalar>>;
+                                    RngType, dca::ClusterSolverId::CT_INT,
+                                    dca::NumericalTraits<dca::util::RealAlias<Scalar>, Scalar>>;
   using Data = dca::phys::DcaData<Parameters>;
 
   using Walker = testing::phys::solver::ctint::WalkerWrapper<Scalar, Parameters>;
   using dca::DistType;
-  using WalkerSubmatrix =
-    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters>;
+  using WalkerSubmatrix = testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters>;
+
+  auto& cpu_data = host_setup.data_;
+  auto& cpu2_data = host2_setup.data_;
+  auto& cpu_parameters = host_setup.parameters_;
+  auto& cpu2_parameters = host2_setup.parameters_;
 
   Concurrency concurrency(0, nullptr);
   dca::util::GitVersion::print();
   dca::util::Modules::print();
 
-  Parameters parameters(dca::util::GitVersion::string(), concurrency);
-  parameters.read_input_and_broadcast<dca::io::JSONReader>(
-      input_dir + "/double_insertion_comparison_input.json");
-  parameters.update_model();
-  parameters.update_domains();
-
-  Data data(parameters);
-  data.initialize();
-
-  dca::phys::solver::G0Interpolation<dca::linalg::CPU, double> g0(
-      dca::phys::solver::ctint::details::shrinkG0(data.G0_r_t));
+  dca::phys::solver::G0Interpolation<dca::linalg::CPU, double> g0_cpu(
+      dca::phys::solver::ctint::details::shrinkG0(cpu_data->G0_r_t));
+  dca::phys::solver::G0Interpolation<dca::linalg::CPU, double> g0_cpu2(
+      dca::phys::solver::ctint::details::shrinkG0(cpu2_data->G0_r_t));
 
   RealRng rng(0, 1);
   std::vector<double> rng_vals(10000);
   for (auto& x : rng_vals)
     x = rng();
-  RngType rng1(rng_vals), rng2(rng_vals);
+  RngType rng1(rng_vals);
+  RngType rng2(rng_vals);
 
   using RDmn = typename Parameters::RClusterDmn;
-  
-  dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar> d_matrix_builder_(g0, Parameters::lattice_type::BANDS, RDmn());
-  Walker walker1(parameters, rng1, d_matrix_builder_);
 
-  parameters.setMaxSubmatrixSize(16);
-  dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar> d_matrix_builder_2(g0, Parameters::lattice_type::BANDS, RDmn());
-  WalkerSubmatrix walker2(parameters, rng2, d_matrix_builder_2);
+  dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar> d_matrix_builder_(
+      g0_cpu, Parameters::lattice_type::BANDS, RDmn());
+  Walker walker1(cpu_parameters, *cpu_data, rng1, d_matrix_builder_);
+
+  cpu2_parameters.setMaxSubmatrixSize(16);
+  dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar> d_matrix_builder_2(
+      g0_cpu2, Parameters::lattice_type::BANDS, RDmn());
+  WalkerSubmatrix walker2(cpu2_parameters, *cpu2_data, rng2, d_matrix_builder_2);
 
   EXPECT_NEAR(walker1.get_MC_log_weight(), walker2.get_MC_log_weight(), 5e-7);
 
@@ -116,7 +132,7 @@ TEST(CtintDoubleUpdateComparisonTest, Self_Energy) {
     EXPECT_NEAR(walker1.get_MC_log_weight(), walker2.get_MC_log_weight(), 5e-7);
     EXPECT_EQ(walker1.get_sign(), walker2.get_sign());
 
-    auto check_direct_weight = [] (auto& walker) {
+    auto check_direct_weight = [](auto& walker) {
       const auto fast_weight = walker.get_MC_log_weight();
       walker.setMFromConfig();
       const auto direct_weight = walker.get_MC_log_weight();
diff --git a/test/unit/linalg/matrixop_cpu_gpu_test.cpp b/test/unit/linalg/matrixop_cpu_gpu_test.cpp
index 8a1324fce..a96256f3d 100644
--- a/test/unit/linalg/matrixop_cpu_gpu_test.cpp
+++ b/test/unit/linalg/matrixop_cpu_gpu_test.cpp
@@ -39,7 +39,6 @@ TEST(MatrixopCPUGPUTest, difference) {
         dca::linalg::Matrix<double, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<double>> b(a);
         b(ia, ja) += sg * diff;
         double err = std::abs(epsilon * b(ia, ja));
-
         EXPECT_NEAR(diff, dca::linalg::matrixop::difference(da, b, 2 * diff), err);
         EXPECT_NEAR(diff, dca::linalg::matrixop::difference(da, b, diff + err), err);
         auto diffcalc = dca::linalg::matrixop::difference(da, b, 2 * diff);
diff --git a/test/unit/linalg/util/CMakeLists.txt b/test/unit/linalg/util/CMakeLists.txt
index 4eef721ed..8a427007c 100644
--- a/test/unit/linalg/util/CMakeLists.txt
+++ b/test/unit/linalg/util/CMakeLists.txt
@@ -29,7 +29,7 @@ if(DCA_HAVE_CUDA)
 dca_add_gtest(complex_op_cuda_test
   GTEST_MAIN
   CUDA
-  LIBS ${DCA_GPU_LIBS} magma::magma BLAS::BLAS)
+  LIBS ${DCA_GPU_LIBS} magma::magma)
 
 dca_gpu_runtime_link(complex_op_cuda_test)
 endif()
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
index 0edaad83c..cb7f61571 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
@@ -30,8 +30,8 @@ using McOptions = MockMcOptions<Scalar>;
 #include "walker_wrapper_submatrix.hpp"
 #include "dca/linalg/matrixop.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp"
-
 #include "dca/util/to_string.hpp"
+
 using namespace dca::addt_str_oper;
 
 constexpr char input_name[] =
@@ -62,12 +62,12 @@ using namespace dca::phys::solver;
 template <typename Scalar>
 using CtintWalkerSubmatrixGpuTest = CtINTWalkerSubmatrixGPUTestT<Scalar>;
 
-template<dca::linalg::DeviceType DEVICE>
+template <dca::linalg::DeviceType DEVICE>
 using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<DEVICE, Scalar>;
 
 // Currently testing float isn't really possible due to the way the Scalar type is
 // carried through from mc_options. See test_setup.hpp PD
-using ScalarTypes = ::testing::Types<double>; //double,
+using ScalarTypes = ::testing::Types<double>;  // double,
 TYPED_TEST_CASE(CtintWalkerSubmatrixGpuTest, ScalarTypes);
 
 // Compare the submatrix update with a direct computation of the M matrix, and compare the
@@ -80,9 +80,9 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
   using Matrix = typename Walker::Matrix;
   using MatrixPair = std::array<Matrix, 2>;
   using SbmWalkerCpu =
-    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
+      testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
   using SbmWalkerGpu =
-    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::GPU>;
+      testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::GPU>;
 
   std::vector<double> setup_rngs{0., 0.00, 0.9,  0.5, 0.01, 0,    0.75, 0.02,
                                  0,  0.6,  0.03, 1,   0.99, 0.04, 0.99};
@@ -100,15 +100,14 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
   G0Interpolation<GPU, Scalar> g0_gpu(g0_func_gpu);
   typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::LabelDomain label_dmn;
 
-
   constexpr int bands = dca::testing::LatticeBilayer::BANDS;
 
   DMatrixBuilder<dca::linalg::CPU> d_matrix_cpu(g0_cpu, bands, RDmn());
+  d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false);  // cpu_parameters.adjustAlphaDd());
   SbmWalkerCpu::setInteractionVertices(cpu_data, cpu_parameters);
-  d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false); //cpu_parameters.adjustAlphaDd());
   DMatrixBuilder<dca::linalg::GPU> d_matrix_gpu(g0_gpu, bands, RDmn());
+  d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false);  // gpu_parameters.adjustAlphaDd());
   SbmWalkerGpu::setInteractionVertices(cpu_data, gpu_parameters);
-  d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false); //gpu_parameters.adjustAlphaDd());
 
   // ************************************
   // Test vertex insertion / removal ****
@@ -162,21 +161,27 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
       SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng, d_matrix_gpu);
       walker_gpu.setInteractionVertices(gpu_data, gpu_parameters);
 
+      // I don't think we can call these before steps are done.  At least the CPU implementation has nan's in M
+      // MatrixPair old_M_cpu(walker_cpu.getM());
+      // MatrixPair old_M_gpu(walker_gpu.getM());
+
+      // doSweep does this
+      walker_gpu.uploadConfiguration();
+
+      constexpr Scalar tolerance = std::numeric_limits<Scalar>::epsilon() * 100;
+
+      // for (int s = 0; s < 2; ++s)
+      //   EXPECT_TRUE(dca::linalg::matrixop::areNear(old_M_cpu[s], old_M_gpu[s], tolerance));
+
       cpu_rng.setNewValues(rng_vals);
       walker_cpu.doStep(steps);
+
       gpu_rng.setNewValues(rng_vals);
       walker_gpu.doStep(steps);
 
       // doSweep does this
       walker_gpu.uploadConfiguration();
 
-      constexpr Scalar tolerance = std::numeric_limits<Scalar>::epsilon() * 100;
-
-      auto M_cpu = walker_cpu.getM();
-      auto M_gpu = walker_gpu.getM();
-      for (int s = 0; s < 2; ++s)
-        EXPECT_TRUE(dca::linalg::matrixop::areNear(M_cpu[s], M_gpu[s], tolerance));
-
       // The final configuration is the same.
       const auto& config1 = walker_cpu.getWalkerConfiguration();
       const auto& config2 = walker_gpu.getWalkerConfiguration();
@@ -185,6 +190,9 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
         EXPECT_EQ(config1[i], config2[i]);
       EXPECT_EQ(walker_cpu.get_sign(), walker_gpu.get_sign());
 
+      auto M_cpu = walker_cpu.getM();
+      auto M_gpu = walker_gpu.getM();
+
       auto fail = compareSubMatrix(M_cpu, M_gpu, "M");
       EXPECT_FALSE(fail);
     }
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp
index a768310f4..9a39bdaeb 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp
@@ -34,17 +34,17 @@ constexpr char input_name[] =
 
 template <typename Scalar>
 using CtintWalkerSubmatrixTest =
-  typename dca::testing::G0Setup<Scalar, dca::testing::LatticeBilayer, dca::ClusterSolverId::CT_INT, input_name>;
-
-  using CDA = dca::phys::ClusterDomainAliases<dca::testing::LatticeBilayer::DIMENSION>;
-  using RDmn = typename CDA::RClusterDmn;
+    typename dca::testing::G0Setup<Scalar, dca::testing::LatticeBilayer,
+                                   dca::ClusterSolverId::CT_INT, input_name>;
 
+using CDA = dca::phys::ClusterDomainAliases<dca::testing::LatticeBilayer::DIMENSION>;
+using RDmn = typename CDA::RClusterDmn;
 
 using namespace dca::phys::solver;
 
 // Currently testing float isn't really possible due to the way the Scalar type is
 // carried through from mc_options. See test_setup.hpp PD
-using ScalarTypes = ::testing::Types<double>; //double,
+using ScalarTypes = ::testing::Types<double>;  // double,
 TYPED_TEST_CASE(CtintWalkerSubmatrixTest, ScalarTypes);
 
 // Compare the submatrix update with a direct computation of the M matrix, and compare the
@@ -58,7 +58,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) {
   using Matrix = typename Walker::Matrix;
   using MatrixPair = std::array<Matrix, 2>;
   using SubmatrixWalker =
-    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
+      testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
 
   std::vector<double> setup_rngs{0., 0.00, 0.9,  0.5, 0.01, 0,    0.75, 0.02,
                                  0,  0.6,  0.03, 1,   0.99, 0.04, 0.99};
@@ -117,7 +117,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) {
     const auto tolerance = 1000.0 * std::numeric_limits<RealAlias<Scalar>>::epsilon();
 
     for (int s = 0; s < 2; ++s)
-	EXPECT_TRUE(dca::linalg::matrixop::areNear(direct_M[s], new_M[s], tolerance));
+      EXPECT_TRUE(dca::linalg::matrixop::areNear(direct_M[s], new_M[s], tolerance));
 
     // Compare with non submatrix walker.
     rng.setNewValues(setup_rngs);
@@ -128,8 +128,8 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) {
       walker_nosub.doStep();
 
     // this needs to be std::abs because it could be a "complex" probability
-    EXPECT_NEAR(std::abs(walker.getAcceptanceProbability()), std::abs(walker_nosub.getAcceptanceProbability()),
-                tolerance);
+    EXPECT_NEAR(std::abs(walker.getAcceptanceProbability()),
+                std::abs(walker_nosub.getAcceptanceProbability()), tolerance);
 
     auto config_nosubm = walker_nosub.getWalkerConfiguration();
     ASSERT_EQ(config.size(), config_nosubm.size());
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json b/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json
index a6a5a4b05..88a9aeed0 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json
@@ -9,16 +9,18 @@
 
   "physics" :
   {
-    "beta"                      :  2,
+    "beta"                      :  4,
     "chemical-potential"        : 0
   },
 
   "bilayer-Hubbard-model":
   {
-    "t"       : 0,
-    "U"       : 0,
-    "V"       : 0,
-    "V-prime" : 2
+    "t"       : 1.0,
+    "t_prime" : 0.1,
+    "t_perp"  : 0.01,
+    "U"       : 1.0,
+    "V"       : 1.0,
+    "V-prime" : 2.0
   },
 
   "Hund-model":
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
index 638a35ac6..6393815cd 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
@@ -50,7 +50,7 @@ struct WalkerSelector<Parameters, GPU, DIST> {
 #endif  // DCA_HAVE_GPU
 
 using namespace dca::phys::solver::ctint;
-  template <typename SCALAR, class Parameters, DeviceType device_t = CPU, DistType DIST = DistType::NONE>
+template <typename SCALAR, class Parameters, DeviceType device_t = CPU, DistType DIST = DistType::NONE>
 struct WalkerWrapperSubmatrix : public WalkerSelector<Parameters, device_t, DIST>::type {
   using BaseClass = typename WalkerSelector<Parameters, device_t, DIST>::type;
   using Scalar = SCALAR;
@@ -59,26 +59,26 @@ struct WalkerWrapperSubmatrix : public WalkerSelector<Parameters, device_t, DIST
   using Data = typename BaseClass::Data;
   using BaseClass::setMFromConfig;
 
-    WalkerWrapperSubmatrix(/*const*/ Parameters& parameters_ref, Rng& rng_ref, DMatrixBuilder<device_t, Scalar>& d_matrix_builder)
-      : BaseClass(parameters_ref, dca::phys::DcaData<Parameters>(parameters_ref), rng_ref, d_matrix_builder, 0),
+  WalkerWrapperSubmatrix(/*const*/ Parameters& parameters_ref, Rng& rng_ref,
+                         DMatrixBuilder<device_t, Scalar>& d_matrix_builder)
+      : BaseClass(parameters_ref, dca::phys::DcaData<Parameters>(parameters_ref), rng_ref,
+                  d_matrix_builder, 0),
         streams_(3) {
     BaseClass::initialize(0);
+  }
 
+  WalkerWrapperSubmatrix(/*const*/ Parameters& parameters_ref,
+                         const dca::phys::DcaData<Parameters>& data, Rng& rng_ref,
+                         DMatrixBuilder<device_t, Scalar>& d_matrix_builder)
+      : BaseClass(parameters_ref, data, rng_ref, d_matrix_builder, 0), streams_(3) {
+    BaseClass::initialize(0);
   }
 
   using Matrix = dca::linalg::Matrix<Scalar, CPU>;
   using MatrixPair = std::array<Matrix, 2>;
 
   MatrixPair getM() {
-    std::array<dca::linalg::Matrix<Scalar, device_t>, 2> M;
-
-    BaseClass::computeM(M);
-#ifdef DCA_HAVE_GPU
-    checkRC(cudaDeviceSynchronize());
-#endif
-
-    std::array<dca::linalg::Matrix<Scalar, CPU>, 2> M_copy{M[0], M[1]};
-    return M_copy;
+    return BaseClass::getM();
   }
 
   const auto& getWalkerConfiguration() const {
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json
index 2d27da0f8..6b997974b 100644
--- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json
@@ -9,14 +9,18 @@
 
   "physics" :
   {
-    "beta"                      :  2,
+    "beta"                      :  4,
     "chemical-potential"        : 0
   },
 
   "bilayer-Hubbard-model":
   {
-    "t"       : 1,
-    "U"       : 2
+    "t"       : 1.0,
+    "t_prime" : 0.1,
+    "t_perp"  : 0.01,
+    "U"       : 1.0,
+    "V"       : 1.0,
+    "V-prime" : 2.0
   },
 
   "Hund-model":
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_test.cpp b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_test.cpp
index 53a198222..2521ff7f3 100644
--- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_test.cpp
@@ -46,10 +46,8 @@ using ConfigGenerator = dca::testing::AccumulationTest<double>;
 using Configuration = ConfigGenerator::Configuration;
 using Sample = ConfigGenerator::Sample;
 
-using Scalar = double;
-
 using TpAccumulatorSinglebandTest =
-  dca::testing::G0Setup<Scalar, dca::testing::LatticeSquare, dca::ClusterSolverId::CT_AUX, input_file>;
+    dca::testing::G0Setup<Scalar, dca::testing::LatticeSquare, dca::ClusterSolverId::CT_AUX, input_file>;
 
 TEST_F(TpAccumulatorSinglebandTest, Accumulate) {
   const std::array<int, 2> n{17, 17};
@@ -77,9 +75,10 @@ TEST_F(TpAccumulatorSinglebandTest, Accumulate) {
   func_names[dca::phys::FourPointType::PARTICLE_HOLE_CHARGE] = "G4_ph_charge";
   func_names[dca::phys::FourPointType::PARTICLE_PARTICLE_UP_DOWN] = "G4_pp_up_down";
 
-  for (const dca::phys::FourPointType type :
-       {dca::phys::FourPointType::PARTICLE_HOLE_TRANSVERSE, dca::phys::FourPointType::PARTICLE_HOLE_MAGNETIC,
-        dca::phys::FourPointType::PARTICLE_HOLE_CHARGE, dca::phys::FourPointType::PARTICLE_PARTICLE_UP_DOWN}) {
+  for (const dca::phys::FourPointType type : {dca::phys::FourPointType::PARTICLE_HOLE_TRANSVERSE,
+                                              dca::phys::FourPointType::PARTICLE_HOLE_MAGNETIC,
+                                              dca::phys::FourPointType::PARTICLE_HOLE_CHARGE,
+                                              dca::phys::FourPointType::PARTICLE_PARTICLE_UP_DOWN}) {
     parameters_.set_four_point_channel(type);
 
     dca::phys::solver::accumulator::TpAccumulator<Parameters> accumulator(
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_test_baseline.hdf5 b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_test_baseline.hdf5
index dca3ef510..d3618ad70 100644
Binary files a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_test_baseline.hdf5 and b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_test_baseline.hdf5 differ
diff --git a/test/unit/phys/dca_step/cluster_solver/test_setup.hpp b/test/unit/phys/dca_step/cluster_solver/test_setup.hpp
index b37cc0837..a86fad3e8 100644
--- a/test/unit/phys/dca_step/cluster_solver/test_setup.hpp
+++ b/test/unit/phys/dca_step/cluster_solver/test_setup.hpp
@@ -30,6 +30,7 @@
 #include "dca/phys/models/analytic_hamiltonians/hund_lattice.hpp"
 #include "dca/phys/models/analytic_hamiltonians/rashba_hubbard.hpp"
 #include "dca/phys/models/analytic_hamiltonians/Moire_Hubbard.hpp"
+#include "dca/phys/models/analytic_hamiltonians/fe_as_lattice.hpp"
 #include "dca/parallel/no_concurrency/no_concurrency.hpp"
 #include "dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp"
 #include "dca/parallel/no_threading/no_threading.hpp"
@@ -50,6 +51,7 @@ using LatticeHund = phys::models::HundLattice<phys::domains::D4>;
 using LatticeKagome = phys::models::KagomeHubbard<phys::domains::no_symmetry<2>>;
 using LatticeRashba = phys::models::RashbaHubbard<phys::domains::no_symmetry<2>>;
 using LatticeMoireHubbard = phys::models::moire_hubbard<phys::domains::no_symmetry<2>>;
+using LatticeFeAs = phys::models::FeAsLattice<phys::domains::D4>;
 
 template <typename Scalar, class Lattice = LatticeSquare,
           ClusterSolverId solver_name = ClusterSolverId::CT_AUX,
@@ -102,9 +104,8 @@ struct G0Setup : public ::testing::Test {
   virtual void TearDown() {}
 
   auto& getParameters() {
-      return parameters_;
+    return parameters_;
   }
-
 };
 
 template <typename Scalar, class Lattice = LatticeSquare,
@@ -154,10 +155,8 @@ struct G0SetupBare {
   }
 
   void TearDown() {}
-
 };
 
-
 template <typename Scalar, class Lattice = LatticeSquare,
           ClusterSolverId solver_name = ClusterSolverId::CT_AUX,
           const char* input_name = default_input, DistType DT = DistType::NONE>
@@ -196,10 +195,8 @@ struct G0SetupFromParam {
   }
 
   void TearDown() {}
-
 };
 
-
 }  // namespace testing
 }  // namespace dca