diff --git a/.github/workflows/ci-github-actions-self-hosted.yaml b/.github/workflows/ci-github-actions-self-hosted.yaml
index 073880006..7508b20dd 100644
--- a/.github/workflows/ci-github-actions-self-hosted.yaml
+++ b/.github/workflows/ci-github-actions-self-hosted.yaml
@@ -18,7 +18,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        jobname: [GCC12-MPI-CUDA-Real-Full]
+        jobname: [LLVM21-MPI-CUDA-Real-Full]
+
     #            GCC12-MPI-NoMPI-CUDA-Real-Full,
     #            GCC12-MPI-NoMPI-CUDA-Real-Debug-Full,
 
diff --git a/applications/analysis/CMakeLists.txt b/applications/analysis/CMakeLists.txt
index dd3cdeb8e..aef6ba13d 100644
--- a/applications/analysis/CMakeLists.txt
+++ b/applications/analysis/CMakeLists.txt
@@ -4,7 +4,8 @@ if (DCA_BUILD_ANALYSIS)
   add_executable(main_analysis main_analysis.cpp)
   target_include_directories(main_analysis PRIVATE ${DCA_INCLUDE_DIRS})
     if (DCA_HAVE_GPU)
-      target_link_libraries(main_analysis PRIVATE ${DCA_GPU_LIBS})
+      target_link_libraries(main_analysis PRIVATE ${DCA_GPU_LIBS}
+  magma::magma)
     endif()
 
     target_link_libraries(main_analysis PUBLIC signals ${DCA_LIBS} simplex_gm_rule)
diff --git a/build-aux/local_CPU.cmake b/build-aux/local_CPU.cmake
index db2f81238..974a17ab5 100644
--- a/build-aux/local_CPU.cmake
+++ b/build-aux/local_CPU.cmake
@@ -16,14 +16,14 @@ set(gtest_DIR "${CMAKE_CURRENT_LIST_DIR}/../libs/gmock-1.7.0/gtest/"
 
 # HDF5
 # CMake 3.6 does not properly find HDF5 if HDF5_ROOT is not set.
-set(HDF5_ROOT "/opt/homebrew/Cellar/hdf5/1.14.4.3/" CACHE PATH "Path to HDF5 installation directory.")
+set(HDF5_ROOT "/opt/homebrew/Cellar/hdf5/1.14.6/" CACHE PATH "Path to HDF5 installation directory.")
 # set(HDF5_ROOT "/usr/local/anaconda3/pkgs/hdf5-1.10.6-hdbbcd12_0/" CACHE PATH "Path to HDF5 installation directory.")
 
 # FFTW
 # set(FFTW_INCLUDE_DIR "/usr/local/Cellar/fftw/3.3.10/include" CACHE PATH "Path to fftw3.h.")
 # set(FFTW_LIBRARY "/usr/local/Cellar/fftw/3.3.10/lib/libfftw3.a" CACHE FILEPATH "Path to FFTW3 library.")
 set(FFTW_INCLUDE_DIR "/opt/homebrew/Cellar/fftw/3.3.10_1/include" CACHE PATH "Path to fftw3.h.")
-set(FFTW_LIBRARY "/opt/homebrew/Cellar/fftw/3.3.10_1/lib/libfftw3.a" CACHE FILEPATH "Path to FFTW3 library.")
+set(FFTW_LIBRARY "/opt/homebrew/Cellar/fftw/3.3.10_2/lib/libfftw3.a" CACHE FILEPATH "Path to FFTW3 library.")
 
 mark_as_advanced(gtest_DIR SPRNG_DIR HDF5_ROOT)
 
@@ -37,4 +37,3 @@ set(MPIEXEC_NUMPROC_FLAG "-np"
   CACHE STRING "Flag used by TEST_RUNNER to specify the number of processes.")
 
 mark_as_advanced(TEST_RUNNER MPIEXEC_NUMPROC_FLAG)
-
diff --git a/cmake/dca_config.cmake b/cmake/dca_config.cmake
index 8b854c209..976f24253 100644
--- a/cmake/dca_config.cmake
+++ b/cmake/dca_config.cmake
@@ -121,8 +121,8 @@ endif()
 
 # Lattice type
 set(DCA_LATTICE "square" CACHE STRING "Lattice type, options are: bilayer | square | triangular |
-    Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn | La3Ni2O7_bilayer")
-set_property(CACHE DCA_LATTICE PROPERTY STRINGS bilayer square triangular Kagome hund twoband_Cu threeband
+    Kagome | Plaquette | hund | twoband_Cu | threeband | Rashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn | La3Ni2O7_bilayer")
+set_property(CACHE DCA_LATTICE PROPERTY STRINGS bilayer square triangular Kagome Plaquette hund twoband_Cu threeband
              Rashba_Hubbard Moire_Hubbard FeAs material_NiO material_FeSn La3Ni2O7_bilayer)
 
 if (DCA_LATTICE STREQUAL "bilayer")
@@ -149,48 +149,64 @@ elseif (DCA_LATTICE STREQUAL "triangular")
   set(DCA_LATTICE_TYPE dca::phys::models::triangular_lattice<PointGroup>)
   set(DCA_LATTICE_INCLUDE
     "dca/phys/models/analytic_hamiltonians/triangular_lattice.hpp")
+
 elseif (DCA_LATTICE STREQUAL "Kagome")
   set(DCA_LATTICE_TYPE dca::phys::models::KagomeHubbard<PointGroup>)
   set(DCA_LATTICE_INCLUDE
     "dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp")
+
+elseif (DCA_LATTICE STREQUAL "Plaquette")
+  set(DCA_LATTICE_TYPE dca::phys::models::squarePlaquetteHubbard<PointGroup>)
+  set(DCA_LATTICE_INCLUDE
+    "dca/phys/models/analytic_hamiltonians/square_plaquette_hubbard.hpp")
+
 elseif (DCA_LATTICE STREQUAL "hund")
   set(DCA_LATTICE_TYPE dca::phys::models::HundLattice<PointGroup>)
   set(DCA_LATTICE_INCLUDE
     "dca/phys/models/analytic_hamiltonians/hund_lattice.hpp")
+
 elseif (DCA_LATTICE STREQUAL "threeband")
   set(DCA_LATTICE_TYPE dca::phys::models::ThreebandHubbard<PointGroup>)
   set(DCA_LATTICE_INCLUDE
     "dca/phys/models/analytic_hamiltonians/threeband_hubbard.hpp")
+
 elseif (DCA_LATTICE STREQUAL "Rashba_Hubbard")
   set(DCA_LATTICE_TYPE dca::phys::models::RashbaHubbard<PointGroup>)
   set(DCA_LATTICE_INCLUDE
     "dca/phys/models/analytic_hamiltonians/rashba_hubbard.hpp")
+
 elseif (DCA_LATTICE STREQUAL "Moire_Hubbard")
   set(DCA_LATTICE_TYPE dca::phys::models::moire_hubbard<PointGroup>)
   set(DCA_LATTICE_INCLUDE
     "dca/phys/models/analytic_hamiltonians/Moire_Hubbard.hpp")
+
 elseif (DCA_LATTICE STREQUAL "twoband_chain")
   set(DCA_LATTICE_TYPE dca::phys::models::twoband_chain<dca::phys::domains::no_symmetry<1>>)
   set(DCA_LATTICE_INCLUDE
       "dca/phys/models/analytic_hamiltonians/hund_lattice.hpp")
+
 elseif (DCA_LATTICE STREQUAL "FeAs")
   set(DCA_LATTICE_TYPE dca::phys::models::FeAsLattice<PointGroup>)
   set(DCA_LATTICE_INCLUDE
       "dca/phys/models/analytic_hamiltonians/fe_as_lattice.hpp")
+
 elseif (DCA_LATTICE STREQUAL "twoband_Cu")
   set(DCA_LATTICE_TYPE dca::phys::models::TwoBandCu<PointGroup>)
   set(DCA_LATTICE_INCLUDE
       "dca/phys/models/analytic_hamiltonians/twoband_Cu.hpp")
+
 elseif (DCA_LATTICE STREQUAL "material_NiO")
   set(DCA_LATTICE_TYPE "dca::phys::models::material_lattice<dca::phys::models::Material::NiO_unsymmetric, dca::phys::domains::${DCA_POINT_GROUP}>")
   set(DCA_LATTICE_INCLUDE
       "dca/phys/models/material_hamiltonians/material_lattice.hpp")
   set(DCA_MODEL_IS_MATERIAL_LATTICE ON CACHE BOOL "is the model a material lattice")
+
 elseif (DCA_LATTICE STREQUAL "material_FeSn")
   set(DCA_LATTICE_TYPE "dca::phys::models::material_lattice<dca::phys::models::Material::FeSn, dca::phys::domains::${DCA_POINT_GROUP}>")
   set(DCA_LATTICE_INCLUDE
       "dca/phys/models/material_hamiltonians/material_lattice.hpp")
   set(DCA_MODEL_IS_MATERIAL_LATTICE ON CACHE BOOL "is the model a material lattice")
+
 else()
   message(FATAL_ERROR "Please set DCA_LATTICE to a valid option: bilayer | La3Ni2O7_bilayer | square | triangular | Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn.")
 endif()
diff --git a/cmake/dca_testing.cmake b/cmake/dca_testing.cmake
index cc597e4e4..00d563bde 100644
--- a/cmake/dca_testing.cmake
+++ b/cmake/dca_testing.cmake
@@ -16,6 +16,7 @@ if(DCA_HAVE_HPX)
   set(test_thread_option HPX)
 endif()
 
+
 # Adds a test written with Google Test.
 #
 # dca_add_gtest(name
@@ -36,7 +37,7 @@ endif()
 function(dca_add_gtest name)
   set(options FAST EXTENSIVE STOCHASTIC PERFORMANCE GTEST_MAIN GTEST_MPI_MAIN THREADED MPI CUDA CUDA_MPI)
   set(oneValueArgs MPI_NUMPROC)
-  set(multiValueArgs INCLUDE_DIRS SOURCES LIBS)
+  set(multiValueArgs INCLUDE_DIRS SOURCES LIBS CUSTOM_SOURCE TEST_DEFINES)
   cmake_parse_arguments(DCA_ADD_GTEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   # FAST, EXTENSIVE and PERFORMANCE are mutually exclusive.
@@ -47,7 +48,7 @@ function(dca_add_gtest name)
       (DCA_ADD_GTEST_STOCHASTIC AND DCA_ADD_GTEST_EXTENSIVE) OR
       (DCA_ADD_GTEST_STOCHASTIC AND DCA_ADD_GTEST_PERFORMANCE))
     message(FATAL_ERROR "Incorrect use of dca_add_gtest.\n
-                         dca_add_gtest(name\n
+                         dca_add_gtest_impl (name src_file compile_definitions \n
                                        [FAST | EXTENSIVE | STOCHASTIC | PERFORMANCE]\n
                                        [GTEST_MAIN]\n
                                        [THREADED]\n
@@ -97,7 +98,7 @@ function(dca_add_gtest name)
   IF (DCA_ADD_GTEST_CUDA_MPI AND NOT DCA_HAVE_GPU_AWARE_MPI)
     return()
   endif()
-  
+
   # Right now we're only testing GPU distributed code on one node so its pointless
   # without more than one GPU per node.
   if (DCA_ADD_GTEST_CUDA_MPI AND DCA_HAVE_GPU_AWARE_MPI AND (DCA_TEST_GPU_COUNT LESS 2) )
@@ -112,9 +113,15 @@ function(dca_add_gtest name)
     set(DCA_ADD_GTEST_SOURCES ${PROJECT_SOURCE_DIR}/test/dca_gtest_main_mpi.cpp ${DCA_ADD_GTEST_SOURCES})
   endif()
 
-  add_executable(${name} ${name}.cpp ${DCA_ADD_GTEST_SOURCES})
-
+  if (DCA_ADD_GTEST_CUSTOM_SOURCE)
+    add_executable(${name} ${DCA_ADD_GTEST_CUSTOM_SOURCE} ${DCA_ADD_GTEST_SOURCES})
+  else ()
+    add_executable(${name} ${name}.cpp ${DCA_ADD_GTEST_SOURCES})
+  endif()
   target_compile_definitions(${name} PRIVATE DCA_SOURCE_DIR=\"${PROJECT_SOURCE_DIR}\")
+  if (DCA_ADD_GTEST_TEST_DEFINES)
+    target_compile_definitions(${name} PRIVATE THIS_TEST_DEFINES=${DCA_ADD_GTEST_TEST_DEFINES})
+  endif()
 
   # this is hacky but allows continued use of DCA_THREADING_LIBS
   # if (DCA_ADD_GTEST_LIBS MATCHES "parallel_hpx")
@@ -182,7 +189,7 @@ function(dca_add_gtest name)
     target_compile_definitions(${name} PUBLIC DCA_HAVE_HPX)
     target_link_libraries(${name} PUBLIC HPX::hpx HPX::wrap_main)
   endif()
-  
+
   target_include_directories(${name} PRIVATE
     ${gtest_SOURCE_DIR}/include
     ${DCA_ADD_GTEST_INCLUDE_DIRS})
@@ -239,5 +246,3 @@ function(dca_add_perftest name)
 
   target_compile_definitions(${name} PRIVATE DCA_SOURCE_DIR=\"${PROJECT_SOURCE_DIR}\")
 endfunction()
-
-
diff --git a/include/dca/function/util/difference.hpp b/include/dca/function/util/difference.hpp
index 94c1d1d55..1c1f9e749 100644
--- a/include/dca/function/util/difference.hpp
+++ b/include/dca/function/util/difference.hpp
@@ -64,8 +64,6 @@ Difference difference(const function<Scalartype1, Dmn1, DT1>& f1,
   l2_error = std::sqrt(l2_error / l2);
   linf_error /= linf;
 
-  
-  
   return Difference{l1_error, l2_error, linf_error};
 }
 
@@ -73,7 +71,9 @@ Difference difference(const function<Scalartype1, Dmn1, DT1>& f1,
 template <typename Scalartype1, typename Scalartype2, class Dmn1, class Dmn2>
 Difference difference(const function<Scalartype1, Dmn1>& f1, const function<Scalartype2, Dmn2>& f2) {
   if (f1.size() != f2.size())
-    throw(std::logic_error("The two functions have different size."));
+    throw(std::logic_error("The two functions have different sizes. f1.size() (" +
+                           std::to_string(f1.size()) + ") == f2.size() (" +
+                           std::to_string(f2.size()) + ")!\n"));
 
   double l1 = 0.;
   double l2 = 0.;
diff --git a/include/dca/math/function_transform/special_transforms/space_transform_2D.hpp b/include/dca/math/function_transform/special_transforms/space_transform_2D.hpp
index af7fe2ea1..45bf17f5f 100644
--- a/include/dca/math/function_transform/special_transforms/space_transform_2D.hpp
+++ b/include/dca/math/function_transform/special_transforms/space_transform_2D.hpp
@@ -30,20 +30,18 @@ namespace math {
 namespace transform {
 // dca::math::transform::
 
-template <class RDmn, class KDmn, typename Scalar = double>
+template <class RDmn, class KDmn, class BDMN, class SPDMN, typename Scalar = double>
 class SpaceTransform2D {
 protected:
   using Complex = dca::util::ComplexAlias<Scalar>;
-  using BDmn = func::dmn_0<phys::domains::electron_band_domain>;
-  using SDmn = func::dmn_0<phys::domains::electron_spin_domain>;
+  using BDmn = BDMN;   // func::dmn_0<phys::domains::electron_band_domain>;
+  using SDmn = SPDMN;  // func::dmn_0<phys::domains::electron_spin_domain>;
 
 public:
   // Apply the 2D Fourier transform defined as
-  // f(k1, b1, k2, b2) = \sum_{r1, r2} Exp[i (k1 (r1 + a[b1]) - k2 (r2 + a[b2])] f(r1, b1, r2, b2) / Nc,
-  // where a[b] is the displacement vector associated with each band,
-  // and rearrange the output domains.
-  // In/Out: f_input. The input is overwritten with a partial result.
-  // Out: f_output
+  // f(k1, b1, k2, b2) = \sum_{r1, r2} Exp[i (k1 (r1 + a[b1]) - k2 (r2 + a[b2])] f(r1, b1, r2, b2) /
+  // Nc, where a[b] is the displacement vector associated with each band, and rearrange the output
+  // domains. In/Out: f_input. The input is overwritten with a partial result. Out: f_output
   template <class W1Dmn, class W2Dmn>
   static void execute(
       func::function<Complex, func::dmn_variadic<RDmn, RDmn, BDmn, BDmn, SDmn, W1Dmn, W2Dmn>>& f_input,
@@ -55,16 +53,15 @@ class SpaceTransform2D {
     return SpaceToMomentumTransform<RDmn, KDmn>::hasPhaseFactor();
   }
   static const auto& getPhaseFactors();
-protected:
-
 
+protected:
 };
 
-template <class RDmn, class KDmn, typename Scalar>
+template <class RDmn, class KDmn, class BDMN, class SPDMN, typename Scalar>
 template <class W1Dmn, class W2Dmn>
-void SpaceTransform2D<RDmn, KDmn, Scalar>::execute(
-    func::function<Complex, func::dmn_variadic<RDmn, RDmn, BDmn, BDmn, SDmn, W1Dmn, W2Dmn>>& f_input,
-    func::function<Complex, func::dmn_variadic<BDmn, BDmn, SDmn, KDmn, KDmn, W1Dmn, W2Dmn>>& f_output) {
+void SpaceTransform2D<RDmn, KDmn, BDMN, SPDMN, Scalar>::execute(
+    func::function<Complex, func::dmn_variadic<RDmn, RDmn, BDMN, BDMN, SPDMN, W1Dmn, W2Dmn>>& f_input,
+    func::function<Complex, func::dmn_variadic<BDMN, BDMN, SPDMN, KDmn, KDmn, W1Dmn, W2Dmn>>& f_output) {
   assert(SDmn::dmn_size() == 2);
   const int nc = RDmn::dmn_size();
   linalg::Matrix<Complex, linalg::CPU> tmp(nc);
@@ -80,19 +77,22 @@ void SpaceTransform2D<RDmn, KDmn, Scalar>::execute(
             linalg::MatrixView<Complex, linalg::CPU> f_r_r(&f_input(0, 0, b1, b2, s, w1, w2), nc);
 
             // f(k1,k2) = \sum_{r1, r2} exp(i(k1 * r1 - k2 *r2)) f(r1, r2) / Nc
-            linalg::matrixop::gemm(T, f_r_r,tmp);
+            linalg::matrixop::gemm(T, f_r_r, tmp);
             linalg::matrixop::gemm('N', 'C', norm, tmp, T, Complex(0), f_r_r);
 
             // f(k1, k2) *= Exp[i (k1 a[b1] - k2 a[b2])]
             for (int k2 = 0; k2 < nc; ++k2)
-              for (int k1 = 0; k1 < nc; ++k1)
+              for (int k1 = 0; k1 < nc; ++k1) {
+                assert(b1 < BDMN::dmn_size());
                 f_output(b1, b2, s, k1, k2, w1, w2) =
                     f_r_r(k1, k2) * phase_factors(b1, k1) * std::conj(phase_factors(b2, k2));
+              }
           }
 }
 
-template <class RDmn, class KDmn, typename Scalar>
-const linalg::Matrix<dca::util::ComplexAlias<Scalar>, linalg::CPU>& SpaceTransform2D<RDmn, KDmn, Scalar>::get_T_matrix() {
+template <class RDmn, class KDmn, class BDMN, class SPDMN, typename Scalar>
+const linalg::Matrix<dca::util::ComplexAlias<Scalar>, linalg::CPU>& SpaceTransform2D<
+    RDmn, KDmn, BDMN, SPDMN, Scalar>::get_T_matrix() {
   auto initialize_T_matrix = []() {
     assert(RDmn::dmn_size() == KDmn::dmn_size());
     linalg::Matrix<Complex, linalg::CPU> T(RDmn::dmn_size());
@@ -100,8 +100,9 @@ const linalg::Matrix<dca::util::ComplexAlias<Scalar>, linalg::CPU>& SpaceTransfo
       const auto& r = RDmn::parameter_type::get_elements()[j];
       for (int i = 0; i < KDmn::dmn_size(); ++i) {
         const auto& k = KDmn::parameter_type::get_elements()[i];
-	using Real = dca::util::RealAlias<Scalar>;
-	auto temp_exp = std::exp(dca::util::ComplexAlias<Real>{0, static_cast<Real>(util::innerProduct(k, r))});
+        using Real = dca::util::RealAlias<Scalar>;
+        auto temp_exp =
+            std::exp(dca::util::ComplexAlias<Real>{0, static_cast<Real>(util::innerProduct(k, r))});
         T(i, j) = typename decltype(T)::ValueType{temp_exp.real(), temp_exp.imag()};
       }
     }
@@ -112,9 +113,9 @@ const linalg::Matrix<dca::util::ComplexAlias<Scalar>, linalg::CPU>& SpaceTransfo
   return T;
 }
 
-template <class RDmn, class KDmn, typename Scalar>
-const auto& SpaceTransform2D<RDmn, KDmn, Scalar>::getPhaseFactors() {
-  static func::function<Complex, func::dmn_variadic<BDmn, KDmn>> phase_factors("Phase factors.");
+template <class RDmn, class KDmn, class BDMN, class SPDMN, typename Scalar>
+const auto& SpaceTransform2D<RDmn, KDmn, BDMN, SPDMN, Scalar>::getPhaseFactors() {
+  static func::function<Complex, func::dmn_variadic<BDMN, KDmn>> phase_factors("Phase factors.");
   using Real = dca::util::RealAlias<Scalar>;
   static std::once_flag flag;
 
@@ -127,10 +128,10 @@ const auto& SpaceTransform2D<RDmn, KDmn, Scalar>::getPhaseFactors() {
     for (int k = 0; k < KDmn::dmn_size(); ++k) {
       const auto& k_vec = KDmn::get_elements()[k];
       for (int b = 0; b < BDmn::dmn_size(); ++b) {
-	// Scalar could be cuComplex or cuDoubleComplex so...
-	std::complex<Real> temp_phase{0., static_cast<Real>(util::innerProduct(k_vec, a_vecs[b]))};
-	temp_phase = std::exp(temp_phase);
-	phase_factors(b,k) = Complex{temp_phase.real(), temp_phase.imag()};
+        // Scalar could be cuComplex or cuDoubleComplex so...
+        std::complex<Real> temp_phase{0., static_cast<Real>(util::innerProduct(k_vec, a_vecs[b]))};
+        temp_phase = std::exp(temp_phase);
+        phase_factors(b, k) = Complex{temp_phase.real(), temp_phase.imag()};
       }
     }
   });
diff --git a/include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp b/include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp
index e9c2cb4af..56b2eb2c1 100644
--- a/include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp
+++ b/include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp
@@ -16,9 +16,9 @@
 #include <array>
 #include <memory>
 #include <complex>
-//#include "dca/config/haves_defines.hpp"
-// its expected that dca::config::McOptions will be provided in some manner before parameters.hpp is
-// included
+// #include "dca/config/haves_defines.hpp"
+//  its expected that dca::config::McOptions will be provided in some manner before parameters.hpp
+//  is included
 
 #include "space_transform_2D.hpp"
 #include "dca/util/type_help.hpp"
@@ -32,11 +32,11 @@ namespace math {
 namespace transform {
 // dca::math::transform::
 
-template <class RDmn, class KDmn, typename Scalar = double>
-class SpaceTransform2DGpu : public SpaceTransform2D<RDmn, KDmn, Scalar> {
+template <class RDmn, class KDmn, class BDMN, class SPDMN, typename Scalar = double>
+class SpaceTransform2DGpu : public SpaceTransform2D<RDmn, KDmn, BDMN, SPDMN, Scalar> {
 protected:
   // I guess the base type should use the host side scalar
-  using Base = SpaceTransform2D<RDmn, KDmn, Scalar>;
+  using Base = SpaceTransform2D<RDmn, KDmn, BDMN, SPDMN, Scalar>;
   using Complex = dca::util::ComplexAlias<Scalar>;
   using MatrixDev = linalg::Matrix<Complex, linalg::GPU>;
   using VectorDev = linalg::Vector<Complex, linalg::GPU>;
@@ -73,7 +73,7 @@ class SpaceTransform2DGpu : public SpaceTransform2D<RDmn, KDmn, Scalar> {
   }
 
 private:
-  using BDmn = func::dmn_0<phys::domains::electron_band_domain>;
+  using BDmn = typename Base::BDmn;  // func::dmn_0<phys::domains::electron_band_domain>;
 
   const MatrixDev& get_T_matrix() {
     auto initialize_T_matrix = []() {
@@ -101,9 +101,9 @@ class SpaceTransform2DGpu : public SpaceTransform2D<RDmn, KDmn, Scalar> {
   linalg::util::MagmaBatchedGemm<Complex> plan2_;
 };
 
-template <class RDmn, class KDmn, typename Scalar>
-SpaceTransform2DGpu<RDmn, KDmn, Scalar>::SpaceTransform2DGpu(const int nw,
-                                                             const linalg::util::MagmaQueue& queue)
+template <class RDmn, class KDmn, class BDMN, class SPDMN, typename Scalar>
+SpaceTransform2DGpu<RDmn, KDmn, BDMN, SPDMN, Scalar>::SpaceTransform2DGpu(
+    const int nw, const linalg::util::MagmaQueue& queue)
     : n_bands_(BDmn::dmn_size()),
       nw_(nw),
       nc_(RDmn::dmn_size()),
@@ -113,8 +113,8 @@ SpaceTransform2DGpu<RDmn, KDmn, Scalar>::SpaceTransform2DGpu(const int nw,
   workspace_ = std::make_shared<RMatrix>();
 }
 
-template <class RDmn, class KDmn, typename Real>
-float SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
+template <class RDmn, class KDmn, class BDMN, class SPDMN, typename Real>
+float SpaceTransform2DGpu<RDmn, KDmn, BDMN, SPDMN, Real>::execute(RMatrix& M) {
   float flop = 0.;
 
   auto& T_times_M = *(workspace_);
@@ -133,7 +133,8 @@ float SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
     const int lda = T.leadingDimension();
     const int ldb = M.leadingDimension();
     const int ldc = T_times_M.leadingDimension();
-    plan1_.execute('N', 'N', nc_, M.nrCols(), nc_, Complex{1.0, 0.0}, Complex{0.0,0.0}, lda, ldb, ldc);
+    plan1_.execute('N', 'N', nc_, M.nrCols(), nc_, Complex{1.0, 0.0}, Complex{0.0, 0.0}, lda, ldb,
+                   ldc);
     flop += n_trafo * 8. * nc_ * M.nrCols() * nc_;
   }
 
@@ -147,7 +148,7 @@ float SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
     const int ldb = T.leadingDimension();
     const int ldc = T_times_M_times_T.leadingDimension();
     const Complex norm = dca::util::makeMaybe<Complex>(1.0 / nc_);
-    plan2_.execute('N', 'C', M.nrRows(), nc_, nc_, norm, Complex{0.0,0.0}, lda, ldb, ldc);
+    plan2_.execute('N', 'C', M.nrRows(), nc_, nc_, norm, Complex{0.0, 0.0}, lda, ldb, ldc);
     flop += n_trafo * 8. * M.nrRows() * nc_ * nc_;
   }
 
@@ -157,9 +158,9 @@ float SpaceTransform2DGpu<RDmn, KDmn, Real>::execute(RMatrix& M) {
   return flop;
 }
 
-template <class RDmn, class KDmn, typename Scalar>
-void SpaceTransform2DGpu<RDmn, KDmn, Scalar>::phaseFactorsAndRearrange(const RMatrix& in,
-                                                                       RMatrix& out) {
+template <class RDmn, class KDmn, class BDMN, class SPDMN, typename Scalar>
+void SpaceTransform2DGpu<RDmn, KDmn, BDMN, SPDMN, Scalar>::phaseFactorsAndRearrange(const RMatrix& in,
+                                                                                    RMatrix& out) {
   out.resizeNoCopy(in.size());
   const Complex* const phase_factors_ptr =
       Base::hasPhaseFactors() ? getPhaseFactors().ptr() : nullptr;
@@ -169,13 +170,14 @@ void SpaceTransform2DGpu<RDmn, KDmn, Scalar>::phaseFactorsAndRearrange(const RMa
                                     queue_);
 }
 
-template <class RDmn, class KDmn, typename Scalar>
-const auto& SpaceTransform2DGpu<RDmn, KDmn, Scalar>::getPhaseFactors() {
+template <class RDmn, class KDmn, class BDMN, class SPDMN, typename Scalar>
+const auto& SpaceTransform2DGpu<RDmn, KDmn, BDMN, SPDMN, Scalar>::getPhaseFactors() {
   auto initialize = []() {
     using dca::util::ComplexAlias;
     const auto& phase_factors = Base::getPhaseFactors();
     linalg::Vector<Complex, linalg::CPU> host_vector(phase_factors.size());
-    std::copy_n(dca::util::castHostType(phase_factors.values()), phase_factors.size(), dca::util::castHostType(host_vector.ptr()));
+    std::copy_n(dca::util::castHostType(phase_factors.values()), phase_factors.size(),
+                dca::util::castHostType(host_vector.ptr()));
     return VectorDev(host_vector);
   };
 
diff --git a/include/dca/phys/dca_analysis/bse_solver/bse_cluster_solver_ext.hpp b/include/dca/phys/dca_analysis/bse_solver/bse_cluster_solver_ext.hpp
index 607694643..6496f57f7 100644
--- a/include/dca/phys/dca_analysis/bse_solver/bse_cluster_solver_ext.hpp
+++ b/include/dca/phys/dca_analysis/bse_solver/bse_cluster_solver_ext.hpp
@@ -84,9 +84,7 @@ class BseClusterSolverExt {
 
   diagrammatic_symmetries<ParametersType> diagrammatic_symmetries_obj;
 
-  func::function<std::complex<ScalarType>,
-                 func::dmn_variadic<DCA_matrix_dmn_t, KExDmn, WExDmn>>
-      Gamma_cluster;
+  func::function<std::complex<ScalarType>, func::dmn_variadic<DCA_matrix_dmn_t, KExDmn, WExDmn>> Gamma_cluster;
   func::function<std::complex<double>, func::dmn_variadic<b, b, b, b, k_DCA, WVertexDmn, KExDmn, WExDmn>>
       G_II_0_function;
 };
@@ -121,7 +119,7 @@ void BseClusterSolverExt<ParametersType, DcaDataType, ScalarType>::compute_Gamma
   func::function<std::complex<ScalarType>, TotalG4Dmn> G_II("G_II");
   func::function<std::complex<ScalarType>, TotalG4Dmn> G_II_0("G_II_0");
 
-  //apply_symmetries_sp();
+  // apply_symmetries_sp();
 
   load_G_II(G_II);
 
@@ -130,7 +128,7 @@ void BseClusterSolverExt<ParametersType, DcaDataType, ScalarType>::compute_Gamma
   // it used to need to be here because G_II_0 was renormalized now we only do that to a temp variable.
   load_G_II_0_function(G_II_0);
 
-  //apply_symmetries_tp(G_II, G_II_0);
+  // apply_symmetries_tp(G_II, G_II_0);
 
   // in original analysis code the G_II and G_II_0 come back renormalized
   // not so here
@@ -160,7 +158,7 @@ void BseClusterSolverExt<ParametersType, DcaDataType, ScalarType>::apply_symmetr
   if (parameters.symmetrize_Gamma()) {
     std::vector<int> subind(3);
     if (concurrency.id() == concurrency.first())
-       std::cout << "symmetrize Gamma_lattice according to the symmetry-group \n" << std::endl;
+      std::cout << "symmetrize Gamma_lattice according to the symmetry-group \n" << std::endl;
     for (int wex_ind = 0; wex_ind < WExDmn::dmn_size(); wex_ind++)
       for (int kex_ind = 0; kex_ind < KExDmn::dmn_size(); kex_ind++) {
         func::function<std::complex<ScalarType>, DCA_matrix_dmn_t> G_II_indi;
@@ -170,7 +168,8 @@ void BseClusterSolverExt<ParametersType, DcaDataType, ScalarType>::apply_symmetr
         G_II_0.slice(0, subind, static_cast<std::complex<ScalarType>*>(G_II_0_indi.values()));
 
         Symmetrize<ParametersType>::execute(G_II_indi, parameters.get_four_point_momentum_transfer());
-        Symmetrize<ParametersType>::execute(G_II_0_indi, parameters.get_four_point_momentum_transfer());
+        Symmetrize<ParametersType>::execute(G_II_0_indi,
+                                            parameters.get_four_point_momentum_transfer());
         G_II.distribute(0, subind, static_cast<std::complex<ScalarType>*>(G_II_indi.values()));
         G_II_0.distribute(0, subind, static_cast<std::complex<ScalarType>*>(G_II_0_indi.values()));
       }
@@ -195,8 +194,8 @@ void BseClusterSolverExt<ParametersType, DcaDataType, ScalarType>::load_G_II(
     std::cout << "\t" << __FUNCTION__ << " -- over " << KExDmn::dmn_size() << " k exchanges and "
               << WExDmn::dmn_size() << " frequency exchanges.\n\n";
 
-  //std::vector<int> coor_1(G_II.signature(), 0.0);
-  // ideally a compile time leaf domain count would be avaialbe from function 
+  // std::vector<int> coor_1(G_II.signature(), 0.0);
+  //  ideally a compile time leaf domain count would be avaialbe from function
   constexpr int num_indexes_G4 = 10;
   std::array<int, num_indexes_G4> coor_1;
   coor_1.fill(0);
@@ -231,12 +230,11 @@ void BseClusterSolverExt<ParametersType, DcaDataType, ScalarType>::load_G_II(
     }
   }
   std::cout << "final linind " << linind << '\n';
-  std::cout << "G_II(0,0,0,0,0,0,0,0,0,0): " << G_II(0,0,0,0,0,0,0,0,0,0) << '\n';
-  std::cout << "G_II(0,0,0,0,0,0,0,0,0,1): " << G_II(0,0,0,0,0,0,0,0,0,1) << '\n';
-  std::cout << "G_II(0,0,0,0,0,0,0,0,0,2): " << G_II(0,0,0,0,0,0,0,0,0,2) << '\n';
-  std::cout << "G4(0,0,0,0,0,0,0,0,0,1): " << G4(0,0,0,0,0,0,0,0,0,1) << '\n';
-  std::cout << "G4(0,0,0,0,0,0,0,0,0,2): " << G4(0,0,0,0,0,0,0,0,0,2) << '\n';
-
+  std::cout << "G_II(0,0,0,0,0,0,0,0,0,0): " << G_II(0, 0, 0, 0, 0, 0, 0, 0, 0, 0) << '\n';
+  std::cout << "G_II(0,0,0,0,0,0,0,0,0,1): " << G_II(0, 0, 0, 0, 0, 0, 0, 0, 0, 1) << '\n';
+  std::cout << "G_II(0,0,0,0,0,0,0,0,0,2): " << G_II(0, 0, 0, 0, 0, 0, 0, 0, 0, 2) << '\n';
+  std::cout << "G4(0,0,0,0,0,0,0,0,0,1): " << G4(0, 0, 0, 0, 0, 0, 0, 0, 0, 1) << '\n';
+  std::cout << "G4(0,0,0,0,0,0,0,0,0,2): " << G4(0, 0, 0, 0, 0, 0, 0, 0, 0, 2) << '\n';
 }
 
 /** This calculates close to the Chi_0(Q,omega_m)K,K' that we want for S(q,omega)
@@ -247,9 +245,10 @@ void BseClusterSolverExt<ParametersType, DcaDataType, ScalarType>::load_G_II_0(
   profiler_type prof(__FUNCTION__, "BseClusterSolverExt", __LINE__);
   if (concurrency.id() == concurrency.first()) {
     std::cout << "\t" << __FUNCTION__ << "\n\n";
-    std::cout << "G_II_0 domain sizes: " << vectorToString(G_II_0.get_domain().get_leaf_domain_sizes()) << "\n\n";
+    std::cout << "G_II_0 domain sizes: "
+              << vectorToString(G_II_0.get_domain().get_leaf_domain_sizes()) << "\n\n";
   }
-  
+
   func::dmn_variadic<k_DCA, WVertexDmn> k_w_dmn;
   G_II_0 = 0.;
 
@@ -354,7 +353,6 @@ void BseClusterSolverExt<ParametersType, DcaDataType, ScalarType>::solve_BSE_on_
 
   for (int wex_ind = 0; wex_ind < WExDmn::dmn_size(); wex_ind++)
     for (int kex_ind = 0; kex_ind < KExDmn::dmn_size(); kex_ind++) {
-
       func::function<std::complex<ScalarType>, DCA_matrix_dmn_t> G_II_indi;
       func::function<std::complex<ScalarType>, DCA_matrix_dmn_t> G_II_0_indi;
       subind = {0, kex_ind, wex_ind};
diff --git a/include/dca/phys/dca_step/cluster_mapping/coarsegraining/coarsegraining_tp.hpp b/include/dca/phys/dca_step/cluster_mapping/coarsegraining/coarsegraining_tp.hpp
index a47c4ab53..5984a6d8a 100644
--- a/include/dca/phys/dca_step/cluster_mapping/coarsegraining/coarsegraining_tp.hpp
+++ b/include/dca/phys/dca_step/cluster_mapping/coarsegraining/coarsegraining_tp.hpp
@@ -332,8 +332,8 @@ void coarsegraining_tp<parameters_type, K_dmn>::compute_tp(
 #ifndef NDEBUG
   if (k_DCA::get_elements() != K_dmn::get_elements()) {
     std::cout << "k_DCA::get_elements() != K_dmn::get_elements() ";
-    std::cout << vectorToString(k_DCA::get_elements()) << " != "
-	      << vectorToString(K_dmn::get_elements()) << '\n';
+    std::cout << vectorToString(k_DCA::get_elements())
+              << " != " << vectorToString(K_dmn::get_elements()) << '\n';
   }
 #endif
   chi = 0.;
@@ -365,7 +365,9 @@ void coarsegraining_tp<parameters_type, K_dmn>::compute_tp(
 
       find_w1_and_w2(w_dmn_t::get_elements(), w_ind, w_1, w_2);
 
-      { BaseClass::compute_G_q_w(k_ind, w_1, H_k, S_K_w, I_q, H_q, S_q, G_q); }
+      {
+        BaseClass::compute_G_q_w(k_ind, w_1, H_k, S_K_w, I_q, H_q, S_q, G_q);
+      }
 
       {
         BaseClass::compute_G_q_w(k_ind, w_2, H_k, S_K_plus_Q_w, I_q_plus_Q, H_q_plus_Q, S_q_plus_Q,
@@ -509,7 +511,9 @@ void coarsegraining_tp<parameters_type, K_dmn>::compute_phi(
 
       find_w1_and_w2(w_dmn_t::get_elements(), w_ind, w_1, w_2);
 
-      { BaseClass::compute_G_q_w(k_ind, w_1, H_k, S_K_w, I_q, H_q, S_q, G_q); }
+      {
+        BaseClass::compute_G_q_w(k_ind, w_1, H_k, S_K_w, I_q, H_q, S_q, G_q);
+      }
 
       {
         BaseClass::compute_G_q_w(k_ind, w_2, H_k, S_Q_min_K_w, I_Q_min_q, H_Q_min_q, S_Q_min_q,
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp
index 04dd7ff4e..77e34f9ab 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp
@@ -270,7 +270,7 @@ double TpAccumulator<Parameters, DT, linalg::CPU>::computeM(
 
   Profiler prf_b("Space FT", "tp-accumulation", __LINE__, thread_id_);
   // TODO: add the gflops here.
-  math::transform::SpaceTransform2D<RDmn, KDmn, TpPrecision>::execute(M_r_r_w_w, G_);
+  math::transform::SpaceTransform2D<RDmn, KDmn, BDmn, SDmn, TpPrecision>::execute(M_r_r_w_w, G_);
 
   return flops;
 }
@@ -344,7 +344,7 @@ void TpAccumulator<Parameters, DT, linalg::CPU>::computeGMultiband(const int s,
 
   matrixOperationsGMultiband(G0_w1, G0_w2, M_matrix, G0_M_);
 
-    // G(w1, w2) += \delta(w1, w2) \delta(k1,k2) G0(w1)
+  // G(w1, w2) += \delta(w1, w2) \delta(k1,k2) G0(w1)
   if (G0_w1.ptr() == G0_w2.ptr()) {
     for (int b2 = 0; b2 < n_bands_; ++b2)
       for (int b1 = 0; b1 < n_bands_; ++b1)
@@ -366,12 +366,12 @@ void TpAccumulator<Parameters, DT, linalg::CPU>::computeGMultiband(const int s,
 }
 
 template <class Parameters, DistType DT>
-void TpAccumulator<Parameters, DT, linalg::CPU>::matrixOperationsGMultiband(const BandBlockView& G0_kw1, const BandBlockView& G0_kw2, BandBlockView& G_kkww, Matrix& G0_M_) {
+void TpAccumulator<Parameters, DT, linalg::CPU>::matrixOperationsGMultiband(
+    const BandBlockView& G0_kw1, const BandBlockView& G0_kw2, BandBlockView& G_kkww, Matrix& G0_M_) {
   // G(w1, w2) <- -G0(w1) M(w1, w2) G0(w2)
   G0_M_.resize(G0_kw1.size());
   linalg::matrixop::gemm(G0_kw1, G_kkww, G0_M_);
   linalg::matrixop::gemm(TpComplex(-1), G0_M_, G0_kw2, TpComplex(0), G_kkww);
-
 }
 
 template <class Parameters, DistType DT>
@@ -395,11 +395,11 @@ void TpAccumulator<Parameters, DT, linalg::CPU>::getGMultiband(int s, int k1, in
   const auto* const G_ptr = &G_(0, 0, s, k1, k2, w1_ext, w2_ext);
   for (int b2 = 0; b2 < n_bands_; ++b2)
     for (int b1 = 0; b1 < n_bands_; ++b1) {
-      G(b1, b2) = sign * G(b1, b2) + G_ptr[b1 + b2 * n_bands_];
+      G(b1, b2) = sign * G(b1, b2) + G_ptr[b2 + b1 * n_bands_];
 #ifndef NDEBUG
-      // if (std::abs(G(b1, b2).imag()) > 10)  // std::isnan(imag(G_(b1, b2, s, k1, k2, w1_ext, w2_ext))))
-      //   std::cout << w1 << "," << w2 << "," << k1 << "," << k2 << "," << b1 << "," << b2 << ","
-      //             << G(b1, b2) << "*G_ptr" << G_ptr->real() << " + " << G_ptr->imag() << "\n";
+    // if (std::abs(G(b1, b2).imag()) > 10)  // std::isnan(imag(G_(b1, b2, s, k1, k2, w1_ext, w2_ext))))
+    //   std::cout << w1 << "," << w2 << "," << k1 << "," << k2 << "," << b1 << "," << b2 << ","
+    //             << G(b1, b2) << "*G_ptr" << G_ptr->real() << " + " << G_ptr->imag() << "\n";
 #endif
     }
 }
@@ -456,8 +456,8 @@ double TpAccumulator<Parameters, DT, linalg::CPU>::updateG4(const int channel_id
         //                  = -1/2 sum_s G(k2+k_ex, k1+k_ex, s) G(k1, k2, -s)
         for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
           const int w_ex = exchange_frq[w_ex_idx];
-	  if(w_ex != 0)
-	    std::cout << "cpu w_ex: " << w_ex << '\n';
+          if (w_ex != 0)
+            std::cout << "cpu w_ex: " << w_ex << '\n';
           for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) {
             const int k_ex = exchange_mom[k_ex_idx];
             for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
@@ -465,10 +465,23 @@ double TpAccumulator<Parameters, DT, linalg::CPU>::updateG4(const int channel_id
                 for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
                   for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
                     TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx);
-                    for (int s = 0; s < 2; ++s)
-                      updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex),
-                                     momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex),
-                                     w_plus_w_ex(w1, w_ex), -sign_over_2, true);
+                    //   for (int s = 0; s < 2; ++s)
+                    //     updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex),
+                    //                    momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex),
+                    //                    w_plus_w_ex(w1, w_ex), -sign_over_2, true);
+                    // }
+                    for (int s = 0; s < 2; ++s) {
+                      getGMultiband(s, k1, k2, w1, w2, G_a_);
+                      getGMultiband(!s, momentum_sum(k2, k_ex), momentum_sum(k1, k_ex),
+                                    w_plus_w_ex(w2, w_ex), w_plus_w_ex(w1, w_ex), G_b_);
+                      for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4)
+                        for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3)
+                          for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2)
+                            for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) {
+                              G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) +=
+                                  -sign_over_2 * G_a_(b2, b4) * G_b_(b3, b1);
+                            }
+                    }
                   }
           }
         }
@@ -483,43 +496,53 @@ double TpAccumulator<Parameters, DT, linalg::CPU>::updateG4(const int channel_id
         //                       - (s1 == s2) G(k2+k_ex, k1+k_ex, s1) G(k1, k2, s1)]
         for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
           const int w_ex = exchange_frq[w_ex_idx];
-	  if(w_ex != 0)
-	    std::cout << "cpu w_ex: " << w_ex << '\n';
+          if (w_ex != 0)
+            std::cout << "cpu w_ex: " << w_ex << '\n';
           for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) {
             const int k_ex = exchange_mom[k_ex_idx];
             for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
               for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
                 for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
                   for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
-                    TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx);
-#ifndef NDEBUG
-                    TpComplex G4_before = *G4_ptr;
-#endif
-                    updateG4SpinDifference(G4_ptr, -1, k1, momentum_sum(k1, k_ex), w1,
-                                           w_plus_w_ex(w1, w_ex), momentum_sum(k2, k_ex), k2,
-                                           w_plus_w_ex(w2, w_ex), w2, sign_over_2, false);
-#ifndef NDEBUG
-                    G4_FromSpinDifference += std::abs(*G4_ptr - G4_before);
-#endif
+                    // TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx,
+                    // w_ex_idx); updateG4SpinDifference(G4_ptr, -1, k1, momentum_sum(k1, k_ex), w1,
+                    //                        w_plus_w_ex(w1, w_ex), momentum_sum(k2, k_ex), k2,
+                    //                        w_plus_w_ex(w2, w_ex), w2, sign_over_2, false);
+                    for (int s1 = 0; s1 < 2; ++s1) {
+                      for (int s2 = 0; s2 < 2; ++s2) {
+                        std::complex<double> spin_factor = (s1 == s2) ? 1 : -1;
+                        getGMultiband(s1, k1, momentum_sum(k1, k_ex), w1, w_plus_w_ex(w1, w_ex),
+                                      G_a_);
+                        getGMultiband(s2, momentum_sum(k2, k_ex), k2, w_plus_w_ex(w2, w_ex), w2,
+                                      G_b_);
+                        for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4)
+                          for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3)
+                            for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2)
+                              for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) {
+                                G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) +=
+                                    sign_over_2 * G_a_(b2, b1) * G_b_(b3, b4) * spin_factor;
+                              }
+                      }
+                    }
                     for (int s = 0; s < 2; ++s) {
-#ifndef NDEBUG
-                      G4_before = *G4_ptr;
-#endif
-                      updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, s, momentum_sum(k2, k_ex),
-                                     momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex),
-                                     w_plus_w_ex(w1, w_ex), -sign_over_2, true);
-#ifndef NDEBUG
-                      G4_DirectDifference += std::abs(*G4_ptr - G4_before);
-#endif
+                      // updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, s, momentum_sum(k2, k_ex),
+                      //                momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex),
+                      //                w_plus_w_ex(w1, w_ex), -sign_over_2, true);
+                      getGMultiband(s, k1, k2, w1, w2, G_a_);
+                      getGMultiband(s, momentum_sum(k2, k_ex), momentum_sum(k1, k_ex),
+                                    w_plus_w_ex(w2, w_ex), w_plus_w_ex(w1, w_ex), G_b_);
+                      for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4)
+                        for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3)
+                          for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2)
+                            for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) {
+                              G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) +=
+                                  -sign_over_2 * G_a_(b3, b1) * G_b_(b2, b4);
+                            }
                     }
                   }
           }
         }
         flops += n_loops * (flops_update_spin_diff + 2 * flops_update_atomic);
-#ifndef NDEBUG
-        std::cout << "G4 PHM from spin:" << G4_FromSpinDifference << '\n';
-        std::cout << "G4 PHM from direct:" << G4_DirectDifference << '\n';
-#endif
         break;
 
       case FourPointType::PARTICLE_HOLE_CHARGE:
@@ -551,56 +574,56 @@ double TpAccumulator<Parameters, DT, linalg::CPU>::updateG4(const int channel_id
         flops += n_loops * (flops_update_spin_diff + 2 * flops_update_atomic);
         break;
 
-      case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
-        // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, s) c^+(k2, s) c(k2+k_ex, s)>
-        //                  = 1/2 sum_s [G(k1, k1+k_ex, s) G(k2+k_ex, k2, s)
-        //                               - G(k2+k_ex, k1+k_ex, s) G(k1, k2, s)]
-        for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
-          const int w_ex = exchange_frq[w_ex_idx];
-          for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) {
-            const int k_ex = exchange_mom[k_ex_idx];
-            for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
-              for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
-                for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
-                  for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
-                    TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx);
-
-                    for (int s = 0; s < 2; ++s)
-                      updateG4Atomic(G4_ptr, s, k1, momentum_sum(k1, k_ex), w1,
-                                     w_plus_w_ex(w1, w_ex), s, momentum_sum(k2, k_ex), k2,
-                                     w_plus_w_ex(w2, w_ex), w2, sign_over_2, false);
-
-                    for (int s = 0; s < 2; ++s)
-                      updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, s, momentum_sum(k2, k_ex),
-                                     momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex),
-                                     w_plus_w_ex(w1, w_ex), -sign_over_2, true);
-                  }
-          }
-        }
-        flops += n_loops * 4 * flops_update_atomic;
-        break;
-
-      case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
-        // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, s) c^+(k2, -s) c(k2+k_ex, -s)>
-        //                  = 1/2 sum_s G(k1, k1+k_ex, s) G(k2+k_ex, k2, -s)
-        for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
-          const int w_ex = exchange_frq[w_ex_idx];
-          for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) {
-            const int k_ex = exchange_mom[k_ex_idx];
-            for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
-              for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
-                for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
-                  for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
-                    TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx);
-                    for (int s = 0; s < 2; ++s)
-                      updateG4Atomic(G4_ptr, s, k1, momentum_sum(k1, k_ex), w1,
-                                     w_plus_w_ex(w1, w_ex), !s, momentum_sum(k2, k_ex), k2,
-                                     w_plus_w_ex(w2, w_ex), w2, sign_over_2, false);
-                  }
-          }
-        }
-        flops += n_loops * 4 * flops_update_atomic;
-        break;
+        // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
+        //   // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, s) c^+(k2, s) c(k2+k_ex, s)>
+        //   //                  = 1/2 sum_s [G(k1, k1+k_ex, s) G(k2+k_ex, k2, s)
+        //   //                               - G(k2+k_ex, k1+k_ex, s) G(k1, k2, s)]
+        //   for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
+        //     const int w_ex = exchange_frq[w_ex_idx];
+        //     for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) {
+        //       const int k_ex = exchange_mom[k_ex_idx];
+        //       for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
+        //         for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
+        //           for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
+        //             for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
+        //               TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx);
+
+        //               for (int s = 0; s < 2; ++s)
+        //                 updateG4Atomic(G4_ptr, s, k1, momentum_sum(k1, k_ex), w1,
+        //                                w_plus_w_ex(w1, w_ex), s, momentum_sum(k2, k_ex), k2,
+        //                                w_plus_w_ex(w2, w_ex), w2, sign_over_2, false);
+
+        //               for (int s = 0; s < 2; ++s)
+        //                 updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, s, momentum_sum(k2, k_ex),
+        //                                momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex),
+        //                                w_plus_w_ex(w1, w_ex), -sign_over_2, true);
+        //             }
+        //     }
+        //   }
+        //   flops += n_loops * 4 * flops_update_atomic;
+        //   break;
+
+        // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
+        //   // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k1+k_ex, s) c(k1, s) c^+(k2, -s) c(k2+k_ex, -s)>
+        //   //                  = 1/2 sum_s G(k1, k1+k_ex, s) G(k2+k_ex, k2, -s)
+        //   for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) {
+        //     const int w_ex = exchange_frq[w_ex_idx];
+        //     for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) {
+        //       const int k_ex = exchange_mom[k_ex_idx];
+        //       for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2)
+        //         for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
+        //           for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
+        //             for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
+        //               TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx,
+        //               w_ex_idx); for (int s = 0; s < 2; ++s)
+        //                 updateG4Atomic(G4_ptr, s, k1, momentum_sum(k1, k_ex), w1,
+        //                                w_plus_w_ex(w1, w_ex), !s, momentum_sum(k2, k_ex), k2,
+        //                                w_plus_w_ex(w2, w_ex), w2, sign_over_2, false);
+        //             }
+        //     }
+        //   }
+        //   flops += n_loops * 4 * flops_update_atomic;
+        //   break;
 
       case FourPointType::PARTICLE_PARTICLE_UP_DOWN:
         // G4(k1, k2, k_ex) = 1/2 sum_s <c^+(k_ex-k1, s) c^+(k1, -s) c(k2, -s) c(k_ex-k2, s)>
@@ -613,11 +636,23 @@ double TpAccumulator<Parameters, DT, linalg::CPU>::updateG4(const int channel_id
               for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
                 for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
                   for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
-                    TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx);
-                    for (int s = 0; s < 2; ++s)
-                      updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, !s, q_minus_k(k1, k_ex),
-                                     q_minus_k(k2, k_ex), w_ex_minus_w(w1, w_ex),
-                                     w_ex_minus_w(w2, w_ex), sign_over_2, false);
+                    // TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx);
+                    for (int s = 0; s < 2; ++s) {
+                      // updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, !s, q_minus_k(k1, k_ex),
+                      //                q_minus_k(k2, k_ex), w_ex_minus_w(w1, w_ex),
+                      //                w_ex_minus_w(w2, w_ex), sign_over_2, false);
+                      // contraction: G_{b1,b3}(k1, k2) * G_{b2,b4}(q-k1, q-k2).
+                      getGMultiband(s, k1, k2, w1, w2, G_a_);
+                      getGMultiband(!s, q_minus_k(k1, k_ex), q_minus_k(k2, k_ex),
+                                    w_ex_minus_w(w1, w_ex), w_ex_minus_w(w2, w_ex), G_b_);
+                      for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4)
+                        for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3)
+                          for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2)
+                            for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) {
+                              G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) +=
+                                  sign_over_2 * G_a_(b1, b3) * G_b_(b2, b4);
+                            }
+                    }
                   }
           }
         }
@@ -641,34 +676,26 @@ double TpAccumulator<Parameters, DT, linalg::CPU>::updateG4(const int channel_id
               for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2)
                 for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1)
                   for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) {
-                    // contraction: G(k2, k1, s3, s2) * G(k_ex - k2, k_ex - k1, s4, s1).
+                    // contraction: G(k2, k1, b1, b3) * G(k_ex - k2, k_ex - k1, b2, b4).
                     getGMultiband(0, k1, k2, w1, w2, G_a_);
                     getGMultiband(0, q_minus_k(k1, k_ex), q_minus_k(k2, k_ex),
                                   w_ex_minus_w(w1, w_ex), w_ex_minus_w(w2, w_ex), G_b_);
-                    // getGMultiband(0, k2, k1, w2, w1, G_a_);
-                    // getGMultiband(0, q_minus_k(k2, k_ex), q_minus_k(k1, k_ex),
-                    //               w_ex_minus_w(w2, w_ex), w_ex_minus_w(w1, w_ex), G_b_);
-                   for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4)
+                    for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4)
                       for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3)
                         for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2)
                           for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) {
                             G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) +=
-                                // complex_factor * G_a_(b3, b2) * G_b_(b4, b1);
                                 complex_factor * G_a_(b1, b3) * G_b_(b2, b4);
-                            // complex_factor * G_a_(b1, b3);
                           }
 
-                    // contraction: -G(k2, k_ex - k1, s3, s1) * G(k_ex - k2, k1, s4, s2).
+                    // contraction: -G(k1, k_ex - k2, b1, b4) * G(k_ex - k1, k2, b2, b3).
                     getGMultiband(0, k1, q_minus_k(k2, k_ex), w1, w_ex_minus_w(w2, w_ex), G_a_);
                     getGMultiband(0, q_minus_k(k1, k_ex), k2, w_ex_minus_w(w1, w_ex), w2, G_b_);
-                    // getGMultiband(0, k2, q_minus_k(k1, k_ex), w2, w_ex_minus_w(w1, w_ex), G_a_);
-                    // getGMultiband(0, q_minus_k(k2, k_ex), k1, w_ex_minus_w(w2, w_ex), w1, G_b_);
                     for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4)
                       for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3)
                         for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2)
                           for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) {
                             G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) -=
-                                // complex_factor * G_a_(b3, b1) * G_b_(b4, b2);
                                 complex_factor * G_a_(b1, b4) * G_b_(b2, b3);
                           }
                   }
@@ -750,8 +777,8 @@ void TpAccumulator<Parameters, DT, linalg::CPU>::updateG4Atomic(
     getGMultiband(s_b, k1_b, k2_b, w1_b, w2_b, G_b_);
 
     // Assuming 0 = down 1 = up
-    //G_a = G[s_a]
-    //G_b = G[d_b]
+    // G_a = G[s_a]
+    // G_b = G[d_b]
 
     if (!cross_legs)
       for (int b4 = 0; b4 < n_bands_; ++b4)
@@ -826,7 +853,8 @@ void TpAccumulator<Parameters, DT, linalg::CPU>::updateG4SpinDifference(
         for (int b3 = 0; b3 < n_bands_; ++b3)
           for (int b2 = 0; b2 < n_bands_; ++b2)
             for (int b1 = 0; b1 < n_bands_; ++b1) {
-              *G4_ptr += alpha * G_a_(b1, b3) * G_b_(b2, b4);
+              // *G4_ptr += alpha * G_a_(b1, b3) * G_b_(b2, b4);
+              *G4_ptr += alpha * G_a_(b2, b1) * G_b_(b3, b4);
               ++G4_ptr;
             }
     else
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp
index 1aa5661db..7e6357874 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp
@@ -438,15 +438,15 @@ double TpAccumulator<Parameters, DT, linalg::GPU>::updateG4(const std::size_t ch
             get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
             G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end);
 
-      case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
-        return details::updateG4<TpComplex, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP>(
-            get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
-            G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end);
+        // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
+        //   return details::updateG4<TpComplex, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP>(
+        //       get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
+        //       G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end);
 
-      case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
-        return details::updateG4<TpComplex, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>(
-            get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
-            G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end);
+        // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
+        //   return details::updateG4<TpComplex, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN>(
+        //       get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(),
+        //       G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end);
 
       case FourPointType::PARTICLE_PARTICLE_UP_DOWN:
         return details::updateG4<TpComplex, FourPointType::PARTICLE_PARTICLE_UP_DOWN>(
@@ -454,16 +454,18 @@ double TpAccumulator<Parameters, DT, linalg::GPU>::updateG4(const std::size_t ch
             G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end);
 
       default:
-        throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu.");
+        throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu." +
+                               std::to_string(static_cast<int>(channel)));
     }
   }
   else {
-    if (channel == FourPointType::PARTICLE_PARTICLE_UP_DOWN )
+    if (channel == FourPointType::PARTICLE_PARTICLE_UP_DOWN)
       return details::updateG4NoSpin<TpComplex, FourPointType::PARTICLE_PARTICLE_UP_DOWN>(
           get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), factor,
           multiple_accumulators_, queues_[0], start, end);
   }
-  throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu.");
+  throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu." +
+                         std::to_string(static_cast<int>(channel)));
 }
 
 template <class Parameters, DistType DT>
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_base.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_base.hpp
index 293463832..d71728c64 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_base.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_base.hpp
@@ -77,9 +77,8 @@ class TpAccumulatorGpuBase {
   using MatrixHost = linalg::Matrix<TpComplex, linalg::CPU>;
 
 public:
-  TpAccumulatorGpuBase(
-		       const func::function<TpComplex, func::dmn_variadic<NuDmn, NuDmn, KDmn, WDmn >>& G0,
-      const Parameters& pars, int n_freq, int thread_id);
+  TpAccumulatorGpuBase(const func::function<TpComplex, func::dmn_variadic<NuDmn, NuDmn, KDmn, WDmn>>& G0,
+                       const Parameters& pars, int n_freq, int thread_id);
 
 protected:
   void initializeG4Helpers() const;
@@ -88,15 +87,15 @@ class TpAccumulatorGpuBase {
 
   template <class Configuration, typename SpScalar>
   double computeM(const std::array<linalg::Matrix<SpScalar, linalg::GPU>, 2>& M_pair,
-                 const std::array<Configuration, 2>& configs);
+                  const std::array<Configuration, 2>& configs);
 
   void sumTo_(TpAccumulatorGpuBase<Parameters, DT>& other_acc);
 
   // \todo is this violation of single source of truth necessary.
-  const func::function<TpComplex, func::dmn_variadic<NuDmn, NuDmn, KDmn, WDmn >>* const G0_ptr_ =
+  const func::function<TpComplex, func::dmn_variadic<NuDmn, NuDmn, KDmn, WDmn>>* const G0_ptr_ =
       nullptr;
 
-  //const int n_pos_frqs_ = -1;
+  // const int n_pos_frqs_ = -1;
 
   std::array<linalg::util::MagmaQueue, 2> queues_;
   linalg::util::GpuEvent event_;
@@ -105,12 +104,13 @@ class TpAccumulatorGpuBase {
 
   // this is how this is defined in the all tp_accumulator_gpu, suspect?
   constexpr static bool non_density_density_ =
-    models::HasInitializeNonDensityInteractionMethod<Parameters>::value;
-  //CachedNdft<TpComplex, RDmn, WTpExtDmn, WTpExtDmn, linalg::CPU, non_density_density_> ndft_obj_;
+      models::HasInitializeNonDensityInteractionMethod<Parameters>::value;
+  // CachedNdft<TpComplex, RDmn, WTpExtDmn, WTpExtDmn, linalg::CPU, non_density_density_> ndft_obj_;
 
-  using NdftType = CachedNdft<TpComplex, RDmn, WTpExtDmn, WTpExtDmn, linalg::GPU, non_density_density_>;
+  using NdftType =
+      CachedNdft<TpComplex, RDmn, WTpExtDmn, WTpExtDmn, linalg::GPU, non_density_density_>;
   std::array<NdftType, 2> ndft_objs_;
-  using DftType = math::transform::SpaceTransform2DGpu<RDmn, KDmn, TpPrecision>;
+  using DftType = math::transform::SpaceTransform2DGpu<RDmn, KDmn, BDmn, SDmn, TpPrecision>;
   std::array<DftType, 2> space_trsf_objs_;
 
   constexpr static int n_ndft_queues_ = config::McOptions::memory_savings ? 1 : 2;
@@ -134,7 +134,7 @@ TpAccumulatorGpuBase<Parameters, DT>::TpAccumulatorGpuBase(
     const func::function<TpComplex, func::dmn_variadic<NuDmn, NuDmn, KDmn, WDmn>>& G0,
     const Parameters& pars, const int n_freq, int thread_id)
     : G0_ptr_(&G0),
-      //n_pos_frqs_(n_pos_frqs),
+      // n_pos_frqs_(n_pos_frqs),
       queues_(),
       ndft_objs_{NdftType(queues_[0]), NdftType(queues_[1])},
       space_trsf_objs_{DftType(n_freq, queues_[0]), DftType(n_freq, queues_[1])},
@@ -159,17 +159,17 @@ auto TpAccumulatorGpuBase<Parameters, DT>::get_G0() -> G0DevType& {
 
 template <class Parameters, DistType DT>
 void TpAccumulatorGpuBase<Parameters, DT>::initializeG4Helpers() const {
-    const auto& add_mat = KDmn::parameter_type::get_add_matrix();
-    const auto& sub_mat = KDmn::parameter_type::get_subtract_matrix();
-    const auto& w_indices = domains::FrequencyExchangeDomain::get_elements();
-    const auto& q_indices = domains::MomentumExchangeDomain::get_elements();
-    const auto extension_offset =   (WTpExtDmn::dmn_size() - WTpDmn::dmn_size()) / 2;
-
-    // CurrentlyA WTpPosDmn should always be == WTpDmn
-    details::G4Helper::set(n_bands_, KDmn::dmn_size(), WTpDmn::dmn_size(), q_indices, w_indices, extension_offset,
-                           add_mat.ptr(), add_mat.leadingDimension(), sub_mat.ptr(),
-                           sub_mat.leadingDimension());
-    assert(cudaPeekAtLastError() == cudaSuccess);
+  const auto& add_mat = KDmn::parameter_type::get_add_matrix();
+  const auto& sub_mat = KDmn::parameter_type::get_subtract_matrix();
+  const auto& w_indices = domains::FrequencyExchangeDomain::get_elements();
+  const auto& q_indices = domains::MomentumExchangeDomain::get_elements();
+  const auto extension_offset = (WTpExtDmn::dmn_size() - WTpDmn::dmn_size()) / 2;
+
+  // CurrentlyA WTpPosDmn should always be == WTpDmn
+  details::G4Helper::set(n_bands_, KDmn::dmn_size(), WTpDmn::dmn_size(), q_indices, w_indices,
+                         extension_offset, add_mat.ptr(), add_mat.leadingDimension(), sub_mat.ptr(),
+                         sub_mat.leadingDimension());
+  assert(cudaPeekAtLastError() == cudaSuccess);
 }
 
 template <class Parameters, DistType DT>
@@ -192,9 +192,10 @@ void TpAccumulatorGpuBase<Parameters, DT>::initializeG0() {
       for (int k = 0; k < KDmn::dmn_size(); ++k)
         for (int b2 = 0; b2 < n_bands_; ++b2)
           for (int b1 = 0; b1 < n_bands_; ++b1) {
-	    assert(std::abs(WTpExtDmn::get_elements()[w] - WDmn::get_elements()[w + sp_index_offset]) < 1e-3);
-            G0_host[s](bkw_dmn(b1, k, w),b2) = (*G0_ptr_)(b1, s, b2, s, k, w + sp_index_offset);
-	  }
+            assert(std::abs(WTpExtDmn::get_elements()[w] -
+                            WDmn::get_elements()[w + sp_index_offset]) < 1e-3);
+            G0_host[s](bkw_dmn(b1, k, w), b2) = (*G0_ptr_)(b1, s, b2, s, k, w + sp_index_offset);
+          }
     dca::linalg::util::GpuStream reset_stream(cudaStreamLegacy);
     G0[s].set(G0_host[s], reset_stream);
   }
@@ -219,7 +220,7 @@ double TpAccumulatorGpuBase<Parameters, DT>::computeM(
     for (int s = 0; s < 2; ++s)
       flop += space_trsf_objs_[stream_id(s)].execute(G_[s]);
   }
-  
+
   return flop;
 }
 
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp
index a987c6b91..9a5b2238e 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp
@@ -77,8 +77,8 @@ class TpAccumulator<Parameters, DT, linalg::GPU> : public TpAccumulatorBase<Para
   using BaseGpu::event_;
   using BaseGpu::n_pos_frqs_;
   using BaseGpu::n_ndft_queues_;
-  //uint64_t start_;
-  //uint64_t end_;
+  // uint64_t start_;
+  // uint64_t end_;
 
   using BaseGpu::get_G0;
 
@@ -112,7 +112,7 @@ class TpAccumulator<Parameters, DT, linalg::GPU> : public TpAccumulatorBase<Para
   // Eventually distribution strategy should be pushed down into linalg::Vector but
   // I think generalization should still wait.
   using G4DevType = linalg::Vector<Complex, linalg::GPU, config::McOptions::TpAllocator<Complex>>;
-  
+
   std::vector<TpGreensFunction>& get_G4();
 
 private:
@@ -157,9 +157,9 @@ TpAccumulator<Parameters, linalg::GPU, DistType::LINEAR>::TpAccumulator(
     : Base(G0, pars, thread_id), BaseGpu(pars, Base::get_n_pos_frqs(), thread_id_) {
   // each mpi rank only allocates memory of size 1/total_G4_size for its small portion of G4
   // static_assert(std::is_same<std::vector<TpAccumulator<DistType::LINEAR>>, decltype(G4_)>);
-  //start_ = G4_[0].get_start();
+  // start_ = G4_[0].get_start();
   // The sense here is one past the last index held
-  //end_ = G4_[0].get_end() + 1;
+  // end_ = G4_[0].get_end() + 1;
 
   // possible these can both go into the parent class constructor
 }
@@ -339,7 +339,7 @@ float TpAccumulator<Parameters, linalg::GPU, DistType::LINEAR>::updateG4(
 
   uint64_t start = get_G4Dev().get_start();
   uint64_6 end = get_G4Dev().get_end() + 1;
-  
+
   switch (channel) {
     case FourPointType::PARTICLE_HOLE_TRANSVERSE:
       return details::updateG4<Real, FourPointType::PARTICLE_HOLE_TRANSVERSE>(
@@ -372,7 +372,8 @@ float TpAccumulator<Parameters, linalg::GPU, DistType::LINEAR>::updateG4(
           G_[1].leadingDimension(), sign_, multiple_accumulators_, queues_[0], start, end);
 
     default:
-      throw std::logic_error("Specified four point type not implemented.");
+      throw std::logic_error("Specified four point type not implemented." +
+                             std::to_string(static_cast<int>(channel)));
   }
 }
 
@@ -424,15 +425,14 @@ void TpAccumulator<Parameters, DT, linalg::GPU>::ringG(float& flop) {
 }
 
 template <class Parameters, DistType DT>
-auto TpAccumulator<Parameters, DT, linalg::GPU>::get_G4Dev()
-    -> std::vector<G4DevType>& {
+auto TpAccumulator<Parameters, DT, linalg::GPU>::get_G4Dev() -> std::vector<G4DevType>& {
   static std::vector<G4DevType> G4;
   return G4;
 }
 
 template <class Parameters, DistType DT>
-void TpAccumulator<Parameters, DT, linalg::GPU>::send(
-    const std::array<RMatrix, 2>& data, int target, std::array<MPI_Request, 2>& request) {
+void TpAccumulator<Parameters, DT, linalg::GPU>::send(const std::array<RMatrix, 2>& data, int target,
+                                                      std::array<MPI_Request, 2>& request) {
   using dca::parallel::MPITypeMap;
   const auto g_size = data[0].size().first * data[0].size().second;
 
@@ -455,8 +455,8 @@ void TpAccumulator<Parameters, DT, linalg::GPU>::send(
 }
 
 template <class Parameters, DistType DT>
-void TpAccumulator<Parameters, DT, linalg::GPU>::receive(
-    std::array<RMatrix, 2>& data, int source, std::array<MPI_Request, 2>& request) {
+void TpAccumulator<Parameters, DT, linalg::GPU>::receive(std::array<RMatrix, 2>& data, int source,
+                                                         std::array<MPI_Request, 2>& request) {
   using dca::parallel::MPITypeMap;
   const auto g_size = data[0].size().first * data[0].size().second;
 
diff --git a/include/dca/phys/four_point_type.hpp b/include/dca/phys/four_point_type.hpp
index 8a824d7c4..b5568fa14 100644
--- a/include/dca/phys/four_point_type.hpp
+++ b/include/dca/phys/four_point_type.hpp
@@ -25,15 +25,16 @@ namespace phys {
  *  causes a bug. That is not a good code smell.
  */
 enum class FourPointType : int {
-  PARTICLE_HOLE_NONE = 0,
+  NONE = 0,
   PARTICLE_HOLE_TRANSVERSE,
   PARTICLE_HOLE_MAGNETIC,
   PARTICLE_HOLE_CHARGE,
-  PARTICLE_HOLE_LONGITUDINAL_UP_UP,
-  PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
   PARTICLE_PARTICLE_UP_DOWN
 };
 
+//  PARTICLE_HOLE_LONGITUDINAL_UP_UP,
+//  PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
+
 FourPointType stringToFourPointType(const std::string& name);
 
 std::string toString(const FourPointType type);
diff --git a/include/dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp b/include/dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp
index 95d908fe0..ffbe424b9 100644
--- a/include/dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp
+++ b/include/dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp
@@ -23,6 +23,7 @@
 #include "dca/phys/domains/cluster/symmetries/point_groups/no_symmetry.hpp"
 #include "dca/phys/models/analytic_hamiltonians/cluster_shape_type.hpp"
 #include "dca/util/type_list.hpp"
+#include "dca/phys/domains/cluster/cluster_operations.hpp"
 
 namespace dca {
 namespace phys {
@@ -151,15 +152,94 @@ void KagomeHubbard<PointGroupType>::initializeHInteraction(
   const int origin = RDmn::parameter_type::origin_index();
 
   const double U = parameters.get_U();  // on-site Hubbard interaction
+  const double V = parameters.get_V();  // Nearest-neighbor interaction
+
+  // There are 6 different nearest neighbor (nn) pairs along a1, a2, a1+a2 and their negative vectors
+  const std::vector<typename RDmn::parameter_type::element_type>& basis =
+      RDmn::parameter_type::get_basis_vectors();
+
+  assert(basis.size() == 2);
+
+  // Pre-compute basis vectors and orbital pairs
+  std::vector<typename RDmn::parameter_type::element_type> nn_vec;
+  nn_vec.reserve(3);
+  nn_vec.push_back(basis[0]);
+  nn_vec.push_back(basis[1]);
+  nn_vec.emplace_back(basis[0]);
+  nn_vec[2][0] += basis[1][0];
+  nn_vec[2][1] += basis[1][1];
+
+  // Orbital pairs corresponding to each nearest neighbor direction
+  constexpr std::array<std::array<int, 2>, 3> orb_pairs = {{
+    {{2, 0}}, {{1, 2}}, {{1, 0}}
+  }};
+
+  const std::vector<typename RDmn::parameter_type::element_type>& super_basis =
+      RDmn::parameter_type::get_super_basis_vectors();
+  const std::vector<typename RDmn::parameter_type::element_type>& elements =
+      RDmn::parameter_type::get_elements();
+
+  std::vector<int> nn_index;
+  std::vector<std::vector<int>> nn_orbs;
+  nn_index.reserve(6);
+  nn_orbs.reserve(6);
+
+  // Process all nearest neighbor vectors in a loop
+  for (int vec_idx = 0; vec_idx < 3; ++vec_idx) {
+    const auto nn_vec_translated =
+        domains::cluster_operations::translate_inside_cluster(nn_vec[vec_idx], super_basis);
+    const int r = domains::cluster_operations::index(nn_vec_translated, elements, domains::BRILLOUIN_ZONE);
+
+    if (r != origin) {
+      const int minus_r = RDmn::parameter_type::subtract(r, origin);
+
+      nn_index.push_back(r);
+      nn_orbs.push_back({orb_pairs[vec_idx][0], orb_pairs[vec_idx][1]});
+
+      // Add negative direction if distinct
+      if (r != minus_r) {
+        nn_index.push_back(minus_r);
+        nn_orbs.push_back({orb_pairs[vec_idx][1], orb_pairs[vec_idx][0]});
+      }
+    }
+  }
 
   H_interaction = 0.;
 
+// Onsite U interaction
   for (int i = 0; i < BANDS; i++) {
     H_interaction(i, 0, i, 1, origin) = U;
     H_interaction(i, 1, i, 0, origin) = U;
   }
+
+  // Nearest-neighbor V interaction inside unit cell
+  constexpr std::array<std::array<int, 2>, 3> intra_cell_pairs = {{
+    {{0, 1}}, {{1, 2}}, {{0, 2}}
+  }};
+
+  for (const auto& pair : intra_cell_pairs) {
+    for (int s = 0; s < 2; ++s) {
+      H_interaction(pair[0], s, pair[1], s, origin) = V;
+      H_interaction(pair[0], s, pair[1], 1-s, origin) = V;
+      H_interaction(pair[1], s, pair[0], s, origin) = V;
+      H_interaction(pair[1], s, pair[0], 1-s, origin) = V;
+    }
+  }
+
+  // Nearest-neighbor V interaction outside unit cell
+  const auto nn_size = nn_index.size();
+  for (std::size_t i = 0; i < nn_size; ++i) {
+    for (int s = 0; s < 2; ++s) {
+      H_interaction(nn_orbs[i][0], s, nn_orbs[i][1], s, nn_index[i]) = V;
+      H_interaction(nn_orbs[i][0], s, nn_orbs[i][1], 1-s, nn_index[i]) = V;
+    }
+  }
+
 }
 
+
+
+
 template <typename PointGroupType>
 template <class domain>
 void KagomeHubbard<PointGroupType>::initializeHSymmetry(func::function<int, domain>& H_symmetries) {
diff --git a/include/dca/phys/models/analytic_hamiltonians/La3Ni2O7_bilayer.hpp b/include/dca/phys/models/analytic_hamiltonians/La3Ni2O7_bilayer.hpp
index 29c98bc82..dced10253 100644
--- a/include/dca/phys/models/analytic_hamiltonians/La3Ni2O7_bilayer.hpp
+++ b/include/dca/phys/models/analytic_hamiltonians/La3Ni2O7_bilayer.hpp
@@ -190,10 +190,10 @@ void La3Ni2O7_bilayer<point_group_type>::initializeHInteraction(
         H_interaction(2, s, 3, s, origin) = V - J;
         H_interaction(3, s, 2, s, origin) = V - J;
 
-        H_interaction(0, s, 1, 1-s, origin) = V + J;
-        H_interaction(1, s, 0, 1-s, origin) = V + J;
-        H_interaction(2, s, 3, 1-s, origin) = V + J;
-        H_interaction(3, s, 2, 1-s, origin) = V + J;
+        H_interaction(0, s, 1, 1-s, origin) = V;
+        H_interaction(1, s, 0, 1-s, origin) = V;
+        H_interaction(2, s, 3, 1-s, origin) = V;
+        H_interaction(3, s, 2, 1-s, origin) = V;
   }
 
 
diff --git a/include/dca/phys/models/analytic_hamiltonians/square_plaquette_hubbard.hpp b/include/dca/phys/models/analytic_hamiltonians/square_plaquette_hubbard.hpp
new file mode 100644
index 000000000..bf98d8f50
--- /dev/null
+++ b/include/dca/phys/models/analytic_hamiltonians/square_plaquette_hubbard.hpp
@@ -0,0 +1,196 @@
+// Copyright (C) 2019 ETH Zurich
+// Copyright (C) 2019 UT-Battelle, LLC
+// All rights reserved.
+//
+// See LICENSE for terms of usage.
+// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+//
+// Author: Thomas A. Maier (maierta@ornl.gov)
+//         Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch)
+//
+// Implementation of the Kagome Hubbard model
+
+#ifndef DCA_PHYS_MODELS_ANALYTIC_HAMILTONIANS_SQUARE_PLAQUETTE_HUBBARD_HPP
+#define DCA_PHYS_MODELS_ANALYTIC_HAMILTONIANS_SQUARE_PLAQUETTE_HUBBARD_HPP
+
+#include <cmath>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include "dca/function/domains.hpp"
+#include "dca/function/function.hpp"
+#include "dca/phys/domains/cluster/symmetries/point_groups/no_symmetry.hpp"
+#include "dca/phys/models/analytic_hamiltonians/cluster_shape_type.hpp"
+#include "dca/util/type_list.hpp"
+#include "dca/phys/domains/cluster/cluster_operations.hpp"
+
+namespace dca {
+namespace phys {
+namespace models {
+// dca::phys::models::
+
+// TODO: the symmetry of this system must be checked.
+template <typename SymmetryGroup>
+class squarePlaquetteHubbard {
+public:
+  static constexpr bool complex_g0 = false;
+  static constexpr bool spin_symmetric = true;
+
+  using LDA_point_group = domains::no_symmetry<2>;
+  using DCA_point_group = SymmetryGroup;
+  // typedef PointGroupType DCA_point_group;
+
+  const static ClusterShapeType DCA_cluster_shape = BETT_CLUSTER;
+  const static ClusterShapeType LDA_cluster_shape = PARALLELEPIPED;
+
+  const static int DIMENSION = 2;
+  const static int BANDS = 4;
+
+  const static double* initializeRDCABasis();
+  // static double* initializeKDCABasis();
+
+  const static double* initializeRLDABasis();
+  // static double* initializeKLDABasis();
+
+  static std::vector<int> flavors();
+  static std::vector<std::vector<double>> aVectors();
+
+  static std::vector<std::pair<std::pair<int, int>, std::pair<int, int>>> orbitalPermutations();
+
+  // Rotations of pi/2 are an anti-symmetry on the band off-diagonal.
+  static int transformationSignOfR(int b1, int b2, int s);
+  static int transformationSignOfK(int b1, int b2, int s);
+
+  // Initializes the interaction Hamiltonian in real space.
+  template <typename BandDmn, typename SpinDmn, typename RDmn, typename parameters_type>
+  static void initializeHInteraction(
+      func::function<double, func::dmn_variadic<func::dmn_variadic<BandDmn, SpinDmn>,
+                                                func::dmn_variadic<BandDmn, SpinDmn>, RDmn>>& H_interaction,
+      const parameters_type& parameters);
+
+  template <class domain>
+  static void initializeHSymmetry(func::function<int, domain>& H_symmetry);
+
+  // Initializes the tight-binding (non-interacting) part of the momentum space Hamiltonian.
+  // Preconditions: The elements of KDmn are two-dimensional (access through index 0 and 1).
+  template <typename ParametersType, typename ScalarType, typename BandDmn, typename SpinDmn, typename KDmn>
+  static void initializeH0(
+      const ParametersType& parameters,
+      func::function<ScalarType, func::dmn_variadic<func::dmn_variadic<BandDmn, SpinDmn>,
+                                                    func::dmn_variadic<BandDmn, SpinDmn>, KDmn>>& H_0);
+};
+
+template <typename PointGroupType>
+int squarePlaquetteHubbard<PointGroupType>::transformationSignOfR(int b1 [[maybe_unused]],
+                                                         int b2 [[maybe_unused]],
+                                                         int s [[maybe_unused]]) {
+  return 1;  // TODO: FIXME
+}
+
+template <typename PointGroupType>
+int squarePlaquetteHubbard<PointGroupType>::transformationSignOfK(int b1 [[maybe_unused]],
+                                                         int b2 [[maybe_unused]],
+                                                         int s [[maybe_unused]]) {
+  return 1;  // TODO: FIXME
+}
+
+template <typename PointGroupType>
+const double* squarePlaquetteHubbard<PointGroupType>::initializeRDCABasis() {
+  static std::array<double, 4> basis{2,0,0,2};
+  return basis.data();
+}
+
+template <typename PointGroupType>
+const double* squarePlaquetteHubbard<PointGroupType>::initializeRLDABasis() {
+  static std::array<double, 4> basis{2,0,0,2};
+  return basis.data();
+}
+
+
+template <typename PointGroupType>
+std::vector<int> squarePlaquetteHubbard<PointGroupType>::flavors() {
+  return {0, 1, 2,3};
+}
+
+template <typename PointGroupType>
+std::vector<std::vector<double>> squarePlaquetteHubbard<PointGroupType>::aVectors() {
+  return {{0, 0}, {1, 0}, {0, 1}, {1, 1}};
+}
+
+template <typename PointGroupType>
+std::vector<std::pair<std::pair<int, int>, std::pair<int, int>>> squarePlaquetteHubbard<
+    PointGroupType>::orbitalPermutations() {
+  return {};
+}
+
+template <typename PointGroupType>
+template <typename BandDmn, typename SpinDmn, typename RDmn, typename parameters_type>
+void squarePlaquetteHubbard<PointGroupType>::initializeHInteraction(
+    func::function<double, func::dmn_variadic<func::dmn_variadic<BandDmn, SpinDmn>,
+                                              func::dmn_variadic<BandDmn, SpinDmn>, RDmn>>& H_interaction,
+    const parameters_type& parameters) {
+  if (BandDmn::dmn_size() != BANDS)
+    throw std::logic_error("Kagome Hubbard model has three bands.");
+  if (SpinDmn::dmn_size() != 2)
+    throw std::logic_error("Spin domain size must be 2.");
+
+  const int origin = RDmn::parameter_type::origin_index();
+
+  const double U = parameters.get_U();  // on-site Hubbard interaction
+
+  H_interaction = 0.;
+
+// Onsite U interaction
+  for (int i = 0; i < BANDS; i++) {
+    H_interaction(i, 0, i, 1, origin) = U;
+    H_interaction(i, 1, i, 0, origin) = U;
+  }
+
+}
+
+template <typename PointGroupType>
+template <class domain>
+void squarePlaquetteHubbard<PointGroupType>::initializeHSymmetry(func::function<int, domain>& H_symmetries) {
+  H_symmetries = -1;
+}
+
+template <typename PointGroupType>
+template <typename ParametersType, typename ScalarType, typename BandDmn, typename SpinDmn, typename KDmn>
+void squarePlaquetteHubbard<PointGroupType>::initializeH0(
+    const ParametersType& parameters,
+    func::function<ScalarType, func::dmn_variadic<func::dmn_variadic<BandDmn, SpinDmn>,
+                                                  func::dmn_variadic<BandDmn, SpinDmn>, KDmn>>& H_0) {
+  if (BandDmn::dmn_size() != BANDS)
+    throw std::logic_error("Plaquette Hubbard model has four bands.");
+  if (SpinDmn::dmn_size() != 2)
+    throw std::logic_error("Spin domain size must be 2.");
+
+  const auto& k_vecs = KDmn::get_elements();
+
+  const auto t = parameters.get_t();
+
+  H_0 = ScalarType(0);
+
+  for (int k_ind = 0; k_ind < KDmn::dmn_size(); ++k_ind) {
+    const auto& k = k_vecs[k_ind];
+
+    for (int s = 0; s < 2; s++) {
+      H_0(0, s, 1, s, k_ind) = -2. * t * std::cos(k[0]);
+      H_0(0, s, 2, s, k_ind) = -2. * t * std::cos(k[1]);
+      H_0(2, s, 3, s, k_ind) = -2. * t * std::cos(k[0]);
+      H_0(1, s, 3, s, k_ind) = -2. * t * std::cos(k[1]);
+
+      H_0(1, s, 0, s, k_ind) = -2. * t * std::cos(k[0]);
+      H_0(2, s, 0, s, k_ind) = -2. * t * std::cos(k[1]);
+      H_0(3, s, 2, s, k_ind) = -2. * t * std::cos(k[0]);
+      H_0(3, s, 1, s, k_ind) = -2. * t * std::cos(k[1]);
+    }
+  }
+}
+
+}  // namespace models
+}  // namespace phys
+}  // namespace dca
+
+#endif  // DCA_PHYS_MODELS_ANALYTIC_HAMILTONIANS_SQUARE_PLAQUETTE_HUBBARD_HPP
diff --git a/include/dca/phys/parameters/four_point_parameters.hpp b/include/dca/phys/parameters/four_point_parameters.hpp
index 75c5541ba..d1840d0d4 100644
--- a/include/dca/phys/parameters/four_point_parameters.hpp
+++ b/include/dca/phys/parameters/four_point_parameters.hpp
@@ -60,8 +60,7 @@ class FourPointParameters {
    *  when the head node has no four point channels. This works around this bug.  \todo fix this.
    */
   bool isAccumulatingG4() const {
-    if (four_point_channels_.size() > 0 &&
-        four_point_channels_[0] != FourPointType::PARTICLE_HOLE_NONE)
+    if (four_point_channels_.size() > 0 && four_point_channels_[0] != FourPointType::NONE)
       return true;
     else
       return false;
@@ -73,7 +72,7 @@ class FourPointParameters {
    * PARTICLE_HOLE_NONE should never be added.
    */
   void set_four_point_channel(FourPointType type) {
-    if (type != FourPointType::PARTICLE_HOLE_NONE)
+    if (type != FourPointType::NONE)
       four_point_channels_ = std::vector<FourPointType>{type};
   }
   void set_four_point_channels(const std::vector<FourPointType>& channels) {
diff --git a/include/dca/phys/parameters/model_parameters_Kagome_hubbard.inc b/include/dca/phys/parameters/model_parameters_Kagome_hubbard.inc
index 66ae5e67b..eb3057af2 100644
--- a/include/dca/phys/parameters/model_parameters_Kagome_hubbard.inc
+++ b/include/dca/phys/parameters/model_parameters_Kagome_hubbard.inc
@@ -36,9 +36,17 @@ public:
   void set_U(double U) {
     U_ = U;
   }
+
+  double get_V() const {
+    return V_;
+  }
+  void set_V(double V) {
+    V_ = V;
+  }
 private:
   double t_ = 0;
   double U_ = 0;
+  double V_ = 0;
 };
 
 template <typename PointGroup>
@@ -49,6 +57,7 @@ int ModelParameters<models::TightBindingModel<models::KagomeHubbard<PointGroup>>
 
   buffer_size += concurrency.get_buffer_size(t_);
   buffer_size += concurrency.get_buffer_size(U_);
+  buffer_size += concurrency.get_buffer_size(V_);
 
   return buffer_size;
 }
@@ -59,6 +68,7 @@ void ModelParameters<models::TightBindingModel<models::KagomeHubbard<PointGroup>
     const Concurrency& concurrency, char* buffer, int buffer_size, int& position) const {
   concurrency.pack(buffer, buffer_size, position, t_);
   concurrency.pack(buffer, buffer_size, position, U_);
+  concurrency.pack(buffer, buffer_size, position, V_);
 }
 
 template <typename PointGroup>
@@ -67,6 +77,7 @@ void ModelParameters<models::TightBindingModel<models::KagomeHubbard<PointGroup>
     const Concurrency& concurrency, char* buffer, int buffer_size, int& position) {
   concurrency.unpack(buffer, buffer_size, position, t_);
   concurrency.unpack(buffer, buffer_size, position, U_);
+  concurrency.unpack(buffer, buffer_size, position, V_);
 }
 
 template <typename PointGroup>
@@ -91,7 +102,11 @@ void ModelParameters<models::TightBindingModel<models::KagomeHubbard<PointGroup>
     }
     if (std::abs(U_) <= 1e-3)
       throw std::runtime_error("For Kagome model abs(U) must be greater than 1e-3");
+    try {
+      reader_or_writer.execute("V", V_);
+    }
+    catch (const std::exception& r_e) {
+    }
 
     reader_or_writer.close_group();
 }
-
diff --git a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu
index 3b4241144..03b8d44a8 100644
--- a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu
+++ b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu
@@ -254,8 +254,7 @@ void computeGMultiband(std::complex<Real>* G, int ldg, const std::complex<Real>*
 
 template <typename Scalar, FourPointType type, typename SignType>
 __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
-                               const GPUComplex<RealAlias<Scalar>>* __restrict__ G_up,
-                               const int ldgu,
+                               const GPUComplex<RealAlias<Scalar>>* __restrict__ G_up, const int ldgu,
                                const GPUComplex<RealAlias<Scalar>>* __restrict__ G_down,
                                const int ldgd, const SignType factor, const bool atomic,
                                const uint64_t start, const uint64_t end) {
@@ -306,7 +305,7 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
   // the exchange momentum, implies the same operation is performed with the exchange frequency.
   // See tp_accumulator.hpp for more details.
   if constexpr (type == FourPointType::PARTICLE_HOLE_TRANSVERSE) {
-    // contribution <- -\sum_s G(k1, k2, s) * G(k2 + k_ex, k1 + k_ex, -s)
+    // contribution <- -\sum_s G_{b2,b4}(k1, k2, s) * G_{b3,b1}(k2 + k_ex, k1 + k_ex, -s)
     int w1_a(w1);
     int w2_a(w2);
     int k1_a(k1);
@@ -316,9 +315,8 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
     else
       g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
 
-    int i_a = nb * k1_a + no * w1_a;
-    int j_a = nb * k2_a + no * w2_a;
-    condSwapAdd(i_a, j_a, b1, b4, true);
+    int i_a = nb * k1_a + no * w1_a + b2;
+    int j_a = nb * k2_a + no * w2_a + b4;
     const GPUComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a];
     const GPUComplex<RealAlias<Scalar>> Ga_2 = G_down[i_a + ldgd * j_a];
 
@@ -330,9 +328,8 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
       g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
     else
       g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
-    int i_b = nb * k1_b + no * w1_b;
-    int j_b = nb * k2_b + no * w2_b;
-    condSwapAdd(i_b, j_b, b2, b3, true);
+    int i_b = nb * k1_b + no * w1_b + b3;
+    int j_b = nb * k2_b + no * w2_b + b1;
     const GPUComplex<RealAlias<Scalar>> Gb_1 = G_down[i_b + ldgd * j_b];
     const GPUComplex<RealAlias<Scalar>> Gb_2 = G_up[i_b + ldgu * j_b];
 
@@ -340,23 +337,21 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
   }
   else if constexpr (type == FourPointType::PARTICLE_HOLE_MAGNETIC) {
     // The PARTICLE_HOLE_MAGNETIC contribution is computed in two parts:
-    // Spin Difference Contribution
+    // Spin Difference Contribution (disconnected diagram)
     // new scope to reuse local index variables
     {
-      // contribution += (\sum_s s * G(k1, k1 + k_ex)) * (\sum_s s * G(k2 + k_ex, k2))
+      // contribution += (\sum_s s * G_{b2, b1}(k1, k1 + k_ex)) * (\sum_s s * G_{b3,b4}(k2 + k_ex, k2))
       int k1_a = k1;
       int k2_a = g4_helper.addKex(k1, k_ex);
       int w1_a(w1);
       int w2_a(g4_helper.addWex(w1, w_ex));
 
-      // conj_a in this case just tells us whether to swap the band axes additions or not
       if (g4_helper.get_bands() == 1)
         g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
       else
         g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
-      int i_a = nb * k1_a + no * w1_a;
-      int j_a = nb * k2_a + no * w2_a;
-      condSwapAdd(i_a, j_a, b1, b3, true);
+      int i_a = nb * k1_a + no * w1_a + b2;
+      int j_a = nb * k2_a + no * w2_a + b1;
       GPUComplex<RealAlias<Scalar>> Ga = G_up[i_a + ldgd * j_a] - G_down[i_a + ldgu * j_a];
 
       int k1_b = g4_helper.addKex(k2, k_ex);
@@ -364,21 +359,17 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
       int w1_b(g4_helper.addWex(w2, w_ex));
       int w2_b(w2);
 
-      // if (i_a == j_a)
-      //   Ga += (G_up[i_a + ldgu * j_a] - G_down[i_a + ldgd * j_a]) *
-
       if (g4_helper.get_bands() == 1)
         g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
       else
         g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
-      int i_b = nb * k1_b + no * w1_b;
-      int j_b = nb * k2_b + no * w2_b;
-      condSwapAdd(i_b, j_b, b2, b4, true);
+      int i_b = nb * k1_b + no * w1_b + b3;
+      int j_b = nb * k2_b + no * w2_b + b4;
       GPUComplex<RealAlias<Scalar>> Gb = G_up[i_b + ldgd * j_b] - G_down[i_b + ldgu * j_b];
 
       contribution = sign_over_2 * (Ga * Gb);
     }
-    // direct contribution <- -\sum_s G(k1, k2, s) * G(k2 + k_ex, k1 + k_ex, s)
+    // direct contribution <- -\sum_s G_{b2,b4}(k1, k2, s) * G_{b3, b1}(k2 + k_ex, k1 + k_ex, s)
     {
       int w1_a(w1);
       int w2_a(w2);
@@ -389,10 +380,8 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
         g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
       else
         g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
-      int i_a = nb * k1_a + no * w1_a;
-      int j_a = nb * k2_a + no * w2_a;
-      i_a += b4;
-      j_a += b1;
+      int i_a = nb * k1_a + no * w1_a + b3;
+      int j_a = nb * k2_a + no * w2_a + b1;
 
       GPUComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a];
       GPUComplex<RealAlias<Scalar>> Ga_2 = G_down[i_a + ldgd * j_a];
@@ -407,10 +396,8 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
       else
         g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
 
-      int i_b = nb * k1_b + no * w1_b;
-      int j_b = nb * k2_b + no * w2_b;
-      i_b += b3;
-      j_b += b2;
+      int i_b = nb * k1_b + no * w1_b + b2;
+      int j_b = nb * k2_b + no * w2_b + b4;
       GPUComplex<RealAlias<Scalar>> Gb_1 = G_up[i_b + ldgu * j_b];
       GPUComplex<RealAlias<Scalar>> Gb_2 = G_down[i_b + ldgd * j_b];
 
@@ -430,10 +417,8 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
         conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
       else
         conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
-      int i_a = nb * k1_a + no * w1_a;
-      int j_a = nb * k2_a + no * w2_a;
-      // b1 , b4
-      condSwapAdd(i_a, j_a, b1, b4, true);
+      int i_a = nb * k1_a + no * w1_a + b1;
+      int j_a = nb * k2_a + no * w2_a + b4;
 
       const GPUComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a];
       const GPUComplex<RealAlias<Scalar>> Ga_2 = G_down[i_a + ldgd * j_a];
@@ -448,10 +433,8 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
       else
         conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
 
-      int i_b = nb * k1_b + no * w1_b;
-      int j_b = nb * k2_b + no * w2_b;
-      // b2, b3
-      condSwapAdd(i_b, j_b, b2, b3, true);
+      int i_b = nb * k1_b + no * w1_b + b2;
+      int j_b = nb * k2_b + no * w2_b + b3;
 
       const GPUComplex<RealAlias<Scalar>> Gb_1 = G_up[i_b + ldgu * j_b];
       const GPUComplex<RealAlias<Scalar>> Gb_2 = G_down[i_b + ldgd * j_b];
@@ -474,9 +457,8 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
       else
         conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
 
-      int i_a = nb * k1_a + no * w1_a;
-      int j_a = nb * k2_a + no * w2_a;
-      condSwapAdd(i_a, j_a, b1, b3, true);
+      int i_a = nb * k1_a + no * w1_a + b2;
+      int j_a = nb * k2_a + no * w2_a + b1;
 
       const GPUComplex<RealAlias<Scalar>> Ga = G_up[i_a + ldgu * j_a] + G_down[i_a + ldgd * j_a];
 
@@ -490,174 +472,173 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
       else
         conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
 
-      int i_b = nb * k1_b + no * w1_b;
-      int j_b = nb * k2_b + no * w2_b;
-      condSwapAdd(i_b, j_b, b2, b4, true);
+      int i_b = nb * k1_b + no * w1_b + b3;
+      int j_b = nb * k2_b + no * w2_b + b4;
 
       const GPUComplex<RealAlias<Scalar>> Gb = G_up[i_b + ldgu * j_b] + G_down[i_b + ldgd * j_b];
 
       contribution += sign_over_2 * (Ga * Gb);
     }
   }
-  else if constexpr (type == FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP) {
-    // The PARTICLE_HOLE_LONGITUDINAL_UP_UP contribution is computed in two parts:
-    {
-      // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, s)
-      int w1_a(w1);
-      int w2_a(g4_helper.addWex(w1, w_ex));
-      int k1_a = k1;
-      int k2_a = g4_helper.addKex(k1, k_ex);
-      bool conj_a = false;
-      if (g4_helper.get_bands() == 1)
-        conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
-      else
-        conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
-      int i_a = nb * k1_a + no * w1_a;
-      int j_a = nb * k2_a + no * w2_a;
-      if (conj_a) {
-        i_a += b4;
-        j_a += b2;
-      }
-      else {
-        i_a += b2;
-        j_a += b4;
-      }
-
-      const GPUComplex<RealAlias<Scalar>> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
-      const GPUComplex<RealAlias<Scalar>> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);
-
-      int w1_b(g4_helper.addWex(w2, w_ex));
-      int w2_b(w2);
-      int k1_b = g4_helper.addKex(k2, k_ex);
-      int k2_b = k2;
-      bool conj_b = false;
-      if (g4_helper.get_bands() == 1)
-        conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
-      else
-        conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
-
-      int i_b = nb * k1_b + no * w1_b;
-      int j_b = nb * k2_b + no * w2_b;
-      if (conj_b) {
-        i_b += b1;
-        j_b += b3;
-      }
-      else {
-        i_b += b3;
-        j_b += b1;
-      }
-
-      const GPUComplex<RealAlias<Scalar>> Gb_1 = cond_conj(G_up[i_b + ldgd * j_b], conj_b);
-      const GPUComplex<RealAlias<Scalar>> Gb_2 = cond_conj(G_down[i_b + ldgu * j_b], conj_b);
-
-      contribution = sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2);
-    }
-    {
-      // contribution <- -\sum_s G(k1, k2, s) * G(k2 + k_ex, k1 + k_ex, s)
-      int w1_a(w1);
-      int w2_a(w2);
-      int k1_a(k1);
-      int k2_a(k2);
-
-      bool conj_a = false;
-      if (g4_helper.get_bands() == 1)
-        conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
-      else
-        conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
-      int i_a = nb * k1_a + no * w1_a;
-      int j_a = nb * k2_a + no * w2_a;
-      if (conj_a) {
-        i_a += b4;
-        j_a += b1;
-      }
-      else {
-        i_a += b1;
-        j_a += b4;
-      }
-      const GPUComplex<RealAlias<Scalar>> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
-      const GPUComplex<RealAlias<Scalar>> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);
-
-      int w1_b(g4_helper.addWex(w2, w_ex));
-      int w2_b(g4_helper.addWex(w1, w_ex));
-      int k1_b = g4_helper.addKex(k2, k_ex);
-      int k2_b = g4_helper.addKex(k1, k_ex);
-
-      bool conj_b = false;
-      if (g4_helper.get_bands() == 1)
-        conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
-      else
-        conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
-
-      int i_b = nb * k1_b + no * w1_b;
-      int j_b = nb * k2_b + no * w2_b;
-      if (conj_b) {
-        i_b += b3;
-        j_b += b2;
-      }
-      else {
-        i_b += b2;
-        j_b += b3;
-      }
-
-      const GPUComplex<RealAlias<Scalar>> Gb_1 = cond_conj(G_up[i_b + ldgd * j_b], conj_b);
-      const GPUComplex<RealAlias<Scalar>> Gb_2 = cond_conj(G_down[i_b + ldgu * j_b], conj_b);
-
-      contribution += -sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2);
-    }
-  }
-  else if constexpr (type == FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN) {
-    // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, -s)
-    int w1_a(w1);
-    int w2_a(g4_helper.addWex(w1, w_ex));
-    int k1_a = k1;
-    int k2_a = g4_helper.addKex(k1, k_ex);
-    bool conj_a = false;
-    if (g4_helper.get_bands() == 1)
-      conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
-    else
-      conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
-    int i_a = nb * k1_a + no * w1_a;
-    int j_a = nb * k2_a + no * w2_a;
-    if (conj_a) {
-      i_a += b4;
-      j_a += b2;
-    }
-    else {
-      i_a += b2;
-      j_a += b4;
-    }
-
-    const GPUComplex<RealAlias<Scalar>> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
-    const GPUComplex<RealAlias<Scalar>> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);
-
-    int w1_b(g4_helper.addWex(w2, w_ex));
-    int w2_b(w2);
-    int k1_b = g4_helper.addKex(k2, k_ex);
-    int k2_b = k2;
-    bool conj_b = false;
-    if (g4_helper.get_bands() == 1)
-      conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
-    else
-      conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
-
-    int i_b = nb * k1_b + no * w1_b;
-    int j_b = nb * k2_b + no * w2_b;
-    if (conj_b) {
-      i_b += b1;
-      j_b += b3;
-    }
-    else {
-      i_b += b3;
-      j_b += b1;
-    }
-
-    const GPUComplex<RealAlias<Scalar>> Gb_1 = cond_conj(G_down[i_b + ldgd * j_b], conj_b);
-    const GPUComplex<RealAlias<Scalar>> Gb_2 = cond_conj(G_up[i_b + ldgu * j_b], conj_b);
-
-    contribution = sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2);
-  }
+  // else if constexpr (type == FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP) {
+  //   // The PARTICLE_HOLE_LONGITUDINAL_UP_UP contribution is computed in two parts:
+  //   {
+  //     // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, s)
+  //     int w1_a(w1);
+  //     int w2_a(g4_helper.addWex(w1, w_ex));
+  //     int k1_a = k1;
+  //     int k2_a = g4_helper.addKex(k1, k_ex);
+  //     bool conj_a = false;
+  //     if (g4_helper.get_bands() == 1)
+  //       conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
+  //     else
+  //       conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
+  //     int i_a = nb * k1_a + no * w1_a;
+  //     int j_a = nb * k2_a + no * w2_a;
+  //     if (conj_a) {
+  //       i_a += b4;
+  //       j_a += b2;
+  //     }
+  //     else {
+  //       i_a += b2;
+  //       j_a += b4;
+  //     }
+
+  //     const GPUComplex<RealAlias<Scalar>> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
+  //     const GPUComplex<RealAlias<Scalar>> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);
+
+  //     int w1_b(g4_helper.addWex(w2, w_ex));
+  //     int w2_b(w2);
+  //     int k1_b = g4_helper.addKex(k2, k_ex);
+  //     int k2_b = k2;
+  //     bool conj_b = false;
+  //     if (g4_helper.get_bands() == 1)
+  //       conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
+  //     else
+  //       conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
+
+  //     int i_b = nb * k1_b + no * w1_b;
+  //     int j_b = nb * k2_b + no * w2_b;
+  //     if (conj_b) {
+  //       i_b += b1;
+  //       j_b += b3;
+  //     }
+  //     else {
+  //       i_b += b3;
+  //       j_b += b1;
+  //     }
+
+  //     const GPUComplex<RealAlias<Scalar>> Gb_1 = cond_conj(G_up[i_b + ldgd * j_b], conj_b);
+  //     const GPUComplex<RealAlias<Scalar>> Gb_2 = cond_conj(G_down[i_b + ldgu * j_b], conj_b);
+
+  //     contribution = sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2);
+  //   }
+  //   {
+  //     // contribution <- -\sum_s G(k1, k2, s) * G(k2 + k_ex, k1 + k_ex, s)
+  //     int w1_a(w1);
+  //     int w2_a(w2);
+  //     int k1_a(k1);
+  //     int k2_a(k2);
+
+  //     bool conj_a = false;
+  //     if (g4_helper.get_bands() == 1)
+  //       conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
+  //     else
+  //       conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
+  //     int i_a = nb * k1_a + no * w1_a;
+  //     int j_a = nb * k2_a + no * w2_a;
+  //     if (conj_a) {
+  //       i_a += b4;
+  //       j_a += b1;
+  //     }
+  //     else {
+  //       i_a += b1;
+  //       j_a += b4;
+  //     }
+  //     const GPUComplex<RealAlias<Scalar>> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
+  //     const GPUComplex<RealAlias<Scalar>> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);
+
+  //     int w1_b(g4_helper.addWex(w2, w_ex));
+  //     int w2_b(g4_helper.addWex(w1, w_ex));
+  //     int k1_b = g4_helper.addKex(k2, k_ex);
+  //     int k2_b = g4_helper.addKex(k1, k_ex);
+
+  //     bool conj_b = false;
+  //     if (g4_helper.get_bands() == 1)
+  //       conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
+  //     else
+  //       conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
+
+  //     int i_b = nb * k1_b + no * w1_b;
+  //     int j_b = nb * k2_b + no * w2_b;
+  //     if (conj_b) {
+  //       i_b += b3;
+  //       j_b += b2;
+  //     }
+  //     else {
+  //       i_b += b2;
+  //       j_b += b3;
+  //     }
+
+  //     const GPUComplex<RealAlias<Scalar>> Gb_1 = cond_conj(G_up[i_b + ldgd * j_b], conj_b);
+  //     const GPUComplex<RealAlias<Scalar>> Gb_2 = cond_conj(G_down[i_b + ldgu * j_b], conj_b);
+
+  //     contribution += -sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2);
+  //   }
+  // }
+  // else if constexpr (type == FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN) {
+  //   // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, -s)
+  //   int w1_a(w1);
+  //   int w2_a(g4_helper.addWex(w1, w_ex));
+  //   int k1_a = k1;
+  //   int k2_a = g4_helper.addKex(k1, k_ex);
+  //   bool conj_a = false;
+  //   if (g4_helper.get_bands() == 1)
+  //     conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
+  //   else
+  //     conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
+  //   int i_a = nb * k1_a + no * w1_a;
+  //   int j_a = nb * k2_a + no * w2_a;
+  //   if (conj_a) {
+  //     i_a += b4;
+  //     j_a += b2;
+  //   }
+  //   else {
+  //     i_a += b2;
+  //     j_a += b4;
+  //   }
+
+  //   const GPUComplex<RealAlias<Scalar>> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a);
+  //   const GPUComplex<RealAlias<Scalar>> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a);
+
+  //   int w1_b(g4_helper.addWex(w2, w_ex));
+  //   int w2_b(w2);
+  //   int k1_b = g4_helper.addKex(k2, k_ex);
+  //   int k2_b = k2;
+  //   bool conj_b = false;
+  //   if (g4_helper.get_bands() == 1)
+  //     conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b);
+  //   else
+  //     conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
+
+  //   int i_b = nb * k1_b + no * w1_b;
+  //   int j_b = nb * k2_b + no * w2_b;
+  //   if (conj_b) {
+  //     i_b += b1;
+  //     j_b += b3;
+  //   }
+  //   else {
+  //     i_b += b3;
+  //     j_b += b1;
+  //   }
+
+  //   const GPUComplex<RealAlias<Scalar>> Gb_1 = cond_conj(G_down[i_b + ldgd * j_b], conj_b);
+  //   const GPUComplex<RealAlias<Scalar>> Gb_2 = cond_conj(G_up[i_b + ldgu * j_b], conj_b);
+
+  //   contribution = sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2);
+  // }
   else if constexpr (type == FourPointType::PARTICLE_PARTICLE_UP_DOWN) {
-    // contribution <- -\sum_s G(k_ex - k2, k_ex - k1, s) * G(k2, k1, -s).
+    // contribution <- \sum_s G_{b1,b3}(k1, k2, s) * G_{b2,b4}(kex - k1, kex - k2, -s).
     int w1_a(w1);
     int w2_a(w2);
     int k1_a(k1);
@@ -667,9 +648,9 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
       g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a);
     else
       g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a);
-    int i_a = nb * k1_a + no * w1_a;
-    int j_a = nb * k2_a + no * w2_a;
-    condSwapAdd(i_a, j_a, b1, b3, true);
+    int i_a = nb * k1_a + no * w1_a + b1;
+    int j_a = nb * k2_a + no * w2_a + b3;
+
     const GPUComplex<RealAlias<Scalar>> Ga_1 = G_up[i_a + ldgu * j_a];
     const GPUComplex<RealAlias<Scalar>> Ga_2 = G_down[i_a + ldgd * j_a];
 
@@ -683,9 +664,8 @@ __global__ void updateG4Kernel(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
     else
       g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
 
-    int i_b = nb * k1_b + no * w1_b;
-    int j_b = nb * k2_b + no * w2_b;
-    condSwapAdd(i_b, j_b, b2, b4, true);
+    int i_b = nb * k1_b + no * w1_b + b2;
+    int j_b = nb * k2_b + no * w2_b + b4;
 
     const GPUComplex<RealAlias<Scalar>> Gb_1 = G_down[i_b + ldgd * j_b];
     const GPUComplex<RealAlias<Scalar>> Gb_2 = G_up[i_b + ldgu * j_b];
@@ -732,12 +712,12 @@ double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu, const Scalar* G_
     case FourPointType::PARTICLE_HOLE_CHARGE:
       // Each update of a G4 entry involves 3 complex additions and 3 complex multiplications.
       return 26. * n_updates;
-    case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
-      // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications.
-      return 32 * n_updates;
-    case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
-      // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
-      return 18. * n_updates;
+    // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
+    //   // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications.
+    //   return 32 * n_updates;
+    // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
+    //   // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
+    //   return 18. * n_updates;
     case FourPointType::PARTICLE_PARTICLE_UP_DOWN:
       // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
       return 18. * n_updates;
@@ -749,8 +729,7 @@ double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu, const Scalar* G_
 template <typename Scalar, FourPointType type, typename SignType>
 __global__ void updateG4KernelNoSpin(GPUComplex<RealAlias<Scalar>>* __restrict__ G4,
                                      const GPUComplex<RealAlias<Scalar>>* __restrict__ G_dn,
-                                     const int ldgd,
-				     const SignType factor, const bool atomic,
+                                     const int ldgd, const SignType factor, const bool atomic,
                                      const uint64_t start, const uint64_t end) {
   // TODO: reduce code duplication.
   // TODO: decrease, if possible, register pressure. E.g. a single thread computes all bands.
@@ -809,13 +788,11 @@ __global__ void updateG4KernelNoSpin(GPUComplex<RealAlias<Scalar>>* __restrict__
       int k2_b = g4_helper.kexMinus(k2, k_ex);
       g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
 
-      int i_a = nb * k1_a + no * w1_a;
-      int j_a = nb * k2_a + no * w2_a;
-      condSwapAdd(i_a, j_a, b1, b3, true);
+      int i_a = nb * k1_a + no * w1_a + b1;
+      int j_a = nb * k2_a + no * w2_a + b3;
 
-      int i_b = nb * k1_b + no * w1_b;
-      int j_b = nb * k2_b + no * w2_b;
-      condSwapAdd(i_b, j_b, b2, b4, true);
+      int i_b = nb * k1_b + no * w1_b + b2;
+      int j_b = nb * k2_b + no * w2_b + b4;
 
       const GPUComplex<RealAlias<Scalar>> Ga_1 = G_dn[i_a + ldgd * j_a];
       const GPUComplex<RealAlias<Scalar>> Gb_1 = G_dn[i_b + ldgd * j_b];
@@ -835,13 +812,11 @@ __global__ void updateG4KernelNoSpin(GPUComplex<RealAlias<Scalar>>* __restrict__
       int k2_b(k2);
       g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b);
 
-      int i_a = nb * k1_a + no * w1_a;
-      int j_a = nb * k2_a + no * w2_a;
-      condSwapAdd(i_a, j_a, b1, b4, true);
+      int i_a = nb * k1_a + no * w1_a + b1;
+      int j_a = nb * k2_a + no * w2_a + b4;
 
-      int i_b = nb * k1_b + no * w1_b;
-      int j_b = nb * k2_b + no * w2_b;
-      condSwapAdd(i_b, j_b, b2, b3, true);
+      int i_b = nb * k1_b + no * w1_b + b2;
+      int j_b = nb * k2_b + no * w2_b + b3;
 
       const GPUComplex<RealAlias<Scalar>> Ga_1 = G_dn[i_a + ldgd * j_a];
       const GPUComplex<RealAlias<Scalar>> Gb_1 = G_dn[i_b + ldgd * j_b];
@@ -857,15 +832,14 @@ __global__ void updateG4KernelNoSpin(GPUComplex<RealAlias<Scalar>>* __restrict__
 }
 
 template <typename Scalar, FourPointType type, typename SignType>
-double updateG4NoSpin(Scalar* G4, const Scalar* G_dn, const int ldgd, const SignType factor, bool atomic, cudaStream_t stream,
-                std::size_t start, std::size_t end) {
+double updateG4NoSpin(Scalar* G4, const Scalar* G_dn, const int ldgd, const SignType factor,
+                      bool atomic, cudaStream_t stream, std::size_t start, std::size_t end) {
   constexpr const std::size_t n_threads = 256;
   const unsigned n_blocks = dca::util::ceilDiv(end - start, n_threads);
 
   using dca::util::GPUTypeConversion;
   updateG4KernelNoSpin<dca::util::CUDATypeMap<Scalar>, type><<<n_blocks, n_threads, 0, stream>>>(
-      castGPUType(G4), castGPUType(G_dn), ldgd,
-      GPUTypeConversion(factor), atomic, start, end);
+      castGPUType(G4), castGPUType(G_dn), ldgd, GPUTypeConversion(factor), atomic, start, end);
 
   // Check for errors.
   auto err = cudaPeekAtLastError();
@@ -887,12 +861,12 @@ double updateG4NoSpin(Scalar* G4, const Scalar* G_dn, const int ldgd, const Sign
     case FourPointType::PARTICLE_HOLE_CHARGE:
       // Each update of a G4 entry involves 3 complex additions and 3 complex multiplications.
       return 26. * n_updates;
-    case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
-      // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications.
-      return 32 * n_updates;
-    case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
-      // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
-      return 18. * n_updates;
+    // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
+    //   // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications.
+    //   return 32 * n_updates;
+    // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
+    //   // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
+    //   return 18. * n_updates;
     case FourPointType::PARTICLE_PARTICLE_UP_DOWN:
       // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications.
       return 18. * n_updates;
@@ -901,7 +875,6 @@ double updateG4NoSpin(Scalar* G4, const Scalar* G_dn, const int ldgd, const Sign
   }
 }
 
-
 // Explicit instantiation.
 template void computeGSingleband<float>(std::complex<float>* G, int ldg,
                                         const std::complex<float>* G0, int nk, int nw,
@@ -932,17 +905,17 @@ template double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_CHARG
     const std::complex<float>* G_down, const int ldgd, const std::int8_t factor, bool atomic,
     cudaStream_t stream, std::size_t start, std::size_t end);
 
-template double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
-                         std::int8_t>(std::complex<float>* G4, const std::complex<float>* G_up,
-                                      const int ldgu, const std::complex<float>* G_down,
-                                      const int ldgd, const std::int8_t factor, bool atomic,
-                                      cudaStream_t stream, std::size_t start, std::size_t end);
+// template double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
+//                          std::int8_t>(std::complex<float>* G4, const std::complex<float>* G_up,
+//                                       const int ldgu, const std::complex<float>* G_down,
+//                                       const int ldgd, const std::int8_t factor, bool atomic,
+//                                       cudaStream_t stream, std::size_t start, std::size_t end);
 
-template double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
-                         std::int8_t>(std::complex<float>* G4, const std::complex<float>* G_up,
-                                      const int ldgu, const std::complex<float>* G_down,
-                                      const int ldgd, const std::int8_t factor, bool atomic,
-                                      cudaStream_t stream, std::size_t start, std::size_t end);
+// template double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
+//                          std::int8_t>(std::complex<float>* G4, const std::complex<float>* G_up,
+//                                       const int ldgu, const std::complex<float>* G_down,
+//                                       const int ldgd, const std::int8_t factor, bool atomic,
+//                                       cudaStream_t stream, std::size_t start, std::size_t end);
 
 template double updateG4<std::complex<float>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::int8_t>(
     std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
@@ -964,17 +937,17 @@ template double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_CHAR
     const std::complex<double>* G_down, const int ldgd, const std::int8_t factor, bool atomic,
     cudaStream_t stream, std::size_t start, std::size_t end);
 
-template double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
-                         std::int8_t>(std::complex<double>* G4, const std::complex<double>* G_up,
-                                      const int ldgu, const std::complex<double>* G_down,
-                                      const int ldgd, const std::int8_t factor, bool atomic,
-                                      cudaStream_t stream, std::size_t start, std::size_t end);
+// template double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
+//                          std::int8_t>(std::complex<double>* G4, const std::complex<double>* G_up,
+//                                       const int ldgu, const std::complex<double>* G_down,
+//                                       const int ldgd, const std::int8_t factor, bool atomic,
+//                                       cudaStream_t stream, std::size_t start, std::size_t end);
 
-template double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
-                         std::int8_t>(std::complex<double>* G4, const std::complex<double>* G_up,
-                                      const int ldgu, const std::complex<double>* G_down,
-                                      const int ldgd, const std::int8_t factor, bool atomic,
-                                      cudaStream_t stream, std::size_t start, std::size_t end);
+// template double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
+//                          std::int8_t>(std::complex<double>* G4, const std::complex<double>* G_up,
+//                                       const int ldgu, const std::complex<double>* G_down,
+//                                       const int ldgd, const std::int8_t factor, bool atomic,
+//                                       cudaStream_t stream, std::size_t start, std::size_t end);
 
 template double updateG4<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::int8_t>(
     std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
@@ -983,98 +956,109 @@ template double updateG4<std::complex<double>, FourPointType::PARTICLE_PARTICLE_
 
 // complex g0
 
-template double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_TRANSVERSE, std::complex<float>>(
-    std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
-    const std::complex<float>* G_down, const int ldgd, const std::complex<float> factor,
-    bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
+template double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_TRANSVERSE,
+                         std::complex<float>>(std::complex<float>* G4,
+                                              const std::complex<float>* G_up, const int ldgu,
+                                              const std::complex<float>* G_down, const int ldgd,
+                                              const std::complex<float> factor, bool atomic,
+                                              cudaStream_t stream, std::size_t start,
+                                              std::size_t end);
 
-template
-double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_MAGNETIC, std::complex<float>>(
+template double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_MAGNETIC, std::complex<float>>(
     std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
     const std::complex<float>* G_down, const int ldgd, const std::complex<float> factor,
     bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
-template
-double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_CHARGE, std::complex<float>>(
+template double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_CHARGE, std::complex<float>>(
     std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
     const std::complex<float>* G_down, const int ldgd, const std::complex<float> factor,
     bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
-template
-double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
-                std::complex<float>>(std::complex<float>* G4, const std::complex<float>* G_up,
-                                     const int ldgu, const std::complex<float>* G_down,
-                                     const int ldgd, const std::complex<float> factor, bool atomic,
-                                     cudaStream_t stream, std::size_t start, std::size_t end);
-
-template
-double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
-                std::complex<float>>(std::complex<float>* G4, const std::complex<float>* G_up,
-                                     const int ldgu, const std::complex<float>* G_down,
-                                     const int ldgd, const std::complex<float> factor, bool atomic,
-                                     cudaStream_t stream, std::size_t start, std::size_t end);
-
-template
-double updateG4<std::complex<float>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<float>>(
-    std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
-    const std::complex<float>* G_down, const int ldgd, const std::complex<float> factor,
-    bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
-
-template
-double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_TRANSVERSE, std::complex<double>>(
+// template
+// double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
+//                 std::complex<float>>(std::complex<float>* G4, const std::complex<float>* G_up,
+//                                      const int ldgu, const std::complex<float>* G_down,
+//                                      const int ldgd, const std::complex<float> factor, bool atomic,
+//                                      cudaStream_t stream, std::size_t start, std::size_t end);
+
+// template
+// double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
+//                 std::complex<float>>(std::complex<float>* G4, const std::complex<float>* G_up,
+//                                      const int ldgu, const std::complex<float>* G_down,
+//                                      const int ldgd, const std::complex<float> factor, bool atomic,
+//                                      cudaStream_t stream, std::size_t start, std::size_t end);
+
+template double updateG4<std::complex<float>, FourPointType::PARTICLE_PARTICLE_UP_DOWN,
+                         std::complex<float>>(std::complex<float>* G4,
+                                              const std::complex<float>* G_up, const int ldgu,
+                                              const std::complex<float>* G_down, const int ldgd,
+                                              const std::complex<float> factor, bool atomic,
+                                              cudaStream_t stream, std::size_t start,
+                                              std::size_t end);
+
+template double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_TRANSVERSE,
+                         std::complex<double>>(std::complex<double>* G4,
+                                               const std::complex<double>* G_up, const int ldgu,
+                                               const std::complex<double>* G_down, const int ldgd,
+                                               const std::complex<double> factor, bool atomic,
+                                               cudaStream_t stream, std::size_t start,
+                                               std::size_t end);
+
+template double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_MAGNETIC,
+                         std::complex<double>>(std::complex<double>* G4,
+                                               const std::complex<double>* G_up, const int ldgu,
+                                               const std::complex<double>* G_down, const int ldgd,
+                                               const std::complex<double> factor, bool atomic,
+                                               cudaStream_t stream, std::size_t start,
+                                               std::size_t end);
+
+template double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_CHARGE, std::complex<double>>(
     std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
     const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor,
     bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
-template
-double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_MAGNETIC, std::complex<double>>(
-    std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
-    const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor,
-    bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
-
-template
-double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_CHARGE, std::complex<double>>(
-    std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
-    const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor,
-    bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
-
-template
-double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
-                std::complex<double>>(std::complex<double>* G4, const std::complex<double>* G_up,
-                                      const int ldgu, const std::complex<double>* G_down,
-                                      const int ldgd, const std::complex<double> factor, bool atomic,
-                                      cudaStream_t stream, std::size_t start, std::size_t end);
-
-template
-double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
-                std::complex<double>>(std::complex<double>* G4, const std::complex<double>* G_up,
-                                      const int ldgu, const std::complex<double>* G_down,
-                                      const int ldgd, const std::complex<double> factor, bool atomic,
-                                      cudaStream_t stream, std::size_t start, std::size_t end);
-
-template
-double updateG4<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<double>>(
-    std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
-    const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor,
-    bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
+// template double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
+//                          std::complex<double>>(std::complex<double>* G4,
+//                                                const std::complex<double>* G_up, const int ldgu,
+//                                                const std::complex<double>* G_down, const int ldgd,
+//                                                const std::complex<double> factor, bool atomic,
+//                                                cudaStream_t stream, std::size_t start,
+//                                                std::size_t end);
+
+// template double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
+//                          std::complex<double>>(std::complex<double>* G4,
+//                                                const std::complex<double>* G_up, const int ldgu,
+//                                                const std::complex<double>* G_down, const int ldgd,
+//                                                const std::complex<double> factor, bool atomic,
+//                                                cudaStream_t stream, std::size_t start,
+//                                                std::size_t end);
+
+template double updateG4<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN,
+                         std::complex<double>>(std::complex<double>* G4,
+                                               const std::complex<double>* G_up, const int ldgu,
+                                               const std::complex<double>* G_down, const int ldgd,
+                                               const std::complex<double> factor, bool atomic,
+                                               cudaStream_t stream, std::size_t start,
+                                               std::size_t end);
 
 // Non spin symmetric
-template double updateG4NoSpin<std::complex<float>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<float>>(
-    std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
-    const std::complex<float> factor, bool atomic,
-    cudaStream_t stream, std::size_t start, std::size_t end);
-
-template double updateG4NoSpin<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<double>>(
-    std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
-    const std::complex<double> factor, bool atomic,
-    cudaStream_t stream, std::size_t start, std::size_t end);
+template double updateG4NoSpin<std::complex<float>, FourPointType::PARTICLE_PARTICLE_UP_DOWN,
+                               std::complex<float>>(std::complex<float>* G4,
+                                                    const std::complex<float>* G_up, const int ldgu,
+                                                    const std::complex<float> factor, bool atomic,
+                                                    cudaStream_t stream, std::size_t start,
+                                                    std::size_t end);
+
+template double updateG4NoSpin<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN,
+                               std::complex<double>>(std::complex<double>* G4,
+                                                     const std::complex<double>* G_up, const int ldgu,
+                                                     const std::complex<double> factor, bool atomic,
+                                                     cudaStream_t stream, std::size_t start,
+                                                     std::size_t end);
 
 template double updateG4NoSpin<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::int8_t>(
     std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
-    const std::int8_t factor, bool atomic,
-    cudaStream_t stream, std::size_t start, std::size_t end);
-
-
+    const std::int8_t factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
 // template<> double updateG4< FourPointType::PARTICLE_HOLE_TRANSVERSE>(
 //   std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
diff --git a/src/phys/four_point_type.cpp b/src/phys/four_point_type.cpp
index d9e30e652..ec08a3cf2 100644
--- a/src/phys/four_point_type.cpp
+++ b/src/phys/four_point_type.cpp
@@ -26,18 +26,18 @@ FourPointType stringToFourPointType(const std::string& name) {
     return FourPointType::PARTICLE_HOLE_MAGNETIC;
   else if (name == "PARTICLE_HOLE_CHARGE")
     return FourPointType::PARTICLE_HOLE_CHARGE;
-  else if (name == "PARTICLE_HOLE_LONGITUDINAL_UP_UP")
-    return FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP;
-  else if (name == "PARTICLE_HOLE_LONGITUDINAL_UP_DOWN")
-    return FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN;
+  // else if (name == "PARTICLE_HOLE_LONGITUDINAL_UP_UP")
+  //   return FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP;
+  // else if (name == "PARTICLE_HOLE_LONGITUDINAL_UP_DOWN")
+  //   return FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN;
   else
-    return FourPointType::PARTICLE_HOLE_NONE;
+    return FourPointType::NONE;
 }
 
 std::string toString(const FourPointType type) {
   switch (type) {
-    case FourPointType::PARTICLE_HOLE_NONE:
-      return "PARTICLE_HOLE_NONE";
+    case FourPointType::NONE:
+      return "NONE";
     case FourPointType::PARTICLE_PARTICLE_UP_DOWN:
       return "PARTICLE_PARTICLE_UP_DOWN";
     case FourPointType::PARTICLE_HOLE_TRANSVERSE:
@@ -46,10 +46,10 @@ std::string toString(const FourPointType type) {
       return "PARTICLE_HOLE_MAGNETIC";
     case FourPointType::PARTICLE_HOLE_CHARGE:
       return "PARTICLE_HOLE_CHARGE";
-    case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
-      return "PARTICLE_HOLE_LONGITUDINAL_UP_UP";
-    case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
-      return "PARTICLE_HOLE_LONGITUDINAL_UP_DOWN";
+    // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP:
+    //   return "PARTICLE_HOLE_LONGITUDINAL_UP_UP";
+    // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN:
+    //   return "PARTICLE_HOLE_LONGITUDINAL_UP_DOWN";
     default:
       throw std::logic_error("Invalid four point mode.");
   }
diff --git a/test/test_automation/ci/run_step.sh b/test/test_automation/ci/run_step.sh
index 68b406eb5..d1c615a5b 100755
--- a/test/test_automation/ci/run_step.sh
+++ b/test/test_automation/ci/run_step.sh
@@ -3,22 +3,22 @@
 set -x
 HOST_NAME=$(hostname -s)
 
-case "$1" in 
+case "$1" in
 
-  # Configure DCA++ using cmake out-of-source builds 
+  # Configure DCA++ using cmake out-of-source builds
   configure)
-    
+
     if [ -d ${GITHUB_WORKSPACE}/../dca-build ]
     then
       echo "Found existing out-of-source build directory ${GITHUB_WORKSPACE}/../dca-build, removing"
       rm -fr ${GITHUB_WORKSPACE}/../dca-build
     fi
-    
+
     echo "Creating new out-of-source build directory ${GITHUB_WORKSPACE}/../dca-build"
     cd ${GITHUB_WORKSPACE}/..
     mkdir dca-build
     cd dca-build
-    
+
     # Real or Complex configuration
     case "${GH_JOBNAME}" in
       *"Real"*)
@@ -28,66 +28,63 @@ case "$1" in
       *"Complex"*)
         echo 'Configure for complex build'
         IS_COMPLEX=1
-      ;; 
+      ;;
     esac
 
     if [[ "${GH_JOBNAME}" =~ (-CUDA) ]]
     then
-      echo "Set PATH to cuda-11.8 to be associated with the C and C++ compilers"
-        export PATH=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/cuda-11.8.0-pkdxb42p6no3tfimntdbw37rvn5p6fiw:$PATH
+      echo "Set PATH to cuda-12.9 to be associated with the C and C++ compilers"
+        export PATH=/usr/local/cuda-12.9/bin:$PATH
         echo "Set CUDACXX CMake environment variable to nvcc cuda 11.8"
-        export CUDACXX=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/cuda-11.8.0-pkdxb42p6no3tfimntdbw37rvn5p6fiw/bin/nvcc
+        export CUDACXX=/usr/local/cuda-12.9/bin/nvcc
         # Make current environment variables available to subsequent steps
         echo "PATH=$PATH" >> $GITHUB_ENV
         echo "CUDACXX=$CUDACXX" >> $GITHUB_ENV
-    fi 
+    fi
 
     # Mixed or Non-Mixed (default, full) precision, used with GPU code
     case "${GH_JOBNAME}" in
       *"Mixed"*)
         echo 'Configure for mixed precision build'
         IS_MIXED_PRECISION=1
-      ;; 
+      ;;
       *)
         IS_MIXED_PRECISION=0
       ;;
     esac
 
     # Path to QMC_DATA in self-hosted CI system and point at minimum gcc-9
-    if [[ "$HOST_NAME" =~ (v100-again) ]]
+    if [[ "$HOST_NAME" =~ (a30four) ]]
     then
 	# use gcc-12
-      export PATH=/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/gcc-12.2.0-cx7pjxgmemcce4tohlmsekuo5qvgjqbl/bin:/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/ninja-1.11.1-plzpokehn3kdbcviteppqntkqun5752f/bin:/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/cmake-3.25.0-xlxorwhfz5jxpyx65ypsh2horyo7n3ef/bin:$PATH
-      export LD_LIBRARY_PATH=/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/gcc-12.2.0-cx7pjxgmemcce4tohlmsekuo5qvgjqbl/lib64:$LD_LIBRARY_PATH
-      export CUDAHOSTCXX=g++
-      export MAGMA_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/magma-2.7.1-nrkrnn42vb2ifpkpoutktrdsnxusrdql
-      export HDF5_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/hdf5-1.14.0-szvumwunzwinu4lhc3kcbbnripfkfcnp
-      export OPENBLAS_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/openblas-0.3.21-e3gckkrsmcrewtzn6lhkdttiwe5ikp4z
-      export ADIOS2_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/adios2-2.8.3-ba3mpxuh435rvisxhorzotb53m3dgfuv
-      export MPI_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/openmpi-4.1.4-mbv4sqfzxaqfwcyhjaaammhsd56dtvl2
-      export FFTW_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/fftw-3.3.10-hdyb4eim5ngsgkhqyiyl5ggispxqmzo5
+      export PATH=$(spack find -lp llvm | awk '/llvm/{print $3}')/bin:${PATH}
+      export LD_LIBRARY_PATH=$(spack find -lp gcc/yel5m4n | awk '/gcc/{print $3}')/lib64:$(spack find -lp hdf5 | awk '/hdf5/{print $3}')/lib64:$(spack find -lp llvm/navafcm | awk '/llvm/{print $3}')/lib:${LD_LIBRARY_PATH}:$(spack find -lp tree-sitter-cmake | awk '/tree-sitter-cmake/{print $3}')/lib:${LD_LIBRARY_PATH}
+      export CUDAHOSTCXX=clang++
+      export MAGMA_ROOT=/home/epd/opt_a30/magma
+      export HDF5_ROOT=$(spack find --loaded -lp hdf5 | awk '/hdf5/{print $3}')
+      export OPENBLAS_ROOT=$(spack find --loaded -lp openblas | awk '/openblas/{print $3}')
+      export MPI_ROOT=$(spack find --loaded -lp openmpi | awk '/openmpi/{print $3}')
+      export FFTW_ROOT=$(spack find --loaded -lp fftw | awk '/fftw/{print $3}')
       # Make current environment variables available to subsequent steps
       echo "PATH=$PATH" >> $GITHUB_ENV
       echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
     fi
-    
+
     case "${GH_JOBNAME}" in
-      *"GCC12-MPI-CUDA-"*)
+      *"LLVM21-MPI-CUDA-"*)
         echo 'Configure for debug mode to capture asserts with gcc'
         cmake -GNinja \
-              -DCMAKE_C_COMPILER=gcc \
-              -DCMAKE_CXX_COMPILER=g++ \
-	      -DCMAKE_CUDA_COMPILER=${CUDA_HOME}/bin/nvcc \
+              -DCMAKE_C_COMPILER=mpicc \
+              -DCMAKE_CXX_COMPILER=mpic++ \
+              -DDCA_WITH_CUDA=1 \
 	      -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler \
-	      -DCMAKE_CUDA_ARCHITECTURES=70 \
-	      -DCMAKE_FORTRAN_COMPILER=gfortran -DCMAKE_EXE_LINKER_FLAGS_INIT="-lgfortran" -DCMAKE_SHARED_LINKER_FLAGS_INIT="-lgfortran" -DCMAKE_MODULE_LINKER_FLAGS_INIT="-lgfortran" \
+	      -DCMAKE_CUDA_ARCHITECTURES=80 \
               -DDCA_WITH_MPI=1 \
 	      -DCMAKE_PREFIX_PATH=${MPI_ROOT}\;${CUDA_ROOT}\;${MAGMA_ROOT}\;${HDF5_ROOT}\;${OPENBLAS_ROOT}\;${ADIOS2_ROOT}\;${FFTW_ROOT} \
               -DCMAKE_BUILD_TYPE=Release \
 	      -DTEST_RUNNER="mpiexec" \
 	      -DMPIEXEC_NUMPROC_FLAG="-n" -DMPIEXEC_PREFLAGS="-mca btl self,tcp" -DDCA_WITH_CUDA=1 -DDCA_WITH_ADIOS2=1 \
 	      -DDCA_WITH_TESTS_FAST=1 \
-	      -DLAPACK_LIBRARIES=${OPENBLAS_ROOT}/lib/libopenblas.a \
               ${GITHUB_WORKSPACE}
       ;;
     esac
@@ -107,17 +104,17 @@ case "$1" in
 
   # Run deterministic tests
   test)
-    
+
     # Run only deterministic tests (reasonable for CI) by default
     TEST_LABEL=""
-    export MPI_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/openmpi-4.1.5-wzdq6mz5bbln7noxsytymadsh6lw3rag
+    export MPI_ROOT=$(spack find --loaded -lp openmpi | awk '/openmpi/{print $3}')
     export PATH=${MPI_ROOT}/bin:${PATH}
     cd ${GITHUB_WORKSPACE}/../dca-build
-    
-    # Add ctest concurrent parallel jobs 
+
+    # Add ctest concurrent parallel jobs
     # Default for Linux GitHub Action runners
     CTEST_JOBS="1"
-    
+
     ctest --output-on-failure $TEST_LABEL -j $CTEST_JOBS
     ;;
 
diff --git a/test/unit/io/CMakeLists.txt b/test/unit/io/CMakeLists.txt
index 8783cdb58..a6a159f4b 100644
--- a/test/unit/io/CMakeLists.txt
+++ b/test/unit/io/CMakeLists.txt
@@ -15,8 +15,14 @@ dca_add_gtest(json_reader_test
   GTEST_MAIN
   LIBS json)
 
+if (DCA_HAVE_GPU)
+  set(DCA_FORTRAN_LIBS magma::magma)
+endif()
+
 dca_add_gtest(reader_writer_test
-  LIBS dca_io parallel_mpi_concurrency function ${DCA_THREADING_LIBS})
+  LIBS dca_io parallel_mpi_concurrency function ${DCA_THREADING_LIBS}
+  ${DCA_FORTRAN_LIBS}
+)
 
 dca_add_gtest(io_types_test
   GTEST_MAIN
@@ -34,4 +40,3 @@ if (DCA_HAVE_ADIOS2)
       LIBS parallel_no_concurrency dca_io function)
   endif()
 endif()
-
diff --git a/test/unit/math/function_transform/space_transform_2D_gpu_test.cpp b/test/unit/math/function_transform/space_transform_2D_gpu_test.cpp
index 319d0099c..b131cf3a9 100644
--- a/test/unit/math/function_transform/space_transform_2D_gpu_test.cpp
+++ b/test/unit/math/function_transform/space_transform_2D_gpu_test.cpp
@@ -55,13 +55,12 @@ using Model = dca::phys::models::TightBindingModel<
 template <typename SCALAR>
 using NumTraits = dca::NumericalTraits<dca::util::RealAlias<SCALAR>, SCALAR>;
 
-
 const std::string input_dir = DCA_SOURCE_DIR "/test/unit/math/function_transform/";
 
 using BDmn = dca::func::dmn_0<dca::phys::domains::electron_band_domain>;
 using SDmn = dca::func::dmn_0<dca::phys::domains::electron_spin_domain>;
 using WPosDmn =
-  dca::func::dmn_0<dca::phys::domains::vertex_frequency_domain<dca::phys::domains::COMPACT>>;
+    dca::func::dmn_0<dca::phys::domains::vertex_frequency_domain<dca::phys::domains::COMPACT>>;
 using WDmn =
     dca::func::dmn_0<dca::phys::domains::vertex_frequency_domain<dca::phys::domains::COMPACT>>;
 
@@ -77,13 +76,12 @@ template <typename Real>
 using SpaceTransform2DGpuTest = ::testing::Test;
 using TestTypes = ::testing::Types<float, double>;  //, std::complex<double>>;
 
-
 TYPED_TEST_CASE(SpaceTransform2DGpuTest, TestTypes);
 
 TYPED_TEST(SpaceTransform2DGpuTest, Execute) {
   using Scalar = TypeParam;
   using dca::util::castGPUType;
-  
+
   Parameters<Scalar> pars(dca::util::GitVersion::string(), *concurrency_ptr);
 
   using KDmn = typename Parameters<Scalar>::KClusterDmn;
@@ -100,14 +98,14 @@ TYPED_TEST(SpaceTransform2DGpuTest, Execute) {
   function<Complex, dmn_variadic<RDmn, RDmn, BDmn, BDmn, SDmn, WPosDmn, WDmn>> f_in;
   RMatrix<Complex, dca::linalg::CPU> M_in;
 
-  
   // Initialize the input function.
   const int nb = BDmn::dmn_size();
   const int nr = RDmn::dmn_size();
   const int nw = WPosDmn::dmn_size();
 
-  std::cout << "nBDmn:" << nb << "  nRDmn:" << nr << "  nw:" << nw << '|' << WDmn::dmn_size() << "  ns:" << SDmn::dmn_size() << '\n';
-  
+  std::cout << "nBDmn:" << nb << "  nRDmn:" << nr << "  nw:" << nw << '|' << WDmn::dmn_size()
+            << "  ns:" << SDmn::dmn_size() << '\n';
+
   M_in.resizeNoCopy(std::make_pair(nb * nr * nw, nb * nr * nw));
   for (int w2 = 0; w2 < nw; ++w2)
     for (int w1 = 0; w1 < nw; ++w1)
@@ -120,20 +118,20 @@ TYPED_TEST(SpaceTransform2DGpuTest, Execute) {
 
               auto index = [=](int r, int b, int w) { return r + nr * b + nb * nr * w; };
               f_in(r1, r2, b1, b2, 0, w1, w2) = val;
-	      M_in(index(r1, b1, w1), index(r2, b2, w2)) = val;
+              M_in(index(r1, b1, w1), index(r2, b2, w2)) = val;
             }
 
   // Transform on the CPU.
   function<Complex, dmn_variadic<BDmn, BDmn, SDmn, KDmn, KDmn, WPosDmn, WDmn>> f_out;
-  dca::math::transform::SpaceTransform2D<RDmn, KDmn, Scalar>::execute(f_in, f_out);
+  dca::math::transform::SpaceTransform2D<RDmn, KDmn, BDmn, SDmn, Scalar>::execute(f_in, f_out);
 
   // Transform on the GPU.
   dca::linalg::ReshapableMatrix<dca::util::CUDATypeMap<Complex>, dca::linalg::GPU> M_dev(M_in);
 
   dca::linalg::util::MagmaQueue queue;
 
-  dca::math::transform::SpaceTransform2DGpu<RDmn, KDmn, dca::util::CUDATypeMap<Complex>> transform_obj(
-      nw, queue);
+  dca::math::transform::SpaceTransform2DGpu<RDmn, KDmn, BDmn, SDmn, dca::util::CUDATypeMap<Complex>>
+      transform_obj(nw, queue);
   transform_obj.execute(M_dev);
 
   queue.getStream().sync();
@@ -145,10 +143,9 @@ TYPED_TEST(SpaceTransform2DGpuTest, Execute) {
     dca::io::Writer writer(*concurrency_ptr, "ADIOS2", true);
     dca::io::Writer writer_h5(*concurrency_ptr, "HDF5", true);
 
-    //writer.open_file("tp_single_band_gpu_test_G4.bp");
+    // writer.open_file("tp_single_band_gpu_test_G4.bp");
     writer_h5.open_file("space_transform_2D_gpu_test.hdf5");
 
- 
     //    writer.execute("m_outDevice", M_out);
     //    writer.execute("m_outHost", f_out);
     writer_h5.execute("m_outDevice", M_out);
@@ -168,9 +165,8 @@ TYPED_TEST(SpaceTransform2DGpuTest, Execute) {
           for (int b2 = 0; b2 < nb; ++b2)
             for (int b1 = 0; b1 < nb; ++b1) {
               const Complex val1 = f_out(b1, b2, 0, r1, r2, w1, w2);
-              auto index = [=](int k, int b, int w) { return  b + nb * k + nb * nr * w; };
-              const Complex val2 =
-		M_out(index(r1, b1, w1), index(r2, b2, w2));
+              auto index = [=](int k, int b, int w) { return b + nb * k + nb * nr * w; };
+              const Complex val2 = M_out(index(r1, b1, w1), index(r2, b2, w2));
               EXPECT_LE(std::abs(val1 - val2), tolerance);
             }
 }
diff --git a/test/unit/math/function_transform/space_transform_2D_test.cpp b/test/unit/math/function_transform/space_transform_2D_test.cpp
index 1dee64bb6..e72a62e85 100644
--- a/test/unit/math/function_transform/space_transform_2D_test.cpp
+++ b/test/unit/math/function_transform/space_transform_2D_test.cpp
@@ -47,16 +47,15 @@ using WPosDmn =
 using WDmn =
     dca::func::dmn_0<dca::phys::domains::vertex_frequency_domain<dca::phys::domains::COMPACT>>;
 
-template<typename SCALAR>
-  using Parameters =
-      dca::phys::params::Parameters<Concurrency, dca::parallel::NoThreading, dca::profiling::NullProfiler,
-    Model, void, dca::ClusterSolverId::CT_AUX, NumTraits<SCALAR>>;
-
+template <typename SCALAR>
+using Parameters =
+    dca::phys::params::Parameters<Concurrency, dca::parallel::NoThreading, dca::profiling::NullProfiler,
+                                  Model, void, dca::ClusterSolverId::CT_AUX, NumTraits<SCALAR>>;
 
 // Perform the test in double and single precision.
 template <typename T>
 class SpaceTransform2DTest : public ::testing::Test {};
-using TestTypes = ::testing::Types<float, double>; //, std::complex<double>>;
+using TestTypes = ::testing::Types<float, double>;  //, std::complex<double>>;
 TYPED_TEST_CASE(SpaceTransform2DTest, TestTypes);
 
 TYPED_TEST(SpaceTransform2DTest, Execute) {
@@ -84,7 +83,7 @@ TYPED_TEST(SpaceTransform2DTest, Execute) {
   auto f_in_cpy = f_in;
 
   dca::func::function<Complex, dca::func::dmn_variadic<BDmn, BDmn, SDmn, KDmn, KDmn, WPosDmn, WDmn>> f_out;
-  dca::math::transform::SpaceTransform2D<RDmn, KDmn, Scalar>::execute(f_in_cpy, f_out);
+  dca::math::transform::SpaceTransform2D<RDmn, KDmn, BDmn, SDmn, Scalar>::execute(f_in_cpy, f_out);
 
   const auto im = std::complex<double>(0, 1);
 
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt
index 7f99ed90f..6d35d6df5 100644
--- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt
@@ -44,20 +44,68 @@ set(TPACC_PARTICLE_HOLE_LIBS gpu_utils cluster_domains time_and_frequency_domain
 if (DCA_HAVE_ADIOS2)
   set(TPACC_PARTICLE_HOLE_LIBS dca_adios2 ${TPACC_PARTICLE_HOLE_LIBS})
 endif()
-  
-dca_add_gtest(tp_accumulator_particle_hole_test
+
+# dca_add_gtest(tp_accumulator_particle_hole_test
+#   FAST
+#   CUDA
+#   GTEST_MPI_MAIN
+#   INCLUDE_DIRS ${PROJECT_SOURCE_DIR} ${FFTW_INCLUDE_DIR} ${HDF5_INCLUDE_DIR}
+#   LIBS function FFTW::Double cluster_domains time_and_frequency_domains quantum_domains timer json random
+#        enumerations dca_algorithms ${DCA_THREADING_LIBS} dca_io parallel_util ${LAPACK_LIBRARIES} gpu_utils parallel_no_concurrency)
+
+dca_add_gtest(tp_accumulator_gpu_test
   FAST
   CUDA
-  GTEST_MPI_MAIN
-  INCLUDE_DIRS ${PROJECT_SOURCE_DIR} ${FFTW_INCLUDE_DIR} ${HDF5_INCLUDE_DIR}
-  LIBS function FFTW::Double cluster_domains time_and_frequency_domains quantum_domains timer json random
-       enumerations dca_algorithms ${DCA_THREADING_LIBS} dca_io parallel_util ${LAPACK_LIBRARIES} gpu_utils parallel_no_concurrency)
+  INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR}
+  LIBS     ${DCA_LIBS} ${DCA_KERNEL_LIBS}
+  CUSTOM_SOURCE tp_accumulator_gpu_test.cpp
+  TEST_DEFINES DoubleSquare
 
-dca_add_gtest(tp_accumulator_gpu_test
+)
+
+dca_add_gtest(tp_accumulator_rashba_gpu_test
+  FAST
+  CUDA
+  INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR}
+  LIBS     ${DCA_LIBS} ${DCA_KERNEL_LIBS}
+  CUSTOM_SOURCE tp_accumulator_gpu_test.cpp
+  TEST_DEFINES  ComplexDoubleRashba
+  )
+
+dca_add_gtest(tp_accumulator_moire_hubbard_gpu_test
+  FAST
+  CUDA
+  INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR}
+  LIBS     ${DCA_LIBS} ${DCA_KERNEL_LIBS}
+  CUSTOM_SOURCE tp_accumulator_gpu_test.cpp
+  TEST_DEFINES  ComplexDoubleMoireHubbard
+  )
+
+dca_add_gtest(tp_accumulator_hund_gpu_test
+  FAST
+  CUDA
+  INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR}
+  LIBS     ${DCA_LIBS} ${DCA_KERNEL_LIBS}
+  CUSTOM_SOURCE tp_accumulator_gpu_test.cpp
+  TEST_DEFINES  DoubleHund
+  )
+
+dca_add_gtest(tp_accumulator_DoubleKagome_gpu_test
+  FAST
+  CUDA
+  INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR}
+  LIBS     ${DCA_LIBS} ${DCA_KERNEL_LIBS}
+  CUSTOM_SOURCE tp_accumulator_gpu_test.cpp
+  TEST_DEFINES  DoubleKagome
+  )
+
+dca_add_gtest(tp_accumulator_FeAs_gpu_test
   FAST
   CUDA
   INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR}
   LIBS     ${DCA_LIBS} ${DCA_KERNEL_LIBS}
+  CUSTOM_SOURCE tp_accumulator_gpu_test.cpp
+  TEST_DEFINES  DoubleFeAs
   )
 
 dca_add_gtest(tp_accumulator_complex_g0_gpu_test
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json
index 6b997974b..f0a4d4b9b 100644
--- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json
@@ -1,46 +1,48 @@
 {
-  "output" :
-  {
-    "output-format"         : "HDF5",
+  "output": {
+    "output-format": "HDF5",
 
-    "output-ED"  : "ed_results.hdf5",
-    "output-QMC" : "output_QMC.hdf5"
+    "output-ED": "ed_results.hdf5",
+    "output-QMC": "output_QMC.hdf5"
   },
 
-  "physics" :
-  {
-    "beta"                      :  4,
-    "chemical-potential"        : 0
+  "physics": {
+    "beta": 4,
+    "chemical-potential": 0
   },
 
-  "bilayer-Hubbard-model":
-  {
-    "t"       : 1.0,
-    "t_prime" : 0.1,
-    "t_perp"  : 0.01,
-    "U"       : 1.0,
-    "V"       : 1.0,
-    "V-prime" : 2.0
+  "Kagome-Hubbard-model": {
+    "t": 1,
+    "U": 2
   },
 
-  "Hund-model":
-  {
-    "t"       : 1,
-    "U"       : 1,
-    "V"       : 1,
-    "Jh"      : 2
+  "bilayer-Hubbard-model": {
+    "t": 1.0,
+    "t_prime": 0.1,
+    "t_perp": 0.01,
+    "U": 1.0,
+    "V": 1.0,
+    "V-prime": 2.0
   },
 
-  "single-band-Hubbard-model":
-  {
-    "t"       : 1,
-    "U"       : 2
+  "Hund-model": {
+    "t": 1,
+    "U": 1,
+    "V": 1,
+    "Jh": 2
+  },
+
+  "single-band-Hubbard-model": {
+    "t": 1,
+    "U": 2
   },
 
   "domains": {
     "real-space-grids": {
-      "cluster": [[4, 0],
-        [0, 4]]
+      "cluster": [
+        [4, 0],
+        [0, 4]
+      ]
     },
 
     "imaginary-time": {
@@ -49,35 +51,31 @@
 
     "imaginary-frequency": {
       "sp-fermionic-frequencies": 512,
-      "four-point-fermionic-frequencies" : 8
+      "four-point-fermionic-frequencies": 8
     }
   },
 
   "four-point": {
     "type": "NONE",
-    "momentum-transfer": [0., 3.1415],
+    "momentum-transfer": [0, 3.1415],
     "frequency-transfer": 0
   },
 
+  "Monte-Carlo-integration": {
+    "Sigma-file": "zero",
 
-  "Monte-Carlo-integration" :
-  {
-    "Sigma-file" : "zero",
+    "warm-up-sweeps": 100,
+    "sweeps-per-measurement": 1,
+    "measurements-per-process-and-accumulator": 500,
+    "seed": 0,
 
-    "warm-up-sweeps"         : 100,
-    "sweeps-per-measurement" : 1,
-    "measurements-per-process-and-accumulator" : 500,
-    "seed" : 0,
-
-    "threaded-solver" : {
-      "accumulators" : 3
+    "threaded-solver": {
+      "accumulators": 3
     }
   },
 
-
-  "CT-INT" :
-  {
-    "initial-configuration-size" :5,
-    "alpha-dd-pos" : 0.51
+  "CT-INT": {
+    "initial-configuration-size": 5,
+    "alpha-dd-pos": 0.51
   }
 }
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_2_transfer.json b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_2_transfer.json
new file mode 100644
index 000000000..c69c8c13b
--- /dev/null
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_2_transfer.json
@@ -0,0 +1,81 @@
+{
+  "output": {
+    "output-format": "HDF5",
+
+    "output-ED": "ed_results.hdf5",
+    "output-QMC": "output_QMC.hdf5"
+  },
+
+  "physics": {
+    "beta": 4,
+    "chemical-potential": 0
+  },
+
+  "Kagome-Hubbard-model": {
+    "t": 1,
+    "U": 2
+  },
+
+  "bilayer-Hubbard-model": {
+    "t": 1.0,
+    "t_prime": 0.1,
+    "t_perp": 0.01,
+    "U": 1.0,
+    "V": 1.0,
+    "V-prime": 2.0
+  },
+
+  "Hund-model": {
+    "t": 1,
+    "U": 1,
+    "V": 1,
+    "Jh": 2
+  },
+
+  "single-band-Hubbard-model": {
+    "t": 1,
+    "U": 2
+  },
+
+  "domains": {
+    "real-space-grids": {
+      "cluster": [
+        [4, 0],
+        [0, 4]
+      ]
+    },
+
+    "imaginary-time": {
+      "sp-time-intervals": 512
+    },
+
+    "imaginary-frequency": {
+      "sp-fermionic-frequencies": 512,
+      "four-point-fermionic-frequencies": 8
+    }
+  },
+
+  "four-point": {
+    "type": "NONE",
+    "momentum-transfer": [0, 3.1415],
+    "frequency-transfer": 2
+  },
+
+  "Monte-Carlo-integration": {
+    "Sigma-file": "zero",
+
+    "warm-up-sweeps": 100,
+    "sweeps-per-measurement": 1,
+    "measurements-per-process-and-accumulator": 500,
+    "seed": 0,
+
+    "threaded-solver": {
+      "accumulators": 3
+    }
+  },
+
+  "CT-INT": {
+    "initial-configuration-size": 5,
+    "alpha-dd-pos": 0.51
+  }
+}
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_complex.json b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_complex.json
index 85b1e5175..e5e5bf705 100644
--- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_complex.json
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_complex.json
@@ -1,42 +1,46 @@
 {
-  "output" :
-  {
-    "output-format"         : "HDF5",
+  "output": {
+    "output-format": "HDF5",
 
-    "output-ED"  : "ed_results.hdf5",
-    "output-QMC" : "output_QMC.hdf5"
+    "output-ED": "ed_results.hdf5",
+    "output-QMC": "output_QMC.hdf5"
   },
 
-  "physics" :
-  {
-    "beta"                      :  2,
-    "chemical-potential"        : 0
+  "physics": {
+    "beta": 2,
+    "chemical-potential": 0
   },
 
-  "bilayer-Hubbard-model":
-  {
-    "t"       : 1,
-    "U"       : 2
+  "bilayer-Hubbard-model": {
+    "t": 1,
+    "U": 2
   },
 
-  "Hund-model":
-  {
-    "t"       : 1,
-    "U"       : 1,
-    "V"       : 1,
-    "Jh"      : 2
+  "Hund-model": {
+    "t": 1,
+    "U": 1,
+    "V": 1,
+    "Jh": 2
   },
 
-  "single-band-Hubbard-model":
-  {
-    "t"       : 1,
-    "U"       : 2
+  "Moire-Hubbard-model": {
+    "t": 1,
+    "h": 0,
+    "U": 1,
+    "phi": 0
+  },
+
+  "single-band-Hubbard-model": {
+    "t": 1,
+    "U": 2
   },
 
   "domains": {
     "real-space-grids": {
-      "cluster": [[4, 0],
-        [0, 4]]
+      "cluster": [
+        [4, 0],
+        [0, 4]
+      ]
     },
 
     "imaginary-time": {
@@ -45,35 +49,31 @@
 
     "imaginary-frequency": {
       "sp-fermionic-frequencies": 512,
-      "four-point-fermionic-frequencies" : 8
+      "four-point-fermionic-frequencies": 8
     }
   },
 
   "four-point": {
-    "type": "NONE",
-      "momentum-transfer": [3.1415, 0.000],
+    "channels": ["PARTICLE_PARTICLE_UP_DOWN"],
+    "momentum-transfer": [3.1415, 0.0],
     "frequency-transfer": 2
   },
 
+  "Monte-Carlo-integration": {
+    "Sigma-file": "zero",
 
-  "Monte-Carlo-integration" :
-  {
-    "Sigma-file" : "zero",
+    "warm-up-sweeps": 100,
+    "sweeps-per-measurement": 1,
+    "measurements-per-process-and-accumulator": 500,
+    "seed": 0,
 
-    "warm-up-sweeps"         : 100,
-    "sweeps-per-measurement" : 1,
-    "measurements-per-process-and-accumulator" : 500,
-    "seed" : 0,
-
-    "threaded-solver" : {
-      "accumulators" : 3
+    "threaded-solver": {
+      "accumulators": 3
     }
   },
 
-
-  "CT-INT" :
-  {
-    "initial-configuration-size" :5,
-    "alpha-dd-pos" : 0.51
+  "CT-INT": {
+    "initial-configuration-size": 5,
+    "alpha-dd-pos": 0.51
   }
 }
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_test.cpp
index 1eb09a529..c5d04944f 100644
--- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_test.cpp
@@ -39,7 +39,9 @@ using McOptions = MockMcOptions<Scalar>;
 #define INPUT_DIR \
   DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/"
 
-constexpr char input_file[] = INPUT_DIR "input_4x4.json";
+#define INPUT_FILE_4_4 INPUT_DIR "input_4x4_2_transfer.json"
+#define INPUT_FILE_4_4_COMPLEX INPUT_DIR "input_4x4_complex.json";
+#define INPUT_FILE_4_4_KAGOME INPUT_DIR "input_4x4_multitransfer_kagome.json";
 
 #ifdef DCA_HAVE_ADIOS2
 adios2::ADIOS* adios_ptr;
@@ -57,9 +59,53 @@ using ConfigGenerator = dca::testing::AccumulationTest<double>;
 using Configuration = ConfigGenerator::Configuration;
 using Sample = ConfigGenerator::Sample;
 
-template <typename SCALAR>
+struct DoubleSquare {
+  using Scalar = double;
+  using Lattice = dca::testing::LatticeSquare;
+  static constexpr char Input[] = INPUT_FILE_4_4;
+};
+
+struct DoubleBilayer {
+  using Scalar = double;
+  using Lattice = dca::testing::LatticeBilayer;
+  static constexpr char Input[] = INPUT_FILE_4_4;
+};
+
+struct DoubleHund {
+  using Scalar = double;
+  using Lattice = dca::testing::LatticeHund;
+  static constexpr char Input[] = INPUT_FILE_4_4;
+};
+
+struct DoubleKagome {
+  using Scalar = double;
+  using Lattice = dca::testing::LatticeKagome;
+  static constexpr char Input[] = INPUT_FILE_4_4_KAGOME;
+};
+
+struct ComplexDoubleRashba {
+  using Scalar = std::complex<double>;
+  using Lattice = dca::testing::LatticeRashba;
+  static constexpr char Input[] = INPUT_FILE_4_4_COMPLEX;
+};
+
+struct ComplexDoubleMoireHubbard {
+  using Scalar = std::complex<double>;
+  using Lattice = dca::testing::LatticeMoireHubbard;
+  static constexpr char Input[] = INPUT_FILE_4_4_COMPLEX;
+};
+
+struct DoubleFeAs {
+  using Scalar = double;
+  using Lattice = dca::testing::LatticeFeAs;
+  static constexpr char Input[] = INPUT_FILE_4_4;
+};
+
+template <typename SCALAR_LATTICE>
 struct TpAccumulatorGpuTest : public ::testing::Test {
-  using G0Setup = dca::testing::G0SetupBare<SCALAR, dca::testing::LatticeBilayer, dca::ClusterSolverId::CT_AUX, input_file>;
+  using G0Setup =
+      dca::testing::G0SetupBare<typename SCALAR_LATTICE::Scalar, typename SCALAR_LATTICE::Lattice,
+                                dca::ClusterSolverId::CT_AUX, SCALAR_LATTICE::Input>;
   virtual void SetUp() {
     host_setup.SetUp();
     gpu_setup.SetUp();
@@ -74,30 +120,50 @@ uint loop_counter = 0;
 
 constexpr bool write_G4s = true;
 
-using TestTypes = ::testing::Types<double>;
+// Running more than one of these we run into the issue with Space
+// transform reinitialization and there will be failures aftrer the
+// first.  The work around is to build this test several times until
+// fix the space transform global and possibly others.  This is
+// clearly something that has helped make many areas of the code less
+// testable.
+
+using TestTypes =
+    ::testing::Types<THIS_TEST_DEFINES>;  // ComplexDoubleMoireHubbard,,
+                                          // DoubleSquare, DoubleKagome, , ComplexDoubleRashba,
+                                          // DoubleFeAs>;  //  DoubleBilayer, DoubleHund,
 TYPED_TEST_CASE(TpAccumulatorGpuTest, TestTypes);
 
 TYPED_TEST(TpAccumulatorGpuTest, Accumulate) {
   const std::array<int, 2> n{22, 22};
   Sample M;
   Configuration config;
-  using Scalar = TypeParam;
-  ConfigGenerator::prepareConfiguration(config, M, TpAccumulatorGpuTest<Scalar>::G0Setup::BDmn::dmn_size(),
-                                        TpAccumulatorGpuTest<Scalar>::G0Setup::RDmn::dmn_size(),
+  using TestParam = TypeParam;
+  using TestType = TpAccumulatorGpuTest<TestParam>;
+  ConfigGenerator::prepareConfiguration(config, M,
+                                        TpAccumulatorGpuTest<TestParam>::G0Setup::BDmn::dmn_size(),
+                                        TpAccumulatorGpuTest<TestParam>::G0Setup::RDmn::dmn_size(),
                                         this->host_setup.parameters_.get_beta(), n);
 
   using namespace dca::phys;
-  std::vector<FourPointType> four_point_channels{
-      FourPointType::PARTICLE_HOLE_TRANSVERSE, FourPointType::PARTICLE_HOLE_MAGNETIC,
-      FourPointType::PARTICLE_HOLE_CHARGE,  FourPointType::PARTICLE_PARTICLE_UP_DOWN};
-  //FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
-  //    FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
+
+  std::vector<FourPointType> four_point_channels;
+  if (TestType::G0Setup::LatticeType::spin_symmetric)
+    four_point_channels = {
+        FourPointType::PARTICLE_HOLE_TRANSVERSE, FourPointType::PARTICLE_HOLE_MAGNETIC,
+        FourPointType::PARTICLE_HOLE_CHARGE, FourPointType::PARTICLE_PARTICLE_UP_DOWN};
+  else
+    four_point_channels = {FourPointType::PARTICLE_PARTICLE_UP_DOWN};
+
+  // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
+  //     FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
   this->host_setup.parameters_.set_four_point_channels(four_point_channels);
   this->gpu_setup.parameters_.set_four_point_channels(four_point_channels);
 
-  dca::phys::solver::accumulator::TpAccumulator<decltype(this->host_setup.parameters_), dca::DistType::NONE, dca::linalg::CPU>
+  dca::phys::solver::accumulator::TpAccumulator<decltype(this->host_setup.parameters_),
+                                                dca::DistType::NONE, dca::linalg::CPU>
       accumulatorHost(this->host_setup.data_->G0_k_w_cluster_excluded, this->host_setup.parameters_);
-  dca::phys::solver::accumulator::TpAccumulator<decltype(this->host_setup.parameters_), dca::DistType::NONE, dca::linalg::GPU>
+  dca::phys::solver::accumulator::TpAccumulator<decltype(this->host_setup.parameters_),
+                                                dca::DistType::NONE, dca::linalg::GPU>
       accumulatorDevice(this->gpu_setup.data_->G0_k_w_cluster_excluded, this->gpu_setup.parameters_);
   const int8_t sign = 1;
 
@@ -126,7 +192,8 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) {
     this->host_setup.data_->write(writer_h5);
 
     for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) {
-      std::string channel_str = dca::phys::toString(this->host_setup.parameters_.get_four_point_channels()[channel]);
+      std::string channel_str =
+          dca::phys::toString(this->host_setup.parameters_.get_four_point_channels()[channel]);
       writer.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]);
       writer.execute("accumulatorDevice_" + channel_str, accumulatorDevice.get_G4()[channel]);
       writer_h5.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]);
@@ -139,11 +206,12 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) {
 #endif
 
   std::cout << "blocks: " << dca::util::ceilDiv(int(accumulatorHost.get_G4()[0].size()), 256) << '\n';
-  
+
   for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) {
     auto diff = dca::func::util::difference(accumulatorHost.get_G4()[channel],
                                             accumulatorDevice.get_G4()[channel]);
-    EXPECT_GT(5e-7, diff.l_inf) << "channel: " << dca::phys::toString(four_point_channels[channel]);
+    EXPECT_GT(5e-7, diff.l_inf) << "channel: " << dca::phys::toString(four_point_channels[channel])
+                                << " diff.l_inf == " << diff.l_inf << "\n";
   }
 }
 
@@ -165,7 +233,6 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) {
 //   Accumulator accumulator3(data_->G0_k_w_cluster_excluded, parameters_, 3);
 //   //accumulator3.set_multiple_accumulators(true);
 
-
 //   auto prepare_configuration = [&](auto& M, auto& configuration, const auto& n) {
 //     ConfigGenerator::prepareConfiguration(M, configuration, TpAccumulatorGpuTest::BDmn::dmn_size(),
 //                                           TpAccumulatorGpuTest::RDmn::dmn_size(),
@@ -188,7 +255,7 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) {
 //   accumulator_sum.resetAccumulation(loop_id, flag);
 
 //   // This is a bandaid for CI.
-//   std::chrono::microseconds sleep_time(10000);  
+//   std::chrono::microseconds sleep_time(10000);
 //   std::this_thread::sleep_for(sleep_time);
 
 //   // there is a data race here between the resets above
@@ -197,8 +264,8 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) {
 //   accumulator1.accumulate(M1, config1, sign);
 //   accumulator2.accumulate(M2, config2, sign);
 //   accumulator1.sumTo(accumulator_sum);
-//   accumulator2.sumTo(accumulator_sum);  
-  
+//   accumulator2.sumTo(accumulator_sum);
+
 //   accumulator_sum.finalize();
 
 //   // Reset the G4 on the GPU to zero.
@@ -212,14 +279,15 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) {
 //   accumulator3.accumulate(M1, config1, sign);
 //   accumulator3.accumulate(M2, config2, sign);
 //   accumulator3.finalize();
-  
+
 //   // auto acc3_it = accumulator3.get_G4()[0].begin();
 //   // auto acc_sum_it = accumulator_sum.get_G4()[0].begin();
 //   // auto acc3_end = accumulator3.get_G4()[0].end();
 
 //   // int index = 0;
 //   // while(acc3_it != acc3_end) {
-//   //   EXPECT_NEAR(acc_sum_it->real(), acc3_it->real(), 1E-4) << "index = " << dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index));
+//   //   EXPECT_NEAR(acc_sum_it->real(), acc3_it->real(), 1E-4) << "index = " <<
+//   dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index));
 //   //   ++acc3_it;
 //   //   ++acc_sum_it;
 //   //   ++index;
@@ -232,13 +300,17 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) {
 //   if (diff.l_inf < 5e-7) {
 //     std::cout << "succeed with diff.l_inf = " << diff.l_inf << '\n';
 //     int index = 10;
-//     std::cout << "values at " << dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index))
-// 	      << " sum: " << accumulator_sum.get_G4()[0](index) << " just_acc: " << accumulator3.get_G4()[0](index) << '\n';
+//     std::cout << "values at " <<
+//     dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index))
+// 	      << " sum: " << accumulator_sum.get_G4()[0](index) << " just_acc: " <<
+// accumulator3.get_G4()[0](index) << '\n';
 //   }
 //   else {
 //     int index = 10;
-//         std::cout << "values at " << dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index))
-// 	      << " sum: " << accumulator_sum.get_G4()[0](index) << " just_acc: " << accumulator3.get_G4()[0](index) << '\n';
+//         std::cout << "values at " <<
+//         dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index))
+// 	      << " sum: " << accumulator_sum.get_G4()[0](index) << " just_acc: " <<
+// accumulator3.get_G4()[0](index) << '\n';
 
 //   }
 // }
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_particle_hole_test.cpp b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_particle_hole_test.cpp
index 3209f1fa5..8749158c2 100644
--- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_particle_hole_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_particle_hole_test.cpp
@@ -12,7 +12,6 @@
 // channels based on the relation between the particle-hole longitudinal up-up and up-down channels,
 // and the particle-hole magnetic and charge channels.
 
-
 #include "dca/platform/dca_gpu.h"
 
 using Scalar = double;
@@ -45,7 +44,7 @@ constexpr char input_file[] = INPUT_DIR "input_tp_accumulator_particle_hole_test
 using Scalar = double;
 
 using TpAccumulatorTest =
-  dca::testing::G0Setup<Scalar, dca::testing::LatticeSquare, dca::ClusterSolverId::CT_AUX, input_file>;
+    dca::testing::G0Setup<Scalar, dca::testing::LatticeSquare, dca::ClusterSolverId::CT_AUX, input_file>;
 
 TEST_F(TpAccumulatorTest, ParticleHoleChannels) {
   using TpAccumulatorType = dca::phys::solver::accumulator::TpAccumulator<Parameters>;
@@ -60,9 +59,10 @@ TEST_F(TpAccumulatorTest, ParticleHoleChannels) {
                                         TpAccumulatorTest::RDmn::dmn_size(), parameters_.get_beta(),
                                         n);
 
-  parameters_.set_four_point_channels(std::vector<dca::phys::FourPointType>{
-      dca::phys::FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, dca::phys::FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
-      dca::phys::FourPointType::PARTICLE_HOLE_MAGNETIC, dca::phys::FourPointType::PARTICLE_HOLE_CHARGE});
+  parameters_.set_four_point_channels(
+      std::vector<dca::phys::FourPointType>{dca::phys::FourPointType::PARTICLE_HOLE_TRANSVERSE,
+                                            dca::phys::FourPointType::PARTICLE_HOLE_MAGNETIC,
+                                            dca::phys::FourPointType::PARTICLE_HOLE_CHARGE});
 
   TpAccumulatorType accumulator_ph(data_->G0_k_w_cluster_excluded, parameters_);
 
@@ -103,7 +103,7 @@ TEST_F(TpAccumulatorTest, ParticleHoleChannels) {
   }
 
   const auto diff_up_up = dca::func::util::difference(G4_ph_long_up_up, G4_ph_long_up_up_check);
-  EXPECT_LT(diff_up_up.l_inf, 100 * std::numeric_limits<TpAccumulatorType::TpPrecision>::epsilon());
+  EXPECT_LT(diff_up_up.l_inf, 100 * std::numeric_limits<TpAccumulatorType::TpPrecision>::epsilon()) << "G4_p4_long_up_up.getDomainSizes() " << dca::vectorToString(G4_ph_long_up_up.getDomainSizes());
 
   const auto diff_up_down = dca::func::util::difference(G4_ph_long_up_down, G4_ph_long_up_down_check);
   EXPECT_LT(diff_up_down.l_inf, 100 * std::numeric_limits<TpAccumulatorType::TpPrecision>::epsilon());
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_gpu_test.cpp
index 3123f8404..7a3c8f033 100644
--- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_gpu_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_gpu_test.cpp
@@ -78,8 +78,8 @@ TEST_F(TpAccumulatorGpuSinglebandTest, Accumulate) {
   std::vector<FourPointType> four_point_channels{FourPointType::PARTICLE_HOLE_TRANSVERSE,
                                                  FourPointType::PARTICLE_HOLE_MAGNETIC,
                                                  FourPointType::PARTICLE_HOLE_CHARGE,
-                                                 FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
-                                                 FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
+                                                 // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
+                                                 // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
                                                  FourPointType::PARTICLE_PARTICLE_UP_DOWN};
 
   parameters_.set_four_point_channels(four_point_channels);
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex.cpp b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex.cpp
index 63211cf2a..2ee9f674b 100644
--- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex.cpp
@@ -37,7 +37,6 @@ using McOptions = MockMcOptions<Scalar>;
 #include "dca/phys/four_point_type.hpp"
 #include "test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/accumulation_test.hpp"
 
-
 constexpr bool update_baseline = false;
 
 #define INPUT_DIR \
@@ -51,8 +50,8 @@ using Sample = ConfigGenerator::Sample;
 
 using Scalar = std::complex<double>;
 
-using TpAccumulatorTest =
-  dca::testing::G0Setup<Scalar, dca::testing::LatticeMoireHubbard, dca::ClusterSolverId::CT_AUX, input_file>;
+using TpAccumulatorTest = dca::testing::G0Setup<Scalar, dca::testing::LatticeMoireHubbard,
+                                                dca::ClusterSolverId::CT_AUX, input_file>;
 
 TEST_F(TpAccumulatorTest, Accumulate) {
   const std::array<int, 2> n{18, 22};
@@ -80,8 +79,7 @@ TEST_F(TpAccumulatorTest, Accumulate) {
   func_names[dca::phys::FourPointType::PARTICLE_HOLE_CHARGE] = "G4_ph_charge";
   func_names[dca::phys::FourPointType::PARTICLE_PARTICLE_UP_DOWN] = "G4_pp_up_down";
 
-  for (const dca::phys::FourPointType type :
-       {dca::phys::FourPointType::PARTICLE_PARTICLE_UP_DOWN}) {
+  for (const dca::phys::FourPointType type : {dca::phys::FourPointType::PARTICLE_PARTICLE_UP_DOWN}) {
     parameters_.set_four_point_channel(type);
 
     dca::phys::solver::accumulator::TpAccumulator<Parameters> accumulator(
diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex_baseline.hdf5 b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex_baseline.hdf5
index b1cb4e33a..99e7f9a7d 100644
Binary files a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex_baseline.hdf5 and b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex_baseline.hdf5 differ
diff --git a/test/unit/phys/dca_step/cluster_solver/test_setup.hpp b/test/unit/phys/dca_step/cluster_solver/test_setup.hpp
index a86fad3e8..2cf8821e3 100644
--- a/test/unit/phys/dca_step/cluster_solver/test_setup.hpp
+++ b/test/unit/phys/dca_step/cluster_solver/test_setup.hpp
@@ -92,11 +92,12 @@ struct G0Setup : public ::testing::Test {
       throw std::runtime_error("Input parsing failed!");
     }
     parameters_.update_model();
-    static bool domain_initialized = false;
-    if (!domain_initialized) {
-      parameters_.update_domains();
-      domain_initialized = true;
-    }
+    parameters_.update_domains();
+    // static bool domain_initialized = false;
+    // if (!domain_initialized) {
+    //   parameters_.update_domains();
+    //   domain_initialized = true;
+    // }
     data_ = std::make_unique<Data>(parameters_);
     data_->initialize();
   }
diff --git a/test/unit/phys/parameters/four_point_parameters/four_point_parameters_test.cpp b/test/unit/phys/parameters/four_point_parameters/four_point_parameters_test.cpp
index 988f7048c..61fde05e7 100644
--- a/test/unit/phys/parameters/four_point_parameters/four_point_parameters_test.cpp
+++ b/test/unit/phys/parameters/four_point_parameters/four_point_parameters_test.cpp
@@ -30,12 +30,12 @@ TEST(FourPointParametersTest, DefaultValues) {
 }
 
 using namespace dca::phys;
-const std::vector<FourPointType> all_channels{FourPointType::PARTICLE_HOLE_TRANSVERSE,
-                                              FourPointType::PARTICLE_HOLE_MAGNETIC,
-                                              FourPointType::PARTICLE_HOLE_CHARGE,
-                                              FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
-                                              FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
-                                              FourPointType::PARTICLE_PARTICLE_UP_DOWN};
+const std::vector<FourPointType> all_channels{
+    FourPointType::NONE, FourPointType::PARTICLE_HOLE_TRANSVERSE,
+    FourPointType::PARTICLE_HOLE_MAGNETIC, FourPointType::PARTICLE_HOLE_CHARGE,
+    // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
+    // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
+    FourPointType::PARTICLE_PARTICLE_UP_DOWN};
 
 TEST(FourPointParametersTest, ReadAll) {
   dca::io::JSONReader reader;