diff --git a/.github/workflows/ci-github-actions-self-hosted.yaml b/.github/workflows/ci-github-actions-self-hosted.yaml index 073880006..7508b20dd 100644 --- a/.github/workflows/ci-github-actions-self-hosted.yaml +++ b/.github/workflows/ci-github-actions-self-hosted.yaml @@ -18,7 +18,8 @@ jobs: strategy: fail-fast: false matrix: - jobname: [GCC12-MPI-CUDA-Real-Full] + jobname: [LLVM21-MPI-CUDA-Real-Full] + # GCC12-MPI-NoMPI-CUDA-Real-Full, # GCC12-MPI-NoMPI-CUDA-Real-Debug-Full, diff --git a/applications/analysis/CMakeLists.txt b/applications/analysis/CMakeLists.txt index dd3cdeb8e..aef6ba13d 100644 --- a/applications/analysis/CMakeLists.txt +++ b/applications/analysis/CMakeLists.txt @@ -4,7 +4,8 @@ if (DCA_BUILD_ANALYSIS) add_executable(main_analysis main_analysis.cpp) target_include_directories(main_analysis PRIVATE ${DCA_INCLUDE_DIRS}) if (DCA_HAVE_GPU) - target_link_libraries(main_analysis PRIVATE ${DCA_GPU_LIBS}) + target_link_libraries(main_analysis PRIVATE ${DCA_GPU_LIBS} + magma::magma) endif() target_link_libraries(main_analysis PUBLIC signals ${DCA_LIBS} simplex_gm_rule) diff --git a/build-aux/local_CPU.cmake b/build-aux/local_CPU.cmake index db2f81238..974a17ab5 100644 --- a/build-aux/local_CPU.cmake +++ b/build-aux/local_CPU.cmake @@ -16,14 +16,14 @@ set(gtest_DIR "${CMAKE_CURRENT_LIST_DIR}/../libs/gmock-1.7.0/gtest/" # HDF5 # CMake 3.6 does not properly find HDF5 if HDF5_ROOT is not set. -set(HDF5_ROOT "/opt/homebrew/Cellar/hdf5/1.14.4.3/" CACHE PATH "Path to HDF5 installation directory.") +set(HDF5_ROOT "/opt/homebrew/Cellar/hdf5/1.14.6/" CACHE PATH "Path to HDF5 installation directory.") # set(HDF5_ROOT "/usr/local/anaconda3/pkgs/hdf5-1.10.6-hdbbcd12_0/" CACHE PATH "Path to HDF5 installation directory.") # FFTW # set(FFTW_INCLUDE_DIR "/usr/local/Cellar/fftw/3.3.10/include" CACHE PATH "Path to fftw3.h.") # set(FFTW_LIBRARY "/usr/local/Cellar/fftw/3.3.10/lib/libfftw3.a" CACHE FILEPATH "Path to FFTW3 library.") set(FFTW_INCLUDE_DIR "/opt/homebrew/Cellar/fftw/3.3.10_1/include" CACHE PATH "Path to fftw3.h.") -set(FFTW_LIBRARY "/opt/homebrew/Cellar/fftw/3.3.10_1/lib/libfftw3.a" CACHE FILEPATH "Path to FFTW3 library.") +set(FFTW_LIBRARY "/opt/homebrew/Cellar/fftw/3.3.10_2/lib/libfftw3.a" CACHE FILEPATH "Path to FFTW3 library.") mark_as_advanced(gtest_DIR SPRNG_DIR HDF5_ROOT) @@ -37,4 +37,3 @@ set(MPIEXEC_NUMPROC_FLAG "-np" CACHE STRING "Flag used by TEST_RUNNER to specify the number of processes.") mark_as_advanced(TEST_RUNNER MPIEXEC_NUMPROC_FLAG) - diff --git a/cmake/dca_config.cmake b/cmake/dca_config.cmake index 8b854c209..976f24253 100644 --- a/cmake/dca_config.cmake +++ b/cmake/dca_config.cmake @@ -121,8 +121,8 @@ endif() # Lattice type set(DCA_LATTICE "square" CACHE STRING "Lattice type, options are: bilayer | square | triangular | - Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn | La3Ni2O7_bilayer") -set_property(CACHE DCA_LATTICE PROPERTY STRINGS bilayer square triangular Kagome hund twoband_Cu threeband + Kagome | Plaquette | hund | twoband_Cu | threeband | Rashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn | La3Ni2O7_bilayer") +set_property(CACHE DCA_LATTICE PROPERTY STRINGS bilayer square triangular Kagome Plaquette hund twoband_Cu threeband Rashba_Hubbard Moire_Hubbard FeAs material_NiO material_FeSn La3Ni2O7_bilayer) if (DCA_LATTICE STREQUAL "bilayer") @@ -149,48 +149,64 @@ elseif (DCA_LATTICE STREQUAL "triangular") set(DCA_LATTICE_TYPE dca::phys::models::triangular_lattice) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/triangular_lattice.hpp") + elseif (DCA_LATTICE STREQUAL "Kagome") set(DCA_LATTICE_TYPE dca::phys::models::KagomeHubbard) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp") + +elseif (DCA_LATTICE STREQUAL "Plaquette") + set(DCA_LATTICE_TYPE dca::phys::models::squarePlaquetteHubbard) + set(DCA_LATTICE_INCLUDE + "dca/phys/models/analytic_hamiltonians/square_plaquette_hubbard.hpp") + elseif (DCA_LATTICE STREQUAL "hund") set(DCA_LATTICE_TYPE dca::phys::models::HundLattice) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/hund_lattice.hpp") + elseif (DCA_LATTICE STREQUAL "threeband") set(DCA_LATTICE_TYPE dca::phys::models::ThreebandHubbard) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/threeband_hubbard.hpp") + elseif (DCA_LATTICE STREQUAL "Rashba_Hubbard") set(DCA_LATTICE_TYPE dca::phys::models::RashbaHubbard) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/rashba_hubbard.hpp") + elseif (DCA_LATTICE STREQUAL "Moire_Hubbard") set(DCA_LATTICE_TYPE dca::phys::models::moire_hubbard) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/Moire_Hubbard.hpp") + elseif (DCA_LATTICE STREQUAL "twoband_chain") set(DCA_LATTICE_TYPE dca::phys::models::twoband_chain>) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/hund_lattice.hpp") + elseif (DCA_LATTICE STREQUAL "FeAs") set(DCA_LATTICE_TYPE dca::phys::models::FeAsLattice) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/fe_as_lattice.hpp") + elseif (DCA_LATTICE STREQUAL "twoband_Cu") set(DCA_LATTICE_TYPE dca::phys::models::TwoBandCu) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/twoband_Cu.hpp") + elseif (DCA_LATTICE STREQUAL "material_NiO") set(DCA_LATTICE_TYPE "dca::phys::models::material_lattice") set(DCA_LATTICE_INCLUDE "dca/phys/models/material_hamiltonians/material_lattice.hpp") set(DCA_MODEL_IS_MATERIAL_LATTICE ON CACHE BOOL "is the model a material lattice") + elseif (DCA_LATTICE STREQUAL "material_FeSn") set(DCA_LATTICE_TYPE "dca::phys::models::material_lattice") set(DCA_LATTICE_INCLUDE "dca/phys/models/material_hamiltonians/material_lattice.hpp") set(DCA_MODEL_IS_MATERIAL_LATTICE ON CACHE BOOL "is the model a material lattice") + else() message(FATAL_ERROR "Please set DCA_LATTICE to a valid option: bilayer | La3Ni2O7_bilayer | square | triangular | Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn.") endif() diff --git a/cmake/dca_testing.cmake b/cmake/dca_testing.cmake index cc597e4e4..00d563bde 100644 --- a/cmake/dca_testing.cmake +++ b/cmake/dca_testing.cmake @@ -16,6 +16,7 @@ if(DCA_HAVE_HPX) set(test_thread_option HPX) endif() + # Adds a test written with Google Test. # # dca_add_gtest(name @@ -36,7 +37,7 @@ endif() function(dca_add_gtest name) set(options FAST EXTENSIVE STOCHASTIC PERFORMANCE GTEST_MAIN GTEST_MPI_MAIN THREADED MPI CUDA CUDA_MPI) set(oneValueArgs MPI_NUMPROC) - set(multiValueArgs INCLUDE_DIRS SOURCES LIBS) + set(multiValueArgs INCLUDE_DIRS SOURCES LIBS CUSTOM_SOURCE TEST_DEFINES) cmake_parse_arguments(DCA_ADD_GTEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) # FAST, EXTENSIVE and PERFORMANCE are mutually exclusive. @@ -47,7 +48,7 @@ function(dca_add_gtest name) (DCA_ADD_GTEST_STOCHASTIC AND DCA_ADD_GTEST_EXTENSIVE) OR (DCA_ADD_GTEST_STOCHASTIC AND DCA_ADD_GTEST_PERFORMANCE)) message(FATAL_ERROR "Incorrect use of dca_add_gtest.\n - dca_add_gtest(name\n + dca_add_gtest_impl (name src_file compile_definitions \n [FAST | EXTENSIVE | STOCHASTIC | PERFORMANCE]\n [GTEST_MAIN]\n [THREADED]\n @@ -97,7 +98,7 @@ function(dca_add_gtest name) IF (DCA_ADD_GTEST_CUDA_MPI AND NOT DCA_HAVE_GPU_AWARE_MPI) return() endif() - + # Right now we're only testing GPU distributed code on one node so its pointless # without more than one GPU per node. if (DCA_ADD_GTEST_CUDA_MPI AND DCA_HAVE_GPU_AWARE_MPI AND (DCA_TEST_GPU_COUNT LESS 2) ) @@ -112,9 +113,15 @@ function(dca_add_gtest name) set(DCA_ADD_GTEST_SOURCES ${PROJECT_SOURCE_DIR}/test/dca_gtest_main_mpi.cpp ${DCA_ADD_GTEST_SOURCES}) endif() - add_executable(${name} ${name}.cpp ${DCA_ADD_GTEST_SOURCES}) - + if (DCA_ADD_GTEST_CUSTOM_SOURCE) + add_executable(${name} ${DCA_ADD_GTEST_CUSTOM_SOURCE} ${DCA_ADD_GTEST_SOURCES}) + else () + add_executable(${name} ${name}.cpp ${DCA_ADD_GTEST_SOURCES}) + endif() target_compile_definitions(${name} PRIVATE DCA_SOURCE_DIR=\"${PROJECT_SOURCE_DIR}\") + if (DCA_ADD_GTEST_TEST_DEFINES) + target_compile_definitions(${name} PRIVATE THIS_TEST_DEFINES=${DCA_ADD_GTEST_TEST_DEFINES}) + endif() # this is hacky but allows continued use of DCA_THREADING_LIBS # if (DCA_ADD_GTEST_LIBS MATCHES "parallel_hpx") @@ -182,7 +189,7 @@ function(dca_add_gtest name) target_compile_definitions(${name} PUBLIC DCA_HAVE_HPX) target_link_libraries(${name} PUBLIC HPX::hpx HPX::wrap_main) endif() - + target_include_directories(${name} PRIVATE ${gtest_SOURCE_DIR}/include ${DCA_ADD_GTEST_INCLUDE_DIRS}) @@ -239,5 +246,3 @@ function(dca_add_perftest name) target_compile_definitions(${name} PRIVATE DCA_SOURCE_DIR=\"${PROJECT_SOURCE_DIR}\") endfunction() - - diff --git a/include/dca/function/util/difference.hpp b/include/dca/function/util/difference.hpp index 94c1d1d55..1c1f9e749 100644 --- a/include/dca/function/util/difference.hpp +++ b/include/dca/function/util/difference.hpp @@ -64,8 +64,6 @@ Difference difference(const function& f1, l2_error = std::sqrt(l2_error / l2); linf_error /= linf; - - return Difference{l1_error, l2_error, linf_error}; } @@ -73,7 +71,9 @@ Difference difference(const function& f1, template Difference difference(const function& f1, const function& f2) { if (f1.size() != f2.size()) - throw(std::logic_error("The two functions have different size.")); + throw(std::logic_error("The two functions have different sizes. f1.size() (" + + std::to_string(f1.size()) + ") == f2.size() (" + + std::to_string(f2.size()) + ")!\n")); double l1 = 0.; double l2 = 0.; diff --git a/include/dca/math/function_transform/special_transforms/space_transform_2D.hpp b/include/dca/math/function_transform/special_transforms/space_transform_2D.hpp index af7fe2ea1..45bf17f5f 100644 --- a/include/dca/math/function_transform/special_transforms/space_transform_2D.hpp +++ b/include/dca/math/function_transform/special_transforms/space_transform_2D.hpp @@ -30,20 +30,18 @@ namespace math { namespace transform { // dca::math::transform:: -template +template class SpaceTransform2D { protected: using Complex = dca::util::ComplexAlias; - using BDmn = func::dmn_0; - using SDmn = func::dmn_0; + using BDmn = BDMN; // func::dmn_0; + using SDmn = SPDMN; // func::dmn_0; public: // Apply the 2D Fourier transform defined as - // f(k1, b1, k2, b2) = \sum_{r1, r2} Exp[i (k1 (r1 + a[b1]) - k2 (r2 + a[b2])] f(r1, b1, r2, b2) / Nc, - // where a[b] is the displacement vector associated with each band, - // and rearrange the output domains. - // In/Out: f_input. The input is overwritten with a partial result. - // Out: f_output + // f(k1, b1, k2, b2) = \sum_{r1, r2} Exp[i (k1 (r1 + a[b1]) - k2 (r2 + a[b2])] f(r1, b1, r2, b2) / + // Nc, where a[b] is the displacement vector associated with each band, and rearrange the output + // domains. In/Out: f_input. The input is overwritten with a partial result. Out: f_output template static void execute( func::function>& f_input, @@ -55,16 +53,15 @@ class SpaceTransform2D { return SpaceToMomentumTransform::hasPhaseFactor(); } static const auto& getPhaseFactors(); -protected: - +protected: }; -template +template template -void SpaceTransform2D::execute( - func::function>& f_input, - func::function>& f_output) { +void SpaceTransform2D::execute( + func::function>& f_input, + func::function>& f_output) { assert(SDmn::dmn_size() == 2); const int nc = RDmn::dmn_size(); linalg::Matrix tmp(nc); @@ -80,19 +77,22 @@ void SpaceTransform2D::execute( linalg::MatrixView f_r_r(&f_input(0, 0, b1, b2, s, w1, w2), nc); // f(k1,k2) = \sum_{r1, r2} exp(i(k1 * r1 - k2 *r2)) f(r1, r2) / Nc - linalg::matrixop::gemm(T, f_r_r,tmp); + linalg::matrixop::gemm(T, f_r_r, tmp); linalg::matrixop::gemm('N', 'C', norm, tmp, T, Complex(0), f_r_r); // f(k1, k2) *= Exp[i (k1 a[b1] - k2 a[b2])] for (int k2 = 0; k2 < nc; ++k2) - for (int k1 = 0; k1 < nc; ++k1) + for (int k1 = 0; k1 < nc; ++k1) { + assert(b1 < BDMN::dmn_size()); f_output(b1, b2, s, k1, k2, w1, w2) = f_r_r(k1, k2) * phase_factors(b1, k1) * std::conj(phase_factors(b2, k2)); + } } } -template -const linalg::Matrix, linalg::CPU>& SpaceTransform2D::get_T_matrix() { +template +const linalg::Matrix, linalg::CPU>& SpaceTransform2D< + RDmn, KDmn, BDMN, SPDMN, Scalar>::get_T_matrix() { auto initialize_T_matrix = []() { assert(RDmn::dmn_size() == KDmn::dmn_size()); linalg::Matrix T(RDmn::dmn_size()); @@ -100,8 +100,9 @@ const linalg::Matrix, linalg::CPU>& SpaceTransfo const auto& r = RDmn::parameter_type::get_elements()[j]; for (int i = 0; i < KDmn::dmn_size(); ++i) { const auto& k = KDmn::parameter_type::get_elements()[i]; - using Real = dca::util::RealAlias; - auto temp_exp = std::exp(dca::util::ComplexAlias{0, static_cast(util::innerProduct(k, r))}); + using Real = dca::util::RealAlias; + auto temp_exp = + std::exp(dca::util::ComplexAlias{0, static_cast(util::innerProduct(k, r))}); T(i, j) = typename decltype(T)::ValueType{temp_exp.real(), temp_exp.imag()}; } } @@ -112,9 +113,9 @@ const linalg::Matrix, linalg::CPU>& SpaceTransfo return T; } -template -const auto& SpaceTransform2D::getPhaseFactors() { - static func::function> phase_factors("Phase factors."); +template +const auto& SpaceTransform2D::getPhaseFactors() { + static func::function> phase_factors("Phase factors."); using Real = dca::util::RealAlias; static std::once_flag flag; @@ -127,10 +128,10 @@ const auto& SpaceTransform2D::getPhaseFactors() { for (int k = 0; k < KDmn::dmn_size(); ++k) { const auto& k_vec = KDmn::get_elements()[k]; for (int b = 0; b < BDmn::dmn_size(); ++b) { - // Scalar could be cuComplex or cuDoubleComplex so... - std::complex temp_phase{0., static_cast(util::innerProduct(k_vec, a_vecs[b]))}; - temp_phase = std::exp(temp_phase); - phase_factors(b,k) = Complex{temp_phase.real(), temp_phase.imag()}; + // Scalar could be cuComplex or cuDoubleComplex so... + std::complex temp_phase{0., static_cast(util::innerProduct(k_vec, a_vecs[b]))}; + temp_phase = std::exp(temp_phase); + phase_factors(b, k) = Complex{temp_phase.real(), temp_phase.imag()}; } } }); diff --git a/include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp b/include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp index e9c2cb4af..56b2eb2c1 100644 --- a/include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp +++ b/include/dca/math/function_transform/special_transforms/space_transform_2D_gpu.hpp @@ -16,9 +16,9 @@ #include #include #include -//#include "dca/config/haves_defines.hpp" -// its expected that dca::config::McOptions will be provided in some manner before parameters.hpp is -// included +// #include "dca/config/haves_defines.hpp" +// its expected that dca::config::McOptions will be provided in some manner before parameters.hpp +// is included #include "space_transform_2D.hpp" #include "dca/util/type_help.hpp" @@ -32,11 +32,11 @@ namespace math { namespace transform { // dca::math::transform:: -template -class SpaceTransform2DGpu : public SpaceTransform2D { +template +class SpaceTransform2DGpu : public SpaceTransform2D { protected: // I guess the base type should use the host side scalar - using Base = SpaceTransform2D; + using Base = SpaceTransform2D; using Complex = dca::util::ComplexAlias; using MatrixDev = linalg::Matrix; using VectorDev = linalg::Vector; @@ -73,7 +73,7 @@ class SpaceTransform2DGpu : public SpaceTransform2D { } private: - using BDmn = func::dmn_0; + using BDmn = typename Base::BDmn; // func::dmn_0; const MatrixDev& get_T_matrix() { auto initialize_T_matrix = []() { @@ -101,9 +101,9 @@ class SpaceTransform2DGpu : public SpaceTransform2D { linalg::util::MagmaBatchedGemm plan2_; }; -template -SpaceTransform2DGpu::SpaceTransform2DGpu(const int nw, - const linalg::util::MagmaQueue& queue) +template +SpaceTransform2DGpu::SpaceTransform2DGpu( + const int nw, const linalg::util::MagmaQueue& queue) : n_bands_(BDmn::dmn_size()), nw_(nw), nc_(RDmn::dmn_size()), @@ -113,8 +113,8 @@ SpaceTransform2DGpu::SpaceTransform2DGpu(const int nw, workspace_ = std::make_shared(); } -template -float SpaceTransform2DGpu::execute(RMatrix& M) { +template +float SpaceTransform2DGpu::execute(RMatrix& M) { float flop = 0.; auto& T_times_M = *(workspace_); @@ -133,7 +133,8 @@ float SpaceTransform2DGpu::execute(RMatrix& M) { const int lda = T.leadingDimension(); const int ldb = M.leadingDimension(); const int ldc = T_times_M.leadingDimension(); - plan1_.execute('N', 'N', nc_, M.nrCols(), nc_, Complex{1.0, 0.0}, Complex{0.0,0.0}, lda, ldb, ldc); + plan1_.execute('N', 'N', nc_, M.nrCols(), nc_, Complex{1.0, 0.0}, Complex{0.0, 0.0}, lda, ldb, + ldc); flop += n_trafo * 8. * nc_ * M.nrCols() * nc_; } @@ -147,7 +148,7 @@ float SpaceTransform2DGpu::execute(RMatrix& M) { const int ldb = T.leadingDimension(); const int ldc = T_times_M_times_T.leadingDimension(); const Complex norm = dca::util::makeMaybe(1.0 / nc_); - plan2_.execute('N', 'C', M.nrRows(), nc_, nc_, norm, Complex{0.0,0.0}, lda, ldb, ldc); + plan2_.execute('N', 'C', M.nrRows(), nc_, nc_, norm, Complex{0.0, 0.0}, lda, ldb, ldc); flop += n_trafo * 8. * M.nrRows() * nc_ * nc_; } @@ -157,9 +158,9 @@ float SpaceTransform2DGpu::execute(RMatrix& M) { return flop; } -template -void SpaceTransform2DGpu::phaseFactorsAndRearrange(const RMatrix& in, - RMatrix& out) { +template +void SpaceTransform2DGpu::phaseFactorsAndRearrange(const RMatrix& in, + RMatrix& out) { out.resizeNoCopy(in.size()); const Complex* const phase_factors_ptr = Base::hasPhaseFactors() ? getPhaseFactors().ptr() : nullptr; @@ -169,13 +170,14 @@ void SpaceTransform2DGpu::phaseFactorsAndRearrange(const RMa queue_); } -template -const auto& SpaceTransform2DGpu::getPhaseFactors() { +template +const auto& SpaceTransform2DGpu::getPhaseFactors() { auto initialize = []() { using dca::util::ComplexAlias; const auto& phase_factors = Base::getPhaseFactors(); linalg::Vector host_vector(phase_factors.size()); - std::copy_n(dca::util::castHostType(phase_factors.values()), phase_factors.size(), dca::util::castHostType(host_vector.ptr())); + std::copy_n(dca::util::castHostType(phase_factors.values()), phase_factors.size(), + dca::util::castHostType(host_vector.ptr())); return VectorDev(host_vector); }; diff --git a/include/dca/phys/dca_analysis/bse_solver/bse_cluster_solver_ext.hpp b/include/dca/phys/dca_analysis/bse_solver/bse_cluster_solver_ext.hpp index 607694643..6496f57f7 100644 --- a/include/dca/phys/dca_analysis/bse_solver/bse_cluster_solver_ext.hpp +++ b/include/dca/phys/dca_analysis/bse_solver/bse_cluster_solver_ext.hpp @@ -84,9 +84,7 @@ class BseClusterSolverExt { diagrammatic_symmetries diagrammatic_symmetries_obj; - func::function, - func::dmn_variadic> - Gamma_cluster; + func::function, func::dmn_variadic> Gamma_cluster; func::function, func::dmn_variadic> G_II_0_function; }; @@ -121,7 +119,7 @@ void BseClusterSolverExt::compute_Gamma func::function, TotalG4Dmn> G_II("G_II"); func::function, TotalG4Dmn> G_II_0("G_II_0"); - //apply_symmetries_sp(); + // apply_symmetries_sp(); load_G_II(G_II); @@ -130,7 +128,7 @@ void BseClusterSolverExt::compute_Gamma // it used to need to be here because G_II_0 was renormalized now we only do that to a temp variable. load_G_II_0_function(G_II_0); - //apply_symmetries_tp(G_II, G_II_0); + // apply_symmetries_tp(G_II, G_II_0); // in original analysis code the G_II and G_II_0 come back renormalized // not so here @@ -160,7 +158,7 @@ void BseClusterSolverExt::apply_symmetr if (parameters.symmetrize_Gamma()) { std::vector subind(3); if (concurrency.id() == concurrency.first()) - std::cout << "symmetrize Gamma_lattice according to the symmetry-group \n" << std::endl; + std::cout << "symmetrize Gamma_lattice according to the symmetry-group \n" << std::endl; for (int wex_ind = 0; wex_ind < WExDmn::dmn_size(); wex_ind++) for (int kex_ind = 0; kex_ind < KExDmn::dmn_size(); kex_ind++) { func::function, DCA_matrix_dmn_t> G_II_indi; @@ -170,7 +168,8 @@ void BseClusterSolverExt::apply_symmetr G_II_0.slice(0, subind, static_cast*>(G_II_0_indi.values())); Symmetrize::execute(G_II_indi, parameters.get_four_point_momentum_transfer()); - Symmetrize::execute(G_II_0_indi, parameters.get_four_point_momentum_transfer()); + Symmetrize::execute(G_II_0_indi, + parameters.get_four_point_momentum_transfer()); G_II.distribute(0, subind, static_cast*>(G_II_indi.values())); G_II_0.distribute(0, subind, static_cast*>(G_II_0_indi.values())); } @@ -195,8 +194,8 @@ void BseClusterSolverExt::load_G_II( std::cout << "\t" << __FUNCTION__ << " -- over " << KExDmn::dmn_size() << " k exchanges and " << WExDmn::dmn_size() << " frequency exchanges.\n\n"; - //std::vector coor_1(G_II.signature(), 0.0); - // ideally a compile time leaf domain count would be avaialbe from function + // std::vector coor_1(G_II.signature(), 0.0); + // ideally a compile time leaf domain count would be avaialbe from function constexpr int num_indexes_G4 = 10; std::array coor_1; coor_1.fill(0); @@ -231,12 +230,11 @@ void BseClusterSolverExt::load_G_II( } } std::cout << "final linind " << linind << '\n'; - std::cout << "G_II(0,0,0,0,0,0,0,0,0,0): " << G_II(0,0,0,0,0,0,0,0,0,0) << '\n'; - std::cout << "G_II(0,0,0,0,0,0,0,0,0,1): " << G_II(0,0,0,0,0,0,0,0,0,1) << '\n'; - std::cout << "G_II(0,0,0,0,0,0,0,0,0,2): " << G_II(0,0,0,0,0,0,0,0,0,2) << '\n'; - std::cout << "G4(0,0,0,0,0,0,0,0,0,1): " << G4(0,0,0,0,0,0,0,0,0,1) << '\n'; - std::cout << "G4(0,0,0,0,0,0,0,0,0,2): " << G4(0,0,0,0,0,0,0,0,0,2) << '\n'; - + std::cout << "G_II(0,0,0,0,0,0,0,0,0,0): " << G_II(0, 0, 0, 0, 0, 0, 0, 0, 0, 0) << '\n'; + std::cout << "G_II(0,0,0,0,0,0,0,0,0,1): " << G_II(0, 0, 0, 0, 0, 0, 0, 0, 0, 1) << '\n'; + std::cout << "G_II(0,0,0,0,0,0,0,0,0,2): " << G_II(0, 0, 0, 0, 0, 0, 0, 0, 0, 2) << '\n'; + std::cout << "G4(0,0,0,0,0,0,0,0,0,1): " << G4(0, 0, 0, 0, 0, 0, 0, 0, 0, 1) << '\n'; + std::cout << "G4(0,0,0,0,0,0,0,0,0,2): " << G4(0, 0, 0, 0, 0, 0, 0, 0, 0, 2) << '\n'; } /** This calculates close to the Chi_0(Q,omega_m)K,K' that we want for S(q,omega) @@ -247,9 +245,10 @@ void BseClusterSolverExt::load_G_II_0( profiler_type prof(__FUNCTION__, "BseClusterSolverExt", __LINE__); if (concurrency.id() == concurrency.first()) { std::cout << "\t" << __FUNCTION__ << "\n\n"; - std::cout << "G_II_0 domain sizes: " << vectorToString(G_II_0.get_domain().get_leaf_domain_sizes()) << "\n\n"; + std::cout << "G_II_0 domain sizes: " + << vectorToString(G_II_0.get_domain().get_leaf_domain_sizes()) << "\n\n"; } - + func::dmn_variadic k_w_dmn; G_II_0 = 0.; @@ -354,7 +353,6 @@ void BseClusterSolverExt::solve_BSE_on_ for (int wex_ind = 0; wex_ind < WExDmn::dmn_size(); wex_ind++) for (int kex_ind = 0; kex_ind < KExDmn::dmn_size(); kex_ind++) { - func::function, DCA_matrix_dmn_t> G_II_indi; func::function, DCA_matrix_dmn_t> G_II_0_indi; subind = {0, kex_ind, wex_ind}; diff --git a/include/dca/phys/dca_step/cluster_mapping/coarsegraining/coarsegraining_tp.hpp b/include/dca/phys/dca_step/cluster_mapping/coarsegraining/coarsegraining_tp.hpp index a47c4ab53..5984a6d8a 100644 --- a/include/dca/phys/dca_step/cluster_mapping/coarsegraining/coarsegraining_tp.hpp +++ b/include/dca/phys/dca_step/cluster_mapping/coarsegraining/coarsegraining_tp.hpp @@ -332,8 +332,8 @@ void coarsegraining_tp::compute_tp( #ifndef NDEBUG if (k_DCA::get_elements() != K_dmn::get_elements()) { std::cout << "k_DCA::get_elements() != K_dmn::get_elements() "; - std::cout << vectorToString(k_DCA::get_elements()) << " != " - << vectorToString(K_dmn::get_elements()) << '\n'; + std::cout << vectorToString(k_DCA::get_elements()) + << " != " << vectorToString(K_dmn::get_elements()) << '\n'; } #endif chi = 0.; @@ -365,7 +365,9 @@ void coarsegraining_tp::compute_tp( find_w1_and_w2(w_dmn_t::get_elements(), w_ind, w_1, w_2); - { BaseClass::compute_G_q_w(k_ind, w_1, H_k, S_K_w, I_q, H_q, S_q, G_q); } + { + BaseClass::compute_G_q_w(k_ind, w_1, H_k, S_K_w, I_q, H_q, S_q, G_q); + } { BaseClass::compute_G_q_w(k_ind, w_2, H_k, S_K_plus_Q_w, I_q_plus_Q, H_q_plus_Q, S_q_plus_Q, @@ -509,7 +511,9 @@ void coarsegraining_tp::compute_phi( find_w1_and_w2(w_dmn_t::get_elements(), w_ind, w_1, w_2); - { BaseClass::compute_G_q_w(k_ind, w_1, H_k, S_K_w, I_q, H_q, S_q, G_q); } + { + BaseClass::compute_G_q_w(k_ind, w_1, H_k, S_K_w, I_q, H_q, S_q, G_q); + } { BaseClass::compute_G_q_w(k_ind, w_2, H_k, S_Q_min_K_w, I_Q_min_q, H_Q_min_q, S_Q_min_q, diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp index 04dd7ff4e..77e34f9ab 100644 --- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp +++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_cpu.hpp @@ -270,7 +270,7 @@ double TpAccumulator::computeM( Profiler prf_b("Space FT", "tp-accumulation", __LINE__, thread_id_); // TODO: add the gflops here. - math::transform::SpaceTransform2D::execute(M_r_r_w_w, G_); + math::transform::SpaceTransform2D::execute(M_r_r_w_w, G_); return flops; } @@ -344,7 +344,7 @@ void TpAccumulator::computeGMultiband(const int s, matrixOperationsGMultiband(G0_w1, G0_w2, M_matrix, G0_M_); - // G(w1, w2) += \delta(w1, w2) \delta(k1,k2) G0(w1) + // G(w1, w2) += \delta(w1, w2) \delta(k1,k2) G0(w1) if (G0_w1.ptr() == G0_w2.ptr()) { for (int b2 = 0; b2 < n_bands_; ++b2) for (int b1 = 0; b1 < n_bands_; ++b1) @@ -366,12 +366,12 @@ void TpAccumulator::computeGMultiband(const int s, } template -void TpAccumulator::matrixOperationsGMultiband(const BandBlockView& G0_kw1, const BandBlockView& G0_kw2, BandBlockView& G_kkww, Matrix& G0_M_) { +void TpAccumulator::matrixOperationsGMultiband( + const BandBlockView& G0_kw1, const BandBlockView& G0_kw2, BandBlockView& G_kkww, Matrix& G0_M_) { // G(w1, w2) <- -G0(w1) M(w1, w2) G0(w2) G0_M_.resize(G0_kw1.size()); linalg::matrixop::gemm(G0_kw1, G_kkww, G0_M_); linalg::matrixop::gemm(TpComplex(-1), G0_M_, G0_kw2, TpComplex(0), G_kkww); - } template @@ -395,11 +395,11 @@ void TpAccumulator::getGMultiband(int s, int k1, in const auto* const G_ptr = &G_(0, 0, s, k1, k2, w1_ext, w2_ext); for (int b2 = 0; b2 < n_bands_; ++b2) for (int b1 = 0; b1 < n_bands_; ++b1) { - G(b1, b2) = sign * G(b1, b2) + G_ptr[b1 + b2 * n_bands_]; + G(b1, b2) = sign * G(b1, b2) + G_ptr[b2 + b1 * n_bands_]; #ifndef NDEBUG - // if (std::abs(G(b1, b2).imag()) > 10) // std::isnan(imag(G_(b1, b2, s, k1, k2, w1_ext, w2_ext)))) - // std::cout << w1 << "," << w2 << "," << k1 << "," << k2 << "," << b1 << "," << b2 << "," - // << G(b1, b2) << "*G_ptr" << G_ptr->real() << " + " << G_ptr->imag() << "\n"; + // if (std::abs(G(b1, b2).imag()) > 10) // std::isnan(imag(G_(b1, b2, s, k1, k2, w1_ext, w2_ext)))) + // std::cout << w1 << "," << w2 << "," << k1 << "," << k2 << "," << b1 << "," << b2 << "," + // << G(b1, b2) << "*G_ptr" << G_ptr->real() << " + " << G_ptr->imag() << "\n"; #endif } } @@ -456,8 +456,8 @@ double TpAccumulator::updateG4(const int channel_id // = -1/2 sum_s G(k2+k_ex, k1+k_ex, s) G(k1, k2, -s) for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; - if(w_ex != 0) - std::cout << "cpu w_ex: " << w_ex << '\n'; + if (w_ex != 0) + std::cout << "cpu w_ex: " << w_ex << '\n'; for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) { const int k_ex = exchange_mom[k_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) @@ -465,10 +465,23 @@ double TpAccumulator::updateG4(const int channel_id for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx); - for (int s = 0; s < 2; ++s) - updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex), - momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex), - w_plus_w_ex(w1, w_ex), -sign_over_2, true); + // for (int s = 0; s < 2; ++s) + // updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, not s, momentum_sum(k2, k_ex), + // momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex), + // w_plus_w_ex(w1, w_ex), -sign_over_2, true); + // } + for (int s = 0; s < 2; ++s) { + getGMultiband(s, k1, k2, w1, w2, G_a_); + getGMultiband(!s, momentum_sum(k2, k_ex), momentum_sum(k1, k_ex), + w_plus_w_ex(w2, w_ex), w_plus_w_ex(w1, w_ex), G_b_); + for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4) + for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3) + for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2) + for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) { + G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) += + -sign_over_2 * G_a_(b2, b4) * G_b_(b3, b1); + } + } } } } @@ -483,43 +496,53 @@ double TpAccumulator::updateG4(const int channel_id // - (s1 == s2) G(k2+k_ex, k1+k_ex, s1) G(k1, k2, s1)] for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { const int w_ex = exchange_frq[w_ex_idx]; - if(w_ex != 0) - std::cout << "cpu w_ex: " << w_ex << '\n'; + if (w_ex != 0) + std::cout << "cpu w_ex: " << w_ex << '\n'; for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) { const int k_ex = exchange_mom[k_ex_idx]; for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { - TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx); -#ifndef NDEBUG - TpComplex G4_before = *G4_ptr; -#endif - updateG4SpinDifference(G4_ptr, -1, k1, momentum_sum(k1, k_ex), w1, - w_plus_w_ex(w1, w_ex), momentum_sum(k2, k_ex), k2, - w_plus_w_ex(w2, w_ex), w2, sign_over_2, false); -#ifndef NDEBUG - G4_FromSpinDifference += std::abs(*G4_ptr - G4_before); -#endif + // TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, + // w_ex_idx); updateG4SpinDifference(G4_ptr, -1, k1, momentum_sum(k1, k_ex), w1, + // w_plus_w_ex(w1, w_ex), momentum_sum(k2, k_ex), k2, + // w_plus_w_ex(w2, w_ex), w2, sign_over_2, false); + for (int s1 = 0; s1 < 2; ++s1) { + for (int s2 = 0; s2 < 2; ++s2) { + std::complex spin_factor = (s1 == s2) ? 1 : -1; + getGMultiband(s1, k1, momentum_sum(k1, k_ex), w1, w_plus_w_ex(w1, w_ex), + G_a_); + getGMultiband(s2, momentum_sum(k2, k_ex), k2, w_plus_w_ex(w2, w_ex), w2, + G_b_); + for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4) + for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3) + for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2) + for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) { + G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) += + sign_over_2 * G_a_(b2, b1) * G_b_(b3, b4) * spin_factor; + } + } + } for (int s = 0; s < 2; ++s) { -#ifndef NDEBUG - G4_before = *G4_ptr; -#endif - updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, s, momentum_sum(k2, k_ex), - momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex), - w_plus_w_ex(w1, w_ex), -sign_over_2, true); -#ifndef NDEBUG - G4_DirectDifference += std::abs(*G4_ptr - G4_before); -#endif + // updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, s, momentum_sum(k2, k_ex), + // momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex), + // w_plus_w_ex(w1, w_ex), -sign_over_2, true); + getGMultiband(s, k1, k2, w1, w2, G_a_); + getGMultiband(s, momentum_sum(k2, k_ex), momentum_sum(k1, k_ex), + w_plus_w_ex(w2, w_ex), w_plus_w_ex(w1, w_ex), G_b_); + for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4) + for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3) + for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2) + for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) { + G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) += + -sign_over_2 * G_a_(b3, b1) * G_b_(b2, b4); + } } } } } flops += n_loops * (flops_update_spin_diff + 2 * flops_update_atomic); -#ifndef NDEBUG - std::cout << "G4 PHM from spin:" << G4_FromSpinDifference << '\n'; - std::cout << "G4 PHM from direct:" << G4_DirectDifference << '\n'; -#endif break; case FourPointType::PARTICLE_HOLE_CHARGE: @@ -551,56 +574,56 @@ double TpAccumulator::updateG4(const int channel_id flops += n_loops * (flops_update_spin_diff + 2 * flops_update_atomic); break; - case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: - // G4(k1, k2, k_ex) = 1/2 sum_s - // = 1/2 sum_s [G(k1, k1+k_ex, s) G(k2+k_ex, k2, s) - // - G(k2+k_ex, k1+k_ex, s) G(k1, k2, s)] - for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { - const int w_ex = exchange_frq[w_ex_idx]; - for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) { - const int k_ex = exchange_mom[k_ex_idx]; - for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) - for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) - for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) - for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { - TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx); - - for (int s = 0; s < 2; ++s) - updateG4Atomic(G4_ptr, s, k1, momentum_sum(k1, k_ex), w1, - w_plus_w_ex(w1, w_ex), s, momentum_sum(k2, k_ex), k2, - w_plus_w_ex(w2, w_ex), w2, sign_over_2, false); - - for (int s = 0; s < 2; ++s) - updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, s, momentum_sum(k2, k_ex), - momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex), - w_plus_w_ex(w1, w_ex), -sign_over_2, true); - } - } - } - flops += n_loops * 4 * flops_update_atomic; - break; - - case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: - // G4(k1, k2, k_ex) = 1/2 sum_s - // = 1/2 sum_s G(k1, k1+k_ex, s) G(k2+k_ex, k2, -s) - for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { - const int w_ex = exchange_frq[w_ex_idx]; - for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) { - const int k_ex = exchange_mom[k_ex_idx]; - for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) - for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) - for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) - for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { - TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx); - for (int s = 0; s < 2; ++s) - updateG4Atomic(G4_ptr, s, k1, momentum_sum(k1, k_ex), w1, - w_plus_w_ex(w1, w_ex), !s, momentum_sum(k2, k_ex), k2, - w_plus_w_ex(w2, w_ex), w2, sign_over_2, false); - } - } - } - flops += n_loops * 4 * flops_update_atomic; - break; + // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: + // // G4(k1, k2, k_ex) = 1/2 sum_s + // // = 1/2 sum_s [G(k1, k1+k_ex, s) G(k2+k_ex, k2, s) + // // - G(k2+k_ex, k1+k_ex, s) G(k1, k2, s)] + // for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { + // const int w_ex = exchange_frq[w_ex_idx]; + // for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) { + // const int k_ex = exchange_mom[k_ex_idx]; + // for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) + // for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) + // for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) + // for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { + // TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx); + + // for (int s = 0; s < 2; ++s) + // updateG4Atomic(G4_ptr, s, k1, momentum_sum(k1, k_ex), w1, + // w_plus_w_ex(w1, w_ex), s, momentum_sum(k2, k_ex), k2, + // w_plus_w_ex(w2, w_ex), w2, sign_over_2, false); + + // for (int s = 0; s < 2; ++s) + // updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, s, momentum_sum(k2, k_ex), + // momentum_sum(k1, k_ex), w_plus_w_ex(w2, w_ex), + // w_plus_w_ex(w1, w_ex), -sign_over_2, true); + // } + // } + // } + // flops += n_loops * 4 * flops_update_atomic; + // break; + + // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: + // // G4(k1, k2, k_ex) = 1/2 sum_s + // // = 1/2 sum_s G(k1, k1+k_ex, s) G(k2+k_ex, k2, -s) + // for (int w_ex_idx = 0; w_ex_idx < exchange_frq.size(); ++w_ex_idx) { + // const int w_ex = exchange_frq[w_ex_idx]; + // for (int k_ex_idx = 0; k_ex_idx < exchange_mom.size(); ++k_ex_idx) { + // const int k_ex = exchange_mom[k_ex_idx]; + // for (int w2 = 0; w2 < WTpDmn::dmn_size(); ++w2) + // for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) + // for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) + // for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { + // TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, + // w_ex_idx); for (int s = 0; s < 2; ++s) + // updateG4Atomic(G4_ptr, s, k1, momentum_sum(k1, k_ex), w1, + // w_plus_w_ex(w1, w_ex), !s, momentum_sum(k2, k_ex), k2, + // w_plus_w_ex(w2, w_ex), w2, sign_over_2, false); + // } + // } + // } + // flops += n_loops * 4 * flops_update_atomic; + // break; case FourPointType::PARTICLE_PARTICLE_UP_DOWN: // G4(k1, k2, k_ex) = 1/2 sum_s @@ -613,11 +636,23 @@ double TpAccumulator::updateG4(const int channel_id for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { - TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx); - for (int s = 0; s < 2; ++s) - updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, !s, q_minus_k(k1, k_ex), - q_minus_k(k2, k_ex), w_ex_minus_w(w1, w_ex), - w_ex_minus_w(w2, w_ex), sign_over_2, false); + // TpComplex* const G4_ptr = &G4(0, 0, 0, 0, k1, w1, k2, w2, k_ex_idx, w_ex_idx); + for (int s = 0; s < 2; ++s) { + // updateG4Atomic(G4_ptr, s, k1, k2, w1, w2, !s, q_minus_k(k1, k_ex), + // q_minus_k(k2, k_ex), w_ex_minus_w(w1, w_ex), + // w_ex_minus_w(w2, w_ex), sign_over_2, false); + // contraction: G_{b1,b3}(k1, k2) * G_{b2,b4}(q-k1, q-k2). + getGMultiband(s, k1, k2, w1, w2, G_a_); + getGMultiband(!s, q_minus_k(k1, k_ex), q_minus_k(k2, k_ex), + w_ex_minus_w(w1, w_ex), w_ex_minus_w(w2, w_ex), G_b_); + for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4) + for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3) + for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2) + for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) { + G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) += + sign_over_2 * G_a_(b1, b3) * G_b_(b2, b4); + } + } } } } @@ -641,34 +676,26 @@ double TpAccumulator::updateG4(const int channel_id for (int k2 = 0; k2 < KDmn::dmn_size(); ++k2) for (int w1 = 0; w1 < WTpDmn::dmn_size(); ++w1) for (int k1 = 0; k1 < KDmn::dmn_size(); ++k1) { - // contraction: G(k2, k1, s3, s2) * G(k_ex - k2, k_ex - k1, s4, s1). + // contraction: G(k2, k1, b1, b3) * G(k_ex - k2, k_ex - k1, b2, b4). getGMultiband(0, k1, k2, w1, w2, G_a_); getGMultiband(0, q_minus_k(k1, k_ex), q_minus_k(k2, k_ex), w_ex_minus_w(w1, w_ex), w_ex_minus_w(w2, w_ex), G_b_); - // getGMultiband(0, k2, k1, w2, w1, G_a_); - // getGMultiband(0, q_minus_k(k2, k_ex), q_minus_k(k1, k_ex), - // w_ex_minus_w(w2, w_ex), w_ex_minus_w(w1, w_ex), G_b_); - for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4) + for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4) for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3) for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2) for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) { G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) += - // complex_factor * G_a_(b3, b2) * G_b_(b4, b1); complex_factor * G_a_(b1, b3) * G_b_(b2, b4); - // complex_factor * G_a_(b1, b3); } - // contraction: -G(k2, k_ex - k1, s3, s1) * G(k_ex - k2, k1, s4, s2). + // contraction: -G(k1, k_ex - k2, b1, b4) * G(k_ex - k1, k2, b2, b3). getGMultiband(0, k1, q_minus_k(k2, k_ex), w1, w_ex_minus_w(w2, w_ex), G_a_); getGMultiband(0, q_minus_k(k1, k_ex), k2, w_ex_minus_w(w1, w_ex), w2, G_b_); - // getGMultiband(0, k2, q_minus_k(k1, k_ex), w2, w_ex_minus_w(w1, w_ex), G_a_); - // getGMultiband(0, q_minus_k(k2, k_ex), k1, w_ex_minus_w(w2, w_ex), w1, G_b_); for (int b4 = 0; b4 < BDmn::dmn_size(); ++b4) for (int b3 = 0; b3 < BDmn::dmn_size(); ++b3) for (int b2 = 0; b2 < BDmn::dmn_size(); ++b2) for (int b1 = 0; b1 < BDmn::dmn_size(); ++b1) { G4(b1, b2, b3, b4, k1, w1, k2, w2, k_ex_idx, w_ex_idx) -= - // complex_factor * G_a_(b3, b1) * G_b_(b4, b2); complex_factor * G_a_(b1, b4) * G_b_(b2, b3); } } @@ -750,8 +777,8 @@ void TpAccumulator::updateG4Atomic( getGMultiband(s_b, k1_b, k2_b, w1_b, w2_b, G_b_); // Assuming 0 = down 1 = up - //G_a = G[s_a] - //G_b = G[d_b] + // G_a = G[s_a] + // G_b = G[d_b] if (!cross_legs) for (int b4 = 0; b4 < n_bands_; ++b4) @@ -826,7 +853,8 @@ void TpAccumulator::updateG4SpinDifference( for (int b3 = 0; b3 < n_bands_; ++b3) for (int b2 = 0; b2 < n_bands_; ++b2) for (int b1 = 0; b1 < n_bands_; ++b1) { - *G4_ptr += alpha * G_a_(b1, b3) * G_b_(b2, b4); + // *G4_ptr += alpha * G_a_(b1, b3) * G_b_(b2, b4); + *G4_ptr += alpha * G_a_(b2, b1) * G_b_(b3, b4); ++G4_ptr; } else diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp index 1aa5661db..7e6357874 100644 --- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp +++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu.hpp @@ -438,15 +438,15 @@ double TpAccumulator::updateG4(const std::size_t ch get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end); - case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: - return details::updateG4( - get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), - G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end); + // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: + // return details::updateG4( + // get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), + // G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end); - case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: - return details::updateG4( - get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), - G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end); + // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: + // return details::updateG4( + // get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), G_[1].ptr(), + // G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end); case FourPointType::PARTICLE_PARTICLE_UP_DOWN: return details::updateG4( @@ -454,16 +454,18 @@ double TpAccumulator::updateG4(const std::size_t ch G_[1].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end); default: - throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu."); + throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu." + + std::to_string(static_cast(channel))); } } else { - if (channel == FourPointType::PARTICLE_PARTICLE_UP_DOWN ) + if (channel == FourPointType::PARTICLE_PARTICLE_UP_DOWN) return details::updateG4NoSpin( get_G4Dev()[channel_index].ptr(), G_[0].ptr(), G_[0].leadingDimension(), factor, multiple_accumulators_, queues_[0], start, end); } - throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu."); + throw std::logic_error("Specified four point type not implemented by tp_accumulator_gpu." + + std::to_string(static_cast(channel))); } template diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_base.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_base.hpp index 293463832..d71728c64 100644 --- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_base.hpp +++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_base.hpp @@ -77,9 +77,8 @@ class TpAccumulatorGpuBase { using MatrixHost = linalg::Matrix; public: - TpAccumulatorGpuBase( - const func::function>& G0, - const Parameters& pars, int n_freq, int thread_id); + TpAccumulatorGpuBase(const func::function>& G0, + const Parameters& pars, int n_freq, int thread_id); protected: void initializeG4Helpers() const; @@ -88,15 +87,15 @@ class TpAccumulatorGpuBase { template double computeM(const std::array, 2>& M_pair, - const std::array& configs); + const std::array& configs); void sumTo_(TpAccumulatorGpuBase& other_acc); // \todo is this violation of single source of truth necessary. - const func::function>* const G0_ptr_ = + const func::function>* const G0_ptr_ = nullptr; - //const int n_pos_frqs_ = -1; + // const int n_pos_frqs_ = -1; std::array queues_; linalg::util::GpuEvent event_; @@ -105,12 +104,13 @@ class TpAccumulatorGpuBase { // this is how this is defined in the all tp_accumulator_gpu, suspect? constexpr static bool non_density_density_ = - models::HasInitializeNonDensityInteractionMethod::value; - //CachedNdft ndft_obj_; + models::HasInitializeNonDensityInteractionMethod::value; + // CachedNdft ndft_obj_; - using NdftType = CachedNdft; + using NdftType = + CachedNdft; std::array ndft_objs_; - using DftType = math::transform::SpaceTransform2DGpu; + using DftType = math::transform::SpaceTransform2DGpu; std::array space_trsf_objs_; constexpr static int n_ndft_queues_ = config::McOptions::memory_savings ? 1 : 2; @@ -134,7 +134,7 @@ TpAccumulatorGpuBase::TpAccumulatorGpuBase( const func::function>& G0, const Parameters& pars, const int n_freq, int thread_id) : G0_ptr_(&G0), - //n_pos_frqs_(n_pos_frqs), + // n_pos_frqs_(n_pos_frqs), queues_(), ndft_objs_{NdftType(queues_[0]), NdftType(queues_[1])}, space_trsf_objs_{DftType(n_freq, queues_[0]), DftType(n_freq, queues_[1])}, @@ -159,17 +159,17 @@ auto TpAccumulatorGpuBase::get_G0() -> G0DevType& { template void TpAccumulatorGpuBase::initializeG4Helpers() const { - const auto& add_mat = KDmn::parameter_type::get_add_matrix(); - const auto& sub_mat = KDmn::parameter_type::get_subtract_matrix(); - const auto& w_indices = domains::FrequencyExchangeDomain::get_elements(); - const auto& q_indices = domains::MomentumExchangeDomain::get_elements(); - const auto extension_offset = (WTpExtDmn::dmn_size() - WTpDmn::dmn_size()) / 2; - - // CurrentlyA WTpPosDmn should always be == WTpDmn - details::G4Helper::set(n_bands_, KDmn::dmn_size(), WTpDmn::dmn_size(), q_indices, w_indices, extension_offset, - add_mat.ptr(), add_mat.leadingDimension(), sub_mat.ptr(), - sub_mat.leadingDimension()); - assert(cudaPeekAtLastError() == cudaSuccess); + const auto& add_mat = KDmn::parameter_type::get_add_matrix(); + const auto& sub_mat = KDmn::parameter_type::get_subtract_matrix(); + const auto& w_indices = domains::FrequencyExchangeDomain::get_elements(); + const auto& q_indices = domains::MomentumExchangeDomain::get_elements(); + const auto extension_offset = (WTpExtDmn::dmn_size() - WTpDmn::dmn_size()) / 2; + + // CurrentlyA WTpPosDmn should always be == WTpDmn + details::G4Helper::set(n_bands_, KDmn::dmn_size(), WTpDmn::dmn_size(), q_indices, w_indices, + extension_offset, add_mat.ptr(), add_mat.leadingDimension(), sub_mat.ptr(), + sub_mat.leadingDimension()); + assert(cudaPeekAtLastError() == cudaSuccess); } template @@ -192,9 +192,10 @@ void TpAccumulatorGpuBase::initializeG0() { for (int k = 0; k < KDmn::dmn_size(); ++k) for (int b2 = 0; b2 < n_bands_; ++b2) for (int b1 = 0; b1 < n_bands_; ++b1) { - assert(std::abs(WTpExtDmn::get_elements()[w] - WDmn::get_elements()[w + sp_index_offset]) < 1e-3); - G0_host[s](bkw_dmn(b1, k, w),b2) = (*G0_ptr_)(b1, s, b2, s, k, w + sp_index_offset); - } + assert(std::abs(WTpExtDmn::get_elements()[w] - + WDmn::get_elements()[w + sp_index_offset]) < 1e-3); + G0_host[s](bkw_dmn(b1, k, w), b2) = (*G0_ptr_)(b1, s, b2, s, k, w + sp_index_offset); + } dca::linalg::util::GpuStream reset_stream(cudaStreamLegacy); G0[s].set(G0_host[s], reset_stream); } @@ -219,7 +220,7 @@ double TpAccumulatorGpuBase::computeM( for (int s = 0; s < 2; ++s) flop += space_trsf_objs_[stream_id(s)].execute(G_[s]); } - + return flop; } diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp index a987c6b91..9a5b2238e 100644 --- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp +++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_mpi_gpu.hpp @@ -77,8 +77,8 @@ class TpAccumulator : public TpAccumulatorBase : public TpAccumulatorBase>; - + std::vector& get_G4(); private: @@ -157,9 +157,9 @@ TpAccumulator::TpAccumulator( : Base(G0, pars, thread_id), BaseGpu(pars, Base::get_n_pos_frqs(), thread_id_) { // each mpi rank only allocates memory of size 1/total_G4_size for its small portion of G4 // static_assert(std::is_same>, decltype(G4_)>); - //start_ = G4_[0].get_start(); + // start_ = G4_[0].get_start(); // The sense here is one past the last index held - //end_ = G4_[0].get_end() + 1; + // end_ = G4_[0].get_end() + 1; // possible these can both go into the parent class constructor } @@ -339,7 +339,7 @@ float TpAccumulator::updateG4( uint64_t start = get_G4Dev().get_start(); uint64_6 end = get_G4Dev().get_end() + 1; - + switch (channel) { case FourPointType::PARTICLE_HOLE_TRANSVERSE: return details::updateG4( @@ -372,7 +372,8 @@ float TpAccumulator::updateG4( G_[1].leadingDimension(), sign_, multiple_accumulators_, queues_[0], start, end); default: - throw std::logic_error("Specified four point type not implemented."); + throw std::logic_error("Specified four point type not implemented." + + std::to_string(static_cast(channel))); } } @@ -424,15 +425,14 @@ void TpAccumulator::ringG(float& flop) { } template -auto TpAccumulator::get_G4Dev() - -> std::vector& { +auto TpAccumulator::get_G4Dev() -> std::vector& { static std::vector G4; return G4; } template -void TpAccumulator::send( - const std::array& data, int target, std::array& request) { +void TpAccumulator::send(const std::array& data, int target, + std::array& request) { using dca::parallel::MPITypeMap; const auto g_size = data[0].size().first * data[0].size().second; @@ -455,8 +455,8 @@ void TpAccumulator::send( } template -void TpAccumulator::receive( - std::array& data, int source, std::array& request) { +void TpAccumulator::receive(std::array& data, int source, + std::array& request) { using dca::parallel::MPITypeMap; const auto g_size = data[0].size().first * data[0].size().second; diff --git a/include/dca/phys/four_point_type.hpp b/include/dca/phys/four_point_type.hpp index 8a824d7c4..b5568fa14 100644 --- a/include/dca/phys/four_point_type.hpp +++ b/include/dca/phys/four_point_type.hpp @@ -25,15 +25,16 @@ namespace phys { * causes a bug. That is not a good code smell. */ enum class FourPointType : int { - PARTICLE_HOLE_NONE = 0, + NONE = 0, PARTICLE_HOLE_TRANSVERSE, PARTICLE_HOLE_MAGNETIC, PARTICLE_HOLE_CHARGE, - PARTICLE_HOLE_LONGITUDINAL_UP_UP, - PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, PARTICLE_PARTICLE_UP_DOWN }; +// PARTICLE_HOLE_LONGITUDINAL_UP_UP, +// PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, + FourPointType stringToFourPointType(const std::string& name); std::string toString(const FourPointType type); diff --git a/include/dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp b/include/dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp index 95d908fe0..ffbe424b9 100644 --- a/include/dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp +++ b/include/dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp @@ -23,6 +23,7 @@ #include "dca/phys/domains/cluster/symmetries/point_groups/no_symmetry.hpp" #include "dca/phys/models/analytic_hamiltonians/cluster_shape_type.hpp" #include "dca/util/type_list.hpp" +#include "dca/phys/domains/cluster/cluster_operations.hpp" namespace dca { namespace phys { @@ -151,15 +152,94 @@ void KagomeHubbard::initializeHInteraction( const int origin = RDmn::parameter_type::origin_index(); const double U = parameters.get_U(); // on-site Hubbard interaction + const double V = parameters.get_V(); // Nearest-neighbor interaction + + // There are 6 different nearest neighbor (nn) pairs along a1, a2, a1+a2 and their negative vectors + const std::vector& basis = + RDmn::parameter_type::get_basis_vectors(); + + assert(basis.size() == 2); + + // Pre-compute basis vectors and orbital pairs + std::vector nn_vec; + nn_vec.reserve(3); + nn_vec.push_back(basis[0]); + nn_vec.push_back(basis[1]); + nn_vec.emplace_back(basis[0]); + nn_vec[2][0] += basis[1][0]; + nn_vec[2][1] += basis[1][1]; + + // Orbital pairs corresponding to each nearest neighbor direction + constexpr std::array, 3> orb_pairs = {{ + {{2, 0}}, {{1, 2}}, {{1, 0}} + }}; + + const std::vector& super_basis = + RDmn::parameter_type::get_super_basis_vectors(); + const std::vector& elements = + RDmn::parameter_type::get_elements(); + + std::vector nn_index; + std::vector> nn_orbs; + nn_index.reserve(6); + nn_orbs.reserve(6); + + // Process all nearest neighbor vectors in a loop + for (int vec_idx = 0; vec_idx < 3; ++vec_idx) { + const auto nn_vec_translated = + domains::cluster_operations::translate_inside_cluster(nn_vec[vec_idx], super_basis); + const int r = domains::cluster_operations::index(nn_vec_translated, elements, domains::BRILLOUIN_ZONE); + + if (r != origin) { + const int minus_r = RDmn::parameter_type::subtract(r, origin); + + nn_index.push_back(r); + nn_orbs.push_back({orb_pairs[vec_idx][0], orb_pairs[vec_idx][1]}); + + // Add negative direction if distinct + if (r != minus_r) { + nn_index.push_back(minus_r); + nn_orbs.push_back({orb_pairs[vec_idx][1], orb_pairs[vec_idx][0]}); + } + } + } H_interaction = 0.; +// Onsite U interaction for (int i = 0; i < BANDS; i++) { H_interaction(i, 0, i, 1, origin) = U; H_interaction(i, 1, i, 0, origin) = U; } + + // Nearest-neighbor V interaction inside unit cell + constexpr std::array, 3> intra_cell_pairs = {{ + {{0, 1}}, {{1, 2}}, {{0, 2}} + }}; + + for (const auto& pair : intra_cell_pairs) { + for (int s = 0; s < 2; ++s) { + H_interaction(pair[0], s, pair[1], s, origin) = V; + H_interaction(pair[0], s, pair[1], 1-s, origin) = V; + H_interaction(pair[1], s, pair[0], s, origin) = V; + H_interaction(pair[1], s, pair[0], 1-s, origin) = V; + } + } + + // Nearest-neighbor V interaction outside unit cell + const auto nn_size = nn_index.size(); + for (std::size_t i = 0; i < nn_size; ++i) { + for (int s = 0; s < 2; ++s) { + H_interaction(nn_orbs[i][0], s, nn_orbs[i][1], s, nn_index[i]) = V; + H_interaction(nn_orbs[i][0], s, nn_orbs[i][1], 1-s, nn_index[i]) = V; + } + } + } + + + template template void KagomeHubbard::initializeHSymmetry(func::function& H_symmetries) { diff --git a/include/dca/phys/models/analytic_hamiltonians/La3Ni2O7_bilayer.hpp b/include/dca/phys/models/analytic_hamiltonians/La3Ni2O7_bilayer.hpp index 29c98bc82..dced10253 100644 --- a/include/dca/phys/models/analytic_hamiltonians/La3Ni2O7_bilayer.hpp +++ b/include/dca/phys/models/analytic_hamiltonians/La3Ni2O7_bilayer.hpp @@ -190,10 +190,10 @@ void La3Ni2O7_bilayer::initializeHInteraction( H_interaction(2, s, 3, s, origin) = V - J; H_interaction(3, s, 2, s, origin) = V - J; - H_interaction(0, s, 1, 1-s, origin) = V + J; - H_interaction(1, s, 0, 1-s, origin) = V + J; - H_interaction(2, s, 3, 1-s, origin) = V + J; - H_interaction(3, s, 2, 1-s, origin) = V + J; + H_interaction(0, s, 1, 1-s, origin) = V; + H_interaction(1, s, 0, 1-s, origin) = V; + H_interaction(2, s, 3, 1-s, origin) = V; + H_interaction(3, s, 2, 1-s, origin) = V; } diff --git a/include/dca/phys/models/analytic_hamiltonians/square_plaquette_hubbard.hpp b/include/dca/phys/models/analytic_hamiltonians/square_plaquette_hubbard.hpp new file mode 100644 index 000000000..bf98d8f50 --- /dev/null +++ b/include/dca/phys/models/analytic_hamiltonians/square_plaquette_hubbard.hpp @@ -0,0 +1,196 @@ +// Copyright (C) 2019 ETH Zurich +// Copyright (C) 2019 UT-Battelle, LLC +// All rights reserved. +// +// See LICENSE for terms of usage. +// See CITATION.md for citation guidelines, if DCA++ is used for scientific publications. +// +// Author: Thomas A. Maier (maierta@ornl.gov) +// Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch) +// +// Implementation of the Kagome Hubbard model + +#ifndef DCA_PHYS_MODELS_ANALYTIC_HAMILTONIANS_SQUARE_PLAQUETTE_HUBBARD_HPP +#define DCA_PHYS_MODELS_ANALYTIC_HAMILTONIANS_SQUARE_PLAQUETTE_HUBBARD_HPP + +#include +#include +#include +#include + +#include "dca/function/domains.hpp" +#include "dca/function/function.hpp" +#include "dca/phys/domains/cluster/symmetries/point_groups/no_symmetry.hpp" +#include "dca/phys/models/analytic_hamiltonians/cluster_shape_type.hpp" +#include "dca/util/type_list.hpp" +#include "dca/phys/domains/cluster/cluster_operations.hpp" + +namespace dca { +namespace phys { +namespace models { +// dca::phys::models:: + +// TODO: the symmetry of this system must be checked. +template +class squarePlaquetteHubbard { +public: + static constexpr bool complex_g0 = false; + static constexpr bool spin_symmetric = true; + + using LDA_point_group = domains::no_symmetry<2>; + using DCA_point_group = SymmetryGroup; + // typedef PointGroupType DCA_point_group; + + const static ClusterShapeType DCA_cluster_shape = BETT_CLUSTER; + const static ClusterShapeType LDA_cluster_shape = PARALLELEPIPED; + + const static int DIMENSION = 2; + const static int BANDS = 4; + + const static double* initializeRDCABasis(); + // static double* initializeKDCABasis(); + + const static double* initializeRLDABasis(); + // static double* initializeKLDABasis(); + + static std::vector flavors(); + static std::vector> aVectors(); + + static std::vector, std::pair>> orbitalPermutations(); + + // Rotations of pi/2 are an anti-symmetry on the band off-diagonal. + static int transformationSignOfR(int b1, int b2, int s); + static int transformationSignOfK(int b1, int b2, int s); + + // Initializes the interaction Hamiltonian in real space. + template + static void initializeHInteraction( + func::function, + func::dmn_variadic, RDmn>>& H_interaction, + const parameters_type& parameters); + + template + static void initializeHSymmetry(func::function& H_symmetry); + + // Initializes the tight-binding (non-interacting) part of the momentum space Hamiltonian. + // Preconditions: The elements of KDmn are two-dimensional (access through index 0 and 1). + template + static void initializeH0( + const ParametersType& parameters, + func::function, + func::dmn_variadic, KDmn>>& H_0); +}; + +template +int squarePlaquetteHubbard::transformationSignOfR(int b1 [[maybe_unused]], + int b2 [[maybe_unused]], + int s [[maybe_unused]]) { + return 1; // TODO: FIXME +} + +template +int squarePlaquetteHubbard::transformationSignOfK(int b1 [[maybe_unused]], + int b2 [[maybe_unused]], + int s [[maybe_unused]]) { + return 1; // TODO: FIXME +} + +template +const double* squarePlaquetteHubbard::initializeRDCABasis() { + static std::array basis{2,0,0,2}; + return basis.data(); +} + +template +const double* squarePlaquetteHubbard::initializeRLDABasis() { + static std::array basis{2,0,0,2}; + return basis.data(); +} + + +template +std::vector squarePlaquetteHubbard::flavors() { + return {0, 1, 2,3}; +} + +template +std::vector> squarePlaquetteHubbard::aVectors() { + return {{0, 0}, {1, 0}, {0, 1}, {1, 1}}; +} + +template +std::vector, std::pair>> squarePlaquetteHubbard< + PointGroupType>::orbitalPermutations() { + return {}; +} + +template +template +void squarePlaquetteHubbard::initializeHInteraction( + func::function, + func::dmn_variadic, RDmn>>& H_interaction, + const parameters_type& parameters) { + if (BandDmn::dmn_size() != BANDS) + throw std::logic_error("Kagome Hubbard model has three bands."); + if (SpinDmn::dmn_size() != 2) + throw std::logic_error("Spin domain size must be 2."); + + const int origin = RDmn::parameter_type::origin_index(); + + const double U = parameters.get_U(); // on-site Hubbard interaction + + H_interaction = 0.; + +// Onsite U interaction + for (int i = 0; i < BANDS; i++) { + H_interaction(i, 0, i, 1, origin) = U; + H_interaction(i, 1, i, 0, origin) = U; + } + +} + +template +template +void squarePlaquetteHubbard::initializeHSymmetry(func::function& H_symmetries) { + H_symmetries = -1; +} + +template +template +void squarePlaquetteHubbard::initializeH0( + const ParametersType& parameters, + func::function, + func::dmn_variadic, KDmn>>& H_0) { + if (BandDmn::dmn_size() != BANDS) + throw std::logic_error("Plaquette Hubbard model has four bands."); + if (SpinDmn::dmn_size() != 2) + throw std::logic_error("Spin domain size must be 2."); + + const auto& k_vecs = KDmn::get_elements(); + + const auto t = parameters.get_t(); + + H_0 = ScalarType(0); + + for (int k_ind = 0; k_ind < KDmn::dmn_size(); ++k_ind) { + const auto& k = k_vecs[k_ind]; + + for (int s = 0; s < 2; s++) { + H_0(0, s, 1, s, k_ind) = -2. * t * std::cos(k[0]); + H_0(0, s, 2, s, k_ind) = -2. * t * std::cos(k[1]); + H_0(2, s, 3, s, k_ind) = -2. * t * std::cos(k[0]); + H_0(1, s, 3, s, k_ind) = -2. * t * std::cos(k[1]); + + H_0(1, s, 0, s, k_ind) = -2. * t * std::cos(k[0]); + H_0(2, s, 0, s, k_ind) = -2. * t * std::cos(k[1]); + H_0(3, s, 2, s, k_ind) = -2. * t * std::cos(k[0]); + H_0(3, s, 1, s, k_ind) = -2. * t * std::cos(k[1]); + } + } +} + +} // namespace models +} // namespace phys +} // namespace dca + +#endif // DCA_PHYS_MODELS_ANALYTIC_HAMILTONIANS_SQUARE_PLAQUETTE_HUBBARD_HPP diff --git a/include/dca/phys/parameters/four_point_parameters.hpp b/include/dca/phys/parameters/four_point_parameters.hpp index 75c5541ba..d1840d0d4 100644 --- a/include/dca/phys/parameters/four_point_parameters.hpp +++ b/include/dca/phys/parameters/four_point_parameters.hpp @@ -60,8 +60,7 @@ class FourPointParameters { * when the head node has no four point channels. This works around this bug. \todo fix this. */ bool isAccumulatingG4() const { - if (four_point_channels_.size() > 0 && - four_point_channels_[0] != FourPointType::PARTICLE_HOLE_NONE) + if (four_point_channels_.size() > 0 && four_point_channels_[0] != FourPointType::NONE) return true; else return false; @@ -73,7 +72,7 @@ class FourPointParameters { * PARTICLE_HOLE_NONE should never be added. */ void set_four_point_channel(FourPointType type) { - if (type != FourPointType::PARTICLE_HOLE_NONE) + if (type != FourPointType::NONE) four_point_channels_ = std::vector{type}; } void set_four_point_channels(const std::vector& channels) { diff --git a/include/dca/phys/parameters/model_parameters_Kagome_hubbard.inc b/include/dca/phys/parameters/model_parameters_Kagome_hubbard.inc index 66ae5e67b..eb3057af2 100644 --- a/include/dca/phys/parameters/model_parameters_Kagome_hubbard.inc +++ b/include/dca/phys/parameters/model_parameters_Kagome_hubbard.inc @@ -36,9 +36,17 @@ public: void set_U(double U) { U_ = U; } + + double get_V() const { + return V_; + } + void set_V(double V) { + V_ = V; + } private: double t_ = 0; double U_ = 0; + double V_ = 0; }; template @@ -49,6 +57,7 @@ int ModelParameters> buffer_size += concurrency.get_buffer_size(t_); buffer_size += concurrency.get_buffer_size(U_); + buffer_size += concurrency.get_buffer_size(V_); return buffer_size; } @@ -59,6 +68,7 @@ void ModelParameters const Concurrency& concurrency, char* buffer, int buffer_size, int& position) const { concurrency.pack(buffer, buffer_size, position, t_); concurrency.pack(buffer, buffer_size, position, U_); + concurrency.pack(buffer, buffer_size, position, V_); } template @@ -67,6 +77,7 @@ void ModelParameters const Concurrency& concurrency, char* buffer, int buffer_size, int& position) { concurrency.unpack(buffer, buffer_size, position, t_); concurrency.unpack(buffer, buffer_size, position, U_); + concurrency.unpack(buffer, buffer_size, position, V_); } template @@ -91,7 +102,11 @@ void ModelParameters } if (std::abs(U_) <= 1e-3) throw std::runtime_error("For Kagome model abs(U) must be greater than 1e-3"); + try { + reader_or_writer.execute("V", V_); + } + catch (const std::exception& r_e) { + } reader_or_writer.close_group(); } - diff --git a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu index 3b4241144..03b8d44a8 100644 --- a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu +++ b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu @@ -254,8 +254,7 @@ void computeGMultiband(std::complex* G, int ldg, const std::complex* template __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, - const GPUComplex>* __restrict__ G_up, - const int ldgu, + const GPUComplex>* __restrict__ G_up, const int ldgu, const GPUComplex>* __restrict__ G_down, const int ldgd, const SignType factor, const bool atomic, const uint64_t start, const uint64_t end) { @@ -306,7 +305,7 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, // the exchange momentum, implies the same operation is performed with the exchange frequency. // See tp_accumulator.hpp for more details. if constexpr (type == FourPointType::PARTICLE_HOLE_TRANSVERSE) { - // contribution <- -\sum_s G(k1, k2, s) * G(k2 + k_ex, k1 + k_ex, -s) + // contribution <- -\sum_s G_{b2,b4}(k1, k2, s) * G_{b3,b1}(k2 + k_ex, k1 + k_ex, -s) int w1_a(w1); int w2_a(w2); int k1_a(k1); @@ -316,9 +315,8 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, else g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - condSwapAdd(i_a, j_a, b1, b4, true); + int i_a = nb * k1_a + no * w1_a + b2; + int j_a = nb * k2_a + no * w2_a + b4; const GPUComplex> Ga_1 = G_up[i_a + ldgu * j_a]; const GPUComplex> Ga_2 = G_down[i_a + ldgd * j_a]; @@ -330,9 +328,8 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); else g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - condSwapAdd(i_b, j_b, b2, b3, true); + int i_b = nb * k1_b + no * w1_b + b3; + int j_b = nb * k2_b + no * w2_b + b1; const GPUComplex> Gb_1 = G_down[i_b + ldgd * j_b]; const GPUComplex> Gb_2 = G_up[i_b + ldgu * j_b]; @@ -340,23 +337,21 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, } else if constexpr (type == FourPointType::PARTICLE_HOLE_MAGNETIC) { // The PARTICLE_HOLE_MAGNETIC contribution is computed in two parts: - // Spin Difference Contribution + // Spin Difference Contribution (disconnected diagram) // new scope to reuse local index variables { - // contribution += (\sum_s s * G(k1, k1 + k_ex)) * (\sum_s s * G(k2 + k_ex, k2)) + // contribution += (\sum_s s * G_{b2, b1}(k1, k1 + k_ex)) * (\sum_s s * G_{b3,b4}(k2 + k_ex, k2)) int k1_a = k1; int k2_a = g4_helper.addKex(k1, k_ex); int w1_a(w1); int w2_a(g4_helper.addWex(w1, w_ex)); - // conj_a in this case just tells us whether to swap the band axes additions or not if (g4_helper.get_bands() == 1) g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); else g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - condSwapAdd(i_a, j_a, b1, b3, true); + int i_a = nb * k1_a + no * w1_a + b2; + int j_a = nb * k2_a + no * w2_a + b1; GPUComplex> Ga = G_up[i_a + ldgd * j_a] - G_down[i_a + ldgu * j_a]; int k1_b = g4_helper.addKex(k2, k_ex); @@ -364,21 +359,17 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, int w1_b(g4_helper.addWex(w2, w_ex)); int w2_b(w2); - // if (i_a == j_a) - // Ga += (G_up[i_a + ldgu * j_a] - G_down[i_a + ldgd * j_a]) * - if (g4_helper.get_bands() == 1) g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); else g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - condSwapAdd(i_b, j_b, b2, b4, true); + int i_b = nb * k1_b + no * w1_b + b3; + int j_b = nb * k2_b + no * w2_b + b4; GPUComplex> Gb = G_up[i_b + ldgd * j_b] - G_down[i_b + ldgu * j_b]; contribution = sign_over_2 * (Ga * Gb); } - // direct contribution <- -\sum_s G(k1, k2, s) * G(k2 + k_ex, k1 + k_ex, s) + // direct contribution <- -\sum_s G_{b2,b4}(k1, k2, s) * G_{b3, b1}(k2 + k_ex, k1 + k_ex, s) { int w1_a(w1); int w2_a(w2); @@ -389,10 +380,8 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); else g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - i_a += b4; - j_a += b1; + int i_a = nb * k1_a + no * w1_a + b3; + int j_a = nb * k2_a + no * w2_a + b1; GPUComplex> Ga_1 = G_up[i_a + ldgu * j_a]; GPUComplex> Ga_2 = G_down[i_a + ldgd * j_a]; @@ -407,10 +396,8 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, else g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - i_b += b3; - j_b += b2; + int i_b = nb * k1_b + no * w1_b + b2; + int j_b = nb * k2_b + no * w2_b + b4; GPUComplex> Gb_1 = G_up[i_b + ldgu * j_b]; GPUComplex> Gb_2 = G_down[i_b + ldgd * j_b]; @@ -430,10 +417,8 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); else conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - // b1 , b4 - condSwapAdd(i_a, j_a, b1, b4, true); + int i_a = nb * k1_a + no * w1_a + b1; + int j_a = nb * k2_a + no * w2_a + b4; const GPUComplex> Ga_1 = G_up[i_a + ldgu * j_a]; const GPUComplex> Ga_2 = G_down[i_a + ldgd * j_a]; @@ -448,10 +433,8 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, else conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - // b2, b3 - condSwapAdd(i_b, j_b, b2, b3, true); + int i_b = nb * k1_b + no * w1_b + b2; + int j_b = nb * k2_b + no * w2_b + b3; const GPUComplex> Gb_1 = G_up[i_b + ldgu * j_b]; const GPUComplex> Gb_2 = G_down[i_b + ldgd * j_b]; @@ -474,9 +457,8 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, else conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - condSwapAdd(i_a, j_a, b1, b3, true); + int i_a = nb * k1_a + no * w1_a + b2; + int j_a = nb * k2_a + no * w2_a + b1; const GPUComplex> Ga = G_up[i_a + ldgu * j_a] + G_down[i_a + ldgd * j_a]; @@ -490,174 +472,173 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, else conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - condSwapAdd(i_b, j_b, b2, b4, true); + int i_b = nb * k1_b + no * w1_b + b3; + int j_b = nb * k2_b + no * w2_b + b4; const GPUComplex> Gb = G_up[i_b + ldgu * j_b] + G_down[i_b + ldgd * j_b]; contribution += sign_over_2 * (Ga * Gb); } } - else if constexpr (type == FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP) { - // The PARTICLE_HOLE_LONGITUDINAL_UP_UP contribution is computed in two parts: - { - // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, s) - int w1_a(w1); - int w2_a(g4_helper.addWex(w1, w_ex)); - int k1_a = k1; - int k2_a = g4_helper.addKex(k1, k_ex); - bool conj_a = false; - if (g4_helper.get_bands() == 1) - conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); - else - conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - if (conj_a) { - i_a += b4; - j_a += b2; - } - else { - i_a += b2; - j_a += b4; - } - - const GPUComplex> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); - const GPUComplex> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); - - int w1_b(g4_helper.addWex(w2, w_ex)); - int w2_b(w2); - int k1_b = g4_helper.addKex(k2, k_ex); - int k2_b = k2; - bool conj_b = false; - if (g4_helper.get_bands() == 1) - conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); - else - conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - if (conj_b) { - i_b += b1; - j_b += b3; - } - else { - i_b += b3; - j_b += b1; - } - - const GPUComplex> Gb_1 = cond_conj(G_up[i_b + ldgd * j_b], conj_b); - const GPUComplex> Gb_2 = cond_conj(G_down[i_b + ldgu * j_b], conj_b); - - contribution = sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2); - } - { - // contribution <- -\sum_s G(k1, k2, s) * G(k2 + k_ex, k1 + k_ex, s) - int w1_a(w1); - int w2_a(w2); - int k1_a(k1); - int k2_a(k2); - - bool conj_a = false; - if (g4_helper.get_bands() == 1) - conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); - else - conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - if (conj_a) { - i_a += b4; - j_a += b1; - } - else { - i_a += b1; - j_a += b4; - } - const GPUComplex> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); - const GPUComplex> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); - - int w1_b(g4_helper.addWex(w2, w_ex)); - int w2_b(g4_helper.addWex(w1, w_ex)); - int k1_b = g4_helper.addKex(k2, k_ex); - int k2_b = g4_helper.addKex(k1, k_ex); - - bool conj_b = false; - if (g4_helper.get_bands() == 1) - conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); - else - conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - if (conj_b) { - i_b += b3; - j_b += b2; - } - else { - i_b += b2; - j_b += b3; - } - - const GPUComplex> Gb_1 = cond_conj(G_up[i_b + ldgd * j_b], conj_b); - const GPUComplex> Gb_2 = cond_conj(G_down[i_b + ldgu * j_b], conj_b); - - contribution += -sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2); - } - } - else if constexpr (type == FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN) { - // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, -s) - int w1_a(w1); - int w2_a(g4_helper.addWex(w1, w_ex)); - int k1_a = k1; - int k2_a = g4_helper.addKex(k1, k_ex); - bool conj_a = false; - if (g4_helper.get_bands() == 1) - conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); - else - conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - if (conj_a) { - i_a += b4; - j_a += b2; - } - else { - i_a += b2; - j_a += b4; - } - - const GPUComplex> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); - const GPUComplex> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); - - int w1_b(g4_helper.addWex(w2, w_ex)); - int w2_b(w2); - int k1_b = g4_helper.addKex(k2, k_ex); - int k2_b = k2; - bool conj_b = false; - if (g4_helper.get_bands() == 1) - conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); - else - conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - if (conj_b) { - i_b += b1; - j_b += b3; - } - else { - i_b += b3; - j_b += b1; - } - - const GPUComplex> Gb_1 = cond_conj(G_down[i_b + ldgd * j_b], conj_b); - const GPUComplex> Gb_2 = cond_conj(G_up[i_b + ldgu * j_b], conj_b); - - contribution = sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2); - } + // else if constexpr (type == FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP) { + // // The PARTICLE_HOLE_LONGITUDINAL_UP_UP contribution is computed in two parts: + // { + // // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, s) + // int w1_a(w1); + // int w2_a(g4_helper.addWex(w1, w_ex)); + // int k1_a = k1; + // int k2_a = g4_helper.addKex(k1, k_ex); + // bool conj_a = false; + // if (g4_helper.get_bands() == 1) + // conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); + // else + // conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); + // int i_a = nb * k1_a + no * w1_a; + // int j_a = nb * k2_a + no * w2_a; + // if (conj_a) { + // i_a += b4; + // j_a += b2; + // } + // else { + // i_a += b2; + // j_a += b4; + // } + + // const GPUComplex> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); + // const GPUComplex> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); + + // int w1_b(g4_helper.addWex(w2, w_ex)); + // int w2_b(w2); + // int k1_b = g4_helper.addKex(k2, k_ex); + // int k2_b = k2; + // bool conj_b = false; + // if (g4_helper.get_bands() == 1) + // conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); + // else + // conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); + + // int i_b = nb * k1_b + no * w1_b; + // int j_b = nb * k2_b + no * w2_b; + // if (conj_b) { + // i_b += b1; + // j_b += b3; + // } + // else { + // i_b += b3; + // j_b += b1; + // } + + // const GPUComplex> Gb_1 = cond_conj(G_up[i_b + ldgd * j_b], conj_b); + // const GPUComplex> Gb_2 = cond_conj(G_down[i_b + ldgu * j_b], conj_b); + + // contribution = sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2); + // } + // { + // // contribution <- -\sum_s G(k1, k2, s) * G(k2 + k_ex, k1 + k_ex, s) + // int w1_a(w1); + // int w2_a(w2); + // int k1_a(k1); + // int k2_a(k2); + + // bool conj_a = false; + // if (g4_helper.get_bands() == 1) + // conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); + // else + // conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); + // int i_a = nb * k1_a + no * w1_a; + // int j_a = nb * k2_a + no * w2_a; + // if (conj_a) { + // i_a += b4; + // j_a += b1; + // } + // else { + // i_a += b1; + // j_a += b4; + // } + // const GPUComplex> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); + // const GPUComplex> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); + + // int w1_b(g4_helper.addWex(w2, w_ex)); + // int w2_b(g4_helper.addWex(w1, w_ex)); + // int k1_b = g4_helper.addKex(k2, k_ex); + // int k2_b = g4_helper.addKex(k1, k_ex); + + // bool conj_b = false; + // if (g4_helper.get_bands() == 1) + // conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); + // else + // conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); + + // int i_b = nb * k1_b + no * w1_b; + // int j_b = nb * k2_b + no * w2_b; + // if (conj_b) { + // i_b += b3; + // j_b += b2; + // } + // else { + // i_b += b2; + // j_b += b3; + // } + + // const GPUComplex> Gb_1 = cond_conj(G_up[i_b + ldgd * j_b], conj_b); + // const GPUComplex> Gb_2 = cond_conj(G_down[i_b + ldgu * j_b], conj_b); + + // contribution += -sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2); + // } + // } + // else if constexpr (type == FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN) { + // // contribution <- \sum_s G(k1, k1+k_ex, s) * G(k2+k_ex, k2, -s) + // int w1_a(w1); + // int w2_a(g4_helper.addWex(w1, w_ex)); + // int k1_a = k1; + // int k2_a = g4_helper.addKex(k1, k_ex); + // bool conj_a = false; + // if (g4_helper.get_bands() == 1) + // conj_a = g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); + // else + // conj_a = g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); + // int i_a = nb * k1_a + no * w1_a; + // int j_a = nb * k2_a + no * w2_a; + // if (conj_a) { + // i_a += b4; + // j_a += b2; + // } + // else { + // i_a += b2; + // j_a += b4; + // } + + // const GPUComplex> Ga_1 = cond_conj(G_up[i_a + ldgu * j_a], conj_a); + // const GPUComplex> Ga_2 = cond_conj(G_down[i_a + ldgd * j_a], conj_a); + + // int w1_b(g4_helper.addWex(w2, w_ex)); + // int w2_b(w2); + // int k1_b = g4_helper.addKex(k2, k_ex); + // int k2_b = k2; + // bool conj_b = false; + // if (g4_helper.get_bands() == 1) + // conj_b = g4_helper.extendGIndices(k1_b, k2_b, w1_b, w2_b); + // else + // conj_b = g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); + + // int i_b = nb * k1_b + no * w1_b; + // int j_b = nb * k2_b + no * w2_b; + // if (conj_b) { + // i_b += b1; + // j_b += b3; + // } + // else { + // i_b += b3; + // j_b += b1; + // } + + // const GPUComplex> Gb_1 = cond_conj(G_down[i_b + ldgd * j_b], conj_b); + // const GPUComplex> Gb_2 = cond_conj(G_up[i_b + ldgu * j_b], conj_b); + + // contribution = sign_over_2 * (Ga_1 * Gb_1 + Ga_2 * Gb_2); + // } else if constexpr (type == FourPointType::PARTICLE_PARTICLE_UP_DOWN) { - // contribution <- -\sum_s G(k_ex - k2, k_ex - k1, s) * G(k2, k1, -s). + // contribution <- \sum_s G_{b1,b3}(k1, k2, s) * G_{b2,b4}(kex - k1, kex - k2, -s). int w1_a(w1); int w2_a(w2); int k1_a(k1); @@ -667,9 +648,9 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, g4_helper.extendGIndices(k1_a, k2_a, w1_a, w2_a); else g4_helper.extendGIndicesMultiBand(k1_a, k2_a, w1_a, w2_a); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - condSwapAdd(i_a, j_a, b1, b3, true); + int i_a = nb * k1_a + no * w1_a + b1; + int j_a = nb * k2_a + no * w2_a + b3; + const GPUComplex> Ga_1 = G_up[i_a + ldgu * j_a]; const GPUComplex> Ga_2 = G_down[i_a + ldgd * j_a]; @@ -683,9 +664,8 @@ __global__ void updateG4Kernel(GPUComplex>* __restrict__ G4, else g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - condSwapAdd(i_b, j_b, b2, b4, true); + int i_b = nb * k1_b + no * w1_b + b2; + int j_b = nb * k2_b + no * w2_b + b4; const GPUComplex> Gb_1 = G_down[i_b + ldgd * j_b]; const GPUComplex> Gb_2 = G_up[i_b + ldgu * j_b]; @@ -732,12 +712,12 @@ double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu, const Scalar* G_ case FourPointType::PARTICLE_HOLE_CHARGE: // Each update of a G4 entry involves 3 complex additions and 3 complex multiplications. return 26. * n_updates; - case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: - // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications. - return 32 * n_updates; - case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: - // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. - return 18. * n_updates; + // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: + // // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications. + // return 32 * n_updates; + // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: + // // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. + // return 18. * n_updates; case FourPointType::PARTICLE_PARTICLE_UP_DOWN: // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. return 18. * n_updates; @@ -749,8 +729,7 @@ double updateG4(Scalar* G4, const Scalar* G_up, const int ldgu, const Scalar* G_ template __global__ void updateG4KernelNoSpin(GPUComplex>* __restrict__ G4, const GPUComplex>* __restrict__ G_dn, - const int ldgd, - const SignType factor, const bool atomic, + const int ldgd, const SignType factor, const bool atomic, const uint64_t start, const uint64_t end) { // TODO: reduce code duplication. // TODO: decrease, if possible, register pressure. E.g. a single thread computes all bands. @@ -809,13 +788,11 @@ __global__ void updateG4KernelNoSpin(GPUComplex>* __restrict__ int k2_b = g4_helper.kexMinus(k2, k_ex); g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - condSwapAdd(i_a, j_a, b1, b3, true); + int i_a = nb * k1_a + no * w1_a + b1; + int j_a = nb * k2_a + no * w2_a + b3; - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - condSwapAdd(i_b, j_b, b2, b4, true); + int i_b = nb * k1_b + no * w1_b + b2; + int j_b = nb * k2_b + no * w2_b + b4; const GPUComplex> Ga_1 = G_dn[i_a + ldgd * j_a]; const GPUComplex> Gb_1 = G_dn[i_b + ldgd * j_b]; @@ -835,13 +812,11 @@ __global__ void updateG4KernelNoSpin(GPUComplex>* __restrict__ int k2_b(k2); g4_helper.extendGIndicesMultiBand(k1_b, k2_b, w1_b, w2_b); - int i_a = nb * k1_a + no * w1_a; - int j_a = nb * k2_a + no * w2_a; - condSwapAdd(i_a, j_a, b1, b4, true); + int i_a = nb * k1_a + no * w1_a + b1; + int j_a = nb * k2_a + no * w2_a + b4; - int i_b = nb * k1_b + no * w1_b; - int j_b = nb * k2_b + no * w2_b; - condSwapAdd(i_b, j_b, b2, b3, true); + int i_b = nb * k1_b + no * w1_b + b2; + int j_b = nb * k2_b + no * w2_b + b3; const GPUComplex> Ga_1 = G_dn[i_a + ldgd * j_a]; const GPUComplex> Gb_1 = G_dn[i_b + ldgd * j_b]; @@ -857,15 +832,14 @@ __global__ void updateG4KernelNoSpin(GPUComplex>* __restrict__ } template -double updateG4NoSpin(Scalar* G4, const Scalar* G_dn, const int ldgd, const SignType factor, bool atomic, cudaStream_t stream, - std::size_t start, std::size_t end) { +double updateG4NoSpin(Scalar* G4, const Scalar* G_dn, const int ldgd, const SignType factor, + bool atomic, cudaStream_t stream, std::size_t start, std::size_t end) { constexpr const std::size_t n_threads = 256; const unsigned n_blocks = dca::util::ceilDiv(end - start, n_threads); using dca::util::GPUTypeConversion; updateG4KernelNoSpin, type><<>>( - castGPUType(G4), castGPUType(G_dn), ldgd, - GPUTypeConversion(factor), atomic, start, end); + castGPUType(G4), castGPUType(G_dn), ldgd, GPUTypeConversion(factor), atomic, start, end); // Check for errors. auto err = cudaPeekAtLastError(); @@ -887,12 +861,12 @@ double updateG4NoSpin(Scalar* G4, const Scalar* G_dn, const int ldgd, const Sign case FourPointType::PARTICLE_HOLE_CHARGE: // Each update of a G4 entry involves 3 complex additions and 3 complex multiplications. return 26. * n_updates; - case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: - // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications. - return 32 * n_updates; - case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: - // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. - return 18. * n_updates; + // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: + // // Each update of a G4 entry involves 3 complex additions and 4 complex multiplications. + // return 32 * n_updates; + // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: + // // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. + // return 18. * n_updates; case FourPointType::PARTICLE_PARTICLE_UP_DOWN: // Each update of a G4 entry involves 2 complex additions and 2 complex multiplications. return 18. * n_updates; @@ -901,7 +875,6 @@ double updateG4NoSpin(Scalar* G4, const Scalar* G_dn, const int ldgd, const Sign } } - // Explicit instantiation. template void computeGSingleband(std::complex* G, int ldg, const std::complex* G0, int nk, int nw, @@ -932,17 +905,17 @@ template double updateG4, FourPointType::PARTICLE_HOLE_CHARG const std::complex* G_down, const int ldgd, const std::int8_t factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, - std::int8_t>(std::complex* G4, const std::complex* G_up, - const int ldgu, const std::complex* G_down, - const int ldgd, const std::int8_t factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); +// template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, +// std::int8_t>(std::complex* G4, const std::complex* G_up, +// const int ldgu, const std::complex* G_down, +// const int ldgd, const std::int8_t factor, bool atomic, +// cudaStream_t stream, std::size_t start, std::size_t end); -template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, - std::int8_t>(std::complex* G4, const std::complex* G_up, - const int ldgu, const std::complex* G_down, - const int ldgd, const std::int8_t factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); +// template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, +// std::int8_t>(std::complex* G4, const std::complex* G_up, +// const int ldgu, const std::complex* G_down, +// const int ldgd, const std::int8_t factor, bool atomic, +// cudaStream_t stream, std::size_t start, std::size_t end); template double updateG4, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::int8_t>( std::complex* G4, const std::complex* G_up, const int ldgu, @@ -964,17 +937,17 @@ template double updateG4, FourPointType::PARTICLE_HOLE_CHAR const std::complex* G_down, const int ldgd, const std::int8_t factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, - std::int8_t>(std::complex* G4, const std::complex* G_up, - const int ldgu, const std::complex* G_down, - const int ldgd, const std::int8_t factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); +// template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, +// std::int8_t>(std::complex* G4, const std::complex* G_up, +// const int ldgu, const std::complex* G_down, +// const int ldgd, const std::int8_t factor, bool atomic, +// cudaStream_t stream, std::size_t start, std::size_t end); -template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, - std::int8_t>(std::complex* G4, const std::complex* G_up, - const int ldgu, const std::complex* G_down, - const int ldgd, const std::int8_t factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); +// template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, +// std::int8_t>(std::complex* G4, const std::complex* G_up, +// const int ldgu, const std::complex* G_down, +// const int ldgd, const std::int8_t factor, bool atomic, +// cudaStream_t stream, std::size_t start, std::size_t end); template double updateG4, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::int8_t>( std::complex* G4, const std::complex* G_up, const int ldgu, @@ -983,98 +956,109 @@ template double updateG4, FourPointType::PARTICLE_PARTICLE_ // complex g0 -template double updateG4, FourPointType::PARTICLE_HOLE_TRANSVERSE, std::complex>( - std::complex* G4, const std::complex* G_up, const int ldgu, - const std::complex* G_down, const int ldgd, const std::complex factor, - bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); +template double updateG4, FourPointType::PARTICLE_HOLE_TRANSVERSE, + std::complex>(std::complex* G4, + const std::complex* G_up, const int ldgu, + const std::complex* G_down, const int ldgd, + const std::complex factor, bool atomic, + cudaStream_t stream, std::size_t start, + std::size_t end); -template -double updateG4, FourPointType::PARTICLE_HOLE_MAGNETIC, std::complex>( +template double updateG4, FourPointType::PARTICLE_HOLE_MAGNETIC, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template -double updateG4, FourPointType::PARTICLE_HOLE_CHARGE, std::complex>( +template double updateG4, FourPointType::PARTICLE_HOLE_CHARGE, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template -double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, - std::complex>(std::complex* G4, const std::complex* G_up, - const int ldgu, const std::complex* G_down, - const int ldgd, const std::complex factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); - -template -double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, - std::complex>(std::complex* G4, const std::complex* G_up, - const int ldgu, const std::complex* G_down, - const int ldgd, const std::complex factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); - -template -double updateG4, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex>( - std::complex* G4, const std::complex* G_up, const int ldgu, - const std::complex* G_down, const int ldgd, const std::complex factor, - bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); - -template -double updateG4, FourPointType::PARTICLE_HOLE_TRANSVERSE, std::complex>( +// template +// double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, +// std::complex>(std::complex* G4, const std::complex* G_up, +// const int ldgu, const std::complex* G_down, +// const int ldgd, const std::complex factor, bool atomic, +// cudaStream_t stream, std::size_t start, std::size_t end); + +// template +// double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, +// std::complex>(std::complex* G4, const std::complex* G_up, +// const int ldgu, const std::complex* G_down, +// const int ldgd, const std::complex factor, bool atomic, +// cudaStream_t stream, std::size_t start, std::size_t end); + +template double updateG4, FourPointType::PARTICLE_PARTICLE_UP_DOWN, + std::complex>(std::complex* G4, + const std::complex* G_up, const int ldgu, + const std::complex* G_down, const int ldgd, + const std::complex factor, bool atomic, + cudaStream_t stream, std::size_t start, + std::size_t end); + +template double updateG4, FourPointType::PARTICLE_HOLE_TRANSVERSE, + std::complex>(std::complex* G4, + const std::complex* G_up, const int ldgu, + const std::complex* G_down, const int ldgd, + const std::complex factor, bool atomic, + cudaStream_t stream, std::size_t start, + std::size_t end); + +template double updateG4, FourPointType::PARTICLE_HOLE_MAGNETIC, + std::complex>(std::complex* G4, + const std::complex* G_up, const int ldgu, + const std::complex* G_down, const int ldgd, + const std::complex factor, bool atomic, + cudaStream_t stream, std::size_t start, + std::size_t end); + +template double updateG4, FourPointType::PARTICLE_HOLE_CHARGE, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template -double updateG4, FourPointType::PARTICLE_HOLE_MAGNETIC, std::complex>( - std::complex* G4, const std::complex* G_up, const int ldgu, - const std::complex* G_down, const int ldgd, const std::complex factor, - bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); - -template -double updateG4, FourPointType::PARTICLE_HOLE_CHARGE, std::complex>( - std::complex* G4, const std::complex* G_up, const int ldgu, - const std::complex* G_down, const int ldgd, const std::complex factor, - bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); - -template -double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, - std::complex>(std::complex* G4, const std::complex* G_up, - const int ldgu, const std::complex* G_down, - const int ldgd, const std::complex factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); - -template -double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, - std::complex>(std::complex* G4, const std::complex* G_up, - const int ldgu, const std::complex* G_down, - const int ldgd, const std::complex factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); - -template -double updateG4, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex>( - std::complex* G4, const std::complex* G_up, const int ldgu, - const std::complex* G_down, const int ldgd, const std::complex factor, - bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); +// template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, +// std::complex>(std::complex* G4, +// const std::complex* G_up, const int ldgu, +// const std::complex* G_down, const int ldgd, +// const std::complex factor, bool atomic, +// cudaStream_t stream, std::size_t start, +// std::size_t end); + +// template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, +// std::complex>(std::complex* G4, +// const std::complex* G_up, const int ldgu, +// const std::complex* G_down, const int ldgd, +// const std::complex factor, bool atomic, +// cudaStream_t stream, std::size_t start, +// std::size_t end); + +template double updateG4, FourPointType::PARTICLE_PARTICLE_UP_DOWN, + std::complex>(std::complex* G4, + const std::complex* G_up, const int ldgu, + const std::complex* G_down, const int ldgd, + const std::complex factor, bool atomic, + cudaStream_t stream, std::size_t start, + std::size_t end); // Non spin symmetric -template double updateG4NoSpin, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex>( - std::complex* G4, const std::complex* G_up, const int ldgu, - const std::complex factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); - -template double updateG4NoSpin, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex>( - std::complex* G4, const std::complex* G_up, const int ldgu, - const std::complex factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); +template double updateG4NoSpin, FourPointType::PARTICLE_PARTICLE_UP_DOWN, + std::complex>(std::complex* G4, + const std::complex* G_up, const int ldgu, + const std::complex factor, bool atomic, + cudaStream_t stream, std::size_t start, + std::size_t end); + +template double updateG4NoSpin, FourPointType::PARTICLE_PARTICLE_UP_DOWN, + std::complex>(std::complex* G4, + const std::complex* G_up, const int ldgu, + const std::complex factor, bool atomic, + cudaStream_t stream, std::size_t start, + std::size_t end); template double updateG4NoSpin, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::int8_t>( std::complex* G4, const std::complex* G_up, const int ldgu, - const std::int8_t factor, bool atomic, - cudaStream_t stream, std::size_t start, std::size_t end); - - + const std::int8_t factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); // template<> double updateG4< FourPointType::PARTICLE_HOLE_TRANSVERSE>( // std::complex* G4, const std::complex* G_up, const int ldgu, diff --git a/src/phys/four_point_type.cpp b/src/phys/four_point_type.cpp index d9e30e652..ec08a3cf2 100644 --- a/src/phys/four_point_type.cpp +++ b/src/phys/four_point_type.cpp @@ -26,18 +26,18 @@ FourPointType stringToFourPointType(const std::string& name) { return FourPointType::PARTICLE_HOLE_MAGNETIC; else if (name == "PARTICLE_HOLE_CHARGE") return FourPointType::PARTICLE_HOLE_CHARGE; - else if (name == "PARTICLE_HOLE_LONGITUDINAL_UP_UP") - return FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP; - else if (name == "PARTICLE_HOLE_LONGITUDINAL_UP_DOWN") - return FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN; + // else if (name == "PARTICLE_HOLE_LONGITUDINAL_UP_UP") + // return FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP; + // else if (name == "PARTICLE_HOLE_LONGITUDINAL_UP_DOWN") + // return FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN; else - return FourPointType::PARTICLE_HOLE_NONE; + return FourPointType::NONE; } std::string toString(const FourPointType type) { switch (type) { - case FourPointType::PARTICLE_HOLE_NONE: - return "PARTICLE_HOLE_NONE"; + case FourPointType::NONE: + return "NONE"; case FourPointType::PARTICLE_PARTICLE_UP_DOWN: return "PARTICLE_PARTICLE_UP_DOWN"; case FourPointType::PARTICLE_HOLE_TRANSVERSE: @@ -46,10 +46,10 @@ std::string toString(const FourPointType type) { return "PARTICLE_HOLE_MAGNETIC"; case FourPointType::PARTICLE_HOLE_CHARGE: return "PARTICLE_HOLE_CHARGE"; - case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: - return "PARTICLE_HOLE_LONGITUDINAL_UP_UP"; - case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: - return "PARTICLE_HOLE_LONGITUDINAL_UP_DOWN"; + // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP: + // return "PARTICLE_HOLE_LONGITUDINAL_UP_UP"; + // case FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN: + // return "PARTICLE_HOLE_LONGITUDINAL_UP_DOWN"; default: throw std::logic_error("Invalid four point mode."); } diff --git a/test/test_automation/ci/run_step.sh b/test/test_automation/ci/run_step.sh index 68b406eb5..d1c615a5b 100755 --- a/test/test_automation/ci/run_step.sh +++ b/test/test_automation/ci/run_step.sh @@ -3,22 +3,22 @@ set -x HOST_NAME=$(hostname -s) -case "$1" in +case "$1" in - # Configure DCA++ using cmake out-of-source builds + # Configure DCA++ using cmake out-of-source builds configure) - + if [ -d ${GITHUB_WORKSPACE}/../dca-build ] then echo "Found existing out-of-source build directory ${GITHUB_WORKSPACE}/../dca-build, removing" rm -fr ${GITHUB_WORKSPACE}/../dca-build fi - + echo "Creating new out-of-source build directory ${GITHUB_WORKSPACE}/../dca-build" cd ${GITHUB_WORKSPACE}/.. mkdir dca-build cd dca-build - + # Real or Complex configuration case "${GH_JOBNAME}" in *"Real"*) @@ -28,66 +28,63 @@ case "$1" in *"Complex"*) echo 'Configure for complex build' IS_COMPLEX=1 - ;; + ;; esac if [[ "${GH_JOBNAME}" =~ (-CUDA) ]] then - echo "Set PATH to cuda-11.8 to be associated with the C and C++ compilers" - export PATH=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/cuda-11.8.0-pkdxb42p6no3tfimntdbw37rvn5p6fiw:$PATH + echo "Set PATH to cuda-12.9 to be associated with the C and C++ compilers" + export PATH=/usr/local/cuda-12.9/bin:$PATH echo "Set CUDACXX CMake environment variable to nvcc cuda 11.8" - export CUDACXX=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/cuda-11.8.0-pkdxb42p6no3tfimntdbw37rvn5p6fiw/bin/nvcc + export CUDACXX=/usr/local/cuda-12.9/bin/nvcc # Make current environment variables available to subsequent steps echo "PATH=$PATH" >> $GITHUB_ENV echo "CUDACXX=$CUDACXX" >> $GITHUB_ENV - fi + fi # Mixed or Non-Mixed (default, full) precision, used with GPU code case "${GH_JOBNAME}" in *"Mixed"*) echo 'Configure for mixed precision build' IS_MIXED_PRECISION=1 - ;; + ;; *) IS_MIXED_PRECISION=0 ;; esac # Path to QMC_DATA in self-hosted CI system and point at minimum gcc-9 - if [[ "$HOST_NAME" =~ (v100-again) ]] + if [[ "$HOST_NAME" =~ (a30four) ]] then # use gcc-12 - export PATH=/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/gcc-12.2.0-cx7pjxgmemcce4tohlmsekuo5qvgjqbl/bin:/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/ninja-1.11.1-plzpokehn3kdbcviteppqntkqun5752f/bin:/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/cmake-3.25.0-xlxorwhfz5jxpyx65ypsh2horyo7n3ef/bin:$PATH - export LD_LIBRARY_PATH=/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/gcc-12.2.0-cx7pjxgmemcce4tohlmsekuo5qvgjqbl/lib64:$LD_LIBRARY_PATH - export CUDAHOSTCXX=g++ - export MAGMA_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/magma-2.7.1-nrkrnn42vb2ifpkpoutktrdsnxusrdql - export HDF5_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/hdf5-1.14.0-szvumwunzwinu4lhc3kcbbnripfkfcnp - export OPENBLAS_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/openblas-0.3.21-e3gckkrsmcrewtzn6lhkdttiwe5ikp4z - export ADIOS2_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/adios2-2.8.3-ba3mpxuh435rvisxhorzotb53m3dgfuv - export MPI_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/openmpi-4.1.4-mbv4sqfzxaqfwcyhjaaammhsd56dtvl2 - export FFTW_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-cascadelake/gcc-12.2.0/fftw-3.3.10-hdyb4eim5ngsgkhqyiyl5ggispxqmzo5 + export PATH=$(spack find -lp llvm | awk '/llvm/{print $3}')/bin:${PATH} + export LD_LIBRARY_PATH=$(spack find -lp gcc/yel5m4n | awk '/gcc/{print $3}')/lib64:$(spack find -lp hdf5 | awk '/hdf5/{print $3}')/lib64:$(spack find -lp llvm/navafcm | awk '/llvm/{print $3}')/lib:${LD_LIBRARY_PATH}:$(spack find -lp tree-sitter-cmake | awk '/tree-sitter-cmake/{print $3}')/lib:${LD_LIBRARY_PATH} + export CUDAHOSTCXX=clang++ + export MAGMA_ROOT=/home/epd/opt_a30/magma + export HDF5_ROOT=$(spack find --loaded -lp hdf5 | awk '/hdf5/{print $3}') + export OPENBLAS_ROOT=$(spack find --loaded -lp openblas | awk '/openblas/{print $3}') + export MPI_ROOT=$(spack find --loaded -lp openmpi | awk '/openmpi/{print $3}') + export FFTW_ROOT=$(spack find --loaded -lp fftw | awk '/fftw/{print $3}') # Make current environment variables available to subsequent steps echo "PATH=$PATH" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV fi - + case "${GH_JOBNAME}" in - *"GCC12-MPI-CUDA-"*) + *"LLVM21-MPI-CUDA-"*) echo 'Configure for debug mode to capture asserts with gcc' cmake -GNinja \ - -DCMAKE_C_COMPILER=gcc \ - -DCMAKE_CXX_COMPILER=g++ \ - -DCMAKE_CUDA_COMPILER=${CUDA_HOME}/bin/nvcc \ + -DCMAKE_C_COMPILER=mpicc \ + -DCMAKE_CXX_COMPILER=mpic++ \ + -DDCA_WITH_CUDA=1 \ -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler \ - -DCMAKE_CUDA_ARCHITECTURES=70 \ - -DCMAKE_FORTRAN_COMPILER=gfortran -DCMAKE_EXE_LINKER_FLAGS_INIT="-lgfortran" -DCMAKE_SHARED_LINKER_FLAGS_INIT="-lgfortran" -DCMAKE_MODULE_LINKER_FLAGS_INIT="-lgfortran" \ + -DCMAKE_CUDA_ARCHITECTURES=80 \ -DDCA_WITH_MPI=1 \ -DCMAKE_PREFIX_PATH=${MPI_ROOT}\;${CUDA_ROOT}\;${MAGMA_ROOT}\;${HDF5_ROOT}\;${OPENBLAS_ROOT}\;${ADIOS2_ROOT}\;${FFTW_ROOT} \ -DCMAKE_BUILD_TYPE=Release \ -DTEST_RUNNER="mpiexec" \ -DMPIEXEC_NUMPROC_FLAG="-n" -DMPIEXEC_PREFLAGS="-mca btl self,tcp" -DDCA_WITH_CUDA=1 -DDCA_WITH_ADIOS2=1 \ -DDCA_WITH_TESTS_FAST=1 \ - -DLAPACK_LIBRARIES=${OPENBLAS_ROOT}/lib/libopenblas.a \ ${GITHUB_WORKSPACE} ;; esac @@ -107,17 +104,17 @@ case "$1" in # Run deterministic tests test) - + # Run only deterministic tests (reasonable for CI) by default TEST_LABEL="" - export MPI_ROOT=/home/epd/spack/opt/spack/linux-ubuntu20.04-skylake_avx512/gcc-12.2.0/openmpi-4.1.5-wzdq6mz5bbln7noxsytymadsh6lw3rag + export MPI_ROOT=$(spack find --loaded -lp openmpi | awk '/openmpi/{print $3}') export PATH=${MPI_ROOT}/bin:${PATH} cd ${GITHUB_WORKSPACE}/../dca-build - - # Add ctest concurrent parallel jobs + + # Add ctest concurrent parallel jobs # Default for Linux GitHub Action runners CTEST_JOBS="1" - + ctest --output-on-failure $TEST_LABEL -j $CTEST_JOBS ;; diff --git a/test/unit/io/CMakeLists.txt b/test/unit/io/CMakeLists.txt index 8783cdb58..a6a159f4b 100644 --- a/test/unit/io/CMakeLists.txt +++ b/test/unit/io/CMakeLists.txt @@ -15,8 +15,14 @@ dca_add_gtest(json_reader_test GTEST_MAIN LIBS json) +if (DCA_HAVE_GPU) + set(DCA_FORTRAN_LIBS magma::magma) +endif() + dca_add_gtest(reader_writer_test - LIBS dca_io parallel_mpi_concurrency function ${DCA_THREADING_LIBS}) + LIBS dca_io parallel_mpi_concurrency function ${DCA_THREADING_LIBS} + ${DCA_FORTRAN_LIBS} +) dca_add_gtest(io_types_test GTEST_MAIN @@ -34,4 +40,3 @@ if (DCA_HAVE_ADIOS2) LIBS parallel_no_concurrency dca_io function) endif() endif() - diff --git a/test/unit/math/function_transform/space_transform_2D_gpu_test.cpp b/test/unit/math/function_transform/space_transform_2D_gpu_test.cpp index 319d0099c..b131cf3a9 100644 --- a/test/unit/math/function_transform/space_transform_2D_gpu_test.cpp +++ b/test/unit/math/function_transform/space_transform_2D_gpu_test.cpp @@ -55,13 +55,12 @@ using Model = dca::phys::models::TightBindingModel< template using NumTraits = dca::NumericalTraits, SCALAR>; - const std::string input_dir = DCA_SOURCE_DIR "/test/unit/math/function_transform/"; using BDmn = dca::func::dmn_0; using SDmn = dca::func::dmn_0; using WPosDmn = - dca::func::dmn_0>; + dca::func::dmn_0>; using WDmn = dca::func::dmn_0>; @@ -77,13 +76,12 @@ template using SpaceTransform2DGpuTest = ::testing::Test; using TestTypes = ::testing::Types; //, std::complex>; - TYPED_TEST_CASE(SpaceTransform2DGpuTest, TestTypes); TYPED_TEST(SpaceTransform2DGpuTest, Execute) { using Scalar = TypeParam; using dca::util::castGPUType; - + Parameters pars(dca::util::GitVersion::string(), *concurrency_ptr); using KDmn = typename Parameters::KClusterDmn; @@ -100,14 +98,14 @@ TYPED_TEST(SpaceTransform2DGpuTest, Execute) { function> f_in; RMatrix M_in; - // Initialize the input function. const int nb = BDmn::dmn_size(); const int nr = RDmn::dmn_size(); const int nw = WPosDmn::dmn_size(); - std::cout << "nBDmn:" << nb << " nRDmn:" << nr << " nw:" << nw << '|' << WDmn::dmn_size() << " ns:" << SDmn::dmn_size() << '\n'; - + std::cout << "nBDmn:" << nb << " nRDmn:" << nr << " nw:" << nw << '|' << WDmn::dmn_size() + << " ns:" << SDmn::dmn_size() << '\n'; + M_in.resizeNoCopy(std::make_pair(nb * nr * nw, nb * nr * nw)); for (int w2 = 0; w2 < nw; ++w2) for (int w1 = 0; w1 < nw; ++w1) @@ -120,20 +118,20 @@ TYPED_TEST(SpaceTransform2DGpuTest, Execute) { auto index = [=](int r, int b, int w) { return r + nr * b + nb * nr * w; }; f_in(r1, r2, b1, b2, 0, w1, w2) = val; - M_in(index(r1, b1, w1), index(r2, b2, w2)) = val; + M_in(index(r1, b1, w1), index(r2, b2, w2)) = val; } // Transform on the CPU. function> f_out; - dca::math::transform::SpaceTransform2D::execute(f_in, f_out); + dca::math::transform::SpaceTransform2D::execute(f_in, f_out); // Transform on the GPU. dca::linalg::ReshapableMatrix, dca::linalg::GPU> M_dev(M_in); dca::linalg::util::MagmaQueue queue; - dca::math::transform::SpaceTransform2DGpu> transform_obj( - nw, queue); + dca::math::transform::SpaceTransform2DGpu> + transform_obj(nw, queue); transform_obj.execute(M_dev); queue.getStream().sync(); @@ -145,10 +143,9 @@ TYPED_TEST(SpaceTransform2DGpuTest, Execute) { dca::io::Writer writer(*concurrency_ptr, "ADIOS2", true); dca::io::Writer writer_h5(*concurrency_ptr, "HDF5", true); - //writer.open_file("tp_single_band_gpu_test_G4.bp"); + // writer.open_file("tp_single_band_gpu_test_G4.bp"); writer_h5.open_file("space_transform_2D_gpu_test.hdf5"); - // writer.execute("m_outDevice", M_out); // writer.execute("m_outHost", f_out); writer_h5.execute("m_outDevice", M_out); @@ -168,9 +165,8 @@ TYPED_TEST(SpaceTransform2DGpuTest, Execute) { for (int b2 = 0; b2 < nb; ++b2) for (int b1 = 0; b1 < nb; ++b1) { const Complex val1 = f_out(b1, b2, 0, r1, r2, w1, w2); - auto index = [=](int k, int b, int w) { return b + nb * k + nb * nr * w; }; - const Complex val2 = - M_out(index(r1, b1, w1), index(r2, b2, w2)); + auto index = [=](int k, int b, int w) { return b + nb * k + nb * nr * w; }; + const Complex val2 = M_out(index(r1, b1, w1), index(r2, b2, w2)); EXPECT_LE(std::abs(val1 - val2), tolerance); } } diff --git a/test/unit/math/function_transform/space_transform_2D_test.cpp b/test/unit/math/function_transform/space_transform_2D_test.cpp index 1dee64bb6..e72a62e85 100644 --- a/test/unit/math/function_transform/space_transform_2D_test.cpp +++ b/test/unit/math/function_transform/space_transform_2D_test.cpp @@ -47,16 +47,15 @@ using WPosDmn = using WDmn = dca::func::dmn_0>; -template - using Parameters = - dca::phys::params::Parameters>; - +template +using Parameters = + dca::phys::params::Parameters>; // Perform the test in double and single precision. template class SpaceTransform2DTest : public ::testing::Test {}; -using TestTypes = ::testing::Types; //, std::complex>; +using TestTypes = ::testing::Types; //, std::complex>; TYPED_TEST_CASE(SpaceTransform2DTest, TestTypes); TYPED_TEST(SpaceTransform2DTest, Execute) { @@ -84,7 +83,7 @@ TYPED_TEST(SpaceTransform2DTest, Execute) { auto f_in_cpy = f_in; dca::func::function> f_out; - dca::math::transform::SpaceTransform2D::execute(f_in_cpy, f_out); + dca::math::transform::SpaceTransform2D::execute(f_in_cpy, f_out); const auto im = std::complex(0, 1); diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt index 7f99ed90f..6d35d6df5 100644 --- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt +++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/CMakeLists.txt @@ -44,20 +44,68 @@ set(TPACC_PARTICLE_HOLE_LIBS gpu_utils cluster_domains time_and_frequency_domain if (DCA_HAVE_ADIOS2) set(TPACC_PARTICLE_HOLE_LIBS dca_adios2 ${TPACC_PARTICLE_HOLE_LIBS}) endif() - -dca_add_gtest(tp_accumulator_particle_hole_test + +# dca_add_gtest(tp_accumulator_particle_hole_test +# FAST +# CUDA +# GTEST_MPI_MAIN +# INCLUDE_DIRS ${PROJECT_SOURCE_DIR} ${FFTW_INCLUDE_DIR} ${HDF5_INCLUDE_DIR} +# LIBS function FFTW::Double cluster_domains time_and_frequency_domains quantum_domains timer json random +# enumerations dca_algorithms ${DCA_THREADING_LIBS} dca_io parallel_util ${LAPACK_LIBRARIES} gpu_utils parallel_no_concurrency) + +dca_add_gtest(tp_accumulator_gpu_test FAST CUDA - GTEST_MPI_MAIN - INCLUDE_DIRS ${PROJECT_SOURCE_DIR} ${FFTW_INCLUDE_DIR} ${HDF5_INCLUDE_DIR} - LIBS function FFTW::Double cluster_domains time_and_frequency_domains quantum_domains timer json random - enumerations dca_algorithms ${DCA_THREADING_LIBS} dca_io parallel_util ${LAPACK_LIBRARIES} gpu_utils parallel_no_concurrency) + INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR} + LIBS ${DCA_LIBS} ${DCA_KERNEL_LIBS} + CUSTOM_SOURCE tp_accumulator_gpu_test.cpp + TEST_DEFINES DoubleSquare -dca_add_gtest(tp_accumulator_gpu_test +) + +dca_add_gtest(tp_accumulator_rashba_gpu_test + FAST + CUDA + INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR} + LIBS ${DCA_LIBS} ${DCA_KERNEL_LIBS} + CUSTOM_SOURCE tp_accumulator_gpu_test.cpp + TEST_DEFINES ComplexDoubleRashba + ) + +dca_add_gtest(tp_accumulator_moire_hubbard_gpu_test + FAST + CUDA + INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR} + LIBS ${DCA_LIBS} ${DCA_KERNEL_LIBS} + CUSTOM_SOURCE tp_accumulator_gpu_test.cpp + TEST_DEFINES ComplexDoubleMoireHubbard + ) + +dca_add_gtest(tp_accumulator_hund_gpu_test + FAST + CUDA + INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR} + LIBS ${DCA_LIBS} ${DCA_KERNEL_LIBS} + CUSTOM_SOURCE tp_accumulator_gpu_test.cpp + TEST_DEFINES DoubleHund + ) + +dca_add_gtest(tp_accumulator_DoubleKagome_gpu_test + FAST + CUDA + INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR} + LIBS ${DCA_LIBS} ${DCA_KERNEL_LIBS} + CUSTOM_SOURCE tp_accumulator_gpu_test.cpp + TEST_DEFINES DoubleKagome + ) + +dca_add_gtest(tp_accumulator_FeAs_gpu_test FAST CUDA INCLUDE_DIRS ${DCA_INCLUDE_DIRS};${PROJECT_SOURCE_DIR} LIBS ${DCA_LIBS} ${DCA_KERNEL_LIBS} + CUSTOM_SOURCE tp_accumulator_gpu_test.cpp + TEST_DEFINES DoubleFeAs ) dca_add_gtest(tp_accumulator_complex_g0_gpu_test diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json index 6b997974b..f0a4d4b9b 100644 --- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json +++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json @@ -1,46 +1,48 @@ { - "output" : - { - "output-format" : "HDF5", + "output": { + "output-format": "HDF5", - "output-ED" : "ed_results.hdf5", - "output-QMC" : "output_QMC.hdf5" + "output-ED": "ed_results.hdf5", + "output-QMC": "output_QMC.hdf5" }, - "physics" : - { - "beta" : 4, - "chemical-potential" : 0 + "physics": { + "beta": 4, + "chemical-potential": 0 }, - "bilayer-Hubbard-model": - { - "t" : 1.0, - "t_prime" : 0.1, - "t_perp" : 0.01, - "U" : 1.0, - "V" : 1.0, - "V-prime" : 2.0 + "Kagome-Hubbard-model": { + "t": 1, + "U": 2 }, - "Hund-model": - { - "t" : 1, - "U" : 1, - "V" : 1, - "Jh" : 2 + "bilayer-Hubbard-model": { + "t": 1.0, + "t_prime": 0.1, + "t_perp": 0.01, + "U": 1.0, + "V": 1.0, + "V-prime": 2.0 }, - "single-band-Hubbard-model": - { - "t" : 1, - "U" : 2 + "Hund-model": { + "t": 1, + "U": 1, + "V": 1, + "Jh": 2 + }, + + "single-band-Hubbard-model": { + "t": 1, + "U": 2 }, "domains": { "real-space-grids": { - "cluster": [[4, 0], - [0, 4]] + "cluster": [ + [4, 0], + [0, 4] + ] }, "imaginary-time": { @@ -49,35 +51,31 @@ "imaginary-frequency": { "sp-fermionic-frequencies": 512, - "four-point-fermionic-frequencies" : 8 + "four-point-fermionic-frequencies": 8 } }, "four-point": { "type": "NONE", - "momentum-transfer": [0., 3.1415], + "momentum-transfer": [0, 3.1415], "frequency-transfer": 0 }, + "Monte-Carlo-integration": { + "Sigma-file": "zero", - "Monte-Carlo-integration" : - { - "Sigma-file" : "zero", + "warm-up-sweeps": 100, + "sweeps-per-measurement": 1, + "measurements-per-process-and-accumulator": 500, + "seed": 0, - "warm-up-sweeps" : 100, - "sweeps-per-measurement" : 1, - "measurements-per-process-and-accumulator" : 500, - "seed" : 0, - - "threaded-solver" : { - "accumulators" : 3 + "threaded-solver": { + "accumulators": 3 } }, - - "CT-INT" : - { - "initial-configuration-size" :5, - "alpha-dd-pos" : 0.51 + "CT-INT": { + "initial-configuration-size": 5, + "alpha-dd-pos": 0.51 } } diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_2_transfer.json b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_2_transfer.json new file mode 100644 index 000000000..c69c8c13b --- /dev/null +++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_2_transfer.json @@ -0,0 +1,81 @@ +{ + "output": { + "output-format": "HDF5", + + "output-ED": "ed_results.hdf5", + "output-QMC": "output_QMC.hdf5" + }, + + "physics": { + "beta": 4, + "chemical-potential": 0 + }, + + "Kagome-Hubbard-model": { + "t": 1, + "U": 2 + }, + + "bilayer-Hubbard-model": { + "t": 1.0, + "t_prime": 0.1, + "t_perp": 0.01, + "U": 1.0, + "V": 1.0, + "V-prime": 2.0 + }, + + "Hund-model": { + "t": 1, + "U": 1, + "V": 1, + "Jh": 2 + }, + + "single-band-Hubbard-model": { + "t": 1, + "U": 2 + }, + + "domains": { + "real-space-grids": { + "cluster": [ + [4, 0], + [0, 4] + ] + }, + + "imaginary-time": { + "sp-time-intervals": 512 + }, + + "imaginary-frequency": { + "sp-fermionic-frequencies": 512, + "four-point-fermionic-frequencies": 8 + } + }, + + "four-point": { + "type": "NONE", + "momentum-transfer": [0, 3.1415], + "frequency-transfer": 2 + }, + + "Monte-Carlo-integration": { + "Sigma-file": "zero", + + "warm-up-sweeps": 100, + "sweeps-per-measurement": 1, + "measurements-per-process-and-accumulator": 500, + "seed": 0, + + "threaded-solver": { + "accumulators": 3 + } + }, + + "CT-INT": { + "initial-configuration-size": 5, + "alpha-dd-pos": 0.51 + } +} diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_complex.json b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_complex.json index 85b1e5175..e5e5bf705 100644 --- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_complex.json +++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4_complex.json @@ -1,42 +1,46 @@ { - "output" : - { - "output-format" : "HDF5", + "output": { + "output-format": "HDF5", - "output-ED" : "ed_results.hdf5", - "output-QMC" : "output_QMC.hdf5" + "output-ED": "ed_results.hdf5", + "output-QMC": "output_QMC.hdf5" }, - "physics" : - { - "beta" : 2, - "chemical-potential" : 0 + "physics": { + "beta": 2, + "chemical-potential": 0 }, - "bilayer-Hubbard-model": - { - "t" : 1, - "U" : 2 + "bilayer-Hubbard-model": { + "t": 1, + "U": 2 }, - "Hund-model": - { - "t" : 1, - "U" : 1, - "V" : 1, - "Jh" : 2 + "Hund-model": { + "t": 1, + "U": 1, + "V": 1, + "Jh": 2 }, - "single-band-Hubbard-model": - { - "t" : 1, - "U" : 2 + "Moire-Hubbard-model": { + "t": 1, + "h": 0, + "U": 1, + "phi": 0 + }, + + "single-band-Hubbard-model": { + "t": 1, + "U": 2 }, "domains": { "real-space-grids": { - "cluster": [[4, 0], - [0, 4]] + "cluster": [ + [4, 0], + [0, 4] + ] }, "imaginary-time": { @@ -45,35 +49,31 @@ "imaginary-frequency": { "sp-fermionic-frequencies": 512, - "four-point-fermionic-frequencies" : 8 + "four-point-fermionic-frequencies": 8 } }, "four-point": { - "type": "NONE", - "momentum-transfer": [3.1415, 0.000], + "channels": ["PARTICLE_PARTICLE_UP_DOWN"], + "momentum-transfer": [3.1415, 0.0], "frequency-transfer": 2 }, + "Monte-Carlo-integration": { + "Sigma-file": "zero", - "Monte-Carlo-integration" : - { - "Sigma-file" : "zero", + "warm-up-sweeps": 100, + "sweeps-per-measurement": 1, + "measurements-per-process-and-accumulator": 500, + "seed": 0, - "warm-up-sweeps" : 100, - "sweeps-per-measurement" : 1, - "measurements-per-process-and-accumulator" : 500, - "seed" : 0, - - "threaded-solver" : { - "accumulators" : 3 + "threaded-solver": { + "accumulators": 3 } }, - - "CT-INT" : - { - "initial-configuration-size" :5, - "alpha-dd-pos" : 0.51 + "CT-INT": { + "initial-configuration-size": 5, + "alpha-dd-pos": 0.51 } } diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_test.cpp index 1eb09a529..c5d04944f 100644 --- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_gpu_test.cpp @@ -39,7 +39,9 @@ using McOptions = MockMcOptions; #define INPUT_DIR \ DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/" -constexpr char input_file[] = INPUT_DIR "input_4x4.json"; +#define INPUT_FILE_4_4 INPUT_DIR "input_4x4_2_transfer.json" +#define INPUT_FILE_4_4_COMPLEX INPUT_DIR "input_4x4_complex.json"; +#define INPUT_FILE_4_4_KAGOME INPUT_DIR "input_4x4_multitransfer_kagome.json"; #ifdef DCA_HAVE_ADIOS2 adios2::ADIOS* adios_ptr; @@ -57,9 +59,53 @@ using ConfigGenerator = dca::testing::AccumulationTest; using Configuration = ConfigGenerator::Configuration; using Sample = ConfigGenerator::Sample; -template +struct DoubleSquare { + using Scalar = double; + using Lattice = dca::testing::LatticeSquare; + static constexpr char Input[] = INPUT_FILE_4_4; +}; + +struct DoubleBilayer { + using Scalar = double; + using Lattice = dca::testing::LatticeBilayer; + static constexpr char Input[] = INPUT_FILE_4_4; +}; + +struct DoubleHund { + using Scalar = double; + using Lattice = dca::testing::LatticeHund; + static constexpr char Input[] = INPUT_FILE_4_4; +}; + +struct DoubleKagome { + using Scalar = double; + using Lattice = dca::testing::LatticeKagome; + static constexpr char Input[] = INPUT_FILE_4_4_KAGOME; +}; + +struct ComplexDoubleRashba { + using Scalar = std::complex; + using Lattice = dca::testing::LatticeRashba; + static constexpr char Input[] = INPUT_FILE_4_4_COMPLEX; +}; + +struct ComplexDoubleMoireHubbard { + using Scalar = std::complex; + using Lattice = dca::testing::LatticeMoireHubbard; + static constexpr char Input[] = INPUT_FILE_4_4_COMPLEX; +}; + +struct DoubleFeAs { + using Scalar = double; + using Lattice = dca::testing::LatticeFeAs; + static constexpr char Input[] = INPUT_FILE_4_4; +}; + +template struct TpAccumulatorGpuTest : public ::testing::Test { - using G0Setup = dca::testing::G0SetupBare; + using G0Setup = + dca::testing::G0SetupBare; virtual void SetUp() { host_setup.SetUp(); gpu_setup.SetUp(); @@ -74,30 +120,50 @@ uint loop_counter = 0; constexpr bool write_G4s = true; -using TestTypes = ::testing::Types; +// Running more than one of these we run into the issue with Space +// transform reinitialization and there will be failures aftrer the +// first. The work around is to build this test several times until +// fix the space transform global and possibly others. This is +// clearly something that has helped make many areas of the code less +// testable. + +using TestTypes = + ::testing::Types; // ComplexDoubleMoireHubbard,, + // DoubleSquare, DoubleKagome, , ComplexDoubleRashba, + // DoubleFeAs>; // DoubleBilayer, DoubleHund, TYPED_TEST_CASE(TpAccumulatorGpuTest, TestTypes); TYPED_TEST(TpAccumulatorGpuTest, Accumulate) { const std::array n{22, 22}; Sample M; Configuration config; - using Scalar = TypeParam; - ConfigGenerator::prepareConfiguration(config, M, TpAccumulatorGpuTest::G0Setup::BDmn::dmn_size(), - TpAccumulatorGpuTest::G0Setup::RDmn::dmn_size(), + using TestParam = TypeParam; + using TestType = TpAccumulatorGpuTest; + ConfigGenerator::prepareConfiguration(config, M, + TpAccumulatorGpuTest::G0Setup::BDmn::dmn_size(), + TpAccumulatorGpuTest::G0Setup::RDmn::dmn_size(), this->host_setup.parameters_.get_beta(), n); using namespace dca::phys; - std::vector four_point_channels{ - FourPointType::PARTICLE_HOLE_TRANSVERSE, FourPointType::PARTICLE_HOLE_MAGNETIC, - FourPointType::PARTICLE_HOLE_CHARGE, FourPointType::PARTICLE_PARTICLE_UP_DOWN}; - //FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, - // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, + + std::vector four_point_channels; + if (TestType::G0Setup::LatticeType::spin_symmetric) + four_point_channels = { + FourPointType::PARTICLE_HOLE_TRANSVERSE, FourPointType::PARTICLE_HOLE_MAGNETIC, + FourPointType::PARTICLE_HOLE_CHARGE, FourPointType::PARTICLE_PARTICLE_UP_DOWN}; + else + four_point_channels = {FourPointType::PARTICLE_PARTICLE_UP_DOWN}; + + // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, + // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, this->host_setup.parameters_.set_four_point_channels(four_point_channels); this->gpu_setup.parameters_.set_four_point_channels(four_point_channels); - dca::phys::solver::accumulator::TpAccumulatorhost_setup.parameters_), dca::DistType::NONE, dca::linalg::CPU> + dca::phys::solver::accumulator::TpAccumulatorhost_setup.parameters_), + dca::DistType::NONE, dca::linalg::CPU> accumulatorHost(this->host_setup.data_->G0_k_w_cluster_excluded, this->host_setup.parameters_); - dca::phys::solver::accumulator::TpAccumulatorhost_setup.parameters_), dca::DistType::NONE, dca::linalg::GPU> + dca::phys::solver::accumulator::TpAccumulatorhost_setup.parameters_), + dca::DistType::NONE, dca::linalg::GPU> accumulatorDevice(this->gpu_setup.data_->G0_k_w_cluster_excluded, this->gpu_setup.parameters_); const int8_t sign = 1; @@ -126,7 +192,8 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) { this->host_setup.data_->write(writer_h5); for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) { - std::string channel_str = dca::phys::toString(this->host_setup.parameters_.get_four_point_channels()[channel]); + std::string channel_str = + dca::phys::toString(this->host_setup.parameters_.get_four_point_channels()[channel]); writer.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]); writer.execute("accumulatorDevice_" + channel_str, accumulatorDevice.get_G4()[channel]); writer_h5.execute("accumulatorHOST_" + channel_str, accumulatorHost.get_G4()[channel]); @@ -139,11 +206,12 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) { #endif std::cout << "blocks: " << dca::util::ceilDiv(int(accumulatorHost.get_G4()[0].size()), 256) << '\n'; - + for (std::size_t channel = 0; channel < accumulatorHost.get_G4().size(); ++channel) { auto diff = dca::func::util::difference(accumulatorHost.get_G4()[channel], accumulatorDevice.get_G4()[channel]); - EXPECT_GT(5e-7, diff.l_inf) << "channel: " << dca::phys::toString(four_point_channels[channel]); + EXPECT_GT(5e-7, diff.l_inf) << "channel: " << dca::phys::toString(four_point_channels[channel]) + << " diff.l_inf == " << diff.l_inf << "\n"; } } @@ -165,7 +233,6 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) { // Accumulator accumulator3(data_->G0_k_w_cluster_excluded, parameters_, 3); // //accumulator3.set_multiple_accumulators(true); - // auto prepare_configuration = [&](auto& M, auto& configuration, const auto& n) { // ConfigGenerator::prepareConfiguration(M, configuration, TpAccumulatorGpuTest::BDmn::dmn_size(), // TpAccumulatorGpuTest::RDmn::dmn_size(), @@ -188,7 +255,7 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) { // accumulator_sum.resetAccumulation(loop_id, flag); // // This is a bandaid for CI. -// std::chrono::microseconds sleep_time(10000); +// std::chrono::microseconds sleep_time(10000); // std::this_thread::sleep_for(sleep_time); // // there is a data race here between the resets above @@ -197,8 +264,8 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) { // accumulator1.accumulate(M1, config1, sign); // accumulator2.accumulate(M2, config2, sign); // accumulator1.sumTo(accumulator_sum); -// accumulator2.sumTo(accumulator_sum); - +// accumulator2.sumTo(accumulator_sum); + // accumulator_sum.finalize(); // // Reset the G4 on the GPU to zero. @@ -212,14 +279,15 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) { // accumulator3.accumulate(M1, config1, sign); // accumulator3.accumulate(M2, config2, sign); // accumulator3.finalize(); - + // // auto acc3_it = accumulator3.get_G4()[0].begin(); // // auto acc_sum_it = accumulator_sum.get_G4()[0].begin(); // // auto acc3_end = accumulator3.get_G4()[0].end(); // // int index = 0; // // while(acc3_it != acc3_end) { -// // EXPECT_NEAR(acc_sum_it->real(), acc3_it->real(), 1E-4) << "index = " << dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index)); +// // EXPECT_NEAR(acc_sum_it->real(), acc3_it->real(), 1E-4) << "index = " << +// dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index)); // // ++acc3_it; // // ++acc_sum_it; // // ++index; @@ -232,13 +300,17 @@ TYPED_TEST(TpAccumulatorGpuTest, Accumulate) { // if (diff.l_inf < 5e-7) { // std::cout << "succeed with diff.l_inf = " << diff.l_inf << '\n'; // int index = 10; -// std::cout << "values at " << dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index)) -// << " sum: " << accumulator_sum.get_G4()[0](index) << " just_acc: " << accumulator3.get_G4()[0](index) << '\n'; +// std::cout << "values at " << +// dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index)) +// << " sum: " << accumulator_sum.get_G4()[0](index) << " just_acc: " << +// accumulator3.get_G4()[0](index) << '\n'; // } // else { // int index = 10; -// std::cout << "values at " << dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index)) -// << " sum: " << accumulator_sum.get_G4()[0](index) << " just_acc: " << accumulator3.get_G4()[0](index) << '\n'; +// std::cout << "values at " << +// dca::vectorToString(accumulator3.get_G4()[0].linind_2_subind(index)) +// << " sum: " << accumulator_sum.get_G4()[0](index) << " just_acc: " << +// accumulator3.get_G4()[0](index) << '\n'; // } // } diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_particle_hole_test.cpp b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_particle_hole_test.cpp index 3209f1fa5..8749158c2 100644 --- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_particle_hole_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_particle_hole_test.cpp @@ -12,7 +12,6 @@ // channels based on the relation between the particle-hole longitudinal up-up and up-down channels, // and the particle-hole magnetic and charge channels. - #include "dca/platform/dca_gpu.h" using Scalar = double; @@ -45,7 +44,7 @@ constexpr char input_file[] = INPUT_DIR "input_tp_accumulator_particle_hole_test using Scalar = double; using TpAccumulatorTest = - dca::testing::G0Setup; + dca::testing::G0Setup; TEST_F(TpAccumulatorTest, ParticleHoleChannels) { using TpAccumulatorType = dca::phys::solver::accumulator::TpAccumulator; @@ -60,9 +59,10 @@ TEST_F(TpAccumulatorTest, ParticleHoleChannels) { TpAccumulatorTest::RDmn::dmn_size(), parameters_.get_beta(), n); - parameters_.set_four_point_channels(std::vector{ - dca::phys::FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, dca::phys::FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, - dca::phys::FourPointType::PARTICLE_HOLE_MAGNETIC, dca::phys::FourPointType::PARTICLE_HOLE_CHARGE}); + parameters_.set_four_point_channels( + std::vector{dca::phys::FourPointType::PARTICLE_HOLE_TRANSVERSE, + dca::phys::FourPointType::PARTICLE_HOLE_MAGNETIC, + dca::phys::FourPointType::PARTICLE_HOLE_CHARGE}); TpAccumulatorType accumulator_ph(data_->G0_k_w_cluster_excluded, parameters_); @@ -103,7 +103,7 @@ TEST_F(TpAccumulatorTest, ParticleHoleChannels) { } const auto diff_up_up = dca::func::util::difference(G4_ph_long_up_up, G4_ph_long_up_up_check); - EXPECT_LT(diff_up_up.l_inf, 100 * std::numeric_limits::epsilon()); + EXPECT_LT(diff_up_up.l_inf, 100 * std::numeric_limits::epsilon()) << "G4_p4_long_up_up.getDomainSizes() " << dca::vectorToString(G4_ph_long_up_up.getDomainSizes()); const auto diff_up_down = dca::func::util::difference(G4_ph_long_up_down, G4_ph_long_up_down_check); EXPECT_LT(diff_up_down.l_inf, 100 * std::numeric_limits::epsilon()); diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_gpu_test.cpp index 3123f8404..7a3c8f033 100644 --- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_gpu_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_singleband_gpu_test.cpp @@ -78,8 +78,8 @@ TEST_F(TpAccumulatorGpuSinglebandTest, Accumulate) { std::vector four_point_channels{FourPointType::PARTICLE_HOLE_TRANSVERSE, FourPointType::PARTICLE_HOLE_MAGNETIC, FourPointType::PARTICLE_HOLE_CHARGE, - FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, - FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, + // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, + // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, FourPointType::PARTICLE_PARTICLE_UP_DOWN}; parameters_.set_four_point_channels(four_point_channels); diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex.cpp b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex.cpp index 63211cf2a..2ee9f674b 100644 --- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex.cpp +++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex.cpp @@ -37,7 +37,6 @@ using McOptions = MockMcOptions; #include "dca/phys/four_point_type.hpp" #include "test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/accumulation_test.hpp" - constexpr bool update_baseline = false; #define INPUT_DIR \ @@ -51,8 +50,8 @@ using Sample = ConfigGenerator::Sample; using Scalar = std::complex; -using TpAccumulatorTest = - dca::testing::G0Setup; +using TpAccumulatorTest = dca::testing::G0Setup; TEST_F(TpAccumulatorTest, Accumulate) { const std::array n{18, 22}; @@ -80,8 +79,7 @@ TEST_F(TpAccumulatorTest, Accumulate) { func_names[dca::phys::FourPointType::PARTICLE_HOLE_CHARGE] = "G4_ph_charge"; func_names[dca::phys::FourPointType::PARTICLE_PARTICLE_UP_DOWN] = "G4_pp_up_down"; - for (const dca::phys::FourPointType type : - {dca::phys::FourPointType::PARTICLE_PARTICLE_UP_DOWN}) { + for (const dca::phys::FourPointType type : {dca::phys::FourPointType::PARTICLE_PARTICLE_UP_DOWN}) { parameters_.set_four_point_channel(type); dca::phys::solver::accumulator::TpAccumulator accumulator( diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex_baseline.hdf5 b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex_baseline.hdf5 index b1cb4e33a..99e7f9a7d 100644 Binary files a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex_baseline.hdf5 and b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_test_complex_baseline.hdf5 differ diff --git a/test/unit/phys/dca_step/cluster_solver/test_setup.hpp b/test/unit/phys/dca_step/cluster_solver/test_setup.hpp index a86fad3e8..2cf8821e3 100644 --- a/test/unit/phys/dca_step/cluster_solver/test_setup.hpp +++ b/test/unit/phys/dca_step/cluster_solver/test_setup.hpp @@ -92,11 +92,12 @@ struct G0Setup : public ::testing::Test { throw std::runtime_error("Input parsing failed!"); } parameters_.update_model(); - static bool domain_initialized = false; - if (!domain_initialized) { - parameters_.update_domains(); - domain_initialized = true; - } + parameters_.update_domains(); + // static bool domain_initialized = false; + // if (!domain_initialized) { + // parameters_.update_domains(); + // domain_initialized = true; + // } data_ = std::make_unique(parameters_); data_->initialize(); } diff --git a/test/unit/phys/parameters/four_point_parameters/four_point_parameters_test.cpp b/test/unit/phys/parameters/four_point_parameters/four_point_parameters_test.cpp index 988f7048c..61fde05e7 100644 --- a/test/unit/phys/parameters/four_point_parameters/four_point_parameters_test.cpp +++ b/test/unit/phys/parameters/four_point_parameters/four_point_parameters_test.cpp @@ -30,12 +30,12 @@ TEST(FourPointParametersTest, DefaultValues) { } using namespace dca::phys; -const std::vector all_channels{FourPointType::PARTICLE_HOLE_TRANSVERSE, - FourPointType::PARTICLE_HOLE_MAGNETIC, - FourPointType::PARTICLE_HOLE_CHARGE, - FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, - FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, - FourPointType::PARTICLE_PARTICLE_UP_DOWN}; +const std::vector all_channels{ + FourPointType::NONE, FourPointType::PARTICLE_HOLE_TRANSVERSE, + FourPointType::PARTICLE_HOLE_MAGNETIC, FourPointType::PARTICLE_HOLE_CHARGE, + // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, + // FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, + FourPointType::PARTICLE_PARTICLE_UP_DOWN}; TEST(FourPointParametersTest, ReadAll) { dca::io::JSONReader reader;