From 34ee434deabcfd460efa18dca7236a96219cb96a Mon Sep 17 00:00:00 2001 From: Peter Doak Date: Mon, 11 Aug 2025 10:13:55 -0400 Subject: [PATCH 1/3] removing assumption spin parallelism from blas --- include/dca/linalg/blas/use_device.hpp | 6 ++++++ include/dca/linalg/matrixop.hpp | 2 ++ include/dca/linalg/util/stream_container.hpp | 2 +- .../walker/tools/g0_interpolation/g0_interpolation_gpu.hpp | 5 +---- .../ctint/structs/device_configuration_manager.hpp | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/dca/linalg/blas/use_device.hpp b/include/dca/linalg/blas/use_device.hpp index 72438de5c..e375d3b8a 100644 --- a/include/dca/linalg/blas/use_device.hpp +++ b/include/dca/linalg/blas/use_device.hpp @@ -85,24 +85,28 @@ struct UseDevice { template inline static void axpy(int n, ScalarType alpha, const ScalarType* x, int incx, ScalarType* y, int incy, int thread_id, int stream_id) { + assert(stream_id == 0); cublas::axpy(util::getHandle(thread_id, stream_id), n, alpha, x, incx, y, incy); } template inline static void copy(int n, const ScalarType* x, int incx, ScalarType* y, int incy, int thread_id, int stream_id) { + assert(stream_id == 0); cublas::copy(util::getHandle(thread_id, stream_id), n, x, incx, y, incy); } template inline static void scal(int n, ScalarType alpha, ScalarType* x, int incx, int thread_id, int stream_id) { + assert(stream_id == 0); cublas::scal(util::getHandle(thread_id, stream_id), n, alpha, x, incx); } template inline static void swap(int n, ScalarType* x, int incx, ScalarType* y, int incy, int thread_id, int stream_id) { + assert(stream_id == 0); cublas::swap(util::getHandle(thread_id, stream_id), n, x, incx, y, incy); } @@ -112,6 +116,7 @@ struct UseDevice { ScalarType alpha, const ScalarType* a, int lda, const ScalarType* b, int ldb, ScalarType beta, ScalarType* c, int ldc, int thread_id, int stream_id) { + assert(stream_id == 0); cublas::gemm(util::getHandle(thread_id, stream_id), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } @@ -120,6 +125,7 @@ struct UseDevice { inline static void trsm(const char* side, const char* uplo, const char* transa, const char* diag, int m, int n, ScalarType alpha, const ScalarType* a, int lda, ScalarType* b, int ldb, int thread_id, int stream_id) { + assert(stream_id == 0); cublas::trsm(util::getHandle(thread_id, stream_id), side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } diff --git a/include/dca/linalg/matrixop.hpp b/include/dca/linalg/matrixop.hpp index 87c650d25..cc5d8c32e 100644 --- a/include/dca/linalg/matrixop.hpp +++ b/include/dca/linalg/matrixop.hpp @@ -679,6 +679,7 @@ template & a, const MatrixB& b, Scalar beta, MatrixC& c, int thread_id = 0, int stream_id = 0) { + assert(stream_id == 0); int m = c.nrRows(); int n = c.nrCols(); int k; @@ -719,6 +720,7 @@ template & a, const MatrixB& b, MatrixC& c, int thread_id = 0, int stream_id = 0) { + assert(stream_id == 0); gemm('N', 'N', 1., a, b, 0., c, thread_id, stream_id); } diff --git a/include/dca/linalg/util/stream_container.hpp b/include/dca/linalg/util/stream_container.hpp index 6d97871d5..7b59796fb 100644 --- a/include/dca/linalg/util/stream_container.hpp +++ b/include/dca/linalg/util/stream_container.hpp @@ -61,7 +61,7 @@ class StreamContainer { } private: - constexpr static std::size_t streams_per_thread_ = 2; + constexpr static std::size_t streams_per_thread_ = 1; std::vector> streams_; }; diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.hpp index ef2d379fe..add1f3b0f 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/g0_interpolation/g0_interpolation_gpu.hpp @@ -101,8 +101,6 @@ class G0Interpolation : public G0InterpolationBase linalg::MultiVector g0_labels_gpu_; using Base::beta; - - linalg::util::GpuEvent config_copied_; }; template @@ -239,7 +237,7 @@ void G0Interpolation::uploadConfiguration( const Configuration& configuration) { const int configuration_size = configuration.size(); - config_copied_.block(); + linalg::util::getStream(thread_id, 0).sync(); g0_labels_cpu_.resizeNoCopy(configuration_size); g0_labels_gpu_.resizeNoCopy(configuration_size); @@ -255,7 +253,6 @@ void G0Interpolation::uploadConfiguration( const auto& stream = linalg::util::getStream(thread_id, stream_id); g0_labels_gpu_.setAsync(g0_labels_cpu_, stream); - config_copied_.record(stream); } } // namespace ctaux diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp index 5cb320da2..5bd9de995 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp @@ -50,7 +50,7 @@ class DeviceConfigurationManager { void DeviceConfigurationManager::upload(const SolverConfiguration& config, int thread_id, int spin) { assert(spin >= 0 and spin < 2); const auto& entries = config.getSector(spin).entries_; - device_entries_[spin].setAsync(entries, linalg::util::getStream(thread_id, spin)); + device_entries_[spin].setAsync(entries, linalg::util::getStream(thread_id, 0)); device_pointers_[spin].data = device_entries_[spin].ptr(); } From 0cf2d69416c2aedde21ff5b77855c3adcc816a30 Mon Sep 17 00:00:00 2001 From: Peter Doak Date: Thu, 14 Aug 2025 18:47:25 -0400 Subject: [PATCH 2/3] following through removing spin concurrency --- build-aux/frontier_build.sh | 2 + .../ctint/walker/ctint_walker_base.hpp | 55 ++-- .../walker/ctint_walker_cpu_submatrix.hpp | 142 ++++++++--- .../walker/ctint_walker_gpu_submatrix.hpp | 205 +++++++++++---- .../walker/ctint_walker_submatrix_base.hpp | 39 +-- .../interpolation/g0_interpolation_gpu.hpp | 5 +- include/dca/phys/parameters/parameters.hpp | 22 +- .../ctint/walker/walker_kernels.cu | 8 +- test/unit/linalg/vector_cpu_gpu_test.cpp | 8 +- test/unit/linalg/vector_cpu_test.cpp | 4 +- test/unit/linalg/vector_gpu_test.cpp | 4 +- .../ct_int_walker_multinsertion_test.cpp | 207 +++++++++------ ..._int_walker_submatrix_gpu_complex_test.cpp | 4 +- .../ct_int_walker_submatrix_gpu_test.cpp | 235 +++++++++++++++++- .../walker/ct_int_walker_submatrix_test.cpp | 8 +- .../ctint/walker/ct_int_walker_test.cpp | 2 +- .../ctint/walker/walker_wrapper.hpp | 4 +- .../ctint/walker/walker_wrapper_submatrix.hpp | 40 ++- .../dca_step/cluster_solver/test_setup.hpp | 43 ++++ 19 files changed, 793 insertions(+), 244 deletions(-) diff --git a/build-aux/frontier_build.sh b/build-aux/frontier_build.sh index 81baf4666..206668793 100644 --- a/build-aux/frontier_build.sh +++ b/build-aux/frontier_build.sh @@ -10,11 +10,13 @@ cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \ -DTEST_RUNNER="srun" \ -DGPU_TARGETS=gfx90a \ -DAMDGPU_TARGETS=gfx90a \ + -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=mpicc \ -DCMAKE_CXX_COMPILER=mpic++ \ -DCMAKE_HIP_COMPILER=/opt/rocm-6.3.1/llvm/bin/clang++ \ -DCMAKE_INSTALL_PREFIX=$INST \ -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DCMAKE_HIP_LINK_FLAGS=--hip-link \ + -DCMAKE_EXPORT_COMPILE_COMMANDSS=1 \ -GNinja \ .. diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp index 70cfb39c4..9ed7e52eb 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp @@ -92,10 +92,10 @@ class CtintWalkerBase { virtual void doStep() = 0; virtual void doSweep() = 0; - - template + + template void setMFromConfigImpl(DMatrixBuilder& d_matrix_builder); - + bool is_thermalized() const { return thermalized_; } @@ -165,17 +165,15 @@ class CtintWalkerBase { return flop; } - const auto& get_stream(int s) const { - assert(s >= 0 && s < 2); - return *streams_[s]; + const auto& get_stream() const { + return stream_; } static void sumConcurrency(const Concurrency&) {} void writeAlphas() const; - static void setInteractionVertices(const Data& data, - const Parameters& parameters); + static void setInteractionVertices(const Data& data, const Parameters& parameters); protected: // typedefs @@ -191,7 +189,7 @@ class CtintWalkerBase { const Concurrency& concurrency_; const int thread_id_; - std::array streams_; + linalg::util::GpuStream* stream_; Rng& rng_; SolverConfiguration configuration_; @@ -230,26 +228,18 @@ class CtintWalkerBase { }; template -CtintWalkerBase::CtintWalkerBase(const Parameters& parameters_ref, - Rng& rng_ref, int id) +CtintWalkerBase::CtintWalkerBase(const Parameters& parameters_ref, Rng& rng_ref, + int id) : parameters_(parameters_ref), concurrency_(parameters_.get_concurrency()), - thread_id_(id), - - streams_{&linalg::util::getStreamContainer()(thread_id_, 0), - &linalg::util::getStreamContainer()(thread_id_, 1)}, - + stream_{&linalg::util::getStreamContainer()(thread_id_, 0)}, rng_(rng_ref), - - configuration_(parameters_.get_beta(), Bdmn::dmn_size(), vertices_, - parameters_.getDoubleUpdateProbability()), - beta_(parameters_.get_beta()), total_interaction_(vertices_.integratedInteraction()) {} template -void CtintWalkerBase::initialize(int iteration) { +void CtintWalkerBase::initialize(int iteration) { assert(total_interaction_); phase_.reset(); @@ -269,7 +259,7 @@ void CtintWalkerBase::initialize(int iteration) { } template -void CtintWalkerBase::updateSweepAverages() { +void CtintWalkerBase::updateSweepAverages() { order_avg_.addSample(order()); sign_avg_.addSample(phase_.getSign()); // Track avg order for the final number of steps / sweep. @@ -293,9 +283,8 @@ void CtintWalkerBase::updateSweepAverages() { // } // } - template -void CtintWalkerBase::updateShell(int meas_id, int meas_to_do) const { +void CtintWalkerBase::updateShell(int meas_id, int meas_to_do) const { if (concurrency_.id() == concurrency_.first() && meas_id > 1 && (meas_id % dca::util::ceilDiv(meas_to_do, 20)) == 0) { std::cout << "\t\t\t" << int(double(meas_id) / double(meas_to_do) * 100) << " % completed \t "; @@ -312,7 +301,7 @@ void CtintWalkerBase::updateShell(int meas_id, int meas_to_do) } template -void CtintWalkerBase::printSummary() const { +void CtintWalkerBase::printSummary() const { std::cout << "\n" << "Walker: process ID = " << concurrency_.id() << ", thread ID = " << thread_id_ << "\n" << "-------------------------------------------\n"; @@ -329,8 +318,8 @@ void CtintWalkerBase::printSummary() const { } template -void CtintWalkerBase::setInteractionVertices(const Data& data, - const Parameters& parameters) { +void CtintWalkerBase::setInteractionVertices(const Data& data, + const Parameters& parameters) { vertices_.reset(); vertices_.initialize(parameters.getDoubleUpdateProbability(), parameters.getAllSitesPartnership()); vertices_.initializeFromHamiltonian(data.H_interactions); @@ -341,11 +330,11 @@ void CtintWalkerBase::setInteractionVertices(const Data& data, } template -void CtintWalkerBase::computeM(MatrixPair& m_accum) const { +void CtintWalkerBase::computeM(MatrixPair& m_accum) const { m_accum = M_; } -// template +// template // void setMFromConfigHelper(WALKER& walker, DMatrixBuilder& d_matrix_builder) { // walker.mc_log_weight_ = 0.; // walker.phase_.reset(); @@ -379,11 +368,11 @@ void CtintWalkerBase::computeM(MatrixPair& m_accum) const { // walker.phase_.multiply(term); // } // } - - + template template -void CtintWalkerBase::setMFromConfigImpl(DMatrixBuilder& d_matrix_builder) { +void CtintWalkerBase::setMFromConfigImpl( + DMatrixBuilder& d_matrix_builder) { mc_log_weight_ = 0.; phase_.reset(); @@ -416,7 +405,7 @@ void CtintWalkerBase::setMFromConfigImpl(DMatrixBuilder; - CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, DMatrixBuilder& d_matrix_builder, int id = 0); + CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, + DMatrixBuilder& d_matrix_builder, int id = 0); virtual ~CtintWalkerSubmatrixCpu() = default; @@ -69,6 +70,23 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase& d_matrix_builder_; + + BaseClass::MatrixPair getM(); + + /** The following methods are really only here to get decent unit testing + they shouldn't really be called outside of the base implementations + */ + void computeMInit() override; + void computeGInit() override; + BaseClass::MatrixPair getRawM(); + BaseClass::MatrixPair getRawG(); + +private: /** This does a bunch of things, this is the majority of a step * For each delayed_move it: * * computes the acceptance prob @@ -83,15 +101,6 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase& d_matrix_builder_; -private: - void doSubmatrixUpdate(); /** returns [acceptance_probability , mc_weight_ratio ] @@ -102,10 +111,7 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase::CtintWalkerSubmatrixCpu( const Parameters& parameters_ref, const Data& data, Rng& rng_ref, DMatrixBuilder& d_matrix_builder, int id) : SubmatrixBase(parameters_ref, data, rng_ref, id), d_matrix_builder_(d_matrix_builder) { - - for (int b = 0; b < n_bands_; ++b) { + for (int b = 0; b < n_bands_; ++b) { for (int i = 1; i <= 3; ++i) { f_[i][b] = d_matrix_builder_.computeF(i, b); f_[-i][b] = d_matrix_builder_.computeF(-i, b); @@ -200,13 +206,37 @@ CtintWalkerSubmatrixCpu::CtintWalkerSubmatrixCpu( } f_[0][b] = 1; } - } template void CtintWalkerSubmatrixCpu::setMFromConfig() { BaseClass::setMFromConfigImpl(d_matrix_builder_); - transformM(); + SubmatrixBase::transformM(); +#ifdef DEBUG_SUBMATRIX + std::cout << "cpu M post setMFromConfigImpl: \n"; + M_[0].set_name("cpu_M_0"); + M_[1].set_name("cpu_M_1"); + M_[0].print(); + M_[1].print(); +#endif +} + +template +void CtintWalkerSubmatrixCpu::markThermalized() { + thermalized_ = true; + + nb_steps_per_sweep_ = std::max(1., std::ceil(sweeps_per_meas_ * partial_order_avg_.mean())); + thermalization_steps_ = n_steps_; + + order_avg_.reset(); + sign_avg_.reset(); + n_accepted_ = 0; + + // Recompute the Monte Carlo weight. + setMFromConfig(); +#ifndef NDEBUG + // writeAlphas(); +#endif } // Extend M by adding non-interacting vertices. @@ -222,26 +252,55 @@ void CtintWalkerSubmatrixCpu::computeMInit() { Scalar f_j; D_.resize(std::make_pair(delta, n_init_[s])); + if (delta == 0 || n_init_[s] == 0) + throw std::runtime_error( + "expansion factor dropped to 0 or below use a higher beta or larger interaction!"); + d_matrix_builder_.computeG0(D_, configuration_.getSector(s), n_init_[s], n_max_[s], 0); +#ifdef DEBUG_SUBMATRIX + D_.print(); +#endif + + std::array, 2> f_values; + f_values[s].resize(n_init_[s]); for (int j = 0; j < n_init_[s]; ++j) { const auto field_type = configuration_.getSector(s).getAuxFieldType(j); const auto b = configuration_.getSector(s).getRightB(j); f_j = f_[field_type][b] - 1; - + f_values[s][j] = f_[field_type][b]; for (int i = 0; i < delta; ++i) { D_(i, j) *= f_j; } } +#ifdef DEBUG_SUBMATRIX + f_values[0].set_name("cpu_f_values_0"); + f_values[1].set_name("cpu_f_values_1"); + f_values[0].print(); + f_values[1].print(); + std::cout << "cpu D post f factor mult\n"; + D_.print(); + using namespace dca::addt_str_oper; + std::cout << "M_[" << s << "] size: " << M_[s].size() << '\n'; +#endif + M_[s].resize(n_max_[s]); MatrixView M(M_[s], 0, 0, n_init_[s], n_init_[s]); MatrixView D_M(M_[s], n_init_[s], 0, delta, n_init_[s]); +#ifdef DEBUG_SUBMATRIX + std::cout << "cpu M pre gemm\n"; + M_[s].print(); +#endif + linalg::matrixop::gemm(D_, M, D_M); flop_ += 2 * D_.nrRows() * D_.nrCols() * M.nrCols(); +#ifdef DEBUG_SUBMATRIX + D_M.print(); +#endif for (int i = 0; i < n_max_[s]; ++i) { for (int j = n_init_[s]; j < n_max_[s]; ++j) { M_[s](i, j) = 0; @@ -250,6 +309,9 @@ void CtintWalkerSubmatrixCpu::computeMInit() { for (int i = n_init_[s]; i < n_max_[s]; ++i) M_[s](i, i) = 1; +#ifdef DEBUG_SUBMATRIX + M_[s].print(); +#endif } } } @@ -473,20 +535,6 @@ void CtintWalkerSubmatrixCpu::recomputeGammaInv() { } } -template -void CtintWalkerSubmatrixCpu::transformM() { - for (int s = 0; s < 2; ++s) { - for (int j = 0; j < M_[s].size().second; ++j) { - for (int i = 0; i < M_[s].size().first; ++i) { - const auto field_type = configuration_.getSector(s).getAuxFieldType(i); - const auto b = configuration_.getSector(s).getLeftB(i); - const Scalar f_i = -(f_[field_type][b] - 1); - M_[s](i, j) /= f_i; - } - } - } -} - template void CtintWalkerSubmatrixCpu::computeM(typename BaseClass::MatrixPair& m_accum) { for (int s = 0; s < 2; ++s) { @@ -571,6 +619,34 @@ void CtintWalkerSubmatrixCpu::computeMixedInsertionAndRemoval( computeRemovalMatrix(s); } +template +CtintWalkerSubmatrixCpu::BaseClass::MatrixPair CtintWalkerSubmatrixCpu< + Parameters, DIST>::getRawM() { + typename BaseClass::MatrixPair M; + M = M_; + M[0].set_name("subMatrixCPU::M[0]"); + M[1].set_name("subMatrixCPU::M[1]"); + return M; +} + +template +CtintWalkerSubmatrixCpu::BaseClass::MatrixPair CtintWalkerSubmatrixCpu< + Parameters, DIST>::getRawG() { + typename BaseClass::MatrixPair G; + G = G_; + G[0].set_name("subMatrixCPU::G[0]"); + G[1].set_name("subMatrixCPU::G[1]"); + return G; +} + +template +CtintWalkerSubmatrixCpu::BaseClass::MatrixPair CtintWalkerSubmatrixCpu::getM() { + typename BaseClass::MatrixPair M; + computeM(M); + return M; +} + } // namespace ctint } // namespace solver } // namespace phys diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp index e098599e6..17cb337c3 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp @@ -29,6 +29,7 @@ #include "dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp" #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp" #include "dca/phys/dca_step/cluster_solver/ctint/walker/kernels_interface.hpp" +#include "dca/util/to_string.hpp" namespace dca { namespace phys { @@ -65,6 +66,8 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase& m_accum); + MatrixPair getM(); + void doSweep() override; void synchronize() const override; @@ -75,19 +78,29 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase getRawG(); + MatrixPair getRawM(); + + void updateM() override; private: void doStep() override; - void computeMInit(); - void computeGInit(); - void updateM(); - - void uploadConfiguration(); - protected: using BaseClass::configuration_; using BaseClass::M_; @@ -100,8 +113,6 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase>, 2> gamma_index_; std::array, linalg::GPU>, 2> gamma_index_dev_; - std::array config_copied_; - DMatrixBuilder& d_matrix_builder_; }; @@ -169,21 +178,28 @@ CtintWalkerSubmatrixGpu::CtintWalkerSubmatrixGpu( if (concurrency_.id() == concurrency_.first() && thread_id_ == 0) std::cout << "\nCT-INT submatrix walker extended to GPU." << std::endl; } - + template void CtintWalkerSubmatrixGpu::setMFromConfig() { BaseClass::setMFromConfigImpl(d_matrix_builder_); + SubmatrixBase::transformM(); +#ifdef DEBUG_SUBMATRIX + M_[0].set_name("gpu_M_0"); + M_[1].set_name("gpu_M_0"); +#endif for (int s = 0; s < 2; ++s) { - M_dev_[s].setAsync(M_[s], get_stream(s)); + std::cout << "GPU pre set M: \n"; +#ifdef DEBUG_SUBMATRIX + M_[s].print(); +#endif + M_dev_[s].setAsync(M_[s], *get_stream()); } } template void CtintWalkerSubmatrixGpu::synchronize() const { Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_); - - checkRC(cudaStreamSynchronize(get_stream(0))); - checkRC(cudaStreamSynchronize(get_stream(1))); + checkRC(cudaStreamSynchronize(*get_stream())); } template @@ -197,8 +213,9 @@ void CtintWalkerSubmatrixGpu::doSweep() { template void CtintWalkerSubmatrixGpu::doStep(const int n_moves_to_delay) { SubmatrixBase::nbr_of_moves_to_delay_ = n_moves_to_delay; - doStep(); uploadConfiguration(); + synchronize(); + doStep(); } template @@ -215,9 +232,7 @@ void CtintWalkerSubmatrixGpu::doStep() { template void CtintWalkerSubmatrixGpu::uploadConfiguration() { - for (int s = 0; s < 2; ++s) - config_copied_[s].block(); - + get_stream()->sync(); // Upload configuration and f values. device_config_.upload(configuration_, thread_id_); for (int s = 0; s < 2; ++s) { @@ -229,17 +244,21 @@ void CtintWalkerSubmatrixGpu::uploadConfiguration() { const auto b = sector.getLeftB(i); values[i] = f_[field_type][b]; } - f_dev_[s].setAsync(values, get_stream(s)); + f_dev_[s].setAsync(values, *get_stream()); } +#ifdef DEBUG_SUBMATRIX + std::cout << "GPU f_values: \n"; + f_values_[0].print(); + f_values_[1].print(); +#endif - for (int s = 0; s < 2; ++s) - config_copied_[s].record(get_stream(s)); + get_stream()->sync(); } template void CtintWalkerSubmatrixGpu::computeMInit() { // Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_); - + get_stream()->sync(); for (int s = 0; s < 2; ++s) M_dev_[s].resize(n_max_[s]); @@ -247,21 +266,52 @@ void CtintWalkerSubmatrixGpu::computeMInit() { const int delta = n_max_[s] - n_init_[s]; if (delta > 0) { D_dev_[s].resizeNoCopy(std::make_pair(delta, n_init_[s])); + + if (delta == 0 || n_init_[s] == 0) + throw std::runtime_error( + "expansion factor dropped to 0 or below use a higher beta or larger interaction!"); + d_matrix_builder_.computeG0(D_dev_[s], device_config_.getDeviceData(s), n_init_[s], false, - get_stream(s)); + *get_stream()); flop_ += D_dev_[s].nrCols() * D_dev_[s].nrRows() * 10; +#ifdef DEBUG_SUBMATRIX + // debugging + SubmatrixBase::D_ = D_dev_[s]; + SubmatrixBase::D_.print(); +#endif MatrixView D_view(D_dev_[s]); - details::multiplyByFColFactor(D_view, f_dev_[s].ptr(), get_stream(s)); + details::multiplyByFColFactor(D_view, f_dev_[s].ptr(), *get_stream()); +#ifdef DEBUG_SUBMATRIX + std::cout << "GPU D multiplied by FCol Factor\n"; + SubmatrixBase::D_ = D_dev_[s]; + SubmatrixBase::D_.print(); +#endif MatrixView M(M_dev_[s], 0, 0, n_init_[s], n_init_[s]); MatrixView D_M(M_dev_[s], n_init_[s], 0, delta, n_init_[s]); - linalg::matrixop::gemm(D_dev_[s], M, D_M, thread_id_, s); +#ifdef DEBUG_SUBMATRIX + SubmatrixBase::M_[s] = M_dev_[s]; + std::cout << "GPU M pre gemm M\n"; + SubmatrixBase::M_[s].print(); +#endif + + linalg::matrixop::gemm(D_dev_[s], M, D_M, thread_id_, 0); + +#ifdef DEBUG_SUBMATRIX + SubmatrixBase::M_[s] = M_dev_[s]; + std::cout << "GPU M post gemm M\n"; + SubmatrixBase::M_[s].print(); + MatrixView D_M_host(SubmatrixBase::M_[s], n_init_[s], 0, delta, n_init_[s]); + std::cout << "gpu D_M_ print\n"; + D_M_host.print(); +#endif + flop_ += 2 * D_dev_[s].nrRows() * D_dev_[s].nrCols() * M.nrCols(); details::setRightSectorToId(M_dev_[s].ptr(), M_dev_[s].leadingDimension(), n_init_[s], - n_max_[s], get_stream(s)); + n_max_[s], *get_stream()); } } } @@ -269,30 +319,33 @@ void CtintWalkerSubmatrixGpu::computeMInit() { template void CtintWalkerSubmatrixGpu::computeGInit() { // Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_); - + get_stream()->sync(); for (int s = 0; s < 2; ++s) { const int delta = n_max_[s] - n_init_[s]; + + // In cpu we only do all this if delta > 0 auto& f_dev = f_dev_[s]; G_dev_[s].resizeNoCopy(n_max_[s]); - MatrixView G(G_dev_[s]); + MatrixView G_view_dev(G_dev_[s]); const MatrixView M(M_dev_[s]); - details::computeGLeft(G, M, f_dev.ptr(), n_init_[s], get_stream(s)); + details::computeGLeft(G_view_dev, M, f_dev.ptr(), n_init_[s], get_stream()->streamActually()); flop_ += n_init_[s] * n_max_[s] * 2; if (delta > 0) { G0_dev_[s].resizeNoCopy(std::make_pair(n_max_[s], delta)); // This doesn't do flops but the g0 interp data it uses does somewhere. d_matrix_builder_.computeG0(G0_dev_[s], device_config_.getDeviceData(s), n_init_[s], true, - get_stream(s)); + *get_stream()); flop_ += G0_dev_[s].nrCols() * G0_dev_[s].nrRows() * 10; - MatrixView G(G_dev_[s], 0, n_init_[s], n_max_[s], delta); + MatrixView G_view_reduced(G_dev_[s], 0, n_init_[s], n_max_[s], delta); // compute G right. - linalg::matrixop::gemm(M_dev_[s], G0_dev_[s], G, thread_id_, s); + linalg::matrixop::gemm(M_dev_[s], G0_dev_[s], G_view_reduced, thread_id_, 0); flop_ += 2 * M_dev_[s].nrRows() * M_dev_[s].nrCols() * G0_dev_[s].nrCols(); + + G_[s].setAsync(G_dev_[s], *get_stream()); } - G_[s].setAsync(G_dev_[s], get_stream(s)); } } @@ -301,7 +354,7 @@ void CtintWalkerSubmatrixGpu::updateM() { // Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_); for (int s = 0; s < 2; ++s) - Gamma_inv_dev_[s].setAsync(Gamma_inv_[s], get_stream(s)); + Gamma_inv_dev_[s].setAsync(Gamma_inv_[s], *get_stream()); // Copy gamma factors. for (int s = 0; s < 2; ++s) { @@ -310,7 +363,7 @@ void CtintWalkerSubmatrixGpu::updateM() { gamma_index.resize(n); for (int i = 0; i < n; ++i) gamma_index[i] = std::make_pair(move_indices_[s][i], gamma_[s][i]); - gamma_index_dev_[s].setAsync(gamma_index, get_stream(s)); + gamma_index_dev_[s].setAsync(gamma_index, *get_stream()); } for (int s = 0; s < 2; ++s) { @@ -325,35 +378,37 @@ void CtintWalkerSubmatrixGpu::updateM() { old_G.resizeNoCopy(std::make_pair(n_max_[s], gamma_size)); old_M.resizeNoCopy(std::make_pair(gamma_size, n_max_[s])); - assert(dca::linalg::util::getStream(thread_id_, s) == get_stream(s)); - move_indices_dev_[s].setAsync(move_indices_[s], get_stream(s)); + // We no longer use the 1 stream per spin + // This isn't portable and i.e. AMD don't really benefit from multiple streams per thread + // and it adds much complexity. + // assert(dca::linalg::util::getStream(thread_id_, s) == get_stream(s)); + move_indices_dev_[s].setAsync(move_indices_[s], *get_stream()); // Note: an event synchronization might be necessary if the order of operation is changed. - linalg::matrixop::copyCols(G_dev_[s], move_indices_dev_[s], old_G, thread_id_, s); - linalg::matrixop::copyRows(M_dev_[s], move_indices_dev_[s], old_M, thread_id_, s); + linalg::matrixop::copyCols(G_dev_[s], move_indices_dev_[s], old_G, thread_id_, 0); + linalg::matrixop::copyRows(M_dev_[s], move_indices_dev_[s], old_M, thread_id_, 0); auto& tmp = G_dev_[s]; // Note: the following resize is safe as it does not deallocate. tmp.resizeNoCopy(std::make_pair(gamma_size, n_max_[s])); - linalg::matrixop::gemm(Gamma_inv_dev_[s], old_M, tmp, thread_id_, s); - linalg::matrixop::gemm(Scalar(-1.), old_G, tmp, Scalar(1.), M_dev_[s], thread_id_, s); + linalg::matrixop::gemm(Gamma_inv_dev_[s], old_M, tmp, thread_id_, 0); + linalg::matrixop::gemm(Scalar(-1.), old_G, tmp, Scalar(1.), M_dev_[s], thread_id_, 0); flop_ += 2 * Gamma_inv_dev_[s].nrRows() * Gamma_inv_dev_[s].nrCols() * old_M.nrCols(); flop_ += 2 * old_G.nrRows() * old_G.nrCols() * tmp.nrCols(); details::divideByGammaFactor(MatrixView(M_dev_[s]), gamma_index_dev_[s].ptr(), - gamma_size, get_stream(s)); + gamma_size, *get_stream()); flop_ += gamma_size * M_dev_[s].nrCols() * 2; } // Remove non-interacting rows and columns. configuration_.moveAndShrink(source_list_, removal_list_, conf_removal_list_); for (int s = 0; s < 2; ++s) { - removal_list_dev_[s].setAsync(removal_list_[s], get_stream(s)); - source_list_dev_[s].setAsync(source_list_[s], get_stream(s)); - config_copied_[s].record(get_stream(s)); + removal_list_dev_[s].setAsync(removal_list_[s], *get_stream()); + source_list_dev_[s].setAsync(source_list_[s], *get_stream()); linalg::matrixop::copyRows(M_dev_[s], source_list_dev_[s], M_dev_[s], removal_list_dev_[s], - thread_id_, s); + thread_id_, 0); linalg::matrixop::copyCols(M_dev_[s], source_list_dev_[s], M_dev_[s], removal_list_dev_[s], - thread_id_, s); + thread_id_, 0); M_dev_[s].resize(configuration_.getSector(s).size()); } @@ -370,13 +425,11 @@ void CtintWalkerSubmatrixGpu::computeM( for (int s = 0; s < 2; ++s) { MatrixView m_in(M_dev_[s]); MatrixView m_out(m_accum[s]); - details::multiplyByInverseFFactor(m_in, m_out, f_dev_[s].ptr(), get_stream(s)); + details::multiplyByInverseFFactor(m_in, m_out, f_dev_[s].ptr(), *get_stream()); flop_ += m_in.nrRows() * m_out.nrCols(); } - // TODO: understand why this is necessary. - config_copied_[0].block(); - config_copied_[1].block(); + get_stream()->sync(); } template @@ -392,6 +445,52 @@ std::size_t CtintWalkerSubmatrixGpu::deviceFingerprint() const return res; } +template +CtintWalkerSubmatrixGpu::MatrixPair CtintWalkerSubmatrixGpu< + Parameters, DIST>::getRawM() { + std::array, 2> M_copy{M_dev_[0].size(), M_dev_[1].size()}; + // Synchronous copies for testing + linalg::util::memoryCopyD2H(M_copy[0].ptr(), M_copy[0].leadingDimension(), M_dev_[0].ptr(), + M_dev_[0].leadingDimension(), M_dev_[0].size()); + linalg::util::memoryCopyD2H(M_copy[1].ptr(), M_copy[1].leadingDimension(), M_dev_[1].ptr(), + M_dev_[1].leadingDimension(), M_dev_[1].size()); + M_copy[0].set_name("subMatrixGPU::M[0]"); + M_copy[1].set_name("subMatrixGPU::M[1]"); + + return M_copy; +} + +template +CtintWalkerSubmatrixGpu::MatrixPair CtintWalkerSubmatrixGpu< + Parameters, DIST>::getRawG() { + std::array, 2> G_copy{G_dev_[0].size(), G_dev_[1].size()}; + // Synchronous copies for testing + linalg::util::memoryCopyD2H(G_copy[0].ptr(), G_copy[0].leadingDimension(), G_dev_[0].ptr(), + G_dev_[0].leadingDimension(), G_dev_[0].size()); + linalg::util::memoryCopyD2H(G_copy[1].ptr(), G_copy[1].leadingDimension(), G_dev_[1].ptr(), + G_dev_[1].leadingDimension(), G_dev_[1].size()); + G_copy[0].set_name("subMatrixGPU::G[0]"); + G_copy[1].set_name("subMatrixGPU::G[1]"); + + return G_copy; +} + +template +CtintWalkerSubmatrixGpu::MatrixPair CtintWalkerSubmatrixGpu< + Parameters, DIST>::getM() { + std::array, 2> M; + synchronize(); + computeM(M); + checkRC(cudaDeviceSynchronize()); + + std::array, 2> M_copy{M[0].size(), M[1].size()}; + linalg::util::memoryCopyD2H(M_copy[0].ptr(), M_copy[0].leadingDimension(), M[0].ptr(), + M[0].leadingDimension(), M[0].size()); + linalg::util::memoryCopyD2H(M_copy[1].ptr(), M_copy[1].leadingDimension(), M[1].ptr(), + M[1].leadingDimension(), M[1].size()); + return M_copy; +} + } // namespace ctint } // namespace solver } // namespace phys diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp index 2a7bb44f6..1906f5a74 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp @@ -50,8 +50,7 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase { using BaseClass::order; virtual void setMFromConfig() = 0; - - void markThermalized() override; + auto getF() const { return f_; } protected: virtual void doStep() override; void doSteps(); @@ -73,6 +72,8 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase { */ void mainSubmatrixProcess(); + void markThermalized() override; + void transformM(); // For testing purposes. @@ -80,6 +81,8 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase { virtual void updateM() = 0; + virtual void computeMInit() = 0; + private: void doSubmatrixUpdate(); @@ -92,8 +95,6 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase { void removeRowAndColOfGammaInv(); - virtual void computeMInit() = 0; - // void computeG0Init(); virtual void computeGInit() = 0; @@ -209,7 +210,7 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase { std::array Gamma_q_; Matrix workspace_; - Matrix D_; + Matrix D_{"SubMatrixHostD"}; using BaseClass::flop_; }; @@ -346,6 +347,20 @@ void CtintWalkerSubmatrixBase::doSubmatrixUpdate() { updateM(); } +template +void CtintWalkerSubmatrixBase::transformM() { + for (int s = 0; s < 2; ++s) { + for (int j = 0; j < M_[s].size().second; ++j) { + for (int i = 0; i < M_[s].size().first; ++i) { + const auto field_type = configuration_.getSector(s).getAuxFieldType(i); + const auto b = configuration_.getSector(s).getLeftB(i); + const Scalar f_i = -(f_[field_type][b] - 1); + M_[s](i, j) /= f_i; + } + } + } +} + template void CtintWalkerSubmatrixBase::mainSubmatrixProcess() { Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_); @@ -778,20 +793,6 @@ void CtintWalkerSubmatrixBase::recomputeGammaInv() { } } -template -void CtintWalkerSubmatrixBase::transformM() { - for (int s = 0; s < 2; ++s) { - for (int j = 0; j < M_[s].size().second; ++j) { - for (int i = 0; i < M_[s].size().first; ++i) { - const auto field_type = configuration_.getSector(s).getAuxFieldType(i); - const auto b = configuration_.getSector(s).getLeftB(i); - const Scalar f_i = -(f_[field_type][b] - 1); - M_[s](i, j) /= f_i; - } - } - } -} - template void CtintWalkerSubmatrixBase::computeInsertionMatrices( const std::vector& insertion_indices, const int s) { diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation_gpu.hpp index 9ae23ad1f..e3c998999 100644 --- a/include/dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation_gpu.hpp +++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation_gpu.hpp @@ -92,7 +92,10 @@ void G0Interpolation::initialize(const FunctionProxy>::values_ = G0_coeff_.ptr(); diff --git a/include/dca/phys/parameters/parameters.hpp b/include/dca/phys/parameters/parameters.hpp index 142e6a9ce..56c4be6ff 100644 --- a/include/dca/phys/parameters/parameters.hpp +++ b/include/dca/phys/parameters/parameters.hpp @@ -127,6 +127,9 @@ class Parameters : public AnalysisParameters, void write(Writer& writer); template void read_input_and_broadcast(const std::string& file_name); + template + void readInput(const std::string& file_name); + void broadcast(); void update_model(); void update_domains(); @@ -256,14 +259,29 @@ template void Parameters::read_input_and_broadcast(const std::string& filename) { + readInput(filename); + broadcast(); +} + +template +void Parameters::broadcast() +{ + concurrency_.broadcast_object(*this); +} + +template +template +void Parameters::readInput(const std::string& filename) { if (concurrency_.id() == concurrency_.first()) { Reader read_obj; read_obj.open_file(filename); this->readWrite(read_obj); read_obj.close_file(); } - - concurrency_.broadcast_object(*this); } template G, const MatrixView -void computeGLeft(MatrixView& G, const MatrixView& M, const Real* f, +void computeGLeft(MatrixView& G_dev, const MatrixView& M_dev, const Real* f_dev, int n_init, cudaStream_t stream) { if (n_init == 0) return; - const int n = G.nrRows(); + const int n = G_dev.nrRows(); constexpr int thread_j = 4; constexpr int thread_i = 64; dim3 threads(thread_i, thread_j); dim3 blocks(std::max(n / (10 * thread_i), 1), dca::util::ceilDiv(n_init, thread_j)); - computeGLeftKernel<<>>(G, M, f, n_init); + computeGLeftKernel<<>>(G_dev, M_dev, f_dev, n_init); } template void computeGLeft(MatrixView&, const MatrixView&, const float*, diff --git a/test/unit/linalg/vector_cpu_gpu_test.cpp b/test/unit/linalg/vector_cpu_gpu_test.cpp index 4558c4754..98083b7d5 100644 --- a/test/unit/linalg/vector_cpu_gpu_test.cpp +++ b/test/unit/linalg/vector_cpu_gpu_test.cpp @@ -151,13 +151,13 @@ TEST(VectorCPUGPUTest, Set) { vec[i] = el; } - vec_copy.set(vec, 0, 1); + vec_copy.set(vec, 0, 0); ASSERT_EQ(vec.size(), vec_copy.size()); ASSERT_EQ(capacity, vec_copy.capacity()); ASSERT_EQ(old_ptr, vec_copy.ptr()); ASSERT_TRUE(testing::isDevicePointer(vec_copy.ptr())); - vec_copy_copy.set(vec_copy, 0, 1); + vec_copy_copy.set(vec_copy, 0, 0); EXPECT_EQ(vec.size(), vec_copy_copy.size()); EXPECT_EQ(capacity_2, vec_copy_copy.capacity()); EXPECT_EQ(old_ptr_2, vec_copy_copy.ptr()); @@ -181,12 +181,12 @@ TEST(VectorCPUGPUTest, Set) { vec[i] = el; } - vec_copy.set(vec, 0, 1); + vec_copy.set(vec, 0, 0); ASSERT_EQ(vec.size(), vec_copy.size()); ASSERT_LE(vec.size(), vec_copy.capacity()); ASSERT_FALSE(testing::isHostPointer(vec_copy.ptr())); - vec_copy_copy.set(vec_copy, 0, 1); + vec_copy_copy.set(vec_copy, 0, 0); EXPECT_EQ(vec.size(), vec_copy_copy.size()); EXPECT_LE(vec.size(), vec_copy_copy.capacity()); EXPECT_TRUE(testing::isHostPointer(vec_copy_copy.ptr())); diff --git a/test/unit/linalg/vector_cpu_test.cpp b/test/unit/linalg/vector_cpu_test.cpp index 5ae7b7959..2d849ae78 100644 --- a/test/unit/linalg/vector_cpu_test.cpp +++ b/test/unit/linalg/vector_cpu_test.cpp @@ -179,7 +179,7 @@ TEST(VectorCPUTest, Set) { vec[i] = el; } - vec_copy.set(vec, 0, 1); + vec_copy.set(vec, 0, 0); EXPECT_EQ(vec.size(), vec_copy.size()); EXPECT_EQ(capacity, vec_copy.capacity()); EXPECT_EQ(old_ptr, vec_copy.ptr()); @@ -203,7 +203,7 @@ TEST(VectorCPUTest, Set) { vec[i] = el; } - vec_copy.set(vec, 0, 1); + vec_copy.set(vec, 0, 0); EXPECT_EQ(vec.size(), vec_copy.size()); EXPECT_LE(vec.size(), vec_copy.capacity()); diff --git a/test/unit/linalg/vector_gpu_test.cpp b/test/unit/linalg/vector_gpu_test.cpp index 8fc1d0d16..1e769a477 100644 --- a/test/unit/linalg/vector_gpu_test.cpp +++ b/test/unit/linalg/vector_gpu_test.cpp @@ -165,7 +165,7 @@ TEST(VectorGPUTest, Set) { testing::setOnDevice(vec.ptr(i), el); } - vec_copy.set(vec, 0, 1); + vec_copy.set(vec, 0, 0); EXPECT_EQ(vec.size(), vec_copy.size()); EXPECT_EQ(capacity, vec_copy.capacity()); EXPECT_EQ(old_ptr, vec_copy.ptr()); @@ -189,7 +189,7 @@ TEST(VectorGPUTest, Set) { testing::setOnDevice(vec.ptr(i), el); } - vec_copy.set(vec, 0, 1); + vec_copy.set(vec, 0, 0); EXPECT_EQ(vec.size(), vec_copy.size()); EXPECT_LE(vec.size(), vec_copy.capacity()); diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp index b8669830a..f79a2f182 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp @@ -11,6 +11,7 @@ // are compared with their direct computation. #include "dca/platform/dca_gpu.h" +#include using Scalar = double; #include "test/mock_mcconfig.hpp" namespace dca { @@ -31,20 +32,44 @@ using McOptions = MockMcOptions; #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp" // This file MUST have + // -"double-update-probability": 0 // -"initial-configuration-size": 0 // or walker wrapper will crash the test on construction constexpr char input_name[] = DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/ctint/walker/multinsert_input.json"; +template +struct CtintWalkerMultiInsertTestT : public ::testing::Test { + using G0Setup = dca::testing::G0SetupFromParam; + using Parameters = typename G0Setup::Parameters; + + virtual void SetUp() {} + + void setParameters(Parameters& param_single, Parameters& param_double) + { + single_setup = std::make_unique(param_single); + single_setup->setUp(); + double_setup = std::make_unique(param_double); + double_setup->setUp(); + } + + virtual void TearDown() { + single_setup.reset(); + double_setup.reset(); + } + + std::unique_ptr single_setup; + std::unique_ptr double_setup; +}; + +template +using CtintWalkerMultiInsertTest = CtintWalkerMultiInsertTestT; + using Scalar = double; -using G0Setup = typename dca::testing::G0Setup; using namespace dca::phys::solver; -using Walker = testing::phys::solver::ctint::WalkerWrapper; -using Matrix = Walker::Matrix; -using MatrixPair = std::array; constexpr int bands = dca::testing::LatticeHund::BANDS; @@ -56,21 +81,47 @@ auto getVertexRng = [](int id) { using CDA = dca::phys::ClusterDomainAliases; using RDmn = typename CDA::RClusterDmn; +using ScalarTypes = ::testing::Types; +TYPED_TEST_CASE(CtintWalkerMultiInsertTest, ScalarTypes); // Compare single versus double update without a submatrix update. -TEST_F(G0Setup, NoSubmatrix) { - G0Setup::RngType rng(std::vector{}); - G0Interpolation g0( - dca::phys::solver::ctint::details::shrinkG0(data_->G0_r_t)); - G0Setup::LabelDomain label_dmn; +TYPED_TEST(CtintWalkerMultiInsertTest, NoSubmatrix) { + using Scalar = TypeParam; + using G0Setup = CtintWalkerMultiInsertTest::G0Setup; + using Concurrency = typename G0Setup::Concurrency; + using Parameters = typename G0Setup::Parameters; + using Walker = typename testing::phys::solver::ctint::WalkerWrapper; + using Matrix = typename Walker::Matrix; + using MatrixPair = std::array; + + Concurrency concurrency(0,nullptr); + + Parameters param_single("", concurrency); + param_single.template readInput(input_name); + param_single.setDoubleUpdateProbability(0); + param_single.broadcast(); + + Parameters param_double("", concurrency); + param_double.template readInput(input_name); + param_double.setDoubleUpdateProbability(1); + param_double.broadcast(); + + this->setParameters(param_single, param_double); + + typename G0Setup::RngType rng(std::vector{}); + G0Interpolation g0( + dca::phys::solver::ctint::details::shrinkG0(this->single_setup->data_->G0_r_t)); + typename G0Setup::LabelDomain label_dmn; + using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder; - DMatrixBuilder d_matrix_builder(g0, bands, RDmn()); - d_matrix_builder.setAlphas(parameters_.getAlphas(), false); - Walker::setInteractionVertices(*data_, parameters_); + DMatrixBuilder d_matrix_builder_single(g0, bands, RDmn()); + d_matrix_builder_single.setAlphas(param_single.getAlphas(), false); + + auto& data_single = this->single_setup->data_; - parameters_.setDoubleUpdateProbability(0); - Walker walker_single(parameters_, rng, d_matrix_builder); - walker_single.setInteractionVertices(*data_, parameters_); + Walker::setInteractionVertices(*data_single, param_single); + + Walker walker_single(param_single, *data_single, rng, d_matrix_builder_single); // interaction, tau, aux, accept rng.setNewValues(std::vector{getVertexRng(6), 0.41, 0, 0}); @@ -104,12 +155,15 @@ TEST_F(G0Setup, NoSubmatrix) { ASSERT_EQ(M1[1].nrCols(), final_size); const double prob_single = walker_single.getAcceptanceProbability(); - - parameters_.setDoubleUpdateProbability(1); - Walker::setInteractionVertices(*data_, parameters_); - Walker walker_double(parameters_, rng, d_matrix_builder); + auto& data_double = this->double_setup->data_; + + Walker::setInteractionVertices(*data_double, param_double); + DMatrixBuilder d_matrix_builder_double(g0, bands, RDmn()); + d_matrix_builder_double.setAlphas(param_double.getAlphas(), false); + + Walker walker_double(param_double, *data_double, rng, d_matrix_builder_double); //////////////////////////////// // interaction, partner, tau, aux, tau, aux, accept rng.setNewValues(std::vector{getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0}); @@ -153,58 +207,61 @@ TEST_F(G0Setup, NoSubmatrix) { } // Compare the double insertion probability of a submatrix walker with a non-submatrix one. -using WalkerSubmatrix = - testing::phys::solver::ctint::WalkerWrapperSubmatrix; -TEST_F(G0Setup, Submatrix) { - G0Setup::RngType rng(std::vector{}); - G0Interpolation g0( - dca::phys::solver::ctint::details::shrinkG0(data_->G0_r_t)); - G0Setup::LabelDomain label_dmn; - using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder; - DMatrixBuilder d_matrix_builder(g0, bands, RDmn()); - d_matrix_builder.setAlphas(parameters_.getAlphas(), false); - parameters_.setDoubleUpdateProbability(1); - Walker::setInteractionVertices(*data_, parameters_); - - Walker walker(parameters_, rng, d_matrix_builder); - - // interaction, partner, tau, aux, tau, aux, accept - rng.setNewValues(std::vector{getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0}); - walker.tryVertexInsert(); - rng.setNewValues(std::vector{getVertexRng(7), 0.9, 0.34, 0, 0.36, 0, 0}); - walker.tryVertexInsert(); - // first idx, double rem, second partner, accept - rng.setNewValues(std::vector{0, 0, 0, 0}); - walker.tryVertexRemoval(); - - auto M1 = walker.getM(); - const double prob1 = walker.getAcceptanceProbability(); - - ///////////////////////////// - using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder; - DMatrixBuilder d_matrix_cpu(g0, bands, RDmn()); - d_matrix_builder.setAlphas(parameters_.getAlphas(), false); - WalkerSubmatrix::setInteractionVertices(*data_, parameters_); - - WalkerSubmatrix walker_subm(parameters_, rng, d_matrix_builder); - - std::vector random_vals{// (insert, first_id, partner_id, tau, aux, tau, aux, accept) x 2 - 0, getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0, 0, getVertexRng(7), - 0.9, 0.34, 0, 0.36, 0, 0, - // with remov, first_id, double_removal, second_id, accept - 1, 0, 0, 0, 0}; - - rng.setNewValues(random_vals); - walker_subm.doStep(3); - - auto M2 = walker_subm.getM(); - const double prob2 = walker_subm.getAcceptanceProbability(); - - EXPECT_NEAR(prob1, prob2, 1e-7); - for (int s = 0; s < 2; ++s) { - EXPECT_EQ(M1[s].size(), M2[s].size()); - EXPECT_TRUE(dca::linalg::matrixop::areNear(M1[s], M2[s], 1e-7)); - M1[s].print(); - M2[s].print(); - } -} +// TEST_F(G0Setup, Submatrix) { +// using Scalar = TypeParam; +// using G0Setup = decltype(*this)::G0Setup; +// using WalkerSubmatrix = +// testing::phys::solver::ctint::WalkerWrapperSubmatrix; +// typename G0Setup::RngType rng(std::vector{}); +// G0Interpolation g0( +// dca::phys::solver::ctint::details::shrinkG0(this->single_setup->data_->G0_r_t); +// typename G0Setup::LabelDomain label_dmn; + +// using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder; +// DMatrixBuilder d_matrix_builder(g0, bands, RDmn()); +// d_matrix_builder.setAlphas(parameters_.getAlphas(), false); +// parameters_.setDoubleUpdateProbability(1); +// Walker::setInteractionVertices(*data_, parameters_); + +// Walker walker(parameters_, rng, d_matrix_builder); + +// // interaction, partner, tau, aux, tau, aux, accept +// rng.setNewValues(std::vector{getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0}); +// walker.tryVertexInsert(); +// rng.setNewValues(std::vector{getVertexRng(7), 0.9, 0.34, 0, 0.36, 0, 0}); +// walker.tryVertexInsert(); +// // first idx, double rem, second partner, accept +// rng.setNewValues(std::vector{0, 0, 0, 0}); +// walker.tryVertexRemoval(); + +// auto M1 = walker.getM(); +// const double prob1 = walker.getAcceptanceProbability(); + +// ///////////////////////////// +// using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder; +// DMatrixBuilder d_matrix_cpu(g0, bands, RDmn()); +// d_matrix_builder.setAlphas(parameters_.getAlphas(), false); +// WalkerSubmatrix::setInteractionVertices(*data_, parameters_); + +// WalkerSubmatrix walker_subm(parameters_, rng, d_matrix_builder); + +// std::vector random_vals{// (insert, first_id, partner_id, tau, aux, tau, aux, accept) x 2 +// 0, getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0, 0, getVertexRng(7), +// 0.9, 0.34, 0, 0.36, 0, 0, +// // with remov, first_id, double_removal, second_id, accept +// 1, 0, 0, 0, 0}; + +// rng.setNewValues(random_vals); +// walker_subm.doStep(3); + +// auto M2 = walker_subm.getM(); +// const double prob2 = walker_subm.getAcceptanceProbability(); + +// EXPECT_NEAR(prob1, prob2, 1e-7); +// for (int s = 0; s < 2; ++s) { +// EXPECT_EQ(M1[s].size(), M2[s].size()); +// EXPECT_TRUE(dca::linalg::matrixop::areNear(M1[s], M2[s], 1e-7)); +// M1[s].print(); +// M2[s].print(); +// } +// } diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp index 157ceee38..ac37a5fd0 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp @@ -82,7 +82,7 @@ TYPED_TEST(CtintWalkerSubmatrixGpuComplexTest, doSteps) { d_matrix_cpu.setAlphas(parameters.getAlphas(), parameters.adjustAlphaDd()); DMatrixBuilder d_matrix_gpu(g0_gpu, bands, RDmn()); d_matrix_gpu.setAlphas(parameters.getAlphas(), parameters.adjustAlphaDd()); - + // ************************************ // Test vertex insertion / removal **** // ************************************ @@ -104,7 +104,7 @@ TYPED_TEST(CtintWalkerSubmatrixGpuComplexTest, doSteps) { 0, 0.99, 0.4, 0.2, -1, // Insertion }; - for (const int initial_size : std::array{0, 5}) { + for (const int initial_size : std::array{2, 5}) { parameters.setInitialConfigurationSize(initial_size); for (int steps = 1; steps <= 8; ++steps) { diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp index 1f727cd4e..0edaad83c 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp @@ -31,6 +31,9 @@ using McOptions = MockMcOptions; #include "dca/linalg/matrixop.hpp" #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp" +#include "dca/util/to_string.hpp" +using namespace dca::addt_str_oper; + constexpr char input_name[] = DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json"; @@ -128,20 +131,45 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) { 0, 0.99, 0.4, 0.2, -1, // Insertion }; - for (const int initial_size : std::array{0, 5}) { + constexpr Scalar tolerance = 10e-12; + auto compareSubMatrix = [tolerance](auto& mat_cpu, auto& mat_gpu, const std::string& mat_tag) { + int fail = 0; + for (int s = 0; s < 2; ++s) { + std::cout << mat_tag << "_cpu[" << s << "] size: " << mat_cpu[s].size() << '\n'; + std::cout << mat_tag << "_gpu[" << s << "] size: " << mat_gpu[s].size() << '\n'; + for (int j = 0; j < mat_cpu[s].size().second; ++j) + for (int i = 0; i < mat_cpu[s].size().first; ++i) { + if (std::abs(mat_cpu[s](i, j) - mat_gpu[s](i, j)) > tolerance) { + std::cout << mat_tag << " element: " << s << "," << i << "," << j << " cpu(" + << mat_cpu[s](i, j) << ") - gpu(" << mat_gpu[s](i, j) << ") > " << tolerance + << '\n'; + ++fail; + } + } + } + return fail; + }; + + for (const int initial_size : std::array{2}) { cpu_parameters.setInitialConfigurationSize(initial_size); gpu_parameters.setInitialConfigurationSize(initial_size); - for (int steps = 1; steps <= 8; ++steps) { + for (int steps = 2; steps <= 8; ++steps) { cpu_rng.setNewValues(setup_rngs); SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng, d_matrix_cpu); + walker_cpu.setInteractionVertices(cpu_data, cpu_parameters); + gpu_rng.setNewValues(setup_rngs); SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng, d_matrix_gpu); + walker_gpu.setInteractionVertices(gpu_data, gpu_parameters); cpu_rng.setNewValues(rng_vals); walker_cpu.doStep(steps); gpu_rng.setNewValues(rng_vals); walker_gpu.doStep(steps); + // doSweep does this + walker_gpu.uploadConfiguration(); + constexpr Scalar tolerance = std::numeric_limits::epsilon() * 100; auto M_cpu = walker_cpu.getM(); @@ -156,6 +184,209 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) { for (int i = 0; i < config1.size(); ++i) EXPECT_EQ(config1[i], config2[i]); EXPECT_EQ(walker_cpu.get_sign(), walker_gpu.get_sign()); + + auto fail = compareSubMatrix(M_cpu, M_gpu, "M"); + EXPECT_FALSE(fail); } } } + +// This test still fails if its run second in the batch. + +TYPED_TEST(CtintWalkerSubmatrixGpuTest, fineGrained) { + using Scalar = TypeParam; + using Parameters = typename CtintWalkerSubmatrixGpuTest::G0Setup::Parameters; + using Walker = testing::phys::solver::ctint::WalkerWrapper; + using Matrix = typename Walker::Matrix; + using MatrixPair = std::array; + using SbmWalkerCpu = + testing::phys::solver::ctint::WalkerWrapperSubmatrix; + using SbmWalkerGpu = + testing::phys::solver::ctint::WalkerWrapperSubmatrix; + + std::vector setup_rngs{0., 0.00, 0.9, 0.5, 0.01, 0, 0.75, 0.02, + 0, 0.6, 0.03, 1, 0.99, 0.04, 0.99}; + typename CtintWalkerSubmatrixGpuTest::G0Setup::RngType cpu_rng(setup_rngs); + typename CtintWalkerSubmatrixGpuTest::G0Setup::RngType gpu_rng(setup_rngs); + + auto& cpu_data = *this->host_setup.data_; + auto& gpu_data = *this->gpu_setup.data_; + auto& cpu_parameters = this->host_setup.parameters_; + auto& gpu_parameters = this->gpu_setup.parameters_; + + const auto g0_func_cpu = dca::phys::solver::ctint::details::shrinkG0(cpu_data.G0_r_t); + G0Interpolation g0_cpu(g0_func_cpu); + const auto g0_func_gpu = dca::phys::solver::ctint::details::shrinkG0(gpu_data.G0_r_t); + G0Interpolation g0_gpu(g0_func_gpu); + typename CtintWalkerSubmatrixGpuTest::G0Setup::LabelDomain label_dmn; + + constexpr int bands = dca::testing::LatticeBilayer::BANDS; + + DMatrixBuilder d_matrix_cpu(g0_cpu, bands, RDmn()); + d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false); // cpu_parameters.adjustAlphaDd()); + DMatrixBuilder d_matrix_gpu(g0_gpu, bands, RDmn()); + d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false); // gpu_parameters.adjustAlphaDd()); + + // ************************************ + // Test vertex insertion / removal **** + // ************************************ + // Set rng values. + // + // Insertion, vertex_id, tau, aux_spin, acceptance_rng + // Removal, vertex_id, acceptance_rng + // ... + // Note: if acceptance_rng <= 0 the move is always accepted, if it is > 1 the move is always + // rejected. + const std::vector rng_vals{ + 0, 0, 0.1, 0.8, -1, // Insertion. + 0, 0.99, 0.2, 0.8, -1, // Insertion. + 0, 0, 0.3, 0.8, 2, // Insertion. Rejected. + 1, 0, -1, // Remove pre-existing. + 1, 0.99, -1, // Remove recently inserted. + 1, 0.99, 2, // Remove recently inserted. Rejected + 1, 0, 2, // Remove . Rejected + 0, 0.99, 0.4, 0.2, -1, // Insertion + }; + + int initial_size = 2; + + cpu_parameters.setInitialConfigurationSize(initial_size); + gpu_parameters.setInitialConfigurationSize(initial_size); + int steps = 4; + + cpu_rng.setNewValues(setup_rngs); + SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng, d_matrix_cpu); + walker_cpu.setInteractionVertices(cpu_data, cpu_parameters); + + gpu_rng.setNewValues(setup_rngs); + SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng, d_matrix_gpu); + walker_gpu.setInteractionVertices(gpu_data, gpu_parameters); + + constexpr Scalar tolerance = 10e-12; + + cpu_rng.setNewValues(rng_vals); + walker_cpu.generateDelayedMoves(steps); + walker_cpu.computeMInit(); + + walker_gpu.uploadConfiguration(); + gpu_rng.setNewValues(rng_vals); + walker_gpu.generateDelayedMoves(steps); + walker_gpu.uploadConfiguration(); + walker_gpu.computeMInit(); + + // walker_gpu.computeGInit(); + auto M_cpu = walker_cpu.getRawM(); + auto M_gpu = walker_gpu.getRawM(); + + std::cout << "Minit init cpu size " << M_cpu[0].size().first << "," << M_cpu[0].size().second + << " : " << M_cpu[1].size().first << "," << M_cpu[1].size().second << '\n'; + std::cout << "Minit init gpu size " << M_gpu[0].size().first << "," << M_gpu[0].size().second + << " : " << M_gpu[1].size().first << "," << M_gpu[1].size().second << '\n'; + + std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n'; + std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n'; + std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n'; + std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n'; + + M_cpu[0].print(); + M_cpu[1].print(); + M_gpu[0].print(); + M_gpu[1].print(); + + auto compareSubMatrix = [tolerance](auto& mat_cpu, auto& mat_gpu, const std::string& mat_tag) { + int fail = 0; + for (int s = 0; s < 2; ++s) { + std::cout << mat_tag << "_cpu[" << s << "] size: " << mat_cpu[s].size() << '\n'; + std::cout << mat_tag << "_gpu[" << s << "] size: " << mat_gpu[s].size() << '\n'; + for (int j = 0; j < mat_cpu[s].size().second; ++j) + for (int i = 0; i < mat_cpu[s].size().first; ++i) { + if (std::abs(mat_cpu[s](i, j) - mat_gpu[s](i, j)) > tolerance) { + std::cout << mat_tag << " element: " << s << "," << i << "," << j << " cpu(" + << mat_cpu[s](i, j) << ") - gpu(" << mat_gpu[s](i, j) << ") > " << tolerance + << '\n'; + ++fail; + } + } + } + return fail; + }; + + auto fail = compareSubMatrix(M_cpu, M_gpu, "M"); + EXPECT_FALSE(fail); + + walker_cpu.computeGInit(); + walker_gpu.computeGInit(); + walker_gpu.synchronize(); + + auto G_cpu = walker_cpu.getRawG(); + auto G_gpu = walker_gpu.getRawG(); + + fail = compareSubMatrix(G_cpu, G_gpu, "G"); + EXPECT_FALSE(fail); + + walker_cpu.mainSubmatrixProcess(); + walker_gpu.mainSubmatrixProcess(); + + walker_cpu.updateM(); + walker_gpu.updateM(); + + M_cpu = walker_cpu.getRawM(); + M_gpu = walker_gpu.getRawM(); + + std::cout << "M post update cpu size " << M_cpu[0].size().first << "," << M_cpu[0].size().second + << " : " << M_cpu[1].size().first << "," << M_cpu[1].size().second << '\n'; + std::cout << "M post update gpu size " << M_gpu[0].size().first << "," << M_gpu[0].size().second + << " : " << M_gpu[1].size().first << "," << M_gpu[1].size().second << '\n'; + + std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n'; + std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n'; + std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n'; + std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n'; + + M_cpu[0].print(); + M_cpu[1].print(); + M_gpu[0].print(); + M_gpu[1].print(); + + fail = compareSubMatrix(M_cpu, M_gpu, "updated_M"); + EXPECT_FALSE(fail); + + // The final configuration is the same. + const auto& config1 = walker_cpu.getWalkerConfiguration(); + const auto& config2 = walker_gpu.getWalkerConfiguration(); + ASSERT_EQ(config1.size(), config2.size()); + for (int i = 0; i < config1.size(); ++i) + EXPECT_EQ(config1[i], config2[i]); + EXPECT_EQ(walker_cpu.get_sign(), walker_gpu.get_sign()); + + walker_gpu.uploadConfiguration(); + + auto comp_M_cpu = walker_cpu.getM(); + auto comp_M_gpu = walker_gpu.getM(); + + std::cout << "computed M cpu size " << comp_M_cpu[0].size().first << "," + << comp_M_cpu[0].size().second << " : " << comp_M_cpu[1].size().first << "," + << comp_M_cpu[1].size().second << '\n'; + std::cout << "computed M gpu size " << comp_M_gpu[0].size().first << "," + << comp_M_gpu[0].size().second << " : " << comp_M_gpu[1].size().first << "," + << comp_M_gpu[1].size().second << '\n'; + + std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n'; + std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n'; + std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n'; + std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n'; + + comp_M_cpu[0].print(); + comp_M_cpu[1].print(); + comp_M_gpu[0].print(); + comp_M_gpu[1].print(); + + fail = compareSubMatrix(comp_M_cpu, comp_M_gpu, "comp_M"); + EXPECT_FALSE(fail); +} diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp index a35bf3ce7..a768310f4 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp @@ -44,7 +44,7 @@ using namespace dca::phys::solver; // Currently testing float isn't really possible due to the way the Scalar type is // carried through from mc_options. See test_setup.hpp PD -using ScalarTypes = ::testing::Types; //double, +using ScalarTypes = ::testing::Types; //double, TYPED_TEST_CASE(CtintWalkerSubmatrixTest, ScalarTypes); // Compare the submatrix update with a direct computation of the M matrix, and compare the @@ -59,7 +59,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) { using MatrixPair = std::array; using SubmatrixWalker = testing::phys::solver::ctint::WalkerWrapperSubmatrix; - + std::vector setup_rngs{0., 0.00, 0.9, 0.5, 0.01, 0, 0.75, 0.02, 0, 0.6, 0.03, 1, 0.99, 0.04, 0.99}; typename TestFixture::RngType rng(setup_rngs); @@ -111,7 +111,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) { MatrixPair direct_M(walker.getM()); using dca::util::RealAlias; - + // This should just be the RealAliases of the scalar since in the // complex case we are checking the Euclidean norm. const auto tolerance = 1000.0 * std::numeric_limits>::epsilon(); @@ -121,7 +121,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) { // Compare with non submatrix walker. rng.setNewValues(setup_rngs); - Walker walker_nosub(parameters, rng, d_matrix_builder); + Walker walker_nosub(parameters, data, rng, d_matrix_builder); rng.setNewValues(rng_vals); for (int i = 0; i < steps; ++i) diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp index 065f651bb..80d555942 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp @@ -79,7 +79,7 @@ TYPED_TEST(CtintWalkerTest, InsertAndRemoveVertex) { Walker::setInteractionVertices(data, parameters); - Walker walker(parameters, rng, d_matrix_builder); + Walker walker(parameters, data, rng, d_matrix_builder); // ******************************* // Test vertex removal *********** diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp index bdeb041ef..9a0ebcca3 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp @@ -33,8 +33,8 @@ class WalkerWrapper : public CtintWalker { using Real = dca::util::RealAlias; using Rng = typename Base::Rng; - WalkerWrapper(Parameters& parameters_ref, Rng& rng_ref, DMatrixBuilder& d_matrix_builder) - : Base(parameters_ref, dca::phys::DcaData(parameters_ref), rng_ref, d_matrix_builder, 0) { + WalkerWrapper(Parameters& parameters_ref, const dca::phys::DcaData& data, Rng& rng_ref, DMatrixBuilder& d_matrix_builder) + : Base(parameters_ref, data, rng_ref, d_matrix_builder, 0) { Base::initialize(0); } diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp index 2d5dc66a8..638a35ac6 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp @@ -66,11 +66,6 @@ struct WalkerWrapperSubmatrix : public WalkerSelector; using MatrixPair = std::array; @@ -94,6 +89,41 @@ struct WalkerWrapperSubmatrix : public WalkerSelector streams_; }; diff --git a/test/unit/phys/dca_step/cluster_solver/test_setup.hpp b/test/unit/phys/dca_step/cluster_solver/test_setup.hpp index d032772f1..b37cc0837 100644 --- a/test/unit/phys/dca_step/cluster_solver/test_setup.hpp +++ b/test/unit/phys/dca_step/cluster_solver/test_setup.hpp @@ -157,6 +157,49 @@ struct G0SetupBare { }; + +template +struct G0SetupFromParam { + using LatticeType = Lattice; + using Model = phys::models::TightBindingModel; + using RngType = testing::StubRng; + using Concurrency = parallel::NoConcurrency; + using Parameters = + phys::params::Parameters, Scalar>>; + using Data = phys::DcaData; + + // Commonly used domains. + using RDmn = typename Parameters::RClusterDmn; + using KDmn = typename Parameters::KClusterDmn; + using BDmn = func::dmn_0; + using SDmn = func::dmn_0; + using NuDmn = func::dmn_variadic; + using WDmn = func::dmn_0; + using LabelDomain = func::dmn_variadic; + + Concurrency concurrency_; + Parameters parameters_; + std::unique_ptr data_; + + G0SetupFromParam(const Parameters& params) : concurrency_(0, nullptr), parameters_(params) {} + + void setUp() { + parameters_.update_model(); + + parameters_.update_domains(); + + data_ = std::make_unique(parameters_); + data_->initialize(); + } + + void TearDown() {} + +}; + + } // namespace testing } // namespace dca From ad83b27427c118dbc2b4bd52ce24f4935f39d618 Mon Sep 17 00:00:00 2001 From: Peter Doak Date: Thu, 14 Aug 2025 20:13:06 -0400 Subject: [PATCH 3/3] reverting dropped configuration initialization --- .../ctint/structs/interaction_vertices.hpp | 2 +- .../ctint/walker/ctint_walker_base.hpp | 4 ++++ .../cluster_solver/ctint/walker/CMakeLists.txt | 12 ++++++------ .../walker/ct_int_walker_multinsertion_test.cpp | 12 ++++++------ test/unit/phys/dca_step/cluster_solver/stub_rng.hpp | 11 +++++++++-- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/structs/interaction_vertices.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/structs/interaction_vertices.hpp index f911375af..d8ad0e270 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/structs/interaction_vertices.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/structs/interaction_vertices.hpp @@ -91,7 +91,7 @@ class InteractionVertices { enum PartnershipType { NONE, SAME_SITE, ALL_SITES }; std::vector cumulative_weigths_; - double total_weigth_ = 0; + double total_weigth_{0}; bool interband_propagator_ = false; PartnershipType partnership_type_ = NONE; }; diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp index 9ed7e52eb..98a8fa2b2 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp @@ -235,6 +235,10 @@ CtintWalkerBase::CtintWalkerBase(const Parameters& parameters_ thread_id_(id), stream_{&linalg::util::getStreamContainer()(thread_id_, 0)}, rng_(rng_ref), + + configuration_(parameters_.get_beta(), Bdmn::dmn_size(), vertices_, + parameters_.getDoubleUpdateProbability()), + beta_(parameters_.get_beta()), total_interaction_(vertices_.integratedInteraction()) {} diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/CMakeLists.txt b/test/unit/phys/dca_step/cluster_solver/ctint/walker/CMakeLists.txt index d98942217..2ce304bb3 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/CMakeLists.txt +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/CMakeLists.txt @@ -17,12 +17,12 @@ dca_add_gtest(ct_int_walker_submatrix_test LIBS ${TEST_LIBS} g0_interpolation ${DCA_GPU_LIBS} ) -dca_add_gtest(ct_int_walker_multinsertion_test - FAST - GTEST_MPI_MAIN - INCLUDE_DIRS ${TEST_INCLUDES} - LIBS ${TEST_LIBS} g0_interpolation - ) +# dca_add_gtest(ct_int_walker_multinsertion_test +# FAST +# GTEST_MPI_MAIN +# INCLUDE_DIRS ${TEST_INCLUDES} +# LIBS ${TEST_LIBS} g0_interpolation +# ) dca_add_gtest(ct_int_walker_submatrix_gpu_test CUDA diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp index f79a2f182..1c1980549 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp @@ -42,13 +42,12 @@ constexpr char input_name[] = template struct CtintWalkerMultiInsertTestT : public ::testing::Test { using G0Setup = dca::testing::G0SetupFromParam; + dca::ClusterSolverId::CT_INT, input_name>; using Parameters = typename G0Setup::Parameters; virtual void SetUp() {} - void setParameters(Parameters& param_single, Parameters& param_double) - { + void setParameters(Parameters& param_single, Parameters& param_double) { single_setup = std::make_unique(param_single); single_setup->setUp(); double_setup = std::make_unique(param_double); @@ -94,11 +93,12 @@ TYPED_TEST(CtintWalkerMultiInsertTest, NoSubmatrix) { using Matrix = typename Walker::Matrix; using MatrixPair = std::array; - Concurrency concurrency(0,nullptr); + Concurrency concurrency(0, nullptr); Parameters param_single("", concurrency); param_single.template readInput(input_name); param_single.setDoubleUpdateProbability(0); + param_single.setInitialConfigurationSize(2); param_single.broadcast(); Parameters param_double("", concurrency); @@ -108,14 +108,14 @@ TYPED_TEST(CtintWalkerMultiInsertTest, NoSubmatrix) { this->setParameters(param_single, param_double); - typename G0Setup::RngType rng(std::vector{}); + typename G0Setup::RngType rng(true, 200); G0Interpolation g0( dca::phys::solver::ctint::details::shrinkG0(this->single_setup->data_->G0_r_t)); typename G0Setup::LabelDomain label_dmn; using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder; DMatrixBuilder d_matrix_builder_single(g0, bands, RDmn()); - d_matrix_builder_single.setAlphas(param_single.getAlphas(), false); + d_matrix_builder_single.setAlphas(param_single.getAlphas(), param_single.adjustAlphaDd()); auto& data_single = this->single_setup->data_; diff --git a/test/unit/phys/dca_step/cluster_solver/stub_rng.hpp b/test/unit/phys/dca_step/cluster_solver/stub_rng.hpp index b80bb631a..e2340f995 100644 --- a/test/unit/phys/dca_step/cluster_solver/stub_rng.hpp +++ b/test/unit/phys/dca_step/cluster_solver/stub_rng.hpp @@ -20,6 +20,13 @@ namespace testing { class StubRng { public: + StubRng(bool bdummy, int n = 200) { + val_ = std::vector(n); + double ftes = 1 / n; + for (int i = 0; i < val_.size(); ++i) + val_[i] = ftes * i; + } + StubRng(int /*a*/ = 0, int /*b*/ = 0, int /*c*/ = 0, int /*d*/ = 0) { val_ = std::vector(200, 0.5); } @@ -43,7 +50,7 @@ class StubRng { int index_ = 0; }; -} // testing -} // dca +} // namespace testing +} // namespace dca #endif // TEST_UNIT_PHYS_DCA_STEP_CLUSTER_SOLVER_STUB_HPP