diff --git a/include/dca/linalg/matrix.hpp b/include/dca/linalg/matrix.hpp index bd7d96ced..8325b008a 100644 --- a/include/dca/linalg/matrix.hpp +++ b/include/dca/linalg/matrix.hpp @@ -30,6 +30,7 @@ #include "dca/linalg/util/memory.hpp" #include "dca/linalg/util/stream_functions.hpp" #include "dca/util/type_help.hpp" +#include "device_type.hpp" namespace dca { namespace linalg { @@ -38,7 +39,8 @@ namespace linalg { /** Matrix class for interfacing with Blas, Cublas, Rocblas * its row major i.e, row is fast. */ -template > +template > class Matrix : public ALLOC { public: using ThisType = Matrix; @@ -72,7 +74,8 @@ class Matrix : public ALLOC { // Contructs a matrix with name name, size rhs.size() and a copy of the elements of rhs, where rhs // elements are stored on a different device. template - Matrix(const Matrix& rhs, const std::string& = default_name_); + Matrix(const Matrix& rhs, + const std::string& = default_name_); // Contructs a matrix with name name, size rhs.size() and a copy of the elements of rhs, where rhs // elements are stored on a different device. @@ -92,10 +95,12 @@ class Matrix : public ALLOC { // Resizes the matrix to rhs.size() and copy the elements, stored on a different device, of rhs. // Postcondition: The name of the matrix is unchanged. template - Matrix& operator=(const Matrix& rhs); + Matrix& operator=( + const Matrix& rhs); template - Matrix& operator=(const Matrix& rhs); + Matrix& operator=( + const Matrix& rhs); // Returns true if this is equal to other, false otherwise. // Two matrices are equal, if they have the same size and contain the same elements. Name and @@ -113,9 +118,9 @@ class Matrix : public ALLOC { template > ScalarType& operator()(int i, int j) { #ifndef NDEBUG - if(!(i >= 0 && i <= size_.first)) + if (!(i >= 0 && i <= size_.first)) throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!"); - if(!(j >= 0 && j <= size_.second)) + if (!(j >= 0 && j <= size_.second)) throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!"); #endif return data_[i + j * leadingDimension()]; @@ -123,9 +128,9 @@ class Matrix : public ALLOC { template > const ScalarType& operator()(int i, int j) const { #ifndef NDEBUG - if(!(i >= 0 && i <= size_.first)) + if (!(i >= 0 && i <= size_.first)) throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!"); - if(!(j >= 0 && j <= size_.second)) + if (!(j >= 0 && j <= size_.second)) throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!"); #endif return data_[i + j * leadingDimension()]; @@ -150,20 +155,20 @@ class Matrix : public ALLOC { // a pointer past the end of the range if i == size().first or j == size().second. // Preconditions: 0 <= i <= size().first, 0 <= j <= size().second. ValueType* ptr(int i, int j) { - //These can be an annoyance debugging so making them "manual" asserts + // These can be an annoyance debugging so making them "manual" asserts #ifndef NDEBUG - if(!(i >= 0 && i <= size_.first)) + if (!(i >= 0 && i <= size_.first)) throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!"); - if(!(j >= 0 && j <= size_.second)) + if (!(j >= 0 && j <= size_.second)) throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!"); #endif return data_ + i + j * leadingDimension(); } const ValueType* ptr(int i, int j) const { #ifndef NDEBUG - if(!(i >= 0 && i <= size_.first)) + if (!(i >= 0 && i <= size_.first)) throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!"); - if(!(j >= 0 && j <= size_.second)) + if (!(j >= 0 && j <= size_.second)) throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!"); #endif return data_ + i + j * leadingDimension(); @@ -240,11 +245,13 @@ class Matrix : public ALLOC { // Asynchronous assignment. template - void setAsync(const Matrix& rhs, const util::GpuStream& stream); + void setAsync(const Matrix& rhs, + const util::GpuStream& stream); // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) template - void setAsync(const Matrix& rhs, int thread_id, int stream_id); + void setAsync(const Matrix& rhs, int thread_id, + int stream_id); void setToZero(const util::GpuStream& stream); @@ -256,6 +263,7 @@ class Matrix : public ALLOC { std::size_t deviceFingerprint() const; std::string toStr() const; + private: static std::pair capacityMultipleOfBlockSize(std::pair size); inline static size_t nrElements(std::pair size) { @@ -276,41 +284,41 @@ class Matrix : public ALLOC { }; template -const std::string Matrix::default_name_ = "no-name"; +const std::string Matrix::default_name_ = "no-name"; template -Matrix::Matrix(const std::string& name) : Matrix(name, 0) {} +Matrix::Matrix(const std::string& name) : Matrix(name, 0) {} template -Matrix::Matrix(int size) : Matrix(std::make_pair(size, size)) {} +Matrix::Matrix(int size) : Matrix(std::make_pair(size, size)) {} template -Matrix::Matrix(const std::string& name, int size) +Matrix::Matrix(const std::string& name, int size) : Matrix(name, std::make_pair(size, size)) {} template -Matrix::Matrix(int size, int capacity) +Matrix::Matrix(int size, int capacity) : Matrix(std::make_pair(size, size), std::make_pair(capacity, capacity)) {} template -Matrix::Matrix(const std::string& name, int size, int capacity) +Matrix::Matrix(const std::string& name, int size, int capacity) : Matrix(name, std::make_pair(size, size), std::make_pair(capacity, capacity)) {} template -Matrix::Matrix(std::pair size) : Matrix(size, size) {} +Matrix::Matrix(std::pair size) : Matrix(size, size) {} template -Matrix::Matrix(const std::string& name, std::pair size) +Matrix::Matrix(const std::string& name, std::pair size) : Matrix(name, size, size) {} template -Matrix::Matrix(std::pair size, std::pair capacity) +Matrix::Matrix(std::pair size, std::pair capacity) : Matrix(default_name_, size, capacity) {} template template Matrix::Matrix(const Matrix& rhs, - const std::string& name) + const std::string& name) : name_(name), size_(rhs.size_), capacity_(rhs.capacity_) { data_ = Allocator::allocate(nrElements(capacity_)); util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); @@ -318,18 +326,19 @@ Matrix::Matrix(const Matrix template -Matrix::Matrix(const Matrix& rhs, - const std::string& name) +Matrix::Matrix(const Matrix& rhs, + const std::string& name) : name_(name), size_(rhs.size_), capacity_(rhs.capacity_) { if (sizeof(ScalarType) != sizeof(ScalarRhs)) - throw std::runtime_error("conversion of both type and location of Matrix not currently possible!"); + throw std::runtime_error( + "conversion of both type and location of Matrix not currently possible!"); data_ = ALLOC::allocate(nrElements(capacity_)); util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); } template Matrix::Matrix(const std::string& name, std::pair size, - std::pair capacity) + std::pair capacity) : name_(name), size_(size), capacity_(capacityMultipleOfBlockSize(capacity)) { assert(size_.first >= 0 && size_.second >= 0); assert(capacity.first >= 0 && capacity.second >= 0); @@ -341,14 +350,15 @@ Matrix::Matrix(const std::string& name, std::pai } template -Matrix::Matrix(const Matrix& rhs, - const std::string& name) +Matrix::Matrix(const Matrix& rhs, + const std::string& name) : name_(name) { *this = rhs; } template -Matrix::Matrix(Matrix&& rhs, const std::string& name) +Matrix::Matrix(Matrix&& rhs, + const std::string& name) : name_(name), size_(rhs.size_), capacity_(rhs.capacity_), data_(rhs.data_) { rhs.capacity_ = std::make_pair(0, 0); rhs.size_ = std::make_pair(0, 0); @@ -356,20 +366,26 @@ Matrix::Matrix(Matrix -Matrix::~Matrix() { +Matrix::~Matrix() { Allocator::deallocate(data_); } template -void Matrix::resize(std::pair new_size) { - if (new_size.first == 0 || new_size.second ==0) { +void Matrix::resize(std::pair new_size) { + if (new_size.first == 0 || new_size.second == 0) { size_ = new_size; return; - } else if (new_size.first > capacity_.first || new_size.second > capacity_.second) { + } + else if (new_size.first > capacity_.first || new_size.second > capacity_.second) { std::pair new_capacity = capacityMultipleOfBlockSize(new_size); ValueType* new_data = nullptr; new_data = Allocator::allocate(nrElements(new_capacity)); +#ifndef NDEBUG + // This makes debugging matrix issues easier since there is never garbage in expanded matrices + if constexpr (device_name == linalg::DeviceType::CPU) + std::fill_n(new_data, nrElements(new_capacity), 0); +#endif // hip memorycpy2D routines don't tolerate leadingDimension = 0 const std::pair copy_size(std::min(new_size.first, size_.first), std::min(new_size.second, size_.second)); @@ -379,14 +395,26 @@ void Matrix::resize(std::pair new_siz capacity_ = new_capacity; size_ = new_size; } - else { +#ifndef NDEBUG + else if (new_size.first > size_.first || new_size.second > size_.second) { + // we need to cover case where we expand the matrix but no allocation is necessary. + // This is slow but still thinking about what to do here, the legacy behavior is fine if + // we're just overwriting the new sections but not if its assumed they are 0, not all client + // code is clear about this. + if constexpr (device_name == linalg::DeviceType::CPU) + for (int i = size_.first; i < new_size.first; ++i) + for (int j = size_.second; j < new_size.second; ++j) + *(data_ + i * new_size.second + j) = 0; size_ = new_size; } +#endif + else + size_ = new_size; } template -Matrix& Matrix::operator=( - const Matrix& rhs) { +Matrix& Matrix::operator=( + const Matrix& rhs) { resizeNoCopy(rhs.size_); if (device_name == CPU) util::memoryCopyCpu(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); @@ -396,15 +424,15 @@ Matrix& Matrix: } template -Matrix& Matrix::operator=( - Matrix&& rhs) { +Matrix& Matrix::operator=( + Matrix&& rhs) { swap(rhs); return *this; } template template -Matrix& Matrix::operator=( +Matrix& Matrix::operator=( const Matrix& rhs) { resizeNoCopy(rhs.size_); util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); @@ -414,7 +442,7 @@ Matrix& Matrix #ifdef DCA_HAVE_GPU template template -Matrix& Matrix::operator=( +Matrix& Matrix::operator=( const Matrix& rhs) { static_assert(sizeof(ScalarType) == sizeof(ScalarRhs), "sizeof ScalarType and ScalarRhs are not equal"); @@ -426,7 +454,8 @@ Matrix& Matrix #endif template -bool Matrix::operator==(const Matrix& other) const { +bool Matrix::operator==( + const Matrix& other) const { if (device_name == GPU) return Matrix(*this) == Matrix(other); @@ -442,12 +471,13 @@ bool Matrix::operator==(const Matrix -bool Matrix::operator!=(const Matrix& other) const { +bool Matrix::operator!=( + const Matrix& other) const { return not(*this == other); } template -void Matrix::resizeNoCopy(std::pair new_size) { +void Matrix::resizeNoCopy(std::pair new_size) { if (new_size.first > capacity_.first || new_size.second > capacity_.second) { size_ = new_size; capacity_ = capacityMultipleOfBlockSize(new_size); @@ -461,28 +491,28 @@ void Matrix::resizeNoCopy(std::pair n } template -void Matrix::clear() { +void Matrix::clear() { Allocator::deallocate(data_); size_ = capacity_ = std::make_pair(0, 0); } template -void Matrix::swap(Matrix& rhs) { +void Matrix::swap(Matrix& rhs) { std::swap(size_, rhs.size_); std::swap(capacity_, rhs.capacity_); std::swap(data_, rhs.data_); } template -void Matrix::swapWithName(Matrix& rhs) { +void Matrix::swapWithName(Matrix& rhs) { std::swap(name_, rhs.name_); swap(rhs); } template template -void Matrix::set(const Matrix& rhs, - int thread_id, int stream_id) { +void Matrix::set( + const Matrix& rhs, int thread_id, int stream_id) { resize(rhs.size_); // This specialization is required since without unified memory CUDA doesn't known which memory locality the pointer has. if constexpr (device_name == DeviceType::GPU && rhs_device_name == DeviceType::CPU) @@ -498,8 +528,9 @@ void Matrix::set(const Matrix template -void Matrix::set(const Matrix& rhs, - const util::GpuStream& stream [[maybe_unused]]) { +void Matrix::set( + const Matrix& rhs, + const util::GpuStream& stream [[maybe_unused]]) { resize(rhs.size_); if constexpr (device_name == DeviceType::GPU && rhs_device_name == DeviceType::CPU) util::memoryCopyH2D(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_); @@ -509,26 +540,27 @@ void Matrix::set(const Matrix template -void Matrix::setAsync(const Matrix& rhs, - const util::GpuStream& stream) { +void Matrix::setAsync( + const Matrix& rhs, const util::GpuStream& stream) { resizeNoCopy(rhs.size_); util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream); } template template -void Matrix::setAsync(const Matrix& rhs, - const int thread_id, const int stream_id) { +void Matrix::setAsync( + const Matrix& rhs, const int thread_id, + const int stream_id) { setAsync(rhs, util::getStream(thread_id, stream_id)); } template -void Matrix::setToZero(const util::GpuStream& stream) { +void Matrix::setToZero(const util::GpuStream& stream) { util::Memory::setToZeroAsync(data_, leadingDimension() * nrCols(), stream); } template -void Matrix::print() const { +void Matrix::print() const { if (device_name == GPU) return Matrix(*this).print(); @@ -549,7 +581,7 @@ void Matrix::print() const { } template -std::string Matrix::toStr() const { +std::string Matrix::toStr() const { if (device_name == GPU) return Matrix(*this).toStr(); @@ -566,9 +598,9 @@ std::string Matrix::toStr() const { return ss.str(); } - + template -void Matrix::printFingerprint() const { +void Matrix::printFingerprint() const { std::stringstream ss; ss << "\n"; @@ -581,7 +613,7 @@ void Matrix::printFingerprint() const { } template -std::pair Matrix::capacityMultipleOfBlockSize( +std::pair Matrix::capacityMultipleOfBlockSize( std::pair size) { assert(size.first >= 0); assert(size.second >= 0); @@ -597,7 +629,7 @@ std::pair Matrix::capacityMultipleOfB } template -std::size_t Matrix::deviceFingerprint() const { +std::size_t Matrix::deviceFingerprint() const { if (device_name == GPU) return capacity_.first * capacity_.second * sizeof(ScalarType); else @@ -608,7 +640,7 @@ std::size_t Matrix::deviceFingerprint() const { template auto makeDiagonalMatrix(Vector& diag) { int dsize = diag.size(); - Matrix matrix("diag_matrix", dsize); + Matrix matrix("diag_matrix", dsize); for (int i = 0; i < dsize; ++i) { matrix(i, i) = diag[i]; } @@ -619,7 +651,7 @@ auto makeDiagonalMatrix(Vector& diag) { template auto makeDiagonalMatrixInv(Vector& diag) { int dsize = diag.size(); - Matrix matrix("diag_matrix", dsize); + Matrix matrix("diag_matrix", dsize); // insure that if ScalarType is complex the 1 is as well. // then std::complex will give us a proper complex multiplicative inverse ScalarType the_one{}; diff --git a/include/dca/linalg/vector.hpp b/include/dca/linalg/vector.hpp index 3e8882b77..a21fd0203 100644 --- a/include/dca/linalg/vector.hpp +++ b/include/dca/linalg/vector.hpp @@ -34,7 +34,7 @@ namespace dca { namespace linalg { // dca::linalg:: - template > class Vector : public Allocator { public: @@ -52,9 +52,8 @@ class Vector : public Allocator { /** copy constructor except for name. * this is strange but for historical reasons is kept. - * has needed to be explicit because with the `const ThisType&` somehow lead to an implicit conversion - * from an int to a Vector& argument that landed here. - * This occurred in Debug with + * has needed to be explicit because with the `const ThisType&` somehow lead to an implicit + * conversion from an int to a Vector& argument that landed here. This occurred in Debug with */ explicit Vector(const ThisType& rhs, const std::string& name = default_name_); @@ -323,7 +322,8 @@ void Vector::setAsync(const Container& rhs, } template -void Vector::setToZeroAsync(const util::GpuStream& stream [[maybe_unused]]) { +void Vector::setToZeroAsync(const util::GpuStream& stream + [[maybe_unused]]) { // TODO: implement in copy.hpp. #ifdef DCA_HAVE_GPU checkRC(cudaMemsetAsync(data_, 0, size_ * sizeof(ScalarType), stream)); @@ -333,12 +333,14 @@ void Vector::setToZeroAsync(const util::GpuS } template -void Vector::setToZero(const util::GpuStream& stream [[maybe_unused]]) { +void Vector::setToZero(const util::GpuStream& stream + [[maybe_unused]]) { dca::linalg::util::Memory::setToZero(data_, size_, stream); } // template -// void Vector::setToZero(const util::GpuStream& stream [[maybe_unused]]) { +// void Vector::setToZero(const util::GpuStream& stream +// [[maybe_unused]]) { // // TODO: implement in copy.hpp. // dca::linalg::util::memory::setToZero(data_, size_, stream); // } diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp index c63d70f18..94d9753bc 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp @@ -33,6 +33,8 @@ namespace solver { namespace ctint { // dca::phys::solver::ctint:: + +// I think the class hierarch isn't helpful here. class SolverConfiguration : public MatrixConfiguration { public: using BaseClass = MatrixConfiguration; diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp index 70cfb39c4..db2e560d5 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp @@ -41,6 +41,8 @@ #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp" #endif +//#define DEBUG_SUBMATRIX + namespace dca { namespace phys { namespace solver { @@ -238,7 +240,7 @@ CtintWalkerBase::CtintWalkerBase(const Parameters& parameters_ thread_id_(id), streams_{&linalg::util::getStreamContainer()(thread_id_, 0), - &linalg::util::getStreamContainer()(thread_id_, 1)}, + &linalg::util::getStreamContainer()(thread_id_, 0)}, rng_(rng_ref), diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp index 861df5cb6..c650d1e9c 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp @@ -55,7 +55,8 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase; - CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, DMatrixBuilder& d_matrix_builder, int id = 0); + CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, + DMatrixBuilder& d_matrix_builder, int id = 0); virtual ~CtintWalkerSubmatrixCpu() = default; @@ -64,11 +65,29 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase& d_matrix_builder_; + + BaseClass::MatrixPair getM(); + + /** The following methods are really only here to get decent unit testing + they shouldn't really be called outside of the base implementations + */ + void computeMInit() override; + void computeGInit() override; + BaseClass::MatrixPair getRawM(); + BaseClass::MatrixPair getRawG(); + +private: + /** This does a bunch of things, this is the majority of a step * For each delayed_move it: * * computes the acceptance prob * * compares std::abs(acceptance prob) and random value @@ -82,17 +101,6 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase& d_matrix_builder_; -private: - void doSubmatrixUpdate(); /** returns [acceptance_probability , mc_weight_ratio ] @@ -103,10 +111,7 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase::CtintWalkerSubmatrixCpu( const Parameters& parameters_ref, const Data& data, Rng& rng_ref, DMatrixBuilder& d_matrix_builder, int id) : SubmatrixBase(parameters_ref, data, rng_ref, id), d_matrix_builder_(d_matrix_builder) { - - for (int b = 0; b < n_bands_; ++b) { + for (int b = 0; b < n_bands_; ++b) { for (int i = 1; i <= 3; ++i) { f_[i][b] = d_matrix_builder_.computeF(i, b); f_[-i][b] = d_matrix_builder_.computeF(-i, b); @@ -202,17 +206,23 @@ CtintWalkerSubmatrixCpu::CtintWalkerSubmatrixCpu( } f_[0][b] = 1; } - } template void CtintWalkerSubmatrixCpu::setMFromConfig() { BaseClass::setMFromConfigImpl(d_matrix_builder_); - transformM(); + SubmatrixBase::transformM(); +#ifdef DEBUG_SUBMATRIX + std::cout << "cpu M post setMFromConfigImpl: \n"; + M_[0].set_name("cpu_M_0"); + M_[1].set_name("cpu_M_1"); + M_[0].print(); + M_[1].print(); +#endif } template -void CtintWalkerSubmatrixCpu::markThermalized() { +void CtintWalkerSubmatrixCpu::markThermalized() { thermalized_ = true; nb_steps_per_sweep_ = std::max(1., std::ceil(sweeps_per_meas_ * partial_order_avg_.mean())); @@ -225,7 +235,7 @@ void CtintWalkerSubmatrixCpu::markThermalized() { // Recompute the Monte Carlo weight. setMFromConfig(); #ifndef NDEBUG - //writeAlphas(); + // writeAlphas(); #endif } @@ -242,26 +252,55 @@ void CtintWalkerSubmatrixCpu::computeMInit() { Scalar f_j; D_.resize(std::make_pair(delta, n_init_[s])); + if (delta == 0 || n_init_[s] == 0) + throw std::runtime_error( + "expansion factor dropped to 0 or below use a higher beta or larger interaction!"); + d_matrix_builder_.computeG0(D_, configuration_.getSector(s), n_init_[s], n_max_[s], 0); +#ifdef DEBUG_SUBMATRIX + D_.print(); +#endif + + std::array, 2> f_values; + f_values[s].resize(n_init_[s]); for (int j = 0; j < n_init_[s]; ++j) { const auto field_type = configuration_.getSector(s).getAuxFieldType(j); const auto b = configuration_.getSector(s).getRightB(j); f_j = f_[field_type][b] - 1; - + f_values[s][j] = f_[field_type][b]; for (int i = 0; i < delta; ++i) { D_(i, j) *= f_j; } } +#ifdef DEBUG_SUBMATRIX + f_values[0].set_name("cpu_f_values_0"); + f_values[1].set_name("cpu_f_values_1"); + f_values[0].print(); + f_values[1].print(); + std::cout << "cpu D post f factor mult\n"; + D_.print(); + using namespace dca::addt_str_oper; + std::cout << "M_[" << s << "] size: " << M_[s].size() << '\n'; +#endif + M_[s].resize(n_max_[s]); MatrixView M(M_[s], 0, 0, n_init_[s], n_init_[s]); MatrixView D_M(M_[s], n_init_[s], 0, delta, n_init_[s]); +#ifdef DEBUG_SUBMATRIX + std::cout << "cpu M pre gemm\n"; + M_[s].print(); +#endif + linalg::matrixop::gemm(D_, M, D_M); flop_ += 2 * D_.nrRows() * D_.nrCols() * M.nrCols(); +#ifdef DEBUG_SUBMATRIX + D_M.print(); +#endif for (int i = 0; i < n_max_[s]; ++i) { for (int j = n_init_[s]; j < n_max_[s]; ++j) { M_[s](i, j) = 0; @@ -270,6 +309,9 @@ void CtintWalkerSubmatrixCpu::computeMInit() { for (int i = n_init_[s]; i < n_max_[s]; ++i) M_[s](i, i) = 1; +#ifdef DEBUG_SUBMATRIX + M_[s].print(); +#endif } } } @@ -386,6 +428,8 @@ void CtintWalkerSubmatrixCpu::updateM() { linalg::matrixop::copyCols(M_[s], source_list_[s], M_[s], removal_list_[s]); M_[s].resize(configuration_.getSector(s).size()); } + + } template @@ -493,20 +537,6 @@ void CtintWalkerSubmatrixCpu::recomputeGammaInv() { } } -template -void CtintWalkerSubmatrixCpu::transformM() { - for (int s = 0; s < 2; ++s) { - for (int j = 0; j < M_[s].size().second; ++j) { - for (int i = 0; i < M_[s].size().first; ++i) { - const auto field_type = configuration_.getSector(s).getAuxFieldType(i); - const auto b = configuration_.getSector(s).getLeftB(i); - const Scalar f_i = -(f_[field_type][b] - 1); - M_[s](i, j) /= f_i; - } - } - } -} - template void CtintWalkerSubmatrixCpu::computeM(typename BaseClass::MatrixPair& m_accum) { for (int s = 0; s < 2; ++s) { @@ -591,6 +621,34 @@ void CtintWalkerSubmatrixCpu::computeMixedInsertionAndRemoval( computeRemovalMatrix(s); } +template +CtintWalkerSubmatrixCpu::BaseClass::MatrixPair CtintWalkerSubmatrixCpu< + Parameters, DIST>::getRawM() { + typename BaseClass::MatrixPair M; + M = M_; + M[0].set_name("subMatrixCPU::M[0]"); + M[1].set_name("subMatrixCPU::M[1]"); + return M; +} + +template +CtintWalkerSubmatrixCpu::BaseClass::MatrixPair CtintWalkerSubmatrixCpu< + Parameters, DIST>::getRawG() { + typename BaseClass::MatrixPair G; + G = G_; + G[0].set_name("subMatrixCPU::G[0]"); + G[1].set_name("subMatrixCPU::G[1]"); + return G; +} + +template +CtintWalkerSubmatrixCpu::BaseClass::MatrixPair CtintWalkerSubmatrixCpu::getM() { + typename BaseClass::MatrixPair M; + computeM(M); + return M; +} + } // namespace ctint } // namespace solver } // namespace phys diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp index e098599e6..a6096373b 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp @@ -29,6 +29,7 @@ #include "dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp" #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp" #include "dca/phys/dca_step/cluster_solver/ctint/walker/kernels_interface.hpp" +#include "dca/util/to_string.hpp" namespace dca { namespace phys { @@ -65,6 +66,8 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase& m_accum); + MatrixPair getM(); + void doSweep() override; void synchronize() const override; @@ -75,19 +78,29 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase getRawG(); + MatrixPair getRawM(); + + void updateM() override; private: void doStep() override; - void computeMInit(); - void computeGInit(); - void updateM(); - - void uploadConfiguration(); - protected: using BaseClass::configuration_; using BaseClass::M_; @@ -100,8 +113,6 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase::CtintWalkerSubmatrixGpu( if (concurrency_.id() == concurrency_.first() && thread_id_ == 0) std::cout << "\nCT-INT submatrix walker extended to GPU." << std::endl; } - + template void CtintWalkerSubmatrixGpu::setMFromConfig() { BaseClass::setMFromConfigImpl(d_matrix_builder_); + SubmatrixBase::transformM(); +#ifdef DEBUG_SUBMATRIX + M_[0].set_name("gpu_M_0"); + M_[1].set_name("gpu_M_0"); +#endif for (int s = 0; s < 2; ++s) { + std::cout << "GPU pre set M: \n"; +#ifdef DEBUG_SUBMATRIX + M_[s].print(); +#endif M_dev_[s].setAsync(M_[s], get_stream(s)); } } @@ -181,9 +201,7 @@ void CtintWalkerSubmatrixGpu::setMFromConfig() { template void CtintWalkerSubmatrixGpu::synchronize() const { Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_); - checkRC(cudaStreamSynchronize(get_stream(0))); - checkRC(cudaStreamSynchronize(get_stream(1))); } template @@ -197,8 +215,9 @@ void CtintWalkerSubmatrixGpu::doSweep() { template void CtintWalkerSubmatrixGpu::doStep(const int n_moves_to_delay) { SubmatrixBase::nbr_of_moves_to_delay_ = n_moves_to_delay; - doStep(); uploadConfiguration(); + synchronize(); + doStep(); } template @@ -232,6 +251,12 @@ void CtintWalkerSubmatrixGpu::uploadConfiguration() { f_dev_[s].setAsync(values, get_stream(s)); } +#ifdef DEBUG_SUBMATRIX + std::cout << "GPU f_values: \n"; + f_values_[0].print(); + f_values_[1].print(); +#endif + for (int s = 0; s < 2; ++s) config_copied_[s].record(get_stream(s)); } @@ -240,24 +265,58 @@ template void CtintWalkerSubmatrixGpu::computeMInit() { // Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_); - for (int s = 0; s < 2; ++s) + for (int s = 0; s < 2; ++s) { + const GpuStream& stream = get_stream(s); + stream.sync(); M_dev_[s].resize(n_max_[s]); + } for (int s = 0; s < 2; ++s) { const int delta = n_max_[s] - n_init_[s]; if (delta > 0) { D_dev_[s].resizeNoCopy(std::make_pair(delta, n_init_[s])); + + if (delta == 0 || n_init_[s] == 0) + throw std::runtime_error( + "expansion factor dropped to 0 or below use a higher beta or larger interaction!"); + d_matrix_builder_.computeG0(D_dev_[s], device_config_.getDeviceData(s), n_init_[s], false, - get_stream(s)); + get_stream(s)); flop_ += D_dev_[s].nrCols() * D_dev_[s].nrRows() * 10; +#ifdef DEBUG_SUBMATRIX + // debugging + SubmatrixBase::D_ = D_dev_[s]; + SubmatrixBase::D_.print(); +#endif MatrixView D_view(D_dev_[s]); details::multiplyByFColFactor(D_view, f_dev_[s].ptr(), get_stream(s)); +#ifdef DEBUG_SUBMATRIX + std::cout << "GPU D multiplied by FCol Factor\n"; + SubmatrixBase::D_ = D_dev_[s]; + SubmatrixBase::D_.print(); +#endif MatrixView M(M_dev_[s], 0, 0, n_init_[s], n_init_[s]); MatrixView D_M(M_dev_[s], n_init_[s], 0, delta, n_init_[s]); +#ifdef DEBUG_SUBMATRIX + SubmatrixBase::M_[s] = M_dev_[s]; + std::cout << "GPU M pre gemm M\n"; + SubmatrixBase::M_[s].print(); +#endif + linalg::matrixop::gemm(D_dev_[s], M, D_M, thread_id_, s); + +#ifdef DEBUG_SUBMATRIX + SubmatrixBase::M_[s] = M_dev_[s]; + std::cout << "GPU M post gemm M\n"; + SubmatrixBase::M_[s].print(); + MatrixView D_M_host(SubmatrixBase::M_[s], n_init_[s], 0, delta, n_init_[s]); + std::cout << "gpu D_M_ print\n"; + D_M_host.print(); +#endif + flop_ += 2 * D_dev_[s].nrRows() * D_dev_[s].nrCols() * M.nrCols(); details::setRightSectorToId(M_dev_[s].ptr(), M_dev_[s].leadingDimension(), n_init_[s], @@ -272,27 +331,31 @@ void CtintWalkerSubmatrixGpu::computeGInit() { for (int s = 0; s < 2; ++s) { const int delta = n_max_[s] - n_init_[s]; + + // In cpu we only do all this if delta > 0 auto& f_dev = f_dev_[s]; G_dev_[s].resizeNoCopy(n_max_[s]); - MatrixView G(G_dev_[s]); + MatrixView G_view_dev(G_dev_[s]); const MatrixView M(M_dev_[s]); - details::computeGLeft(G, M, f_dev.ptr(), n_init_[s], get_stream(s)); + details::computeGLeft(G_view_dev, M, f_dev.ptr(), n_init_[s], get_stream(s)); flop_ += n_init_[s] * n_max_[s] * 2; if (delta > 0) { G0_dev_[s].resizeNoCopy(std::make_pair(n_max_[s], delta)); + // This doesn't do flops but the g0 interp data it uses does somewhere. d_matrix_builder_.computeG0(G0_dev_[s], device_config_.getDeviceData(s), n_init_[s], true, - get_stream(s)); + get_stream(s)); flop_ += G0_dev_[s].nrCols() * G0_dev_[s].nrRows() * 10; - MatrixView G(G_dev_[s], 0, n_init_[s], n_max_[s], delta); + MatrixView G_view_reduced(G_dev_[s], 0, n_init_[s], n_max_[s], delta); // compute G right. - linalg::matrixop::gemm(M_dev_[s], G0_dev_[s], G, thread_id_, s); + linalg::matrixop::gemm(M_dev_[s], G0_dev_[s], G_view_reduced, thread_id_, s); flop_ += 2 * M_dev_[s].nrRows() * M_dev_[s].nrCols() * G0_dev_[s].nrCols(); + + G_[s].setAsync(G_dev_[s], get_stream(s)); } - G_[s].setAsync(G_dev_[s], get_stream(s)); } } @@ -325,7 +388,10 @@ void CtintWalkerSubmatrixGpu::updateM() { old_G.resizeNoCopy(std::make_pair(n_max_[s], gamma_size)); old_M.resizeNoCopy(std::make_pair(gamma_size, n_max_[s])); - assert(dca::linalg::util::getStream(thread_id_, s) == get_stream(s)); + // We no longer use the 1 stream per spin + // This isn't portable and i.e. AMD don't really benefit from multiple streams per thread + // and it adds much complexity. + // assert(dca::linalg::util::getStream(thread_id_, s) == get_stream(s)); move_indices_dev_[s].setAsync(move_indices_[s], get_stream(s)); // Note: an event synchronization might be necessary if the order of operation is changed. linalg::matrixop::copyCols(G_dev_[s], move_indices_dev_[s], old_G, thread_id_, s); @@ -392,6 +458,52 @@ std::size_t CtintWalkerSubmatrixGpu::deviceFingerprint() const return res; } +template +CtintWalkerSubmatrixGpu::MatrixPair CtintWalkerSubmatrixGpu< + Parameters, DIST>::getRawM() { + std::array, 2> M_copy{M_dev_[0].size(), M_dev_[1].size()}; + // Synchronous copies for testing + linalg::util::memoryCopyD2H(M_copy[0].ptr(), M_copy[0].leadingDimension(), M_dev_[0].ptr(), + M_dev_[0].leadingDimension(), M_dev_[0].size()); + linalg::util::memoryCopyD2H(M_copy[1].ptr(), M_copy[1].leadingDimension(), M_dev_[1].ptr(), + M_dev_[1].leadingDimension(), M_dev_[1].size()); + M_copy[0].set_name("subMatrixGPU::M[0]"); + M_copy[1].set_name("subMatrixGPU::M[1]"); + + return M_copy; +} + +template +CtintWalkerSubmatrixGpu::MatrixPair CtintWalkerSubmatrixGpu< + Parameters, DIST>::getRawG() { + std::array, 2> G_copy{G_dev_[0].size(), G_dev_[1].size()}; + // Synchronous copies for testing + linalg::util::memoryCopyD2H(G_copy[0].ptr(), G_copy[0].leadingDimension(), G_dev_[0].ptr(), + G_dev_[0].leadingDimension(), G_dev_[0].size()); + linalg::util::memoryCopyD2H(G_copy[1].ptr(), G_copy[1].leadingDimension(), G_dev_[1].ptr(), + G_dev_[1].leadingDimension(), G_dev_[1].size()); + G_copy[0].set_name("subMatrixGPU::G[0]"); + G_copy[1].set_name("subMatrixGPU::G[1]"); + + return G_copy; +} + +template +CtintWalkerSubmatrixGpu::MatrixPair CtintWalkerSubmatrixGpu< + Parameters, DIST>::getM() { + std::array, 2> M; + synchronize(); + computeM(M); + checkRC(cudaDeviceSynchronize()); + + std::array, 2> M_copy{M[0].size(), M[1].size()}; + linalg::util::memoryCopyD2H(M_copy[0].ptr(), M_copy[0].leadingDimension(), M[0].ptr(), + M[0].leadingDimension(), M[0].size()); + linalg::util::memoryCopyD2H(M_copy[1].ptr(), M_copy[1].leadingDimension(), M[1].ptr(), + M[1].leadingDimension(), M[1].size()); + return M_copy; +} + } // namespace ctint } // namespace solver } // namespace phys diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp index 3b2901aa6..7336a43fe 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp @@ -50,6 +50,7 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase { using BaseClass::order; virtual void setMFromConfig() = 0; + auto getF() const { return f_; } protected: virtual void doStep() override; void doSteps(); @@ -80,6 +81,8 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase { virtual void updateM() = 0; + virtual void computeMInit() = 0; + private: void doSubmatrixUpdate(); @@ -92,8 +95,6 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase { void removeRowAndColOfGammaInv(); - virtual void computeMInit() = 0; - // void computeG0Init(); virtual void computeGInit() = 0; @@ -113,7 +114,7 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase { assert(move_idx < sector_indices_[s].size()); return sector_indices_[s][move_idx] >= n_init_[s]; } - + protected: using MatrixView = linalg::MatrixView; using Matrix = linalg::Matrix; @@ -209,7 +210,7 @@ class CtintWalkerSubmatrixBase : public CtintWalkerBase { std::array Gamma_q_; Matrix workspace_; - Matrix D_; + Matrix D_{"SubMatrixHostD"}; using BaseClass::flop_; }; @@ -248,6 +249,7 @@ void CtintWalkerSubmatrixBase::doSweep() { doSteps(); } + template void CtintWalkerSubmatrixBase::doSteps() { // Here nbr_of_steps is the number of single steps/moves during the current sweep, @@ -346,6 +348,20 @@ void CtintWalkerSubmatrixBase::doSubmatrixUpdate() { updateM(); } +template +void CtintWalkerSubmatrixBase::transformM() { + for (int s = 0; s < 2; ++s) { + for (int j = 0; j < M_[s].size().second; ++j) { + for (int i = 0; i < M_[s].size().first; ++i) { + const auto field_type = configuration_.getSector(s).getAuxFieldType(i); + const auto b = configuration_.getSector(s).getLeftB(i); + const Scalar f_i = -(f_[field_type][b] - 1); + M_[s](i, j) /= f_i; + } + } + } +} + template void CtintWalkerSubmatrixBase::mainSubmatrixProcess() { Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_); @@ -778,20 +794,6 @@ void CtintWalkerSubmatrixBase::recomputeGammaInv() { } } -template -void CtintWalkerSubmatrixBase::transformM() { - for (int s = 0; s < 2; ++s) { - for (int j = 0; j < M_[s].size().second; ++j) { - for (int i = 0; i < M_[s].size().first; ++i) { - const auto field_type = configuration_.getSector(s).getAuxFieldType(i); - const auto b = configuration_.getSector(s).getLeftB(i); - const Scalar f_i = -(f_[field_type][b] - 1); - M_[s](i, j) /= f_i; - } - } - } -} - template void CtintWalkerSubmatrixBase::computeInsertionMatrices( const std::vector& insertion_indices, const int s) { diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp index ecec506e0..40bdb92a4 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp @@ -83,6 +83,8 @@ class DMatrixBuilder { void computeG0(Matrix& G0, const Sector& configuration, const int n_init, const int n_max, const int which_section) const; + auto& getSiteDiff() { return site_diff_; } + #ifdef DCA_HAVE_GPU virtual void computeG0(linalg::Matrix& /*G0*/, const details::DeviceConfiguration& /*configuration*/, int /*n_init*/, diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh index b36133118..8da2658ac 100644 --- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh +++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh @@ -96,6 +96,10 @@ protected: // Global instance to be used in the tp accumulation kernel. extern __device__ __constant__ G4Helper g4_helper; +extern __CONSTANT__ int* w_ex_indices_actual; +extern __CONSTANT__ int* k_ex_indices_actual; + + inline __device__ int G4Helper::addWex(const int w_idx, const int w_ex_idx) const { return w_idx + w_ex_indices_[w_ex_idx]; } diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh b/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh index 0dba00d07..83962ef68 100644 --- a/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh +++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cuh @@ -39,12 +39,15 @@ public: // returns the index of -id __DEVICE__ inline int minus(int id) const; -private: + // private: int nc_; const int* add_matrix_; const int* sub_matrix_; }; +extern __CONSTANT__ int* cluster_add_matrix; +extern __CONSTANT__ int* cluster_sub_matrix; + // Global instance for real space and momentum clusters. extern __DEVICE__ __CONSTANT__ ClusterHelper cluster_real_helper; extern __DEVICE__ __CONSTANT__ ClusterHelper cluster_momentum_helper; diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh b/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh index 8f2b75355..bdc861b3b 100644 --- a/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh +++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/solver_helper.cuh @@ -44,7 +44,6 @@ public: private: static bool initialized_; - std::size_t subdm_step_[2]; }; diff --git a/include/dca/phys/parameters/mc_solver_parameters.hpp b/include/dca/phys/parameters/mc_solver_parameters.hpp index 595b946b5..c2440a50c 100644 --- a/include/dca/phys/parameters/mc_solver_parameters.hpp +++ b/include/dca/phys/parameters/mc_solver_parameters.hpp @@ -99,7 +99,7 @@ int McSolverParameters::getBufferSize(const Concurrency template void McSolverParameters::pack(const Concurrency& concurrency, char* buffer, - int buffer_size, int& position) const { + int buffer_size, int& position) const { concurrency.pack(buffer, buffer_size, position, expansion_parameter_K_); concurrency.pack(buffer, buffer_size, position, initial_configuration_size_); concurrency.pack(buffer, buffer_size, position, initial_matrix_size_); @@ -110,7 +110,7 @@ void McSolverParameters::pack(const Concurrency& concur template void McSolverParameters::unpack(const Concurrency& concurrency, char* buffer, - int buffer_size, int& position) { + int buffer_size, int& position) { concurrency.unpack(buffer, buffer_size, position, expansion_parameter_K_); concurrency.unpack(buffer, buffer_size, position, initial_configuration_size_); concurrency.unpack(buffer, buffer_size, position, initial_matrix_size_); @@ -207,16 +207,18 @@ int McSolverParameters::getBufferSize(const Concurre } template -void McSolverParameters::pack(const Concurrency& concurrency, char* buffer, - int buffer_size, int& position) const { +void McSolverParameters::pack(const Concurrency& concurrency, + char* buffer, int buffer_size, + int& position) const { concurrency.pack(buffer, buffer_size, position, self_energy_tail_cutoff_); concurrency.pack(buffer, buffer_size, position, steps_per_sweep_); concurrency.pack(buffer, buffer_size, position, shifts_per_sweep_); } template -void McSolverParameters::unpack(const Concurrency& concurrency, char* buffer, - int buffer_size, int& position) { +void McSolverParameters::unpack(const Concurrency& concurrency, + char* buffer, int buffer_size, + int& position) { concurrency.unpack(buffer, buffer_size, position, self_energy_tail_cutoff_); concurrency.unpack(buffer, buffer_size, position, steps_per_sweep_); concurrency.unpack(buffer, buffer_size, position, shifts_per_sweep_); @@ -252,6 +254,17 @@ void McSolverParameters::readWrite(ReaderOrWriter& r // Specialization for CT-INT template <> class McSolverParameters { +private: + /// code does not currently cover corner case of configuration size 0, as such this isn't a sensible default to use. + int initial_configuration_size_ = 2; + double alpha_dd_pos_ = 0.501; + double alpha_dd_neg_ = 0; + double alpha_ndd_ = 1e-4; + bool adjust_alpha_dd_ = false; + double double_update_probability_ = 0; + bool all_sites_partnership_ = 0; + int max_submatrix_size_ = 1; + public: template int getBufferSize(const Concurrency& concurrency) const; @@ -298,16 +311,6 @@ class McSolverParameters { void setMaxSubmatrixSize(const int size) { max_submatrix_size_ = size; } - -private: - int initial_configuration_size_ = 0; - double alpha_dd_pos_ = 0.501; - double alpha_dd_neg_ = 0; - double alpha_ndd_ = 1e-4; - bool adjust_alpha_dd_ = false; - double double_update_probability_ = 0; - bool all_sites_partnership_ = 0; - int max_submatrix_size_ = 1; }; template @@ -327,7 +330,7 @@ int McSolverParameters::getBufferSize(const Concurrency template void McSolverParameters::pack(const Concurrency& concurrency, char* buffer, - const int buffer_size, int& position) const { + const int buffer_size, int& position) const { concurrency.pack(buffer, buffer_size, position, initial_configuration_size_); concurrency.pack(buffer, buffer_size, position, alpha_dd_pos_); concurrency.pack(buffer, buffer_size, position, alpha_dd_neg_); @@ -340,7 +343,7 @@ void McSolverParameters::pack(const Concurrency& concur template void McSolverParameters::unpack(const Concurrency& concurrency, char* buffer, - const int buffer_size, int& position) { + const int buffer_size, int& position) { concurrency.unpack(buffer, buffer_size, position, initial_configuration_size_); concurrency.unpack(buffer, buffer_size, position, alpha_dd_pos_); concurrency.unpack(buffer, buffer_size, position, alpha_dd_neg_); diff --git a/include/dca/phys/parameters/physics_parameters.hpp b/include/dca/phys/parameters/physics_parameters.hpp index 51f53cba4..12ff59554 100644 --- a/include/dca/phys/parameters/physics_parameters.hpp +++ b/include/dca/phys/parameters/physics_parameters.hpp @@ -57,6 +57,7 @@ class PhysicsParameters { return adjust_chemical_potential_; } + const void set_beta(double beta) { beta_ = beta; } private: double beta_; double density_; diff --git a/include/dca/util/cuda2hip.h b/include/dca/util/cuda2hip.h index 4de6d8caa..256897d86 100644 --- a/include/dca/util/cuda2hip.h +++ b/include/dca/util/cuda2hip.h @@ -141,6 +141,7 @@ #define cudaIpcMemHandle_t hipIpcMemHandle_t #define cudaIpcMemLazyEnablePeerAccess hipIpcMemLazyEnablePeerAccess #define cudaIpcOpenMemHandle hipIpcOpenMemHandle +#define cudaGetSymbolSize hipGetSymbolSize #define cudaMalloc hipMalloc #define cudaMallocArray hipMallocArray #define cudaMallocManaged hipMallocManaged diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp index c188ee348..14365bbba 100644 --- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp +++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.cpp @@ -126,7 +126,7 @@ auto DMatrixBuilder::computeGamma(const int aux_spin_type, const in // Compute only the parts of G0 required at a given moment. (Re)Computing every element is not needed in most situations. template -void DMatrixBuilder::computeG0(Matrix& G0, const Sector& configuration, const int n_init, + void DMatrixBuilder::computeG0(Matrix& G0, const Sector& configuration, const int n_init, const int n_max, const int which_section) const { int b_i, b_j, r_i, r_j; Real tau_i, tau_j; diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp index 2b59fffa6..4f2b171f9 100644 --- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp +++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp @@ -32,6 +32,8 @@ DMatrixBuilder::DMatrixBuilder(const G0Interpolation G0, const int b_j = config.getRightB(j); const auto tau_j = config.getTau(j); + const int label = dca::phys::solver::details::solver_helper.index(b_i, b_j, config.getLeftR(i), config.getRightR(j)); diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu b/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu index ef354ba10..a116287c6 100644 --- a/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu +++ b/src/phys/dca_step/cluster_solver/ctint/walker/walker_kernels.cu @@ -34,6 +34,7 @@ using dca::util::castGPUType; template __global__ void setRightSectorToIdKernel(Scalar* m, const int ldm, const int n0, const int n_max) { const int i = threadIdx.x + blockDim.x * blockIdx.x; + // this preserves the behavior where we skip the lower left sector. const int j = threadIdx.y + blockDim.y * blockIdx.y + n0; if (i >= n_max || j >= n_max) @@ -43,8 +44,9 @@ __global__ void setRightSectorToIdKernel(Scalar* m, const int ldm, const int n0, Scalar the_one{}; the_one += 1.0; Scalar the_zero{}; + //assert(the_one == (the_zero -= 1.0)); - m[i + ldm * j] = (i == j) ? the_one : the_zero; + m[i + ldm * j] = (i == j) ? the_one : the_zero; } template @@ -79,18 +81,18 @@ __global__ void computeGLeftKernel(MatrixView G, const MatrixView -void computeGLeft(MatrixView& G, const MatrixView& M, const Real* f, +void computeGLeft(MatrixView& G_dev, const MatrixView& M_dev, const Real* f_dev, int n_init, cudaStream_t stream) { if (n_init == 0) return; - const int n = G.nrRows(); + const int n = G_dev.nrRows(); constexpr int thread_j = 4; constexpr int thread_i = 64; dim3 threads(thread_i, thread_j); dim3 blocks(std::max(n / (10 * thread_i), 1), dca::util::ceilDiv(n_init, thread_j)); - computeGLeftKernel<<>>(G, M, f, n_init); + computeGLeftKernel<<>>(G_dev, M_dev, f_dev, n_init); } template void computeGLeft(MatrixView&, const MatrixView&, const float*, diff --git a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cu b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cu index 62dc9f69c..73613c596 100644 --- a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cu +++ b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cu @@ -10,6 +10,7 @@ // This file implements G4Helper::set. #include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/g4_helper.cuh" +#include "dca/platform/dca_gpu.h" #include #include @@ -27,6 +28,8 @@ namespace details { // dca::phys::solver::accumulator::details:: __device__ __constant__ G4Helper g4_helper; +__CONSTANT__ int* w_ex_indices_actual; +__CONSTANT__ int* k_ex_indices_actual; void G4Helper::set(int nb, int nk, int nw, const std::vector& delta_k, const std::vector& delta_w, const int extension_offset, const int* add_k, int lda, const int* sub_k, @@ -72,7 +75,9 @@ void G4Helper::set(int nb, int nk, int nw, const std::vector& delta_k, #ifndef NDEBUG cudaMalloc(&host_helper.bad_indicies_, sizeof(int) * 1024); #endif - cudaMemcpyToSymbol(g4_helper, &host_helper, sizeof(G4Helper)); + checkRC(cudaMemcpyToSymbol(w_ex_indices_actual, &host_helper.w_ex_indices_, sizeof(int*))); + checkRC(cudaMemcpyToSymbol(k_ex_indices_actual, &host_helper.k_ex_indices_, sizeof(int*))); + checkRC(cudaMemcpyToSymbol(g4_helper, &host_helper, sizeof(G4Helper))); } __device__ bool G4Helper::extendGIndices(int& k1, int& k2, int& w1, int& w2) const { diff --git a/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu b/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu index f6fa66987..4b3debc5f 100644 --- a/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu +++ b/src/phys/dca_step/cluster_solver/shared_tools/cluster_helper.cu @@ -13,6 +13,7 @@ #include #include +#include #include "dca/platform/dca_gpu.h" #include "dca/linalg/util/allocators/vectors_typedefs.hpp" @@ -24,6 +25,21 @@ namespace details { __device__ __constant__ ClusterHelper cluster_real_helper; __device__ __constant__ ClusterHelper cluster_momentum_helper; +__CONSTANT__ int* cluster_add_matrix; +__CONSTANT__ int* cluster_sub_matrix; + + __global__ void checkClusterHelper(int nc, int lds) { + const int i_t = threadIdx.x + blockDim.x * blockIdx.x; + const int j = threadIdx.y + blockDim.y * blockIdx.y; + if (i_t == 0 && j == 0) { + for (int ch_i = 0; ch_i < nc; ++ch_i) { + for (int ch_j = 0; ch_j < lds; ++ch_j) { + printf("%d ", cluster_real_helper.sub_matrix_[ch_i * lds + ch_j]); + } + printf("\n"); + } + } +} void ClusterHelper::set(int nc, const int* add, int lda, const int* sub, int lds, bool momentum) { static std::array flags; @@ -44,13 +60,29 @@ void ClusterHelper::set(int nc, const int* add, int lda, const int* sub, int lds compact_transfer(add, lda, const_cast(&host_helper.add_matrix_)); compact_transfer(sub, lds, const_cast(&host_helper.sub_matrix_)); + // The logic here for CUDA is transfers from unpinned memory only block for the copy from host + // memory to a DMA buffer. This is on the default queue here and so a stream synch will not + // work to insure the ClusterHelper copy is actually complete. + cudaDeviceSynchronize(); if (momentum) { - cudaMemcpyToSymbol(cluster_momentum_helper, &host_helper, sizeof(ClusterHelper)); + + cudaMemcpyToSymbol(cluster_momentum_helper, &host_helper, sizeof(ClusterHelper), + cudaMemcpyHostToDevice); } else { - cudaMemcpyToSymbol(cluster_real_helper, &host_helper, sizeof(ClusterHelper)); + // In debug on sdgx-2 for CTINT I see know evidence this actually works, it appears not to. + size_t cluster_helper_size; + checkRC(cudaGetSymbolSize(&cluster_helper_size, cluster_real_helper)); + assert(cluster_helper_size == sizeof(ClusterHelper)); + checkRC(cudaMemcpyToSymbol(cluster_add_matrix, &host_helper.add_matrix_, sizeof(int*))); + checkRC(cudaMemcpyToSymbol(cluster_sub_matrix, &host_helper.sub_matrix_, sizeof(int*))); + checkRC(cudaMemcpyToSymbol(cluster_real_helper, &host_helper, cluster_helper_size)); + cudaDeviceSynchronize(); } +#ifndef NDEBUG + checkClusterHelper<<<1, 1, 0, 0>>>(nc, lds); +#endif }); } diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp index 1f727cd4e..fd837332a 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp @@ -30,6 +30,9 @@ using McOptions = MockMcOptions; #include "walker_wrapper_submatrix.hpp" #include "dca/linalg/matrixop.hpp" #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp" +#include "dca/util/to_string.hpp" + +using namespace dca::addt_str_oper; constexpr char input_name[] = DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json"; @@ -59,12 +62,12 @@ using namespace dca::phys::solver; template using CtintWalkerSubmatrixGpuTest = CtINTWalkerSubmatrixGPUTestT; -template +template using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder; // Currently testing float isn't really possible due to the way the Scalar type is // carried through from mc_options. See test_setup.hpp PD -using ScalarTypes = ::testing::Types; //double, +using ScalarTypes = ::testing::Types; // double, TYPED_TEST_CASE(CtintWalkerSubmatrixGpuTest, ScalarTypes); // Compare the submatrix update with a direct computation of the M matrix, and compare the @@ -77,9 +80,9 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) { using Matrix = typename Walker::Matrix; using MatrixPair = std::array; using SbmWalkerCpu = - testing::phys::solver::ctint::WalkerWrapperSubmatrix; + testing::phys::solver::ctint::WalkerWrapperSubmatrix; using SbmWalkerGpu = - testing::phys::solver::ctint::WalkerWrapperSubmatrix; + testing::phys::solver::ctint::WalkerWrapperSubmatrix; std::vector setup_rngs{0., 0.00, 0.9, 0.5, 0.01, 0, 0.75, 0.02, 0, 0.6, 0.03, 1, 0.99, 0.04, 0.99}; @@ -97,15 +100,14 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) { G0Interpolation g0_gpu(g0_func_gpu); typename CtintWalkerSubmatrixGpuTest::G0Setup::LabelDomain label_dmn; - constexpr int bands = dca::testing::LatticeBilayer::BANDS; DMatrixBuilder d_matrix_cpu(g0_cpu, bands, RDmn()); + d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false); // cpu_parameters.adjustAlphaDd()); SbmWalkerCpu::setInteractionVertices(cpu_data, cpu_parameters); - d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false); //cpu_parameters.adjustAlphaDd()); DMatrixBuilder d_matrix_gpu(g0_gpu, bands, RDmn()); + d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false); // gpu_parameters.adjustAlphaDd()); SbmWalkerGpu::setInteractionVertices(cpu_data, gpu_parameters); - d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false); //gpu_parameters.adjustAlphaDd()); // ************************************ // Test vertex insertion / removal **** @@ -128,10 +130,29 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) { 0, 0.99, 0.4, 0.2, -1, // Insertion }; - for (const int initial_size : std::array{0, 5}) { + constexpr Scalar tolerance = 10e-12; + auto compareSubMatrix = [tolerance](auto& mat_cpu, auto& mat_gpu, const std::string& mat_tag) { + int fail = 0; + for (int s = 0; s < 2; ++s) { + std::cout << mat_tag << "_cpu[" << s << "] size: " << mat_cpu[s].size() << '\n'; + std::cout << mat_tag << "_gpu[" << s << "] size: " << mat_gpu[s].size() << '\n'; + for (int j = 0; j < mat_cpu[s].size().second; ++j) + for (int i = 0; i < mat_cpu[s].size().first; ++i) { + if (std::abs(mat_cpu[s](i, j) - mat_gpu[s](i, j)) > tolerance) { + std::cout << mat_tag << " element: " << s << "," << i << "," << j << " cpu(" + << mat_cpu[s](i, j) << ") - gpu(" << mat_gpu[s](i, j) << ") > " << tolerance + << '\n'; + ++fail; + } + } + } + return fail; + }; + + for (const int initial_size : std::array{2}) { cpu_parameters.setInitialConfigurationSize(initial_size); gpu_parameters.setInitialConfigurationSize(initial_size); - for (int steps = 1; steps <= 8; ++steps) { + for (int steps = 2; steps <= 8; ++steps) { cpu_rng.setNewValues(setup_rngs); SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng, d_matrix_cpu); gpu_rng.setNewValues(setup_rngs); @@ -139,15 +160,12 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) { cpu_rng.setNewValues(rng_vals); walker_cpu.doStep(steps); + gpu_rng.setNewValues(rng_vals); walker_gpu.doStep(steps); - constexpr Scalar tolerance = std::numeric_limits::epsilon() * 100; - - auto M_cpu = walker_cpu.getM(); - auto M_gpu = walker_gpu.getM(); - for (int s = 0; s < 2; ++s) - EXPECT_TRUE(dca::linalg::matrixop::areNear(M_cpu[s], M_gpu[s], tolerance)); + // doSweep does this + walker_gpu.uploadConfiguration(); // The final configuration is the same. const auto& config1 = walker_cpu.getWalkerConfiguration(); @@ -156,6 +174,211 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) { for (int i = 0; i < config1.size(); ++i) EXPECT_EQ(config1[i], config2[i]); EXPECT_EQ(walker_cpu.get_sign(), walker_gpu.get_sign()); + + auto M_cpu = walker_cpu.getM(); + auto M_gpu = walker_gpu.getM(); + + auto fail = compareSubMatrix(M_cpu, M_gpu, "M"); + EXPECT_FALSE(fail); } } } + +// This test still fails if its run second in the batch. + +TYPED_TEST(CtintWalkerSubmatrixGpuTest, fineGrained) { + using Scalar = TypeParam; + using Parameters = typename CtintWalkerSubmatrixGpuTest::G0Setup::Parameters; + using Walker = testing::phys::solver::ctint::WalkerWrapper; + using Matrix = typename Walker::Matrix; + using MatrixPair = std::array; + using SbmWalkerCpu = + testing::phys::solver::ctint::WalkerWrapperSubmatrix; + using SbmWalkerGpu = + testing::phys::solver::ctint::WalkerWrapperSubmatrix; + + std::vector setup_rngs{0., 0.00, 0.9, 0.5, 0.01, 0, 0.75, 0.02, + 0, 0.6, 0.03, 1, 0.99, 0.04, 0.99}; + typename CtintWalkerSubmatrixGpuTest::G0Setup::RngType cpu_rng(setup_rngs); + typename CtintWalkerSubmatrixGpuTest::G0Setup::RngType gpu_rng(setup_rngs); + + auto& cpu_data = *this->host_setup.data_; + auto& gpu_data = *this->gpu_setup.data_; + auto& cpu_parameters = this->host_setup.parameters_; + auto& gpu_parameters = this->gpu_setup.parameters_; + + const auto g0_func_cpu = dca::phys::solver::ctint::details::shrinkG0(cpu_data.G0_r_t); + G0Interpolation g0_cpu(g0_func_cpu); + const auto g0_func_gpu = dca::phys::solver::ctint::details::shrinkG0(gpu_data.G0_r_t); + G0Interpolation g0_gpu(g0_func_gpu); + typename CtintWalkerSubmatrixGpuTest::G0Setup::LabelDomain label_dmn; + + constexpr int bands = dca::testing::LatticeBilayer::BANDS; + + DMatrixBuilder d_matrix_cpu(g0_cpu, bands, RDmn()); + d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false); // cpu_parameters.adjustAlphaDd()); + SbmWalkerCpu::setInteractionVertices(cpu_data, cpu_parameters); + DMatrixBuilder d_matrix_gpu(g0_gpu, bands, RDmn()); + d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false); // gpu_parameters.adjustAlphaDd()); + SbmWalkerGpu::setInteractionVertices(cpu_data, gpu_parameters); + + // ************************************ + // Test vertex insertion / removal **** + // ************************************ + // Set rng values. + // + // Insertion, vertex_id, tau, aux_spin, acceptance_rng + // Removal, vertex_id, acceptance_rng + // ... + // Note: if acceptance_rng <= 0 the move is always accepted, if it is > 1 the move is always + // rejected. + const std::vector rng_vals{ + 0, 0, 0.1, 0.8, -1, // Insertion. + 0, 0.99, 0.2, 0.8, -1, // Insertion. + 0, 0, 0.3, 0.8, 2, // Insertion. Rejected. + 1, 0, -1, // Remove pre-existing. + 1, 0.99, -1, // Remove recently inserted. + 1, 0.99, 2, // Remove recently inserted. Rejected + 1, 0, 2, // Remove . Rejected + 0, 0.99, 0.4, 0.2, -1, // Insertion + }; + + int initial_size = 2; + + cpu_parameters.setInitialConfigurationSize(initial_size); + gpu_parameters.setInitialConfigurationSize(initial_size); + int steps = 4; + + cpu_rng.setNewValues(setup_rngs); + SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng, d_matrix_cpu); + gpu_rng.setNewValues(setup_rngs); + SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng, d_matrix_gpu); + + constexpr Scalar tolerance = 10e-12; + + cpu_rng.setNewValues(rng_vals); + walker_cpu.generateDelayedMoves(steps); + walker_cpu.computeMInit(); + + walker_gpu.uploadConfiguration(); + gpu_rng.setNewValues(rng_vals); + walker_gpu.generateDelayedMoves(steps); + walker_gpu.uploadConfiguration(); + walker_gpu.computeMInit(); + + // walker_gpu.computeGInit(); + auto M_cpu = walker_cpu.getRawM(); + auto M_gpu = walker_gpu.getRawM(); + + std::cout << "Minit init cpu size " << M_cpu[0].size().first << "," << M_cpu[0].size().second + << " : " << M_cpu[1].size().first << "," << M_cpu[1].size().second << '\n'; + std::cout << "Minit init gpu size " << M_gpu[0].size().first << "," << M_gpu[0].size().second + << " : " << M_gpu[1].size().first << "," << M_gpu[1].size().second << '\n'; + + std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n'; + std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n'; + std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n'; + std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n'; + + M_cpu[0].print(); + M_cpu[1].print(); + M_gpu[0].print(); + M_gpu[1].print(); + + auto compareSubMatrix = [tolerance](auto& mat_cpu, auto& mat_gpu, const std::string& mat_tag) { + int fail = 0; + for (int s = 0; s < 2; ++s) { + std::cout << mat_tag << "_cpu[" << s << "] size: " << mat_cpu[s].size() << '\n'; + std::cout << mat_tag << "_gpu[" << s << "] size: " << mat_gpu[s].size() << '\n'; + for (int j = 0; j < mat_cpu[s].size().second; ++j) + for (int i = 0; i < mat_cpu[s].size().first; ++i) { + if (std::abs(mat_cpu[s](i, j) - mat_gpu[s](i, j)) > tolerance) { + std::cout << mat_tag << " element: " << s << "," << i << "," << j << " cpu(" + << mat_cpu[s](i, j) << ") - gpu(" << mat_gpu[s](i, j) << ") > " << tolerance + << '\n'; + ++fail; + } + } + } + return fail; + }; + + auto fail = compareSubMatrix(M_cpu, M_gpu, "M"); + EXPECT_FALSE(fail); + + walker_cpu.computeGInit(); + walker_gpu.computeGInit(); + walker_gpu.synchronize(); + + auto G_cpu = walker_cpu.getRawG(); + auto G_gpu = walker_gpu.getRawG(); + + fail = compareSubMatrix(G_cpu, G_gpu, "G"); + EXPECT_FALSE(fail); + + walker_cpu.mainSubmatrixProcess(); + walker_gpu.mainSubmatrixProcess(); + + walker_cpu.updateM(); + walker_gpu.updateM(); + + M_cpu = walker_cpu.getRawM(); + M_gpu = walker_gpu.getRawM(); + + std::cout << "M post update cpu size " << M_cpu[0].size().first << "," << M_cpu[0].size().second + << " : " << M_cpu[1].size().first << "," << M_cpu[1].size().second << '\n'; + std::cout << "M post update gpu size " << M_gpu[0].size().first << "," << M_gpu[0].size().second + << " : " << M_gpu[1].size().first << "," << M_gpu[1].size().second << '\n'; + + std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n'; + std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n'; + std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n'; + std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n'; + + M_cpu[0].print(); + M_cpu[1].print(); + M_gpu[0].print(); + M_gpu[1].print(); + + fail = compareSubMatrix(M_cpu, M_gpu, "updated_M"); + EXPECT_FALSE(fail); + + // The final configuration is the same. + const auto& config1 = walker_cpu.getWalkerConfiguration(); + const auto& config2 = walker_gpu.getWalkerConfiguration(); + ASSERT_EQ(config1.size(), config2.size()); + for (int i = 0; i < config1.size(); ++i) + EXPECT_EQ(config1[i], config2[i]); + EXPECT_EQ(walker_cpu.get_sign(), walker_gpu.get_sign()); + + walker_gpu.uploadConfiguration(); + + auto comp_M_cpu = walker_cpu.getM(); + auto comp_M_gpu = walker_gpu.getM(); + + std::cout << "computed M cpu size " << comp_M_cpu[0].size().first << "," + << comp_M_cpu[0].size().second << " : " << comp_M_cpu[1].size().first << "," + << comp_M_cpu[1].size().second << '\n'; + std::cout << "computed M gpu size " << comp_M_gpu[0].size().first << "," + << comp_M_gpu[0].size().second << " : " << comp_M_gpu[1].size().first << "," + << comp_M_gpu[1].size().second << '\n'; + + std::cout << walker_cpu.getF()[-1][0] << "," << walker_cpu.getF()[-1][1] << '\n'; + std::cout << walker_cpu.getF()[1][0] << "," << walker_cpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getF()[-1][0] << "," << walker_gpu.getF()[-1][1] << '\n'; + std::cout << walker_gpu.getF()[1][0] << "," << walker_gpu.getF()[1][1] << '\n'; + std::cout << walker_gpu.getFValues()[0][0] << "," << walker_gpu.getFValues()[0][1] << '\n'; + std::cout << walker_gpu.getFValues()[1][0] << "," << walker_gpu.getFValues()[1][1] << '\n'; + + comp_M_cpu[0].print(); + comp_M_cpu[1].print(); + comp_M_gpu[0].print(); + comp_M_gpu[1].print(); + + fail = compareSubMatrix(comp_M_cpu, comp_M_gpu, "comp_M"); + EXPECT_FALSE(fail); +} diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json b/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json index a6a5a4b05..88a9aeed0 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json @@ -9,16 +9,18 @@ "physics" : { - "beta" : 2, + "beta" : 4, "chemical-potential" : 0 }, "bilayer-Hubbard-model": { - "t" : 0, - "U" : 0, - "V" : 0, - "V-prime" : 2 + "t" : 1.0, + "t_prime" : 0.1, + "t_perp" : 0.01, + "U" : 1.0, + "V" : 1.0, + "V-prime" : 2.0 }, "Hund-model": diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp index 2d5dc66a8..4c451a525 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp @@ -50,7 +50,7 @@ struct WalkerSelector { #endif // DCA_HAVE_GPU using namespace dca::phys::solver::ctint; - template +template struct WalkerWrapperSubmatrix : public WalkerSelector::type { using BaseClass = typename WalkerSelector::type; using Scalar = SCALAR; @@ -59,31 +59,19 @@ struct WalkerWrapperSubmatrix : public WalkerSelector& d_matrix_builder) - : BaseClass(parameters_ref, dca::phys::DcaData(parameters_ref), rng_ref, d_matrix_builder, 0), + WalkerWrapperSubmatrix(/*const*/ Parameters& parameters_ref, Rng& rng_ref, + DMatrixBuilder& d_matrix_builder) + : BaseClass(parameters_ref, dca::phys::DcaData(parameters_ref), rng_ref, + d_matrix_builder, 0), streams_(3) { BaseClass::initialize(0); - - } - - // This purposefully shadows - void doStep(const int n_steps_to_delay) { - BaseClass::SubmatrixBase::doStep(n_steps_to_delay); } using Matrix = dca::linalg::Matrix; using MatrixPair = std::array; MatrixPair getM() { - std::array, 2> M; - - BaseClass::computeM(M); -#ifdef DCA_HAVE_GPU - checkRC(cudaDeviceSynchronize()); -#endif - - std::array, 2> M_copy{M[0], M[1]}; - return M_copy; + return BaseClass::getM(); } const auto& getWalkerConfiguration() const { @@ -94,6 +82,41 @@ struct WalkerWrapperSubmatrix : public WalkerSelector streams_; }; diff --git a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json index 2d27da0f8..6b997974b 100644 --- a/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json +++ b/test/unit/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/input_4x4.json @@ -9,14 +9,18 @@ "physics" : { - "beta" : 2, + "beta" : 4, "chemical-potential" : 0 }, "bilayer-Hubbard-model": { - "t" : 1, - "U" : 2 + "t" : 1.0, + "t_prime" : 0.1, + "t_perp" : 0.01, + "U" : 1.0, + "V" : 1.0, + "V-prime" : 2.0 }, "Hund-model":