Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions .github/workflows/ci-github-actions-self-hosted.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,18 @@ on:
- master

jobs:
v100:
runs-on: [self-hosted, Linux, X64, V100, CUDA]
two_a30:
runs-on: [self-hosted, Linux, X64, A30]

env:
GH_JOBNAME: ${{matrix.jobname}}
GH_OS: Linux
strategy:
fail-fast: false
matrix:
jobname: [
GCC12-MPI-CUDA-Real-Full,
]
# GCC12-MPI-NoMPI-CUDA-Real-Full,
# GCC12-MPI-NoMPI-CUDA-Real-Debug-Full,
jobname: [GCC12-MPI-CUDA-Real-Full]
# GCC12-MPI-NoMPI-CUDA-Real-Full,
# GCC12-MPI-NoMPI-CUDA-Real-Debug-Full,

steps:
- name: Checkout PR branch
Expand All @@ -36,4 +34,3 @@ jobs:

- name: Test
run: test/test_automation/ci/run_step.sh test

12 changes: 6 additions & 6 deletions include/dca/linalg/lapack/magma.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,9 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
int* ldc, const int batch_count, const magma_queue_t queue) {
using util::castMAGMAComplex;
magmablas_cgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
convertToMagmaComplex(alpha), castMAGMAComplex(a), lda,
castMAGMAComplex(b), ldb, convertToMagmaComplex(beta),
castMAGMAComplex(c), ldc, batch_count, queue);
convertToMagmaType(alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b),
ldb, convertToMagmaType(beta), castMAGMAComplex(c), ldc, batch_count,
queue);
checkErrorsCudaDebug();
}
inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k,
Expand All @@ -163,9 +163,9 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
int* ldc, const int batch_count, const magma_queue_t queue) {
using util::castMAGMAComplex;
magmablas_zgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
convertToMagmaComplex(alpha), castMAGMAComplex(a), lda,
castMAGMAComplex(b), ldb, convertToMagmaComplex(beta),
castMAGMAComplex(c), ldc, batch_count, queue);
convertToMagmaType(alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b),
ldb, convertToMagmaType(beta), castMAGMAComplex(c), ldc, batch_count,
queue);
checkErrorsCudaDebug();
}

Expand Down
12 changes: 7 additions & 5 deletions include/dca/linalg/matrixop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "dca/linalg/blas/use_device.hpp"
#include "dca/linalg/lapack/use_device.hpp"
#include "dca/linalg/matrix.hpp"
#include "dca/linalg/util/allocators/aligned_allocator.hpp"
#include "dca/linalg/util/util_lapack.hpp"
#include "dca/linalg/util/util_matrixop.hpp"
#include "dca/linalg/vector.hpp"
Expand Down Expand Up @@ -230,7 +231,6 @@ auto difference(const Matrix<Scalar, CPU, ALLOC>& a, const Matrix<Scalar, CPU, A

return max_diff;
}

template <typename Scalar, class ALLOC>
auto difference(const Matrix<Scalar, GPU>& a, const Matrix<Scalar, CPU, ALLOC>& b,
double diff_threshold = 1e-3) {
Expand All @@ -241,7 +241,7 @@ auto difference(const Matrix<Scalar, GPU>& a, const Matrix<Scalar, CPU, ALLOC>&
template <typename Scalar, class ALLOC>
auto difference(const Matrix<Scalar, CPU, ALLOC>& a, const Matrix<Scalar, GPU>& b,
double diff_threshold = 1e-3) {
Matrix<Scalar, CPU> cp_b(b);
Matrix<Scalar, CPU, ALLOC> cp_b(b);
return difference(a, cp_b, diff_threshold);
}

Expand Down Expand Up @@ -354,7 +354,7 @@ void smallInverse(Matrix<Scalar, CPU, ALLOC>& m_inv, Vector<int, CPU>& ipiv,
break;
}
case 3: {
const Matrix<Scalar, CPU, ALLOC> m(m_inv);
const Matrix<Scalar, CPU, dca::linalg::util::AlignedAllocator<Scalar>> m(m_inv);
const Scalar det = m(0, 0) * (m(1, 1) * m(2, 2) - m(2, 1) * m(1, 2)) -
m(1, 0) * (m(0, 1) * m(2, 2) - m(0, 2) * m(2, 1)) +
m(2, 0) * (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1));
Expand Down Expand Up @@ -1005,7 +1005,8 @@ inline void multiplyDiagonalLeft(const Vector<ScalarIn, device_name>& d,
template <typename ScalarIn, typename ScalarOut>
inline void multiplyDiagonalLeft(const Vector<ScalarIn, CPU>& d, const Matrix<ScalarIn, GPU>& a,
Matrix<ScalarOut, GPU>& b, int thread_id = 0, int stream_id = 0) {
Vector<ScalarIn, GPU> d_gpu(d);
Vector<ScalarIn, GPU> d_gpu;
d_gpu.setAsync(d, thread_id, stream_id);
multiplyDiagonalLeft(d_gpu, a, b, thread_id, stream_id);
}

Expand All @@ -1025,7 +1026,8 @@ inline void multiplyDiagonalRight(const Matrix<Scalar, device_name>& a,
template <typename Scalar>
inline void multiplyDiagonalRight(const Matrix<Scalar, GPU>& a, const Vector<Scalar, CPU>& d,
Matrix<Scalar, GPU>& b, int thread_id = 0, int stream_id = 0) {
Vector<Scalar, GPU> d_gpu(d);
Vector<Scalar, GPU> d_gpu;
d_gpu.setAsync(d, thread_id, stream_id);
multiplyDiagonalRight(a, d_gpu, b, thread_id, stream_id);
}

Expand Down
21 changes: 18 additions & 3 deletions include/dca/linalg/util/cast_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,14 +202,24 @@ template <typename Real>
using CudaComplex = typename details::ComplexContainer<Real>::type;
} // namespace dca::linalg::util

inline double2 convertToMagmaType(std::complex<double> var) {
inline magmaDoubleComplex convertToMagmaType(std::complex<double> var) {
return {reinterpret_cast<double (&)[2]>(var)[0], reinterpret_cast<double (&)[2]>(var)[1]};
}

inline float2 convertToMagmaType(std::complex<float> var) {
inline magmaFloatComplex convertToMagmaType(std::complex<float> var) {
return {reinterpret_cast<float (&)[2]>(var)[0], reinterpret_cast<float (&)[2]>(var)[1]};
}

#ifdef DCA_HAVE_HIP
inline magmaFloatComplex convertToMagmaType(HIP_vector_type<float, 2> var) {
return {reinterpret_cast<float (&)[2]>(var)[0], reinterpret_cast<float (&)[2]>(var)[1]};
}

inline magmaDoubleComplex convertToMagmaType(HIP_vector_type<double, 2> var) {
return {reinterpret_cast<double (&)[2]>(var)[0], reinterpret_cast<double (&)[2]>(var)[1]};
}
#endif

namespace dca::util {
template <typename T>
using MAGMATypeMap = typename std::disjunction<
Expand All @@ -231,8 +241,13 @@ using MAGMATypeMap = typename std::disjunction<
OnTypesEqual<T, const std::complex<double>**, const magmaDoubleComplex**>,
OnTypesEqual<T, const std::complex<float>* const*, const magmaFloatComplex* const*>,
OnTypesEqual<T, const std::complex<double>* const*, const magmaDoubleComplex* const*>,
OnTypesEqual<T, const double2* const*, const magmaDoubleComplex* const*>,
OnTypesEqual<T, const float2* const*, const magmaFloatComplex* const*>,
#ifdef DCA_HAVE_HIP
OnTypesEqual<T, const HIP_vector_type<float, 2>* const*, const magmaFloatComplex* const*>,
OnTypesEqual<T, const HIP_vector_type<double, 2>* const*, const magmaDoubleComplex* const*>,
#endif
default_type<void>>::type;

template <typename T>
__device__ __host__ MAGMATypeMap<T> castMagmaType(T var) {
return reinterpret_cast<MAGMATypeMap<T>>(var);
Expand Down
16 changes: 9 additions & 7 deletions include/dca/linalg/vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ namespace dca {
namespace linalg {
// dca::linalg::

template <typename ScalarType, DeviceType device_name = DeviceType::CPU,
template <typename ScalarType, DeviceType device_name = DeviceType::CPU,
class Allocator = util::DefaultAllocator<ScalarType, device_name>>
class Vector : public Allocator {
public:
Expand All @@ -52,9 +52,8 @@ class Vector : public Allocator {

/** copy constructor except for name.
* this is strange but for historical reasons is kept.
* has needed to be explicit because with the `const ThisType&` somehow lead to an implicit conversion
* from an int to a Vector& argument that landed here.
* This occurred in Debug with
* has needed to be explicit because with the `const ThisType&` somehow lead to an implicit
* conversion from an int to a Vector& argument that landed here. This occurred in Debug with
*/
explicit Vector(const ThisType& rhs, const std::string& name = default_name_);

Expand Down Expand Up @@ -323,7 +322,8 @@ void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs,
}

template <typename ScalarType, DeviceType device_name, class Allocator>
void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream [[maybe_unused]]) {
void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream
[[maybe_unused]]) {
// TODO: implement in copy.hpp.
#ifdef DCA_HAVE_GPU
checkRC(cudaMemsetAsync(data_, 0, size_ * sizeof(ScalarType), stream));
Expand All @@ -333,12 +333,14 @@ void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuS
}

template <typename ScalarType, DeviceType device_name, class Allocator>
void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
[[maybe_unused]]) {
dca::linalg::util::Memory<device_name>::setToZero(data_, size_, stream);
}

// template <typename ScalarType, DeviceType device_name, class Allocator>
// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
// [[maybe_unused]]) {
// // TODO: implement in copy.hpp.
// dca::linalg::util::memory<device_name>::setToZero(data_, size_, stream);
// }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ namespace solver {
namespace ctint {
// dca::phys::solver::ctint::

template <class Parameters, linalg::DeviceType device,
DistType DIST = dca::DistType::NONE>
template <class Parameters, linalg::DeviceType device, DistType DIST = dca::DistType::NONE>
class CtintAccumulator : public MC_accumulator_data<typename Parameters::Scalar> {
public:
constexpr static ClusterSolverId solver_id{ClusterSolverId::CT_INT};
Expand All @@ -47,7 +46,7 @@ class CtintAccumulator : public MC_accumulator_data<typename Parameters::Scalar>
using Base::accumulated_phase_;
using Base::current_phase_;
using Base::number_of_measurements_;

using ParametersType = Parameters;
using DataType = phys::DcaData<Parameters, DIST>;
using SpAccumulator = accumulator::SpAccumulator<Parameters, device>;
Expand Down Expand Up @@ -112,7 +111,7 @@ class CtintAccumulator : public MC_accumulator_data<typename Parameters::Scalar>
int get_number_of_measurements() const {
std::cout << "number_of_measurements ==" << number_of_measurements_ << '\n';
std::cout << "accumulated_phase_.count() == " << accumulated_phase_.count() << '\n';
//assert(accumulated_phase_.count() == number_of_measurements_);
// assert(accumulated_phase_.count() == number_of_measurements_);
return number_of_measurements_;
}

Expand Down Expand Up @@ -147,7 +146,6 @@ class CtintAccumulator : public MC_accumulator_data<typename Parameters::Scalar>
MatrixConfiguration configuration_;

std::vector<const linalg::util::GpuStream*> streams_;
linalg::util::GpuEvent event_;

util::Accumulator<unsigned long> accumulated_order_;

Expand All @@ -170,7 +168,7 @@ class CtintAccumulator : public MC_accumulator_data<typename Parameters::Scalar>
template <class Parameters, linalg::DeviceType device, DistType DIST>
template <class Data>
CtintAccumulator<Parameters, device, DIST>::CtintAccumulator(const Parameters& pars,
const Data& data, int id)
const Data& data, int id)
: parameters_(pars),
thread_id_(id),
sp_accumulator_(pars),
Expand All @@ -187,7 +185,7 @@ void CtintAccumulator<Parameters, device, DIST>::initialize(const int dca_iterat
parameters_.dump_every_iteration());
accumulated_order_.reset();
accumulated_phase_.reset();

Base::initialize(dca_iteration);
sp_accumulator_.resetAccumulation();
sp_accumulator_.clearSingleMeasurement();
Expand Down Expand Up @@ -215,13 +213,7 @@ void CtintAccumulator<Parameters, device, DIST>::updateFrom(Walker& walker) {
measure_flops_ = M_[0].nrCols() * M_[0].nrCols() * 2 * 2 * 8 * 19;

if constexpr (device == linalg::GPU) {
for (int s = 0; s < 2; ++s) {
event_.record(walker.get_stream(s));
// Synchronize sp accumulator streams with walker.
event_.block(*sp_accumulator_.get_streams()[s]);
// Synchronize both walker streams with tp accumulator.
event_.block(*tp_accumulator_.get_stream());
}
walker.synchronize();
}

configuration_ = walker.getConfiguration();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
#include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_choice.hpp"
#include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp"
#include "dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation.hpp"
//#include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator.hpp"
// #include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator.hpp"
#include "dca/phys/dca_data/dca_data.hpp"
#include "dca/phys/dca_loop/dca_loop_data.hpp"
#include "dca/phys/dca_step/symmetrization/symmetrize.hpp"
Expand All @@ -58,7 +58,7 @@ class CtintClusterSolver {
static constexpr ClusterSolverId solver_type{ClusterSolverId::CT_INT};

using Real = typename config::McOptions::MC_REAL;
using Scalar = typename dca::util::ScalarSelect<Real,Parameters::complex_g0>::type;
using Scalar = typename dca::util::ScalarSelect<Real, Parameters::complex_g0>::type;
using Concurrency = typename Parameters::concurrency_type;

using CDA = ClusterDomainAliases<Parameters::lattice_type::DIMENSION>;
Expand Down Expand Up @@ -111,8 +111,11 @@ class CtintClusterSolver {
// Returns the function G(k,w) without averaging across MPI ranks.
auto local_G_k_w() const;

DMatrixBuilder& getResource() { return *d_matrix_builder_; };
protected: // thread jacket interface.
DMatrixBuilder& getResource() {
return *d_matrix_builder_;
};

protected: // thread jacket interface.
using ParametersType = Parameters;
using DataType = Data;
using Rng = typename Parameters::random_number_generator;
Expand Down Expand Up @@ -140,8 +143,8 @@ class CtintClusterSolver {

/** gather all M and G4 and accumulated sign
* \param[out] Returns: average phase
* \param[in,out] G greens function has allreduce or leaveoneoutSum applied to it
* side effect seems undesirable and motivated by saving copy.
* \param[in,out] G greens function has allreduce or leaveoneoutSum applied to
* it side effect seems undesirable and motivated by saving copy.
* \param[in] compute_error does leave one out sum removing the local accumulated type.
*/
auto gatherMAndG4(SpGreensFunction& M, bool compute_error) const;
Expand Down Expand Up @@ -192,7 +195,7 @@ CtintClusterSolver<DEV, PARAM, use_submatrix, DIST>::CtintClusterSolver(
template <dca::linalg::DeviceType device_t, class Parameters, bool use_submatrix, DistType DIST>
void CtintClusterSolver<device_t, Parameters, use_submatrix, DIST>::initialize(int dca_iteration) {
dca_iteration_ = dca_iteration;

g0_.initializeShrinked(data_.G0_r_t_cluster_excluded);

d_matrix_builder_->setAlphas(parameters_.getAlphas(), parameters_.adjustAlphaDd());
Expand All @@ -202,7 +205,6 @@ void CtintClusterSolver<device_t, Parameters, use_submatrix, DIST>::initialize(i
if (concurrency_.id() == concurrency_.first())
std::cout << "\n\n\t CT-INT Integrator has initialized (DCA-iteration : " << dca_iteration
<< ")\n\n";

}

template <dca::linalg::DeviceType device_t, class Parameters, bool use_submatrix, DistType DIST>
Expand Down Expand Up @@ -353,10 +355,8 @@ void CtintClusterSolver<device_t, Parameters, use_submatrix, DIST>::warmUp() {
const int n_sweep = parameters_.get_warm_up_sweeps();
for (int i = 0; i < n_sweep; i++) {
walker_->doSweep();

walker_->updateShell(i, n_sweep);
}

walker_->markThermalized();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ namespace solver {
namespace ctint {
// dca::phys::solver::ctint::


// I think the class hierarch isn't helpful here.
class SolverConfiguration : public MatrixConfiguration {
public:
using BaseClass = MatrixConfiguration;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
#include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp"
#endif

//#define DEBUG_SUBMATRIX

namespace dca {
namespace phys {
namespace solver {
Expand Down
Loading
Loading