diff --git a/CMakeLists.txt b/CMakeLists.txt index 30ea7c8ea..dd5c8b064 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,6 +122,7 @@ set(DCA_LIBS ctaux ctint ss_ct_hyb + g0_interpolation function_transform gaussian_quadrature nfft @@ -140,7 +141,6 @@ set(DCA_LIBS quantum_domains time_and_frequency_domains signals - symmetrization coarsegraining ${DCA_CONCURRENCY_LIB} ${DCA_THREADING_LIBS} @@ -148,13 +148,17 @@ set(DCA_LIBS cuda_utils ) +if(TARGET stdc++fs) # std::filesystem + list(APPEND DCA_LIBS stdc++fs) +endif() + if (DCA_HAVE_CUDA) list(APPEND DCA_LIBS blas_kernels - dnfft_kernels lapack_kernels mc_kernels special_transform_kernels + cuda_utils ${DCA_CUDA_LIBS}) endif() diff --git a/applications/cluster_solver_check/cluster_solver_check.cpp b/applications/cluster_solver_check/cluster_solver_check.cpp index bcecafb19..e40c19ec3 100644 --- a/applications/cluster_solver_check/cluster_solver_check.cpp +++ b/applications/cluster_solver_check/cluster_solver_check.cpp @@ -99,7 +99,7 @@ int main(int argc, char** argv) { ed_solver.finalize(dca_loop_data); const auto Sigma_ed(dca_data_imag.Sigma); - const int tested_frequencies = 10; + const int tested_frequencies = 5; const auto G_ed(dca::math::util::cutFrequency(dca_data_imag.G_k_w, tested_frequencies)); if (concurrency.id() == concurrency.first()) { diff --git a/applications/dca/main_dca.cpp b/applications/dca/main_dca.cpp index 2772c8d6b..c756ca293 100644 --- a/applications/dca/main_dca.cpp +++ b/applications/dca/main_dca.cpp @@ -22,6 +22,7 @@ #include "dca/util/git_version.hpp" #include "dca/util/modules.hpp" #include "dca/util/signal_handler.hpp" +#include "dca/application/dca_loop_dispatch.hpp" int main(int argc, char** argv) { if (argc < 2) { diff --git a/build-aux/summit.cmake b/build-aux/summit.cmake index 2737ee831..a5c131a07 100644 --- a/build-aux/summit.cmake +++ b/build-aux/summit.cmake @@ -42,3 +42,6 @@ set(MAGMA_DIR $ENV{OLCF_MAGMA_ROOT} CACHE PATH # FFTW paths. set(FFTW_INCLUDE_DIR $ENV{OLCF_FFTW_ROOT}/include CACHE PATH "Path to fftw3.h.") set(FFTW_LIBRARY $ENV{OLCF_FFTW_ROOT}/lib/libfftw3.so CACHE FILEPATH "The FFTW3(-compatible) library.") + +# Flags for summit. +set(CMAKE_CXX_FLAGS -march=power9) diff --git a/cmake/dca_config.cmake b/cmake/dca_config.cmake index 1629f081a..9f884fd6a 100644 --- a/cmake/dca_config.cmake +++ b/cmake/dca_config.cmake @@ -73,9 +73,9 @@ endif() # Lattice type set(DCA_LATTICE "square" CACHE STRING "Lattice type, options are: bilayer | square | triangular | - hund | twoband_Cu | threeband | FeAs.") + hund | twoband_Cu | threeband | FeAs | Rashba.") set_property(CACHE DCA_LATTICE PROPERTY STRINGS bilayer square triangular hund twoband_Cu threeband - FeAs) + FeAs Rashba) if (DCA_LATTICE STREQUAL "bilayer") set(DCA_LATTICE_TYPE dca::phys::models::bilayer_lattice) @@ -99,22 +99,24 @@ elseif (DCA_LATTICE STREQUAL "threeband") set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/threeband_hubbard.hpp") -elseif (DCA_LATTICE STREQUAL "twoband_chain") - set(DCA_LATTICE_TYPE dca::phys::models::twoband_chain>) - set(DCA_LATTICE_INCLUDE - "dca/phys/models/analytic_hamiltonians/hund_lattice.hpp") elseif (DCA_LATTICE STREQUAL "FeAs") set(DCA_LATTICE_TYPE dca::phys::models::FeAsLattice) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/fe_as_lattice.hpp") + elseif (DCA_LATTICE STREQUAL "twoband_Cu") set(DCA_LATTICE_TYPE dca::phys::models::TwoBandCu) set(DCA_LATTICE_INCLUDE "dca/phys/models/analytic_hamiltonians/twoband_Cu.hpp") +elseif (DCA_LATTICE STREQUAL "Rashba") + set(DCA_LATTICE_TYPE dca::phys::models::RashbaHubbard) + set(DCA_LATTICE_INCLUDE + "dca/phys/models/analytic_hamiltonians/rashba_hubbard.hpp") + else() message(FATAL_ERROR "Please set DCA_LATTICE to a valid option: bilayer | square | triangular | - hund | twoband_Cu | threeband | FeAs.") + hund | twoband_Cu | threeband | FeAs | Rashba.") endif() # Model type @@ -307,22 +309,14 @@ else() endif() option(DCA_WITH_SINGLE_PRECISION_MC "Perform Monte Carlo and measurements in single precision." OFF) -option(DCA_WITH_SINGLE_PRECISION_TP_MEASUREMENTS "Measure two particle function in single precision." OFF) if (DCA_WITH_SINGLE_PRECISION_MC) set(DCA_WITH_SINGLE_PRECISION_TP_MEASUREMENTS ON CACHE BOOL "Measure two particle function in single precision." FORCE) - set(MC_SCALAR float) -else() - set(MC_SCALAR double) -endif() - -if (DCA_WITH_SINGLE_PRECISION_TP_MEASUREMENTS) - set(TP_ACCUMULATION_SCALAR float) + set(MC_SINGLE_PRECISION true) else() - set(TP_ACCUMULATION_SCALAR double) + set(MC_SINGLE_PRECISION false) endif() - option(DCA_WITH_MANAGED_MEMORY "Use managed memory allocator." OFF) mark_as_advanced(DCA_WITH_MANAGED_MEMORY) if (DCA_WITH_MANAGED_MEMORY) @@ -335,15 +329,6 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/mc_options.hpp.in" "${CMAKE_BINARY_DIR}/include/dca/config/mc_options.hpp" @ONLY) -################################################################################ -# Symmetrization -option(DCA_SYMMETRIZE "Apply cluster, time and frequency symmetries to single particle functions." - ON) - -if(DCA_SYMMETRIZE) - add_compile_definitions(DCA_WITH_SYMMETRIZATION) -endif() - ################################################################################ # Generate applications' config files. configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/analysis.hpp.in" diff --git a/cmake/dca_cuda.cmake b/cmake/dca_cuda.cmake index bced45c11..b0b8a4fbf 100644 --- a/cmake/dca_cuda.cmake +++ b/cmake/dca_cuda.cmake @@ -20,6 +20,7 @@ if (CUDA_FOUND) list(APPEND DCA_CUDA_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cublas_LIBRARY}) CUDA_INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) set(CUDA_SEPARABLE_COMPILATION ON) + list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") set(CVD_LAUNCHER "" CACHE INTERNAL "launch script for setting the Cuda visible devices.") # Use the following script for systems with multiple gpus visible from a rank. diff --git a/cmake/dca_external_libs.cmake b/cmake/dca_external_libs.cmake index 3e5cd01d5..565d4d627 100644 --- a/cmake/dca_external_libs.cmake +++ b/cmake/dca_external_libs.cmake @@ -27,6 +27,7 @@ if (NOT HDF5_FOUND) find_package(HDF5 REQUIRED COMPONENTS C CXX) endif() +find_package(HDF5 REQUIRED COMPONENTS C CXX) list(APPEND DCA_EXTERNAL_LIBS ${HDF5_LIBRARIES}) list(APPEND DCA_EXTERNAL_INCLUDE_DIRS ${HDF5_INCLUDE_DIRS}) diff --git a/include/dca/config/cmake_options.hpp b/include/dca/config/cmake_options.hpp index ae1e95730..c1712e632 100644 --- a/include/dca/config/cmake_options.hpp +++ b/include/dca/config/cmake_options.hpp @@ -40,12 +40,10 @@ struct CMakeOptions { static const std::string dca_with_autotuning; static const std::string dca_with_gnuplot; static const std::string dca_with_single_precision_mc; - static const std::string dca_with_single_precision_tp_measurements; static const std::string dca_with_memory_savings; static const std::string dca_with_managed_memory; static const std::string dca_with_qmc_bit; - static void print(); }; diff --git a/include/dca/config/mc_options.hpp.in b/include/dca/config/mc_options.hpp.in index 70d181ae6..5e0403ce2 100644 --- a/include/dca/config/mc_options.hpp.in +++ b/include/dca/config/mc_options.hpp.in @@ -12,6 +12,8 @@ #ifndef DCA_CONFIG_MC_OPTIONS_HPP #define DCA_CONFIG_MC_OPTIONS_HPP +#include +#include #ifdef DCA_HAVE_CUDA #include "dca/linalg/util/allocators/device_allocator.hpp" #include "dca/linalg/util/allocators/managed_allocator.hpp" @@ -22,9 +24,9 @@ namespace config { // dca::config:: struct McOptions { - using MCScalar = @MC_SCALAR@; - - using TPAccumulationScalar = @TP_ACCUMULATION_SCALAR@; + static constexpr bool single_precision = @MC_SINGLE_PRECISION@; + using Real = std::conditional_t; + using Complex = std::complex; static constexpr bool memory_savings = @MEMORY_SAVINGS@; diff --git a/include/dca/function/domains/dmn.hpp b/include/dca/function/domains/dmn.hpp index dee11cb65..ad69bb82e 100644 --- a/include/dca/function/domains/dmn.hpp +++ b/include/dca/function/domains/dmn.hpp @@ -22,7 +22,7 @@ #include #include -#include "dca/util/type_utils.hpp" // for dca::util::type_name +#include "dca/util/print_type.hpp" // for dca::util::type_name namespace dca { namespace func { diff --git a/include/dca/function/function.hpp b/include/dca/function/function.hpp index ecfa179e3..fe636f7ad 100644 --- a/include/dca/function/function.hpp +++ b/include/dca/function/function.hpp @@ -32,6 +32,7 @@ #include "dca/function/set_to_zero.hpp" #include "dca/util/pack_operations.hpp" #include "dca/util/integer_division.hpp" +#include "dca/util/print_type.hpp" #include "dca/util/type_utils.hpp" namespace dca { @@ -64,6 +65,8 @@ class function { function(const function& other, const std::string& name) : function(other) { name_ = name; } + template + function(const function& other); // Move constructor // Constructs the function with elements and name of other using move semantics. @@ -82,9 +85,9 @@ class function { // Precondition: The other function has been resetted, if the domain had been initialized after // the other function's construction. // Postcondition: The function's name is unchanged. - function& operator=(const function& other); + function& operator=(const function& other); template - function& operator=(const function& other); + function& operator=(const function& other); // Move assignment operator // Replaces the function's elements with those of other using move semantics. @@ -105,7 +108,6 @@ class function { const std::string& get_name() const { return name_; } - // TODO: Remove this method and use constructor parameter instead. void set_name(const std::string& name) { name_ = name; } @@ -120,6 +122,8 @@ class function { void resize(std::size_t nb_elements_new) { fnc_values_.resize(nb_elements_new); } + + // TODO: use a more standard method name. // Returns the size of the leaf domain with the given index. // Does not return function values! int operator[](const int index) const { @@ -140,10 +144,10 @@ class function { auto end() { return fnc_values_.end(); } - const auto begin() const { + auto begin() const { return fnc_values_.begin(); } - const auto end() const { + auto end() const { return fnc_values_.end(); } @@ -190,7 +194,7 @@ class function { // Enable only if all arguments are integral to prevent subind_to_linind(int*, int) to resolve to // subind_to_linind(int...) rather than subind_to_linind(const int* const, int). template - std::enable_if_t::value...>::value, int> subind_2_linind( + std::enable_if_t::value...), int> subind_2_linind( const Ts... subindices) const { // We need to cast all subindices to the same type for dmn_variadic. return dmn(static_cast(subindices)...); @@ -347,6 +351,24 @@ function::function(const function& other throw std::logic_error("Copy construction from a not yet resetted function."); } +template +template +function::function(const function& other) + : name_(other.get_name()), + function_type(__PRETTY_FUNCTION__), + dmn(), + Nb_sbdms(dmn.get_leaf_domain_sizes().size()), + size_sbdm(dmn.get_leaf_domain_sizes()), + step_sbdm(dmn.get_leaf_domain_steps()), + fnc_values_(dmn.get_size()) { + if (size() != other.size()) { + // The other function has not been resetted after the domain was initialized. + throw std::logic_error("Copy construction from a not yet resetted function."); + } + + std::copy(other.begin(), other.end(), begin()); +} + template function::function(function&& other) : name_(std::move(other.name_)), @@ -362,8 +384,7 @@ function::function(function&& other) } template -function& function::operator=( - const function& other) { +function& function::operator=(const function& other) { if (this != &other) { if (dmn.get_size() != other.dmn.get_size()) { // Domain had not been initialized when the functions were created. @@ -385,10 +406,17 @@ template template function& function::operator=(const function& other) { if (size() != other.size()) { - throw(std::logic_error("Function size does not match.")); + // Domain had not been initialized when the functions were created. + // Reset this function and check again. + reset(); + } + + if (size() != other.size()) { + // The other function has not been resetted after the domain was initialized. + throw std::logic_error("Copy assignment from a not yet resetted function."); } - fnc_values_ = other.fnc_values_; + std::copy(other.begin(), other.end(), begin()); return *this; } diff --git a/include/dca/function/util/real_complex_conversion.hpp b/include/dca/function/util/real_complex_conversion.hpp index 562dfcc7b..d69a21d21 100644 --- a/include/dca/function/util/real_complex_conversion.hpp +++ b/include/dca/function/util/real_complex_conversion.hpp @@ -18,6 +18,10 @@ #include "dca/function/function.hpp" +#ifdef DCA_HAVE_MPI +#include +#endif + namespace dca { namespace func { namespace util { @@ -40,14 +44,35 @@ auto complex(const function& f) { template auto real(const function, Dmn>& f, const bool check_imaginary = false) { function f_real; + Scalartype max_img = 0, rel_max = 0; for (int i = 0; i < f_real.size(); ++i) { - if (check_imaginary && std::abs(f(i).imag()) > 500 * std::numeric_limits::epsilon()) - throw(std::logic_error("The function is not purely real.")); + const auto val = std::abs(f(i).imag()); + if (val > max_img) { + max_img = val; + rel_max = val / std::abs(f(i).real()); + } f_real(i) = f(i).real(); } + if (check_imaginary && max_img > 1e3 * std::numeric_limits::epsilon()) { + // throw(std::logic_error("The function is not purely real.")); + + int id = 0; +#ifdef DCA_HAVE_MPI + int initialized; + MPI_Initialized(&initialized); + if (initialized) { + MPI_Comm_rank(MPI_COMM_WORLD, &id); + } +#endif // DCA_HAVE_MPI + if (id == 0) { + std::cerr << "WARNING: The function" << f.get_name() << " is not purely real. Max img " + << max_img << " relative of max " << rel_max << std::endl; + } + } + return f_real; } diff --git a/include/dca/io/hdf5/hdf5_types.hpp b/include/dca/io/hdf5/hdf5_types.hpp index db594a3ff..c0320bd5c 100644 --- a/include/dca/io/hdf5/hdf5_types.hpp +++ b/include/dca/io/hdf5/hdf5_types.hpp @@ -119,6 +119,30 @@ class HDF5_TYPE { } }; +template <> +class HDF5_TYPE { +public: + static hid_t get() { + return H5T_NATIVE_INT8; + } + + static H5::PredType get_PredType() { + return H5::PredType::NATIVE_INT8; + } +}; + +template <> +class HDF5_TYPE { +public: + static hid_t get() { + return H5T_NATIVE_UINT16; + } + + static H5::PredType get_PredType() { + return H5::PredType::NATIVE_UINT16; + } +}; + template <> class HDF5_TYPE { public: diff --git a/include/dca/io/hdf5/hdf5_writer.hpp b/include/dca/io/hdf5/hdf5_writer.hpp index fd11ed1f5..1aa86284d 100644 --- a/include/dca/io/hdf5/hdf5_writer.hpp +++ b/include/dca/io/hdf5/hdf5_writer.hpp @@ -56,6 +56,8 @@ class HDF5Writer { std::string get_path(); + void erase(const std::string& name); + template static void to_file(const arbitrary_struct_t& arbitrary_struct, const std::string& file_name); @@ -108,6 +110,12 @@ class HDF5Writer { return execute(name, static_cast(buffer)); } + template + void rewrite(const std::string& name, const T& obj) { + erase(name); + execute(name, obj); + } + operator bool() const noexcept { return static_cast(file_); } diff --git a/include/dca/io/json/json_writer.hpp b/include/dca/io/json/json_writer.hpp index 81b20b759..c2618d7c7 100644 --- a/include/dca/io/json/json_writer.hpp +++ b/include/dca/io/json/json_writer.hpp @@ -76,6 +76,12 @@ class JSONWriter { execute(f->get_name(), *f); } + // "execute" already takes care of overwriting. + template + void rewrite(const std::string& name, const T& obj){ + execute(name, obj); + } + private: bool verbose_; std::stack open_groups_; diff --git a/include/dca/io/writer.hpp b/include/dca/io/writer.hpp index dc01320b2..9bb79cedd 100644 --- a/include/dca/io/writer.hpp +++ b/include/dca/io/writer.hpp @@ -60,6 +60,11 @@ class Writer { std::visit([&](auto& var) { var.execute(args...); }, writer_); } + template + void rewrite(const Args&... args) { + std::visit([&](auto& var) { var.rewrite(args...); }, writer_); + } + operator bool() const noexcept { return std::visit([&](const auto& var) { return static_cast(var); }, writer_); } diff --git a/include/dca/linalg/blas/cublas1.hpp b/include/dca/linalg/blas/cublas1.hpp index 14e68cfa8..7f8c8f9c9 100644 --- a/include/dca/linalg/blas/cublas1.hpp +++ b/include/dca/linalg/blas/cublas1.hpp @@ -37,16 +37,16 @@ inline void swap(cublasHandle_t handle, int n, double* x, int incx, double* y, i } inline void swap(cublasHandle_t handle, int n, std::complex* x, int incx, std::complex* y, int incy) { - cuComplex* cu_x = util::castCudaComplex(x); - cuComplex* cu_y = util::castCudaComplex(y); + cuComplex* cu_x = castCuda(x); + cuComplex* cu_y = castCuda(y); cublasStatus_t status = cublasCswap(handle, n, cu_x, incx, cu_y, incy); checkRC(status); checkErrorsCudaDebug(); } inline void swap(cublasHandle_t handle, int n, std::complex* x, int incx, std::complex* y, int incy) { - cuDoubleComplex* cu_x = util::castCudaComplex(x); - cuDoubleComplex* cu_y = util::castCudaComplex(y); + cuDoubleComplex* cu_x = castCuda(x); + cuDoubleComplex* cu_y = castCuda(y); cublasStatus_t status = cublasZswap(handle, n, cu_x, incx, cu_y, incy); checkRC(status); checkErrorsCudaDebug(); @@ -64,16 +64,16 @@ inline void scal(cublasHandle_t handle, int n, double alpha, double* x, int incx } inline void scal(cublasHandle_t handle, int n, std::complex alpha, std::complex* x, int incx) { - const cuComplex* cu_alpha = util::castCudaComplex(alpha); - cuComplex* cu_x = util::castCudaComplex(x); + const cuComplex* cu_alpha = castCuda(&alpha); + cuComplex* cu_x = castCuda(x); cublasStatus_t status = cublasCscal(handle, n, cu_alpha, cu_x, incx); checkRC(status); checkErrorsCudaDebug(); } inline void scal(cublasHandle_t handle, int n, std::complex alpha, std::complex* x, int incx) { - const cuDoubleComplex* cu_alpha = util::castCudaComplex(alpha); - cuDoubleComplex* cu_x = util::castCudaComplex(x); + const cuDoubleComplex* cu_alpha = castCuda(&alpha); + cuDoubleComplex* cu_x = castCuda(x); cublasStatus_t status = cublasZscal(handle, n, cu_alpha, cu_x, incx); checkRC(status); checkErrorsCudaDebug(); @@ -91,16 +91,16 @@ inline void copy(cublasHandle_t handle, int n, const double* x, int incx, double } inline void copy(cublasHandle_t handle, int n, const std::complex* x, int incx, std::complex* y, int incy) { - const cuComplex* cu_x = util::castCudaComplex(x); - cuComplex* cu_y = util::castCudaComplex(y); + const cuComplex* cu_x = castCuda(x); + cuComplex* cu_y = castCuda(y); cublasStatus_t status = cublasCcopy(handle, n, cu_x, incx, cu_y, incy); checkRC(status); checkErrorsCudaDebug(); } inline void copy(cublasHandle_t handle, int n, const std::complex* x, int incx, std::complex* y, int incy) { - const cuDoubleComplex* cu_x = util::castCudaComplex(x); - cuDoubleComplex* cu_y = util::castCudaComplex(y); + const cuDoubleComplex* cu_x = castCuda(x); + cuDoubleComplex* cu_y = castCuda(y); cublasStatus_t status = cublasZcopy(handle, n, cu_x, incx, cu_y, incy); checkRC(status); checkErrorsCudaDebug(); @@ -120,24 +120,24 @@ inline void axpy(cublasHandle_t handle, int n, double alpha, const double* x, in } inline void axpy(cublasHandle_t handle, int n, std::complex alpha, const std::complex* x, int incx, std::complex* y, int incy) { - const cuComplex* cu_alpha = util::castCudaComplex(alpha); - const cuComplex* cu_x = util::castCudaComplex(x); - cuComplex* cu_y = util::castCudaComplex(y); + const cuComplex* cu_alpha = castCuda(&alpha); + const cuComplex* cu_x = castCuda(x); + cuComplex* cu_y = castCuda(y); cublasStatus_t status = cublasCaxpy(handle, n, cu_alpha, cu_x, incx, cu_y, incy); checkRC(status); checkErrorsCudaDebug(); } inline void axpy(cublasHandle_t handle, int n, std::complex alpha, const std::complex* x, int incx, std::complex* y, int incy) { - const cuDoubleComplex* cu_alpha = util::castCudaComplex(alpha); - const cuDoubleComplex* cu_x = util::castCudaComplex(x); - cuDoubleComplex* cu_y = util::castCudaComplex(y); + const cuDoubleComplex* cu_alpha = castCuda(&alpha); + const cuDoubleComplex* cu_x = castCuda(x); + cuDoubleComplex* cu_y = castCuda(y); cublasStatus_t status = cublasZaxpy(handle, n, cu_alpha, cu_x, incx, cu_y, incy); checkRC(status); checkErrorsCudaDebug(); } -} // cublas -} // linalg -} // dca +} // namespace cublas +} // namespace linalg +} // namespace dca #endif // DCA_LINALG_BLAS_CUBLAS1_HPP diff --git a/include/dca/linalg/blas/cublas3.hpp b/include/dca/linalg/blas/cublas3.hpp index 8c0f43ad2..a8ea5bc32 100644 --- a/include/dca/linalg/blas/cublas3.hpp +++ b/include/dca/linalg/blas/cublas3.hpp @@ -51,11 +51,11 @@ inline void gemm(cublasHandle_t handle, const char* transa, const char* transb, const std::complex* b, int ldb, std::complex beta, std::complex* c, int ldc) { checkErrorsCudaDebug(); - const cuComplex* cu_alpha = util::castCudaComplex(alpha); - const cuComplex* cu_a = util::castCudaComplex(a); - const cuComplex* cu_b = util::castCudaComplex(b); - const cuComplex* cu_beta = util::castCudaComplex(beta); - cuComplex* cu_c = util::castCudaComplex(c); + const cuComplex* cu_alpha = castCuda(&alpha); + const cuComplex* cu_a = castCuda(a); + const cuComplex* cu_b = castCuda(b); + const cuComplex* cu_beta = castCuda(&beta); + cuComplex* cu_c = castCuda(c); cublasStatus_t status = cublasCgemm(handle, getCublasTransValue(*transa), getCublasTransValue(*transb), m, n, k, @@ -68,11 +68,11 @@ inline void gemm(cublasHandle_t handle, const char* transa, const char* transb, const std::complex* b, int ldb, std::complex beta, std::complex* c, int ldc) { checkErrorsCudaDebug(); - const cuDoubleComplex* cu_alpha = util::castCudaComplex(alpha); - const cuDoubleComplex* cu_a = util::castCudaComplex(a); - const cuDoubleComplex* cu_b = util::castCudaComplex(b); - const cuDoubleComplex* cu_beta = util::castCudaComplex(beta); - cuDoubleComplex* cu_c = util::castCudaComplex(c); + const cuDoubleComplex* cu_alpha = castCuda(&alpha); + const cuDoubleComplex* cu_a = castCuda(a); + const cuDoubleComplex* cu_b = castCuda(b); + const cuDoubleComplex* cu_beta = castCuda(&beta); + cuDoubleComplex* cu_c = castCuda(c); cublasStatus_t status = cublasZgemm(handle, getCublasTransValue(*transa), getCublasTransValue(*transb), m, n, k, @@ -105,9 +105,9 @@ inline void trsm(cublasHandle_t handle, const char* side, const char* uplo, cons const char* diag, int m, int n, std::complex alpha, const std::complex* a, int lda, std::complex* b, int ldb) { checkErrorsCudaDebug(); - const cuComplex* cu_alpha = util::castCudaComplex(alpha); - const cuComplex* cu_a = util::castCudaComplex(a); - cuComplex* cu_b = util::castCudaComplex(b); + const cuComplex* cu_alpha = castCuda(&alpha); + const cuComplex* cu_a = castCuda(a); + cuComplex* cu_b = castCuda(b); cublasStatus_t status = cublasCtrsm(handle, getCublasSideValue(*side), getCublasUploValue(*uplo), getCublasTransValue(*transa), getCublasDiagValue(*diag), m, n, @@ -119,9 +119,9 @@ inline void trsm(cublasHandle_t handle, const char* side, const char* uplo, cons const char* diag, int m, int n, std::complex alpha, const std::complex* a, int lda, std::complex* b, int ldb) { checkErrorsCudaDebug(); - const cuDoubleComplex* cu_alpha = util::castCudaComplex(alpha); - const cuDoubleComplex* cu_a = util::castCudaComplex(a); - cuDoubleComplex* cu_b = util::castCudaComplex(b); + const cuDoubleComplex* cu_alpha = castCuda(&alpha); + const cuDoubleComplex* cu_a = castCuda(a); + cuDoubleComplex* cu_b = castCuda(b); cublasStatus_t status = cublasZtrsm(handle, getCublasSideValue(*side), getCublasUploValue(*uplo), getCublasTransValue(*transa), getCublasDiagValue(*diag), m, n, @@ -129,8 +129,8 @@ inline void trsm(cublasHandle_t handle, const char* side, const char* uplo, cons checkRC(status); checkErrorsCudaDebug(); } -} // cublas -} // linalg -} // dca +} // namespace cublas +} // namespace linalg +} // namespace dca #endif // DCA_LINALG_BLAS_CUBLAS3_HPP diff --git a/include/dca/linalg/blas/cublas_conversion_char_types.hpp b/include/dca/linalg/blas/cublas_conversion_char_types.hpp index 992dae599..75f1dca7b 100644 --- a/include/dca/linalg/blas/cublas_conversion_char_types.hpp +++ b/include/dca/linalg/blas/cublas_conversion_char_types.hpp @@ -31,7 +31,7 @@ inline cublasDiagType_t getCublasDiagValue(char diag) { } throw std::logic_error(__FUNCTION__); - return cublasDiagType_t(); + return cublasDiagType_t(); } // Returns the corresponding cublasSideMode_t value to side. diff --git a/include/dca/linalg/blas/kernels_gpu.hpp b/include/dca/linalg/blas/kernels_gpu.hpp index 35b5f8f09..b7d8e36a1 100644 --- a/include/dca/linalg/blas/kernels_gpu.hpp +++ b/include/dca/linalg/blas/kernels_gpu.hpp @@ -21,15 +21,15 @@ namespace linalg { namespace blas { // dca::linalg::blas:: -// Type can be float, double, cuComplex, cuDoubleComplex, std::complex, std::complex. +// Type can be float, double, std::complex, std::complex. template void copyRows(int row_size, int n_rows, const int* i_x, const Type* x, int ldx, const int* i_y, Type* y, int ldy, int thread_id, int stream_id); template inline void copyRows(int row_size, int n_rows, const int* i_x, const std::complex* x, int ldx, const int* i_y, std::complex* y, int ldy, int thread_id, int stream_id) { - auto cu_x = util::castCudaComplex(x); - auto cu_y = util::castCudaComplex(y); + auto cu_x = castCuda(x); + auto cu_y = castCuda(y); copyRows(row_size, n_rows, i_x, cu_x, ldx, i_y, cu_y, ldy, thread_id, stream_id); } template @@ -42,8 +42,8 @@ void copyCols(int col_size, int n_cols, const int* j_x, const Type* x, int ldx, template inline void copyCols(int col_size, int n_cols, const int* j_x, const std::complex* x, int ldx, const int* j_y, std::complex* y, int ldy, int thread_id, int stream_id) { - auto cu_x = util::castCudaComplex(x); - auto cu_y = util::castCudaComplex(y); + auto cu_x = castCuda(x); + auto cu_y = castCuda(y); copyCols(col_size, n_cols, j_x, cu_x, ldx, j_y, cu_y, ldy, thread_id, stream_id); } template @@ -54,7 +54,7 @@ template void moveLeft(int m, int n, Type* a, int lda); template inline void moveLeft(int m, int n, std::complex* a, int lda) { - auto cu_a = util::castCudaComplex(a); + auto cu_a = castCuda(a); moveLeft(m, n, cu_a, lda); } @@ -62,7 +62,7 @@ template void moveUp(int m, int n, Type* a, int lda); template inline void moveUp(int m, int n, std::complex* a, int lda) { - auto cu_a = util::castCudaComplex(a); + auto cu_a = castCuda(a); moveUp(m, n, cu_a, lda); } @@ -72,8 +72,8 @@ void scaleRows(int row_size, int n_rows, const int* i, const Type* alpha, Type* template inline void scaleRows(int row_size, int n_rows, const int* i, const std::complex* alpha, std::complex* a, int lda, int thread_id, int stream_id) { - auto cu_alpha = util::castCudaComplex(alpha); - auto cu_a = util::castCudaComplex(a); + auto cu_alpha = castCuda(alpha); + auto cu_a = castCuda(a); scaleRows(row_size, n_rows, i, cu_alpha, cu_a, lda, thread_id, stream_id); } @@ -83,7 +83,7 @@ void swapRows(int row_size, int n_rows, const int* i_1, const int* i_2, Type* a, template inline void swapRows(int row_size, int n_rows, const int* i_1, const int* i_2, std::complex* a, int lda, int thread_id, int stream_id) { - auto cu_a = util::castCudaComplex(a); + auto cu_a = castCuda(a); swapRows(row_size, n_rows, i_1, i_2, cu_a, lda, thread_id, stream_id); } @@ -93,7 +93,7 @@ void swapCols(int col_size, int n_cols, const int* j_1, const int* j_2, Type* a, template inline void swapCols(int col_size, int n_cols, const int* j_1, const int* j_2, std::complex* a, int lda, int thread_id, int stream_id) { - auto cu_a = util::castCudaComplex(a); + auto cu_a = castCuda(a); swapCols(col_size, n_cols, j_1, j_2, cu_a, lda, thread_id, stream_id); } diff --git a/include/dca/linalg/lapack/laset_gpu.hpp b/include/dca/linalg/lapack/laset_gpu.hpp index 32ee45f9e..ce92c1077 100644 --- a/include/dca/linalg/lapack/laset_gpu.hpp +++ b/include/dca/linalg/lapack/laset_gpu.hpp @@ -29,14 +29,14 @@ void laset_gpu(int m, int n, Type offdiag, Type diag, Type* a, int lda, int thre template inline void laset_gpu(int m, int n, std::complex offdiag, std::complex diag, std::complex* a, int lda, int thread_id, int stream_id) { - auto cu_offdiag = linalg::util::castCudaComplex(offdiag); - auto cu_diag = linalg::util::castCudaComplex(&diag); - auto cu_a = linalg::util::castCudaComplex(a); - laset_gpu(m, n, *cu_offdiag, *cu_diag, cu_a, lda, thread_id, stream_id); + auto cu_offdiag = linalg::castCuda(offdiag); + auto cu_diag = linalg::castCuda(diag); + auto cu_a = linalg::castCuda(a); + laset_gpu(m, n, cu_offdiag, cu_diag, cu_a, lda, thread_id, stream_id); } -} // lapack -} // linalg -} // dca +} // namespace lapack +} // namespace linalg +} // namespace dca #endif // DCA_LINALG_LAPACK_LASET_GPU_HPP diff --git a/include/dca/linalg/lapack/magma.hpp b/include/dca/linalg/lapack/magma.hpp index 9e6d7cd83..48e315ac5 100644 --- a/include/dca/linalg/lapack/magma.hpp +++ b/include/dca/linalg/lapack/magma.hpp @@ -46,7 +46,7 @@ inline void getrf_gpu(int m, int n, double* a, int lda, int* ipiv) { } inline void getrf_gpu(int m, int n, std::complex* a, int lda, int* ipiv) { checkErrorsCudaDebug(); - auto cu_a = util::castCudaComplex(a); + auto cu_a = castCuda(a); int info = 0; magma_cgetrf_gpu(m, n, cu_a, lda, ipiv, &info); @@ -56,7 +56,7 @@ inline void getrf_gpu(int m, int n, std::complex* a, int lda, int* ipiv) } inline void getrf_gpu(int m, int n, std::complex* a, int lda, int* ipiv) { checkErrorsCudaDebug(); - auto cu_a = util::castCudaComplex(a); + auto cu_a = castCuda(a); int info = 0; magma_zgetrf_gpu(m, n, cu_a, lda, ipiv, &info); @@ -86,8 +86,8 @@ inline void getri_gpu(int n, double* a, int lda, int* ipiv, double* work, int lw inline void getri_gpu(int n, std::complex* a, int lda, int* ipiv, std::complex* work, int lwork) { checkErrorsCudaDebug(); - auto cu_a = util::castCudaComplex(a); - auto cu_work = util::castCudaComplex(work); + auto cu_a = castCuda(a); + auto cu_work = castCuda(work); int info = 0; magma_cgetri_gpu(n, cu_a, lda, ipiv, cu_work, lwork, &info); @@ -98,8 +98,8 @@ inline void getri_gpu(int n, std::complex* a, int lda, int* ipiv, std::co inline void getri_gpu(int n, std::complex* a, int lda, int* ipiv, std::complex* work, int lwork) { checkErrorsCudaDebug(); - auto cu_a = util::castCudaComplex(a); - auto cu_work = util::castCudaComplex(work); + auto cu_a = castCuda(a); + auto cu_work = castCuda(work); int info = 0; magma_zgetri_gpu(n, cu_a, lda, ipiv, cu_work, lwork, &info); @@ -143,10 +143,9 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m const std::complex* const* b, int* ldb, const std::complex beta, std::complex** c, int* ldc, const int batch_count, const magma_queue_t queue) { - using util::castCudaComplex; - magmablas_cgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, - *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), - ldb, *castCudaComplex(beta), castCudaComplex(c), ldc, batch_count, queue); + magmablas_cgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, castCuda(alpha), + castCuda(a), lda, castCuda(b), ldb, castCuda(beta), castCuda(c), ldc, + batch_count, queue); checkErrorsCudaDebug(); } inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k, @@ -155,10 +154,9 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m const std::complex* const* b, int* ldb, const std::complex beta, std::complex** c, int* ldc, const int batch_count, const magma_queue_t queue) { - using util::castCudaComplex; - magmablas_zgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, - *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), - ldb, *castCudaComplex(beta), castCudaComplex(c), ldc, batch_count, queue); + magmablas_zgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, castCuda(alpha), + castCuda(a), lda, castCuda(b), ldb, castCuda(beta), castCuda(c), ldc, + batch_count, queue); checkErrorsCudaDebug(); } @@ -193,11 +191,9 @@ inline void magmablas_gemm_vbatched_max_nocheck( const std::complex* const* a, int* lda, const std::complex* const* b, int* ldb, const std::complex beta, std::complex** c, int* ldc, const int batch_count, const int m_max, const int n_max, const int k_max, magma_queue_t queue) { - using util::castCudaComplex; magmablas_cgemm_vbatched_max_nocheck( - toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), - castCudaComplex(a), lda, castCudaComplex(b), ldb, *castCudaComplex(beta), castCudaComplex(c), - ldc, batch_count, m_max, n_max, k_max, queue); + toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, castCuda(alpha), castCuda(a), lda, + castCuda(b), ldb, castCuda(beta), castCuda(c), ldc, batch_count, m_max, n_max, k_max, queue); checkErrorsCudaDebug(); } @@ -206,11 +202,9 @@ inline void magmablas_gemm_vbatched_max_nocheck( const std::complex* const* a, int* lda, const std::complex* const* b, int* ldb, const std::complex beta, std::complex** c, int* ldc, const int batch_count, const int m_max, const int n_max, const int k_max, magma_queue_t queue) { - using util::castCudaComplex; magmablas_zgemm_vbatched_max_nocheck( - toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha), - castCudaComplex(a), lda, castCudaComplex(b), ldb, *castCudaComplex(beta), castCudaComplex(c), - ldc, batch_count, m_max, n_max, k_max, queue); + toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, castCuda(alpha), castCuda(a), lda, + castCuda(b), ldb, castCuda(beta), castCuda(c), ldc, batch_count, m_max, n_max, k_max, queue); checkErrorsCudaDebug(); } @@ -238,10 +232,9 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i const std::complex* const* b, const int ldb, const std::complex beta, std::complex** c, const int ldc, const int batch_count, const magma_queue_t queue) { - using util::castCudaComplex; - magmablas_cgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, - *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), ldb, - *castCudaComplex(beta), castCudaComplex(c), ldc, batch_count, queue); + magmablas_cgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, castCuda(alpha), + castCuda(a), lda, castCuda(b), ldb, castCuda(beta), castCuda(c), ldc, + batch_count, queue); checkErrorsCudaDebug(); } inline void magmablas_gemm_batched(const char transa, const char transb, const int m, const int n, @@ -250,10 +243,9 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i const std::complex* const* b, const int ldb, const std::complex beta, std::complex** c, const int ldc, const int batch_count, const magma_queue_t queue) { - using util::castCudaComplex; - magmablas_zgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, - *castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), ldb, - *castCudaComplex(beta), castCudaComplex(c), ldc, batch_count, queue); + magmablas_zgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, castCuda(alpha), + castCuda(a), lda, castCuda(b), ldb, castCuda(beta), castCuda(c), ldc, + batch_count, queue); checkErrorsCudaDebug(); } diff --git a/include/dca/linalg/lapack/multiply_diagonal_gpu.hpp b/include/dca/linalg/lapack/multiply_diagonal_gpu.hpp index 6e4403c1b..cea20ee01 100644 --- a/include/dca/linalg/lapack/multiply_diagonal_gpu.hpp +++ b/include/dca/linalg/lapack/multiply_diagonal_gpu.hpp @@ -33,9 +33,9 @@ template inline void multiplyDiagonalLeft_gpu(int m, int n, const std::complex* d, int inc_d, const std::complex* a, int lda, std::complex* b, int ldb, int thread_id, int stream_id) { - auto cu_a = linalg::util::castCudaComplex(a); - auto cu_d = linalg::util::castCudaComplex(d); - auto cu_b = linalg::util::castCudaComplex(b); + auto cu_a = linalg::castCuda(a); + auto cu_d = linalg::castCuda(d); + auto cu_b = linalg::castCuda(b); multiplyDiagonalLeft_gpu(m, n, cu_d, inc_d, cu_a, lda, cu_b, ldb, thread_id, stream_id); } @@ -51,9 +51,9 @@ template inline void multiplyDiagonalRight_gpu(int m, int n, const std::complex* a, int lda, const std::complex* d, int inc_d, std::complex* b, int ldb, int thread_id, int stream_id) { - auto cu_a = linalg::util::castCudaComplex(a); - auto cu_d = linalg::util::castCudaComplex(d); - auto cu_b = linalg::util::castCudaComplex(b); + auto cu_a = linalg::castCuda(a); + auto cu_d = linalg::castCuda(d); + auto cu_b = linalg::castCuda(b); multiplyDiagonalRight_gpu(m, n, cu_a, lda, cu_d, inc_d, cu_b, ldb, thread_id, stream_id); } diff --git a/include/dca/linalg/matrix.hpp b/include/dca/linalg/matrix.hpp index 8018e01ab..b740cdf4b 100644 --- a/include/dca/linalg/matrix.hpp +++ b/include/dca/linalg/matrix.hpp @@ -32,7 +32,7 @@ namespace dca { namespace linalg { // dca::linalg:: -template +template class Matrix : public util::DefaultAllocator { public: using ThisType = Matrix; @@ -194,22 +194,20 @@ class Matrix : public util::DefaultAllocator { // Swaps the contents of the matrix, included the name, with those of rhs. void swapWithName(Matrix& rhs); -#ifdef DCA_HAVE_CUDA - // Asynchronous assignment. + // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) + // + synchronization of stream template - void setAsync(const Matrix& rhs, cudaStream_t stream); + void set(const Matrix& rhs, int thread_id, int stream_id); - // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) + // Asynchronous assignment. template - void setAsync(const Matrix& rhs, int thread_id, int stream_id); + void setAsync(const Matrix& rhs, const util::CudaStream& stream); - void setToZero(cudaStream_t stream); -#else - // Synchronous assignment fallback for SetAsync. + // Asynchronous assignment (copy with stream = getStream(thread_id, stream_id)) template void setAsync(const Matrix& rhs, int thread_id, int stream_id); -#endif // DCA_HAVE_CUDA + void setToZero(const util::CudaStream& stream); // Prints the values of the matrix elements. void print() const; @@ -287,7 +285,9 @@ Matrix::Matrix(const std::string& name, std::pair= capacity.first && capacity_.second >= capacity.second); data_ = Allocator::allocate(nrElements(capacity_)); - util::Memory::setToZero(data_, nrElements(capacity_)); + if (nrElements(capacity_)) { // Avoid cuda calls when initializing static matrices. + util::Memory::setToZero(data_, nrElements(capacity_)); + } } template @@ -413,12 +413,19 @@ void Matrix::swapWithName(Matrix +template +void Matrix::set(const Matrix& rhs, + int thread_id, int stream_id) { + resize(rhs.size_); + util::memoryCopy(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, thread_id, + stream_id); +} template template void Matrix::setAsync(const Matrix& rhs, - const cudaStream_t stream) { + const util::CudaStream& stream) { resizeNoCopy(rhs.size_); util::memoryCopyAsync(data_, leadingDimension(), rhs.data_, rhs.leadingDimension(), size_, stream); } @@ -431,21 +438,10 @@ void Matrix::setAsync(const Matrix -void Matrix::setToZero(cudaStream_t stream) { - cudaMemsetAsync(data_, 0, leadingDimension() * nrCols() * sizeof(ScalarType), stream); +void Matrix::setToZero(const util::CudaStream& stream) { + util::Memory::setToZeroAsync(data_, leadingDimension() * nrCols(), stream); } -#else - -template -template -void Matrix::setAsync(const Matrix& rhs, - int /*thread_id*/, int /*stream_id*/) { - set(rhs); -} - -#endif // DCA_HAVE_CUDA - template void Matrix::print() const { if (device_name == GPU) diff --git a/include/dca/linalg/matrix_view.hpp b/include/dca/linalg/matrix_view.hpp index a7f7ff468..042afdeb0 100644 --- a/include/dca/linalg/matrix_view.hpp +++ b/include/dca/linalg/matrix_view.hpp @@ -43,10 +43,10 @@ class MatrixView { template