Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 18 additions & 16 deletions include/dca/linalg/lapack/magma.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#endif
#include "dca/linalg/lapack/lapack.hpp"


#include "dca/linalg/util/cast_gpu.hpp"

// C++ wrappers
Expand Down Expand Up @@ -103,7 +102,7 @@ inline void getri_gpu(int n, std::complex<float>* a, int lda, int* ipiv, std::co
inline void getri_gpu(int n, std::complex<double>* a, int lda, int* ipiv,
std::complex<double>* work, int lwork) {
checkErrorsCudaDebug();

auto cu_a = util::castMAGMAComplex(a);
auto cu_work = util::castMAGMAComplex(work);

Expand Down Expand Up @@ -151,8 +150,9 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
int* ldc, const int batch_count, const magma_queue_t queue) {
using util::castMAGMAComplex;
magmablas_cgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
*castMAGMAComplex(&alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b),
ldb, *castMAGMAComplex(&beta), castMAGMAComplex(c), ldc, batch_count, queue);
convertToMagmaComplex(alpha), castMAGMAComplex(a), lda,
castMAGMAComplex(b), ldb, convertToMagmaComplex(beta),
castMAGMAComplex(c), ldc, batch_count, queue);
checkErrorsCudaDebug();
}
inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k,
Expand All @@ -163,8 +163,9 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
int* ldc, const int batch_count, const magma_queue_t queue) {
using util::castMAGMAComplex;
magmablas_zgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
*castMAGMAComplex(&alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b),
ldb, *castMAGMAComplex(&beta), castMAGMAComplex(c), ldc, batch_count, queue);
convertToMagmaComplex(alpha), castMAGMAComplex(a), lda,
castMAGMAComplex(b), ldb, convertToMagmaComplex(beta),
castMAGMAComplex(c), ldc, batch_count, queue);
checkErrorsCudaDebug();
}

Expand Down Expand Up @@ -202,8 +203,8 @@ inline void magmablas_gemm_vbatched_max_nocheck(
using util::castMAGMAComplex;
magmablas_cgemm_vbatched_max_nocheck(
toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castMAGMAComplex(alpha),
castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb, *castMAGMAComplex(beta), castMAGMAComplex(c),
ldc, batch_count, m_max, n_max, k_max, queue);
castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb, *castMAGMAComplex(beta),
castMAGMAComplex(c), ldc, batch_count, m_max, n_max, k_max, queue);
checkErrorsCudaDebug();
}

Expand All @@ -214,9 +215,9 @@ inline void magmablas_gemm_vbatched_max_nocheck(
const int m_max, const int n_max, const int k_max, magma_queue_t queue) {
using util::castMAGMAComplex;
magmablas_zgemm_vbatched_max_nocheck(
toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castMAGMAComplex(alpha),
castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb, *castMAGMAComplex(beta), castMAGMAComplex(c),
ldc, batch_count, m_max, n_max, k_max, queue);
toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, convertToMagmaType(alpha),
castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb, convertToMagmaType(beta),
castMAGMAComplex(c), ldc, batch_count, m_max, n_max, k_max, queue);
checkErrorsCudaDebug();
}

Expand Down Expand Up @@ -246,8 +247,9 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
const int ldc, const int batch_count, const magma_queue_t queue) {
using util::castMAGMAComplex;
magmablas_cgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
*castMAGMAComplex(alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb,
*castMAGMAComplex(beta), castMAGMAComplex(c), ldc, batch_count, queue);
convertToMagmaType(alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b),
ldb, convertToMagmaType(beta), castMAGMAComplex(c), ldc, batch_count,
queue);
checkErrorsCudaDebug();
}
inline void magmablas_gemm_batched(const char transa, const char transb, const int m, const int n,
Expand All @@ -256,10 +258,10 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
const std::complex<double>* const* b, const int ldb,
const std::complex<double> beta, std::complex<double>** c,
const int ldc, const int batch_count, const magma_queue_t queue) {
using util::castMAGMAComplex;
using dca::util::castMagmaType;
magmablas_zgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
*castMAGMAComplex(alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb,
*castMAGMAComplex(beta), castMAGMAComplex(c), ldc, batch_count, queue);
convertToMagmaType(alpha), castMagmaType(a), lda, castMagmaType(b), ldb,
convertToMagmaType(beta), castMagmaType(c), ldc, batch_count, queue);
checkErrorsCudaDebug();
}

Expand Down
23 changes: 17 additions & 6 deletions include/dca/linalg/matrixop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,25 +223,36 @@ auto difference(const Matrix<Scalar, CPU, ALLOC>& a, const Matrix<Scalar, CPU, A
s << std::endl;
std::cout << s.str();
#endif // NDEBUG
std::cerr << "matrix difference in excess of threshold!\n";
std::cerr << "matrix difference (" << max_diff << ") in excess of threshold (" << diff_threshold
<< ")!\n";
throw std::logic_error(__FUNCTION__);
}

return max_diff;
}
template <typename Scalar, DeviceType device_name, class ALLOC>
auto difference(const Matrix<Scalar, device_name>& a, const Matrix<Scalar, CPU, ALLOC>& b,

template <typename Scalar, class ALLOC>
auto difference(const Matrix<Scalar, GPU>& a, const Matrix<Scalar, CPU, ALLOC>& b,
double diff_threshold = 1e-3) {
Matrix<Scalar, CPU, ALLOC> cp_a(a);
return difference(cp_a, b, diff_threshold);
}
template <typename Scalar, DeviceType device_name_a, DeviceType device_name_b>
auto difference(const Matrix<Scalar, device_name_a>& a, const Matrix<Scalar, device_name_b>& b,

template <typename Scalar, class ALLOC>
auto difference(const Matrix<Scalar, CPU, ALLOC>& a, const Matrix<Scalar, GPU>& b,
double diff_threshold = 1e-3) {
Matrix<Scalar, CPU> cp_b(b);
return difference(a, cp_b, diff_threshold);
}

template <typename Scalar>
auto difference(const Matrix<Scalar, GPU>& a, const Matrix<Scalar, GPU>& b,
double diff_threshold = 1e-3) {
Matrix<Scalar, CPU> cp_a(a);
Matrix<Scalar, CPU> cp_b(b);
return difference(cp_a, cp_b, diff_threshold);
}

// Returns the real part of a matrix.
// In: a
// TODO test.
Expand Down Expand Up @@ -314,7 +325,7 @@ void inverse(MatrixType<Scalar, device_name, ALLOC<Scalar>>& mat, Vector<int, CP
work.ptr(), lwork);
}

template <typename Scalar, DeviceType device_name, template<typename> class ALLOC,
template <typename Scalar, DeviceType device_name, template <typename> class ALLOC,
template <typename, DeviceType, class> class MatrixType>
void inverse(MatrixType<Scalar, device_name, ALLOC<Scalar>>& mat) {
Vector<int, CPU> ipiv;
Expand Down
62 changes: 46 additions & 16 deletions include/dca/linalg/util/cast_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,12 @@

#include "dca/config/haves_defines.hpp"
#include "dca/platform/dca_gpu_complex.h"
#include "dca/util/type_mapping.hpp"

#include <magma_v2.h>
namespace dca {
namespace linalg {
namespace util {
// dca::linalg::util::
namespace dca::linalg::util {

#if defined(DCA_HAVE_CUDA)
#if defined(DCA_HAVE_CUDA)
// returns a cuComplex pointer.
inline cuComplex** castCudaComplex(std::complex<float>** ptr) {
return reinterpret_cast<cuComplex**>(ptr);
Expand Down Expand Up @@ -139,16 +137,16 @@ inline const magmaFloatComplex* const* castMAGMAComplex(const std::complex<float
}
inline const magmaFloatComplex* castMAGMAComplex(const std::complex<float>* ptr) {
return reinterpret_cast<const magmaFloatComplex*>(ptr);
}
}
inline const magmaFloatComplex* castMAGMAComplex(const std::complex<float>& el) {
return castMAGMAComplex(&el);
}

#ifdef DCA_HAVE_CUDA
#define cublasDoubleComplex cuDoubleComplex
#define cublasComplex cuComplex
#define cublasDoubleComplex cuDoubleComplex
#define cublasComplex cuComplex
#endif

inline cublasDoubleComplex** castCUBLASComplex(std::complex<double>** ptr) {
return reinterpret_cast<cublasDoubleComplex**>(ptr);
}
Expand Down Expand Up @@ -181,11 +179,10 @@ inline const cublasComplex* const* castCUBLASComplex(const std::complex<float>*
}
inline const cublasComplex* castCUBLASComplex(const std::complex<float>* ptr) {
return reinterpret_cast<const cublasComplex*>(ptr);
}
}
inline const cublasComplex* castCUBLASComplex(const std::complex<float>& el) {
return castCUBLASComplex(&el);
}


// Provides a templated typedef.
namespace details {
Expand All @@ -201,13 +198,46 @@ struct ComplexContainer<float> {
using type = cuComplex;
};
} // namespace details
// dca::linalg::util::

template <typename Real>
using CudaComplex = typename details::ComplexContainer<Real>::type;
} // namespace dca::linalg::util

inline double2 convertToMagmaType(std::complex<double> var) {
return {reinterpret_cast<double (&)[2]>(var)[0], reinterpret_cast<double (&)[2]>(var)[1]};
}

inline float2 convertToMagmaType(std::complex<float> var) {
return {reinterpret_cast<float (&)[2]>(var)[0], reinterpret_cast<float (&)[2]>(var)[1]};
}

namespace dca::util {
template <typename T>
using MAGMATypeMap = typename std::disjunction<
OnTypesEqual<T, float, float>, OnTypesEqual<T, double, double>, OnTypesEqual<T, float*, float*>,
OnTypesEqual<T, double*, double*>, OnTypesEqual<T, const float*, const float*>,
OnTypesEqual<T, const double*, const double*>, OnTypesEqual<T, float**, float**>,
OnTypesEqual<T, const float**, const float**>, OnTypesEqual<T, double**, double**>,
OnTypesEqual<T, const double**, const double**>,
OnTypesEqual<T, std::complex<double>*, magmaDoubleComplex*>,
OnTypesEqual<T, std::complex<float>**, magmaFloatComplex**>,
OnTypesEqual<T, std::complex<double>**, magmaDoubleComplex**>,
OnTypesEqual<T, std::complex<float>*, magmaFloatComplex*>,
OnTypesEqual<T, float2, magmaFloatComplex>, OnTypesEqual<T, double2, magmaDoubleComplex>,
OnTypesEqual<T, const std::complex<double>*, const magmaDoubleComplex*>,
OnTypesEqual<T, const std::complex<float>*, const magmaFloatComplex*>,
OnTypesEqual<T, const std::complex<double>&, const magmaDoubleComplex&>,
OnTypesEqual<T, const std::complex<float>&, const magmaFloatComplex&>,
OnTypesEqual<T, const std::complex<float>**, const magmaFloatComplex**>,
OnTypesEqual<T, const std::complex<double>**, const magmaDoubleComplex**>,
OnTypesEqual<T, const std::complex<float>* const*, const magmaFloatComplex* const*>,
OnTypesEqual<T, const std::complex<double>* const*, const magmaDoubleComplex* const*>,
default_type<void>>::type;

template <typename T>
__device__ __host__ MAGMATypeMap<T> castMagmaType(T var) {
return reinterpret_cast<MAGMATypeMap<T>>(var);
}

} // namespace util
} // namespace linalg
} // namespace dca
} // namespace dca::util

#endif // DCA_LINALG_UTIL_CAST_CUDA_HPP
2 changes: 1 addition & 1 deletion test/unit/linalg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dca_add_gtest(matrixop_cpu_gpu_test
dca_add_gtest(matrixop_real_gpu_test
GTEST_MAIN
CUDA
LIBS ${DCA_LIBS} lapack gpu_utils magma::magma lapack_kernels blas_kernels)
LIBS ${DCA_LIBS} lapack gpu_utils magma::magma BLAS::BLAS lapack_kernels blas_kernels)

# lapack_kernels blas_kernels lapack_kernels
dca_add_gtest(matrixop_complex_gpu_test
Expand Down
5 changes: 3 additions & 2 deletions test/unit/linalg/matrixop_cpu_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "dca/testing/gtest_h_w_warning_blocking.h"
#include "dca/linalg/blas/blas3.hpp"
#include "dca/linalg/matrix.hpp"
#include "dca/linalg/util/allocators/pinned_allocator.hpp"
#include "cpu_test_util.hpp"
#include "gpu_test_util.hpp"

Expand All @@ -28,14 +29,14 @@ TEST(MatrixopCPUGPUTest, difference) {

auto val_a = [](int i, int j) { return 10 * i + j; };

dca::linalg::Matrix<double, dca::linalg::CPU> a(size2_a);
dca::linalg::Matrix<double, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<double>> a(size2_a);
testing::setMatrixElements(a, val_a);
dca::linalg::Matrix<double, dca::linalg::GPU> da(a);

for (int sg : {1, -1})
for (int ia : {0, 1, 4})
for (int ja : {0, 2, 3}) {
dca::linalg::Matrix<double, dca::linalg::CPU> b(a);
dca::linalg::Matrix<double, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<double>> b(a);
b(ia, ja) += sg * diff;
double err = std::abs(epsilon * b(ia, ja));

Expand Down
13 changes: 9 additions & 4 deletions test/unit/linalg/matrixop_real_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ class MatrixopRealGPUTest : public ::testing::Test {
static const ScalarType epsilon;
};
template <typename ScalarType>
const ScalarType MatrixopRealGPUTest<ScalarType>::epsilon = std::numeric_limits<ScalarType>::epsilon();
const ScalarType MatrixopRealGPUTest<ScalarType>::epsilon =
std::numeric_limits<ScalarType>::epsilon();

typedef ::testing::Types<float, double> FloatingPointTypes;
TYPED_TEST_CASE(MatrixopRealGPUTest, FloatingPointTypes);
Expand Down Expand Up @@ -765,22 +766,26 @@ TEST(MatrixopGPUTest, Difference) {

auto val_a = [](int i, int j) { return 10 * i + j; };

dca::linalg::Matrix<double, dca::linalg::CPU> a(size2_a);
dca::linalg::Matrix<double, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<double>> a(size2_a);
testing::setMatrixElements(a, val_a);
dca::linalg::Matrix<double, dca::linalg::GPU> da(a);

for (int sg : {1, -1})
for (int ia : {0, 1, 4})
for (int ja : {0, 2, 3}) {
dca::linalg::Matrix<double, dca::linalg::CPU> b(a);
dca::linalg::Matrix<double, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<double>> b(a);
b(ia, ja) += sg * diff;
double err = std::abs(epsilon * b(ia, ja));
dca::linalg::Matrix<double, dca::linalg::GPU> db(b);

// To make this clear the difference calls are expected to show
// differences!
EXPECT_NEAR(diff, dca::linalg::matrixop::difference(da, db, 2 * diff), err);
EXPECT_NEAR(diff, dca::linalg::matrixop::difference(da, db, diff + err), err);
auto diffcalc = dca::linalg::matrixop::difference(da, db, 2 * diff);
EXPECT_NEAR(diff, dca::linalg::matrixop::difference(da, db, diffcalc), err);
// This will result on output even though we expect and want a
// throw.
std::cerr << "difference output expected below\n";
EXPECT_THROW(dca::linalg::matrixop::difference(da, db, diffcalc - err), std::logic_error);
}
}
2 changes: 1 addition & 1 deletion test/unit/linalg/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ if(DCA_HAVE_CUDA)
dca_add_gtest(complex_op_cuda_test
GTEST_MAIN
CUDA
LIBS ${DCA_GPU_LIBS})
LIBS ${DCA_GPU_LIBS} magma::magma BLAS::BLAS)

dca_gpu_runtime_link(complex_op_cuda_test)
endif()
13 changes: 7 additions & 6 deletions test/unit/linalg/util/complex_op_cuda_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,24 @@
#include "dca/linalg/util/complex_operators_cuda.cu.hpp"
#include <stdexcept>
#include "dca/testing/gtest_h_w_warning_blocking.h"
#include <cstdint>

TEST(ComplexOpCuda, Assign) {
double2 d1_a{0.0, 0.0};
double2 d2_a{1.0,2.0};
dca::linalg::assign(d1_a,d2_a);
double2 d2_a{1.0, 2.0};
dca::linalg::assign(d1_a, d2_a);
EXPECT_EQ(d1_a.x, d2_a.x);
EXPECT_EQ(d1_a.y, d2_a.y);

std::complex<double> c1{1.3,2.4};
dca::linalg::assign(d1_a,c1);
std::complex<double> c1{1.3, 2.4};
dca::linalg::assign(d1_a, c1);
EXPECT_EQ(d1_a.x, c1.real());
EXPECT_EQ(d1_a.y, c1.imag());

std::complex<double> c2{0.0,0.5};
std::complex<double> c2{0.0, 0.5};
dca::linalg::assign(c2, d1_a);
EXPECT_EQ(c2.real(), c1.real());
EXPECT_EQ(c2.imag(),c1.imag());
EXPECT_EQ(c2.imag(), c1.imag());

std::int8_t i81 = 1;
dca::linalg::assign(d1_a, i81);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
# why this test pulls in a bunch of magma references is a mystery
dca_add_gtest(four_point_parameters_test
GTEST_MAIN
LIBS json enumerations ${DCA_GPU_LIBS})
LIBS json enumerations ${DCA_GPU_LIBS} magma::magma)
Loading
Loading