CompFUSE · PDoakORNL · Sep 12, 2025 · Sep 10, 2025
diff --git a/include/dca/linalg/lapack/magma.hpp b/include/dca/linalg/lapack/magma.hpp
@@ -22,7 +22,6 @@
 #endif
 #include "dca/linalg/lapack/lapack.hpp"
 
-
 #include "dca/linalg/util/cast_gpu.hpp"
 
 // C++ wrappers
@@ -103,7 +102,7 @@ inline void getri_gpu(int n, std::complex<float>* a, int lda, int* ipiv, std::co
 inline void getri_gpu(int n, std::complex<double>* a, int lda, int* ipiv,
                       std::complex<double>* work, int lwork) {
   checkErrorsCudaDebug();
-  
+
   auto cu_a = util::castMAGMAComplex(a);
   auto cu_work = util::castMAGMAComplex(work);
 
@@ -151,8 +150,9 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
                                     int* ldc, const int batch_count, const magma_queue_t queue) {
   using util::castMAGMAComplex;
   magmablas_cgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
-                           *castMAGMAComplex(&alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b),
-                           ldb, *castMAGMAComplex(&beta), castMAGMAComplex(c), ldc, batch_count, queue);
+                           convertToMagmaComplex(alpha), castMAGMAComplex(a), lda,
+                           castMAGMAComplex(b), ldb, convertToMagmaComplex(beta),
+                           castMAGMAComplex(c), ldc, batch_count, queue);
   checkErrorsCudaDebug();
 }
 inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k,
@@ -163,8 +163,9 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
                                     int* ldc, const int batch_count, const magma_queue_t queue) {
   using util::castMAGMAComplex;
   magmablas_zgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
-                           *castMAGMAComplex(&alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b),
-                           ldb, *castMAGMAComplex(&beta), castMAGMAComplex(c), ldc, batch_count, queue);
+                           convertToMagmaComplex(alpha), castMAGMAComplex(a), lda,
+                           castMAGMAComplex(b), ldb, convertToMagmaComplex(beta),
+                           castMAGMAComplex(c), ldc, batch_count, queue);
   checkErrorsCudaDebug();
 }
 
@@ -202,8 +203,8 @@ inline void magmablas_gemm_vbatched_max_nocheck(
   using util::castMAGMAComplex;
   magmablas_cgemm_vbatched_max_nocheck(
       toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castMAGMAComplex(alpha),
-      castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb, *castMAGMAComplex(beta), castMAGMAComplex(c),
-      ldc, batch_count, m_max, n_max, k_max, queue);
+      castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb, *castMAGMAComplex(beta),
+      castMAGMAComplex(c), ldc, batch_count, m_max, n_max, k_max, queue);
   checkErrorsCudaDebug();
 }
 
@@ -214,9 +215,9 @@ inline void magmablas_gemm_vbatched_max_nocheck(
     const int m_max, const int n_max, const int k_max, magma_queue_t queue) {
   using util::castMAGMAComplex;
   magmablas_zgemm_vbatched_max_nocheck(
-      toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castMAGMAComplex(alpha),
-      castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb, *castMAGMAComplex(beta), castMAGMAComplex(c),
-      ldc, batch_count, m_max, n_max, k_max, queue);
+      toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, convertToMagmaType(alpha),
+      castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb, convertToMagmaType(beta),
+      castMAGMAComplex(c), ldc, batch_count, m_max, n_max, k_max, queue);
   checkErrorsCudaDebug();
 }
 
@@ -246,8 +247,9 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
                                    const int ldc, const int batch_count, const magma_queue_t queue) {
   using util::castMAGMAComplex;
   magmablas_cgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
-                          *castMAGMAComplex(alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb,
-                          *castMAGMAComplex(beta), castMAGMAComplex(c), ldc, batch_count, queue);
+                          convertToMagmaType(alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b),
+                          ldb, convertToMagmaType(beta), castMAGMAComplex(c), ldc, batch_count,
+                          queue);
   checkErrorsCudaDebug();
 }
 inline void magmablas_gemm_batched(const char transa, const char transb, const int m, const int n,
@@ -256,10 +258,10 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
                                    const std::complex<double>* const* b, const int ldb,
                                    const std::complex<double> beta, std::complex<double>** c,
                                    const int ldc, const int batch_count, const magma_queue_t queue) {
-  using util::castMAGMAComplex;
+  using dca::util::castMagmaType;
   magmablas_zgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
-                          *castMAGMAComplex(alpha), castMAGMAComplex(a), lda, castMAGMAComplex(b), ldb,
-                          *castMAGMAComplex(beta), castMAGMAComplex(c), ldc, batch_count, queue);
+                          convertToMagmaType(alpha), castMagmaType(a), lda, castMagmaType(b), ldb,
+                          convertToMagmaType(beta), castMagmaType(c), ldc, batch_count, queue);
   checkErrorsCudaDebug();
 }
 

diff --git a/include/dca/linalg/matrixop.hpp b/include/dca/linalg/matrixop.hpp
@@ -223,25 +223,36 @@ auto difference(const Matrix<Scalar, CPU, ALLOC>& a, const Matrix<Scalar, CPU, A
     s << std::endl;
     std::cout << s.str();
 #endif  // NDEBUG
-    std::cerr << "matrix difference in excess of threshold!\n";
+    std::cerr << "matrix difference (" << max_diff << ") in excess of threshold (" << diff_threshold
+              << ")!\n";
     throw std::logic_error(__FUNCTION__);
   }
 
   return max_diff;
 }
-template <typename Scalar, DeviceType device_name, class ALLOC>
-auto difference(const Matrix<Scalar, device_name>& a, const Matrix<Scalar, CPU, ALLOC>& b,
+
+template <typename Scalar, class ALLOC>
+auto difference(const Matrix<Scalar, GPU>& a, const Matrix<Scalar, CPU, ALLOC>& b,
                 double diff_threshold = 1e-3) {
   Matrix<Scalar, CPU, ALLOC> cp_a(a);
   return difference(cp_a, b, diff_threshold);
 }
-template <typename Scalar, DeviceType device_name_a, DeviceType device_name_b>
-auto difference(const Matrix<Scalar, device_name_a>& a, const Matrix<Scalar, device_name_b>& b,
+
+template <typename Scalar, class ALLOC>
+auto difference(const Matrix<Scalar, CPU, ALLOC>& a, const Matrix<Scalar, GPU>& b,
                 double diff_threshold = 1e-3) {
   Matrix<Scalar, CPU> cp_b(b);
   return difference(a, cp_b, diff_threshold);
 }
 
+template <typename Scalar>
+auto difference(const Matrix<Scalar, GPU>& a, const Matrix<Scalar, GPU>& b,
+                double diff_threshold = 1e-3) {
+  Matrix<Scalar, CPU> cp_a(a);
+  Matrix<Scalar, CPU> cp_b(b);
+  return difference(cp_a, cp_b, diff_threshold);
+}
+
 // Returns the real part of a matrix.
 // In: a
 // TODO test.
@@ -314,7 +325,7 @@ void inverse(MatrixType<Scalar, device_name, ALLOC<Scalar>>& mat, Vector<int, CP
                                         work.ptr(), lwork);
 }
 
-  template <typename Scalar, DeviceType device_name, template<typename> class ALLOC,
+template <typename Scalar, DeviceType device_name, template <typename> class ALLOC,
           template <typename, DeviceType, class> class MatrixType>
 void inverse(MatrixType<Scalar, device_name, ALLOC<Scalar>>& mat) {
   Vector<int, CPU> ipiv;

diff --git a/include/dca/linalg/util/cast_gpu.hpp b/include/dca/linalg/util/cast_gpu.hpp
@@ -18,14 +18,12 @@
 
 #include "dca/config/haves_defines.hpp"
 #include "dca/platform/dca_gpu_complex.h"
+#include "dca/util/type_mapping.hpp"
 
 #include <magma_v2.h>
-namespace dca {
-namespace linalg {
-namespace util {
-// dca::linalg::util::
+namespace dca::linalg::util {
 
-#if defined(DCA_HAVE_CUDA) 
+#if defined(DCA_HAVE_CUDA)
 // returns a cuComplex pointer.
 inline cuComplex** castCudaComplex(std::complex<float>** ptr) {
   return reinterpret_cast<cuComplex**>(ptr);
@@ -139,16 +137,16 @@ inline const magmaFloatComplex* const* castMAGMAComplex(const std::complex<float
 }
 inline const magmaFloatComplex* castMAGMAComplex(const std::complex<float>* ptr) {
   return reinterpret_cast<const magmaFloatComplex*>(ptr);
-}  
+}
 inline const magmaFloatComplex* castMAGMAComplex(const std::complex<float>& el) {
   return castMAGMAComplex(&el);
 }
 
 #ifdef DCA_HAVE_CUDA
-  #define cublasDoubleComplex  cuDoubleComplex
-  #define cublasComplex        cuComplex
+#define cublasDoubleComplex cuDoubleComplex
+#define cublasComplex cuComplex
 #endif
-  
+
 inline cublasDoubleComplex** castCUBLASComplex(std::complex<double>** ptr) {
   return reinterpret_cast<cublasDoubleComplex**>(ptr);
 }
@@ -181,11 +179,10 @@ inline const cublasComplex* const* castCUBLASComplex(const std::complex<float>*
 }
 inline const cublasComplex* castCUBLASComplex(const std::complex<float>* ptr) {
   return reinterpret_cast<const cublasComplex*>(ptr);
-}  
+}
 inline const cublasComplex* castCUBLASComplex(const std::complex<float>& el) {
   return castCUBLASComplex(&el);
 }
-
 
 // Provides a templated typedef.
 namespace details {
@@ -201,13 +198,46 @@ struct ComplexContainer<float> {
   using type = cuComplex;
 };
 }  // namespace details
-// dca::linalg::util::
-
 template <typename Real>
 using CudaComplex = typename details::ComplexContainer<Real>::type;
+}  // namespace dca::linalg::util
+
+inline double2 convertToMagmaType(std::complex<double> var) {
+  return {reinterpret_cast<double (&)[2]>(var)[0], reinterpret_cast<double (&)[2]>(var)[1]};
+}
+
+inline float2 convertToMagmaType(std::complex<float> var) {
+  return {reinterpret_cast<float (&)[2]>(var)[0], reinterpret_cast<float (&)[2]>(var)[1]};
+}
+
+namespace dca::util {
+template <typename T>
+using MAGMATypeMap = typename std::disjunction<
+    OnTypesEqual<T, float, float>, OnTypesEqual<T, double, double>, OnTypesEqual<T, float*, float*>,
+    OnTypesEqual<T, double*, double*>, OnTypesEqual<T, const float*, const float*>,
+    OnTypesEqual<T, const double*, const double*>, OnTypesEqual<T, float**, float**>,
+    OnTypesEqual<T, const float**, const float**>, OnTypesEqual<T, double**, double**>,
+    OnTypesEqual<T, const double**, const double**>,
+    OnTypesEqual<T, std::complex<double>*, magmaDoubleComplex*>,
+    OnTypesEqual<T, std::complex<float>**, magmaFloatComplex**>,
+    OnTypesEqual<T, std::complex<double>**, magmaDoubleComplex**>,
+    OnTypesEqual<T, std::complex<float>*, magmaFloatComplex*>,
+    OnTypesEqual<T, float2, magmaFloatComplex>, OnTypesEqual<T, double2, magmaDoubleComplex>,
+    OnTypesEqual<T, const std::complex<double>*, const magmaDoubleComplex*>,
+    OnTypesEqual<T, const std::complex<float>*, const magmaFloatComplex*>,
+    OnTypesEqual<T, const std::complex<double>&, const magmaDoubleComplex&>,
+    OnTypesEqual<T, const std::complex<float>&, const magmaFloatComplex&>,
+    OnTypesEqual<T, const std::complex<float>**, const magmaFloatComplex**>,
+    OnTypesEqual<T, const std::complex<double>**, const magmaDoubleComplex**>,
+    OnTypesEqual<T, const std::complex<float>* const*, const magmaFloatComplex* const*>,
+    OnTypesEqual<T, const std::complex<double>* const*, const magmaDoubleComplex* const*>,
+    default_type<void>>::type;
+
+template <typename T>
+__device__ __host__ MAGMATypeMap<T> castMagmaType(T var) {
+  return reinterpret_cast<MAGMATypeMap<T>>(var);
+}
 
-}  // namespace util
-}  // namespace linalg
-}  // namespace dca
+}  // namespace dca::util
 
 #endif  // DCA_LINALG_UTIL_CAST_CUDA_HPP
diff --git a/test/unit/linalg/CMakeLists.txt b/test/unit/linalg/CMakeLists.txt
@@ -41,7 +41,7 @@ dca_add_gtest(matrixop_cpu_gpu_test
 dca_add_gtest(matrixop_real_gpu_test
               GTEST_MAIN
               CUDA
-              LIBS ${DCA_LIBS} lapack gpu_utils magma::magma lapack_kernels blas_kernels)
+              LIBS ${DCA_LIBS} lapack gpu_utils magma::magma BLAS::BLAS lapack_kernels blas_kernels)
 
 # lapack_kernels blas_kernels  lapack_kernels
 dca_add_gtest(matrixop_complex_gpu_test

diff --git a/test/unit/linalg/matrixop_cpu_gpu_test.cpp b/test/unit/linalg/matrixop_cpu_gpu_test.cpp
@@ -18,6 +18,7 @@
 #include "dca/testing/gtest_h_w_warning_blocking.h"
 #include "dca/linalg/blas/blas3.hpp"
 #include "dca/linalg/matrix.hpp"
+#include "dca/linalg/util/allocators/pinned_allocator.hpp"
 #include "cpu_test_util.hpp"
 #include "gpu_test_util.hpp"
 
@@ -28,14 +29,14 @@ TEST(MatrixopCPUGPUTest, difference) {
 
   auto val_a = [](int i, int j) { return 10 * i + j; };
 
-  dca::linalg::Matrix<double, dca::linalg::CPU> a(size2_a);
+  dca::linalg::Matrix<double, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<double>> a(size2_a);
   testing::setMatrixElements(a, val_a);
   dca::linalg::Matrix<double, dca::linalg::GPU> da(a);
 
   for (int sg : {1, -1})
     for (int ia : {0, 1, 4})
       for (int ja : {0, 2, 3}) {
-        dca::linalg::Matrix<double, dca::linalg::CPU> b(a);
+        dca::linalg::Matrix<double, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<double>> b(a);
         b(ia, ja) += sg * diff;
         double err = std::abs(epsilon * b(ia, ja));
 

diff --git a/test/unit/linalg/matrixop_real_gpu_test.cpp b/test/unit/linalg/matrixop_real_gpu_test.cpp
@@ -31,7 +31,8 @@ class MatrixopRealGPUTest : public ::testing::Test {
   static const ScalarType epsilon;
 };
 template <typename ScalarType>
-const ScalarType MatrixopRealGPUTest<ScalarType>::epsilon = std::numeric_limits<ScalarType>::epsilon();
+const ScalarType MatrixopRealGPUTest<ScalarType>::epsilon =
+    std::numeric_limits<ScalarType>::epsilon();
 
 typedef ::testing::Types<float, double> FloatingPointTypes;
 TYPED_TEST_CASE(MatrixopRealGPUTest, FloatingPointTypes);
@@ -765,22 +766,26 @@ TEST(MatrixopGPUTest, Difference) {
 
   auto val_a = [](int i, int j) { return 10 * i + j; };
 
-  dca::linalg::Matrix<double, dca::linalg::CPU> a(size2_a);
+  dca::linalg::Matrix<double, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<double>> a(size2_a);
   testing::setMatrixElements(a, val_a);
   dca::linalg::Matrix<double, dca::linalg::GPU> da(a);
 
   for (int sg : {1, -1})
     for (int ia : {0, 1, 4})
       for (int ja : {0, 2, 3}) {
-        dca::linalg::Matrix<double, dca::linalg::CPU> b(a);
+        dca::linalg::Matrix<double, dca::linalg::CPU, dca::linalg::util::PinnedAllocator<double>> b(a);
         b(ia, ja) += sg * diff;
         double err = std::abs(epsilon * b(ia, ja));
         dca::linalg::Matrix<double, dca::linalg::GPU> db(b);
-
+        // To make this clear the difference calls are expected to show
+        // differences!
         EXPECT_NEAR(diff, dca::linalg::matrixop::difference(da, db, 2 * diff), err);
         EXPECT_NEAR(diff, dca::linalg::matrixop::difference(da, db, diff + err), err);
         auto diffcalc = dca::linalg::matrixop::difference(da, db, 2 * diff);
         EXPECT_NEAR(diff, dca::linalg::matrixop::difference(da, db, diffcalc), err);
+        // This will result on output even though we expect and want a
+        // throw.
+        std::cerr << "difference output expected below\n";
         EXPECT_THROW(dca::linalg::matrixop::difference(da, db, diffcalc - err), std::logic_error);
       }
 }
diff --git a/test/unit/linalg/util/CMakeLists.txt b/test/unit/linalg/util/CMakeLists.txt
@@ -29,7 +29,7 @@ if(DCA_HAVE_CUDA)
 dca_add_gtest(complex_op_cuda_test
   GTEST_MAIN
   CUDA
-  LIBS ${DCA_GPU_LIBS})
+  LIBS ${DCA_GPU_LIBS} magma::magma BLAS::BLAS)
 
 dca_gpu_runtime_link(complex_op_cuda_test)
 endif()
diff --git a/test/unit/linalg/util/complex_op_cuda_test.cpp b/test/unit/linalg/util/complex_op_cuda_test.cpp
@@ -13,23 +13,24 @@
 #include "dca/linalg/util/complex_operators_cuda.cu.hpp"
 #include <stdexcept>
 #include "dca/testing/gtest_h_w_warning_blocking.h"
+#include <cstdint>
 
 TEST(ComplexOpCuda, Assign) {
   double2 d1_a{0.0, 0.0};
-  double2 d2_a{1.0,2.0};
-  dca::linalg::assign(d1_a,d2_a);
+  double2 d2_a{1.0, 2.0};
+  dca::linalg::assign(d1_a, d2_a);
   EXPECT_EQ(d1_a.x, d2_a.x);
   EXPECT_EQ(d1_a.y, d2_a.y);
 
-  std::complex<double> c1{1.3,2.4};
-  dca::linalg::assign(d1_a,c1);
+  std::complex<double> c1{1.3, 2.4};
+  dca::linalg::assign(d1_a, c1);
   EXPECT_EQ(d1_a.x, c1.real());
   EXPECT_EQ(d1_a.y, c1.imag());
 
-  std::complex<double> c2{0.0,0.5};
+  std::complex<double> c2{0.0, 0.5};
   dca::linalg::assign(c2, d1_a);
   EXPECT_EQ(c2.real(), c1.real());
-  EXPECT_EQ(c2.imag(),c1.imag());
+  EXPECT_EQ(c2.imag(), c1.imag());
 
   std::int8_t i81 = 1;
   dca::linalg::assign(d1_a, i81);

diff --git a/test/unit/phys/parameters/four_point_parameters/CMakeLists.txt b/test/unit/phys/parameters/four_point_parameters/CMakeLists.txt
@@ -3,4 +3,4 @@
 # why this test pulls in a bunch of magma references is a mystery
 dca_add_gtest(four_point_parameters_test
   GTEST_MAIN
-  LIBS json enumerations ${DCA_GPU_LIBS})
+  LIBS json enumerations ${DCA_GPU_LIBS} magma::magma)