From c4f0c747c28a2e4bfe505b2f6966bac8c1125f5b Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Sat, 16 Aug 2025 22:38:33 -0400 Subject: [PATCH 01/36] new async self-energy update in qkpt strems --- src/cu_routines.cu | 9 ++- src/cugw_qpt.cu | 102 ++++++++++++++++++++++++++++--- src/green/gpu/cugw_qpt.h | 126 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 226 insertions(+), 11 deletions(-) diff --git a/src/cu_routines.cu b/src/cu_routines.cu index 4542006..809bc59 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -266,19 +266,20 @@ namespace green::gpu { r2(k, k1, k1_reduced_id, k_vector, V_Qim, Vk1k2_Qij, Gk1_stij, need_minus_k1); - gw_qkpt* qkpt = obtain_idle_qkpt(qkpts); + gw_qkpt* qkpt = obtain_idle_qkpt_for_sigma(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C); + qkpt->set_k_red_id(k_reduced_id); if (_low_device_memory) { if (!_X2C) { qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); qkpt->compute_second_tau_contraction(Sigmak_stij.data(), qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); - copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns); + // copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns); } else { // In 2cGW, G(-k) = G*(k) has already been addressed in r2() qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, false); qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(), qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); - copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts); + // copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts); } } else { qkpt->set_up_qkpt_second(nullptr, V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); @@ -288,6 +289,7 @@ namespace green::gpu { } } } + wait_and_clean_qkpts(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C); cudaDeviceSynchronize(); if (!_low_device_memory and !_X2C) { copy_Sigma_from_device_to_host(sigma_kstij_device, Sigma_tskij_host.data(), _ink, _nao, _nts, _ns); @@ -303,6 +305,7 @@ namespace green::gpu { } } } + template void cugw_utils::copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_4tij, int k, int nts) { size_t nao = Sigmak_4tij.shape()[3]; diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index a366155..715fae8 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -320,7 +320,7 @@ namespace green::gpu { g_ktij_(g_ktij), g_kmtij_(g_kmtij), sigma_ktij_(sigma_ktij), sigma_k_locks_(sigma_k_locks), nao_(nao), nao2_(nao * nao), nao3_(nao2_ * nao), naux_(naux), naux2_(naux * naux), nauxnao_(naux * nao), nauxnao2_(naux * nao * nao), ns_(ns), nt_(nt), nt_batch_(nt_batch), ntnaux_(nt * naux), ntnaux2_(nt * naux * naux), ntnao_(nt * nao), ntnao2_(nt * nao2_), - handle_(handle) { + handle_(handle), cleanup_req_(false) { _low_memory_requirement = (g_ktij == nullptr) ? true : false; if (cudaStreamCreate(&stream_) != cudaSuccess) throw std::runtime_error("main stream creation failed"); @@ -349,7 +349,9 @@ namespace green::gpu { throw std::runtime_error("failure allocating Gk_tsij on host"); if (cudaMallocHost(&Gk_smtij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) throw std::runtime_error("failure allocating Gk_tsij on host"); - Sigmak_stij_buffer_ = Gk_smtij_buffer_; + if (cudaMallocHost(&Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) + throw std::runtime_error("failure allocating Gk_tsij on host"); + // Sigmak_stij_buffer_ = Gk_smtij_buffer_; } if (cudaMalloc(&Pqk0_tQP_local_, nt_batch_ * naux2_ * sizeof(cuda_complex)) != cudaSuccess) @@ -383,6 +385,9 @@ namespace green::gpu { cudaFreeHost(Gk1_stij_buffer_); cudaFreeHost(Gk_smtij_buffer_); } + if require_cleanup() { + throw std::runtime_error("cleanup of self-energy was not done correctly."); + } } template @@ -525,6 +530,42 @@ namespace green::gpu { release_lock<<<1, 1, 0, stream_>>>(Pqk0_tQP_lock); } + template + void gw_qkpt::async_compute_second_tau_contraction(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP, + ztensor<5>& Sigma_tskij_host) { + cuda_complex one = cu_type_map::cast(1., 0.); + cuda_complex zero = cu_type_map::cast(0., 0.); + cuda_complex m1 = cu_type_map::cast(-1., 0.); + cuda_complex* Y1t_Qin = X1t_tmQ_; // name change, reuse memory + cuda_complex* Y2t_inP = X2t_Ptm_; // name change, reuse memory + cublasSetStream(*handle_, stream_); + for (int s = 0; s < ns_; ++s) { + for (int t = 0; t < nt_; t += nt_batch_) { + int st = s * nt_ + t; + int nt_mult = std::min(nt_batch_, nt_ - t); + // Y1_Qin = V_Qim * G1_mn; G1_mn = G^{k1}(t)_mn + if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_N, nao_, nauxnao_, nao_, &one, g_stij_ + st * nao2_, nao_, + nao2_, V_Qim_, nao_, 0, &zero, Y1t_Qin, nao_, nauxnao2_, nt_mult) != CUBLAS_STATUS_SUCCESS) { + throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); + } + // Y2_inP = Y1_Qin * Pq_QP + if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_T, naux_, nao2_, naux_, &one, Pqk_tQP + t * naux2_, naux_, + naux2_, Y1t_Qin, nao2_, nauxnao2_, &zero, Y2t_inP, naux_, nauxnao2_, + nt_mult) != CUBLAS_STATUS_SUCCESS) { + throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); + } + // Sigma_ij = Y2_inP V_nPj + if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_N, nao_, nao_, nauxnao_, &m1, V_nPj_, nao_, 0, Y2t_inP, + nauxnao_, nauxnao2_, &zero, sigmak_stij_ + st * nao2_, nao_, nao2_, + nt_mult) != CUBLAS_STATUS_SUCCESS) { + throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); + } + } + } + write_sigma(_low_memory_requirement, Sigmak_stij_host); + cudaEventRecord(all_done_event_); + } + template void gw_qkpt::compute_second_tau_contraction(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) { cuda_complex one = cu_type_map::cast(1., 0.); @@ -598,7 +639,7 @@ namespace green::gpu { } template - void gw_qkpt::write_sigma(bool low_memory_mode, cxx_complex* Sigmak_stij_host) { + void gw_qkpt::write_sigma(bool low_memory_mode) { // write results. Make sure we have exclusive write access to sigma, then add array sigmak_tij to sigma_ktij acquire_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); scalar_t one = 1.; @@ -609,14 +650,61 @@ namespace green::gpu { } } else { // Copy sigmak_stij_ back to CPU - if (Sigmak_stij_host == nullptr) - throw std::runtime_error("gw_qkpt.write_sigma(): Sigmak_stij_host cannot be a null pointer"); - cudaMemcpy(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost); - std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); + cudaMemcpyAsync(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost, stream_); + cleanup_req_ = true; + // std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); } release_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); } + + template + void gw_qkpt::cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c) { + // Block the stream until all the tasks are completed + if require_cleanup() { + cudaStreamSynchronize(stream_); + // TODO: Why do we have another intermediate for Sigma(k)? + // I iunderstand that this is not the cuda malloc buffer, but simply a normal one. + std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); + if (!x2c) { + copy_Sigma(Sigma_tskij_host, Sigmak_stij_host); + } else { + copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij_host); + } + cleanup_req_ = false; + } + } + + template + void gw_qkpt::copy_Sigma(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij) { + for (size_t t = 0; t < nt_; ++t) { + for (size_t s = 0; s < ns_; ++s) { + matrix(Sigma_tskij_host(t, s, k_red_id_)) += matrix(Sigmak_stij(s, t)).template cast>(); + } + } + } + + template + void gw_qkpt::copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_4tij) {} + size_t nao = Sigmak_4tij.shape()[3]; + size_t nso = 2 * nao; + MatrixXcf Sigma_ij(nso, nso); + for (size_t ss = 0; ss < 3; ++ss) { + size_t a = (ss % 2 == 0) ? 0 : 1; + size_t b = ((ss + 1) / 2 != 1) ? 0 : 1; + size_t i_shift = a * nao; + size_t j_shift = b * nao; + for (size_t t = 0; t < nt_; ++t) { + matrix(Sigma_tskij_host(t, 0, k_red_id_)).block(i_shift, j_shift, nao, nao) += + matrix(Sigmak_4tij(ss, t)).template cast>(); + if (ss == 2) { + matrix(Sigma_tskij_host(t, 0, k_red_id_)).block(j_shift, i_shift, nao, nao) += + matrix(Sigmak_4tij(ss, t)).conjugate().transpose().template cast>(); + } + } + } + } + template bool gw_qkpt::is_busy() { cudaError_t stream_status = cudaStreamQuery(stream_); diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index 292ac48..debc3b8 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -262,6 +262,25 @@ namespace green::gpu { */ void write_P0(int t, cuda_complex* Pqk0_tQP, int* Pqk0_tQP_lock); + /** + * \brief Using dressed GW polarization compute self-energy at a given momentum point + * + * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point + * \param Pqk_tQP Dressed polarization bubble + * \param Sigma_tskij_host host copy of full self-energy for MPI task assigned to the current device + */ + void async_compute_second_tau_contraction(cxx_complex* Sigma_stij_host=nullptr, + cuda_complex* Pqk_tQP=nullptr, ztensor<5>& Sigma_tskij_host); + /** + * \brief Using dressed GW polarization compute self-energy at a given momentum point (X2C version) + * + * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point + * \param Pqk_tQP Dressed polarization bubble + * \param Sigma_tskij_host host copy of full self-energy for MPI task assigned to the current device + */ + void async_compute_second_tau_contraction_2C(cxx_complex* Sigma_stij_host=nullptr, + cuda_complex* Pqk_tQP=nullptr, ztensor<5>& Sigma_tskij_host); + /** * \brief Using dressed GW polarization compute self-energy at a given momentum point * @@ -279,9 +298,43 @@ namespace green::gpu { /** * \brief For a given k-point copy self-energy back to a host memory * \param low_memory_mode - whether the whole self-energy allocated in memory or not + */ + void write_sigma(bool low_memory_mode = false); + + /** + * \brief return the status of copy_selfenergy from device to host + * + * \return false - not required, stream ready for next calculation + * \return true - required + */ + bool require_cleanup(){ + return cleanup_req_; + } + + /** + * \brief perform cleanup, i.e. copy data from Sigmak buffer (4-index array for a given momentum point) to Host shared memory Self-energy + * + * \param low_memory_mode - whether the whole self-energy allocated in memory or not * \param Sigmak_stij_host - Host stored self-energy object at a given momentum point + * \param Sigma_tskij_host - Host stored full self-energy object for each device + * \param x2c - use x2c specific functions or not + */ + void cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c); + + /** + * \brief + * + */ + void copy_Sigma(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij) + + /** + * \brief + * + * \param Sigma_tskij_host + * \param Sigmak_stij + * \param k_id */ - void write_sigma(bool low_memory_mode = false, cxx_complex* Sigmak_stij_host = nullptr); + void copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij) /** * \brief Check if cuda devices are budy @@ -289,6 +342,15 @@ namespace green::gpu { */ bool is_busy(); + /** + * \brief Set the k_red_id_ object + * + * \param k + */ + void set_k_red_id(int k) { + k_red_id_ = k; + } + // static std::size_t size(size_t nao, size_t naux, size_t nt, size_t nt_batch, size_t ns) { return (2 * naux * nao * nao // V_Qpm+V_pmQ @@ -374,6 +436,14 @@ namespace green::gpu { // pointer to cublas handle cublasHandle_t* handle_; + + // irreducible k-pt assigned to the qkpt stream + int k_red_id_; + + // status of data transfer / copy from Device to Host. + // false: not required, stream ready for next calculation + // true: required, stream occupied + bool cleanup_req_ = false; }; template @@ -387,4 +457,58 @@ namespace green::gpu { return qkpts[pos]; } + /** + * \brief returns an idle qkpt stream, otherwise waits until a stream is available + * + * \tparam prec - precision for calculation + * \param qkpts - vector of qkpt workers (gw_qkpt type) + * \param low_memory_mode - low memory mode for read/write integrals + * \param Sigmak_stij_host - cudaMallocHost buffer for transfering Sigma + * \return gw_qkpt* - pointer to idle qkpt + */ + template + gw_qkpt* obtain_idle_qkpt_for_sigma(std::vector*>& qkpts, bool low_memory_mode, + tensor,4>& Sigmak_stij_host, + ztensor<5>& Sigma_tskij_host, bool x2c) { + static int pos = 0; + pos++; + if (pos >= qkpts.size()) pos = 0; + while (qkpts[pos]->is_busy()) { + pos = (pos + 1) % qkpts.size(); + } + qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host, Sigma_tskij_host, x2c); + return qkpts[pos]; + } + + /** + * \brief waits for all qkpts to complete and cleans them up + * + * \tparam prec - precision for calculation + * \param qkpts - vector of qkpt workers (gw_qkpt type) + * \param low_memory_mode - low memory mode for read/write integrals + * \param Sigmak_stij_host - cudaMallocHost buffer for transfering Sigma + */ + template + void wait_and_clean_qkpts(std::vector*>& qkpts, bool low_memory_mode, + tensor,4>& Sigmak_stij_host, + ztensor<5>& Sigma_tskij_shared, bool x2c) { + if (!utils::context.global_rank) { + std::cout << "\nDEBUG: Waiting for all qkpts to finish working\n" << std::endl; + } + static int pos = 0; + pos++; + if (pos >= qkpts.size()) pos = 0; + for (pos = 0; pos < qkpts.size(); pos++) { + // wait for qkpt to finish its tasks, then cleanup + while (qkpts[pos]->is_busy()) { + continue; + } + qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host, Sigma_tskij_shared, x2c); + } + if (!utils::context.global_rank) { + std::cout << "\nDEBUG: Cleanup of qkpts completed\n" << std::endl; + } + return; + } + } // namespace green::gpu From 1e7601354c86b533dc52d1c958100d23f488154e Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Sun, 17 Aug 2025 10:54:31 -0400 Subject: [PATCH 02/36] fix minor bugs --- src/cugw_qpt.cu | 78 ++++++++++++++++++++-------------------- src/green/gpu/cugw_qpt.h | 52 +++++++++++++-------------- 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index 715fae8..621b457 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -385,7 +385,7 @@ namespace green::gpu { cudaFreeHost(Gk1_stij_buffer_); cudaFreeHost(Gk_smtij_buffer_); } - if require_cleanup() { + if (require_cleanup()) { throw std::runtime_error("cleanup of self-energy was not done correctly."); } } @@ -530,41 +530,41 @@ namespace green::gpu { release_lock<<<1, 1, 0, stream_>>>(Pqk0_tQP_lock); } - template - void gw_qkpt::async_compute_second_tau_contraction(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP, - ztensor<5>& Sigma_tskij_host) { - cuda_complex one = cu_type_map::cast(1., 0.); - cuda_complex zero = cu_type_map::cast(0., 0.); - cuda_complex m1 = cu_type_map::cast(-1., 0.); - cuda_complex* Y1t_Qin = X1t_tmQ_; // name change, reuse memory - cuda_complex* Y2t_inP = X2t_Ptm_; // name change, reuse memory - cublasSetStream(*handle_, stream_); - for (int s = 0; s < ns_; ++s) { - for (int t = 0; t < nt_; t += nt_batch_) { - int st = s * nt_ + t; - int nt_mult = std::min(nt_batch_, nt_ - t); - // Y1_Qin = V_Qim * G1_mn; G1_mn = G^{k1}(t)_mn - if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_N, nao_, nauxnao_, nao_, &one, g_stij_ + st * nao2_, nao_, - nao2_, V_Qim_, nao_, 0, &zero, Y1t_Qin, nao_, nauxnao2_, nt_mult) != CUBLAS_STATUS_SUCCESS) { - throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); - } - // Y2_inP = Y1_Qin * Pq_QP - if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_T, naux_, nao2_, naux_, &one, Pqk_tQP + t * naux2_, naux_, - naux2_, Y1t_Qin, nao2_, nauxnao2_, &zero, Y2t_inP, naux_, nauxnao2_, - nt_mult) != CUBLAS_STATUS_SUCCESS) { - throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); - } - // Sigma_ij = Y2_inP V_nPj - if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_N, nao_, nao_, nauxnao_, &m1, V_nPj_, nao_, 0, Y2t_inP, - nauxnao_, nauxnao2_, &zero, sigmak_stij_ + st * nao2_, nao_, nao2_, - nt_mult) != CUBLAS_STATUS_SUCCESS) { - throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); - } - } - } - write_sigma(_low_memory_requirement, Sigmak_stij_host); - cudaEventRecord(all_done_event_); - } + // template + // void gw_qkpt::async_compute_second_tau_contraction(ztensor<5>& Sigma_tskij_host, cxx_complex* Sigmak_stij_host, + // cuda_complex* Pqk_tQP) {} + // cuda_complex one = cu_type_map::cast(1., 0.); + // cuda_complex zero = cu_type_map::cast(0., 0.); + // cuda_complex m1 = cu_type_map::cast(-1., 0.); + // cuda_complex* Y1t_Qin = X1t_tmQ_; // name change, reuse memory + // cuda_complex* Y2t_inP = X2t_Ptm_; // name change, reuse memory + // cublasSetStream(*handle_, stream_); + // for (int s = 0; s < ns_; ++s) { + // for (int t = 0; t < nt_; t += nt_batch_) { + // int st = s * nt_ + t; + // int nt_mult = std::min(nt_batch_, nt_ - t); + // // Y1_Qin = V_Qim * G1_mn; G1_mn = G^{k1}(t)_mn + // if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_N, nao_, nauxnao_, nao_, &one, g_stij_ + st * nao2_, nao_, + // nao2_, V_Qim_, nao_, 0, &zero, Y1t_Qin, nao_, nauxnao2_, nt_mult) != CUBLAS_STATUS_SUCCESS) { + // throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); + // } + // // Y2_inP = Y1_Qin * Pq_QP + // if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_T, naux_, nao2_, naux_, &one, Pqk_tQP + t * naux2_, naux_, + // naux2_, Y1t_Qin, nao2_, nauxnao2_, &zero, Y2t_inP, naux_, nauxnao2_, + // nt_mult) != CUBLAS_STATUS_SUCCESS) { + // throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); + // } + // // Sigma_ij = Y2_inP V_nPj + // if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_N, nao_, nao_, nauxnao_, &m1, V_nPj_, nao_, 0, Y2t_inP, + // nauxnao_, nauxnao2_, &zero, sigmak_stij_ + st * nao2_, nao_, nao2_, + // nt_mult) != CUBLAS_STATUS_SUCCESS) { + // throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); + // } + // } + // } + // write_sigma(_low_memory_requirement, Sigmak_stij_host); + // cudaEventRecord(all_done_event_); + // } template void gw_qkpt::compute_second_tau_contraction(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) { @@ -597,7 +597,7 @@ namespace green::gpu { } } } - write_sigma(_low_memory_requirement, Sigmak_stij_host); + write_sigma(_low_memory_requirement); cudaEventRecord(all_done_event_); } @@ -634,7 +634,7 @@ namespace green::gpu { } } } - write_sigma(true, Sigmak_stij_host); + write_sigma(true); cudaEventRecord(all_done_event_); } @@ -661,7 +661,7 @@ namespace green::gpu { template void gw_qkpt::cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c) { // Block the stream until all the tasks are completed - if require_cleanup() { + if (require_cleanup()) { cudaStreamSynchronize(stream_); // TODO: Why do we have another intermediate for Sigma(k)? // I iunderstand that this is not the cuda malloc buffer, but simply a normal one. diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index debc3b8..9f541c9 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -262,24 +262,24 @@ namespace green::gpu { */ void write_P0(int t, cuda_complex* Pqk0_tQP, int* Pqk0_tQP_lock); - /** - * \brief Using dressed GW polarization compute self-energy at a given momentum point - * - * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point - * \param Pqk_tQP Dressed polarization bubble - * \param Sigma_tskij_host host copy of full self-energy for MPI task assigned to the current device - */ - void async_compute_second_tau_contraction(cxx_complex* Sigma_stij_host=nullptr, - cuda_complex* Pqk_tQP=nullptr, ztensor<5>& Sigma_tskij_host); - /** - * \brief Using dressed GW polarization compute self-energy at a given momentum point (X2C version) - * - * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point - * \param Pqk_tQP Dressed polarization bubble - * \param Sigma_tskij_host host copy of full self-energy for MPI task assigned to the current device - */ - void async_compute_second_tau_contraction_2C(cxx_complex* Sigma_stij_host=nullptr, - cuda_complex* Pqk_tQP=nullptr, ztensor<5>& Sigma_tskij_host); + // /** + // * \brief Using dressed GW polarization compute self-energy at a given momentum point + // * + // * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point + // * \param Pqk_tQP Dressed polarization bubble + // * \param Sigma_tskij_host host copy of full self-energy for MPI task assigned to the current device + // */ + // void async_compute_second_tau_contraction(ztensor<5>& Sigma_tskij_host, cxx_complex* Sigma_stij_host=nullptr, + // cuda_complex* Pqk_tQP=nullptr); + // /** + // * \brief Using dressed GW polarization compute self-energy at a given momentum point (X2C version) + // * + // * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point + // * \param Pqk_tQP Dressed polarization bubble + // * \param Sigma_tskij_host host copy of full self-energy for MPI task assigned to the current device + // */ + // void async_compute_second_tau_contraction_2C(ztensor<5>& Sigma_tskij_host, cxx_complex* Sigma_stij_host=nullptr, + // cuda_complex* Pqk_tQP=nullptr); /** * \brief Using dressed GW polarization compute self-energy at a given momentum point @@ -325,7 +325,7 @@ namespace green::gpu { * \brief * */ - void copy_Sigma(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij) + void copy_Sigma(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij); /** * \brief @@ -334,7 +334,7 @@ namespace green::gpu { * \param Sigmak_stij * \param k_id */ - void copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij) + void copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij); /** * \brief Check if cuda devices are budy @@ -492,9 +492,9 @@ namespace green::gpu { void wait_and_clean_qkpts(std::vector*>& qkpts, bool low_memory_mode, tensor,4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_shared, bool x2c) { - if (!utils::context.global_rank) { - std::cout << "\nDEBUG: Waiting for all qkpts to finish working\n" << std::endl; - } + // if (!utils::context.global_rank) { + // std::cout << "\nDEBUG: Waiting for all qkpts to finish working\n" << std::endl; + // } static int pos = 0; pos++; if (pos >= qkpts.size()) pos = 0; @@ -505,9 +505,9 @@ namespace green::gpu { } qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host, Sigma_tskij_shared, x2c); } - if (!utils::context.global_rank) { - std::cout << "\nDEBUG: Cleanup of qkpts completed\n" << std::endl; - } + // if (!utils::context.global_rank) { + // std::cout << "\nDEBUG: Cleanup of qkpts completed\n" << std::endl; + // } return; } From b386b6d74cd96217eefa0abc9c7bdd211213003a Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Sun, 17 Aug 2025 12:17:35 -0400 Subject: [PATCH 03/36] more bugs --- src/cugw_qpt.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index 621b457..6026c64 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -659,13 +659,13 @@ namespace green::gpu { template - void gw_qkpt::cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c) { + void gw_qkpt::cleanup(bool low_memory_mode, tensor, 4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c) { // Block the stream until all the tasks are completed if (require_cleanup()) { cudaStreamSynchronize(stream_); // TODO: Why do we have another intermediate for Sigma(k)? // I iunderstand that this is not the cuda malloc buffer, but simply a normal one. - std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); + std::memcpy(Sigmak_stij_host.data(), Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); if (!x2c) { copy_Sigma(Sigma_tskij_host, Sigmak_stij_host); } else { @@ -685,8 +685,8 @@ namespace green::gpu { } template - void gw_qkpt::copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_4tij) {} - size_t nao = Sigmak_4tij.shape()[3]; + void gw_qkpt::copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij) { + size_t nao = Sigmak_stij.shape()[3]; size_t nso = 2 * nao; MatrixXcf Sigma_ij(nso, nso); for (size_t ss = 0; ss < 3; ++ss) { @@ -696,10 +696,10 @@ namespace green::gpu { size_t j_shift = b * nao; for (size_t t = 0; t < nt_; ++t) { matrix(Sigma_tskij_host(t, 0, k_red_id_)).block(i_shift, j_shift, nao, nao) += - matrix(Sigmak_4tij(ss, t)).template cast>(); + matrix(Sigmak_stij(ss, t)).template cast>(); if (ss == 2) { matrix(Sigma_tskij_host(t, 0, k_red_id_)).block(j_shift, i_shift, nao, nao) += - matrix(Sigmak_4tij(ss, t)).conjugate().transpose().template cast>(); + matrix(Sigmak_stij(ss, t)).conjugate().transpose().template cast>(); } } } From 5fa22129c4e37e50437f00aedb258d19fb788738 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Sun, 17 Aug 2025 12:45:41 -0400 Subject: [PATCH 04/36] update definition of the cleanup function --- src/green/gpu/cugw_qpt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index 9f541c9..85c5e2c 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -319,7 +319,7 @@ namespace green::gpu { * \param Sigma_tskij_host - Host stored full self-energy object for each device * \param x2c - use x2c specific functions or not */ - void cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c); + void cleanup(bool low_memory_mode, tensor, 4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c); /** * \brief From f8bc8ec3bd5d9e164e66e76dfd4af3a32a10345a Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Sun, 17 Aug 2025 13:15:22 -0400 Subject: [PATCH 05/36] add profile arguments --- src/cu_routines.cu | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/cu_routines.cu b/src/cu_routines.cu index 809bc59..8cdd4d5 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -21,6 +21,30 @@ #include +// #ifdef USE_NVTX +#include "nvtx3/nvToolsExt.h" + +const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff }; +const int num_colors = sizeof(colors)/sizeof(uint32_t); + +#define PUSH_RANGE(name,cid) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ +} +#define POP_RANGE nvtxRangePop(); +// #else +// #define PUSH_RANGE(name,cid) +// #define POP_RANGE +// #endif + __global__ void initialize_array(cuDoubleComplex* array, cuDoubleComplex value, int count) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= count) return; @@ -213,6 +237,7 @@ namespace green::gpu { int verbose, irre_pos_callback& irre_pos, mom_cons_callback& momentum_conservation, gw_reader1_callback& r1, gw_reader2_callback& r2) { // this is the main GW loop + if (!_devices_rank) PUSH_RANGE("Solve in CU_ROUTINES begin", 0); if (!_devices_rank && verbose > 0) std::cout << "GW main loop" << std::endl; qpt.verbose() = verbose; @@ -220,6 +245,7 @@ namespace green::gpu { if (verbose > 2) std::cout << "q = " << q_reduced_id << std::endl; size_t q = reduced_to_full[q_reduced_id]; qpt.reset_Pqk0(); + if (!_devices_rank) PUSH_RANGE("Build P0", 1); for (size_t k = 0; k < _nk; ++k) { std::array k_vector = momentum_conservation({ {k, 0, q} @@ -230,9 +256,13 @@ namespace green::gpu { bool need_minus_k = reduced_to_full[k_reduced_id] != k; bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1; + if (!_devices_rank) PUSH_RANGE("r1: read ints and G(k2)", 2); r1(k, k1, k_reduced_id, k1_reduced_id, k_vector, V_Qpm, Vk1k2_Qij, Gk_smtij, Gk1_stij, need_minus_k, need_minus_k1); + if (!_devices_rank) POP_RANGE; gw_qkpt* qkpt = obtain_idle_qkpt(qkpts); + + if (!_devices_rank) PUSH_RANGE("setup: copy data to device", 3); if (_low_device_memory) { if (!_X2C) { qkpt->set_up_qkpt_first(Gk1_stij.data(), Gk_smtij.data(), V_Qpm.data(), k_reduced_id, need_minus_k, k1_reduced_id, @@ -244,13 +274,24 @@ namespace green::gpu { } else { qkpt->set_up_qkpt_first(nullptr, nullptr, V_Qpm.data(), k_reduced_id, need_minus_k, k1_reduced_id, need_minus_k1); } + if (!_devices_rank) POP_RANGE; + if (!_devices_rank) PUSH_RANGE("P0 contraction", 4); qkpt->compute_first_tau_contraction(qpt.Pqk0_tQP(qkpt->all_done_event()), qpt.Pqk0_tQP_lock()); + if (!_devices_rank) POP_RANGE; } + + if (!_devices_rank) POP_RANGE; + if (!_devices_rank) PUSH_RANGE("Build P", 1); + qpt.wait_for_kpts(); qpt.scale_Pq0_tQP(1. / _nk); qpt.transform_tw(); qpt.compute_Pq(); qpt.transform_wt(); + + if (!_devices_rank) POP_RANGE; + if (!_devices_rank) PUSH_RANGE("Build Sigma", 1); + // Write to Sigma(k), k belongs to _ink for (size_t k_reduced_id = 0; k_reduced_id < _ink; ++k_reduced_id) { size_t k = reduced_to_full[k_reduced_id]; @@ -264,21 +305,32 @@ namespace green::gpu { bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1; bool need_minus_q = reduced_to_full[q_reduced_id] != q_or_qinv; + if (!_devices_rank) PUSH_RANGE("r2: read ints and G(k3)", 2); r2(k, k1, k1_reduced_id, k_vector, V_Qim, Vk1k2_Qij, Gk1_stij, need_minus_k1); + if (!_devices_rank) POP_RANGE; + if (!_devices_rank) PUSH_RANGE("r1: wait / copy data back to host and free qkpt stream", 2); gw_qkpt* qkpt = obtain_idle_qkpt_for_sigma(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C); + if (!_devices_rank) POP_RANGE; + qkpt->set_k_red_id(k_reduced_id); if (_low_device_memory) { if (!_X2C) { + if (!_devices_rank) PUSH_RANGE("setup: copy data to device", 3); qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); qkpt->compute_second_tau_contraction(Sigmak_stij.data(), qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); + if (!_devices_rank) POP_RANGE; // copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns); } else { + if (!_devices_rank) PUSH_RANGE("setup: copy data to device", 3); // In 2cGW, G(-k) = G*(k) has already been addressed in r2() qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, false); + if (!_devices_rank) POP_RANGE; + if (!_devices_rank) PUSH_RANGE("Sigma contraction", 4); qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(), qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); + if (!_devices_rank) POP_RANGE; // copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts); } } else { @@ -288,12 +340,16 @@ namespace green::gpu { } } } + if (!_devices_rank) POP_RANGE; } + if (!_devices_rank) PUSH_RANGE("Wait for remaining qkpt workers", 1); wait_and_clean_qkpts(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C); + if (!_devices_rank) POP_RANGE; cudaDeviceSynchronize(); if (!_low_device_memory and !_X2C) { copy_Sigma_from_device_to_host(sigma_kstij_device, Sigma_tskij_host.data(), _ink, _nao, _nts, _ns); } + if (!_devices_rank) POP_RANGE; } template From c15e7b48354e72676a2266d80b74e2e62e4518d9 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Sun, 17 Aug 2025 22:59:01 -0400 Subject: [PATCH 06/36] modify profile colors --- src/cu_routines.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/cu_routines.cu b/src/cu_routines.cu index 8cdd4d5..1f8b5bc 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -309,25 +309,27 @@ namespace green::gpu { r2(k, k1, k1_reduced_id, k_vector, V_Qim, Vk1k2_Qij, Gk1_stij, need_minus_k1); if (!_devices_rank) POP_RANGE; - if (!_devices_rank) PUSH_RANGE("r1: wait / copy data back to host and free qkpt stream", 2); + if (!_devices_rank) PUSH_RANGE("r1: wait / copy data back to host and free qkpt stream", 3); gw_qkpt* qkpt = obtain_idle_qkpt_for_sigma(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C); if (!_devices_rank) POP_RANGE; qkpt->set_k_red_id(k_reduced_id); if (_low_device_memory) { if (!_X2C) { - if (!_devices_rank) PUSH_RANGE("setup: copy data to device", 3); + if (!_devices_rank) PUSH_RANGE("setup: copy data to device", 4); qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); + if (!_devices_rank) POP_RANGE; + if (!_devices_rank) PUSH_RANGE("Sigma contraction", 5); qkpt->compute_second_tau_contraction(Sigmak_stij.data(), qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); if (!_devices_rank) POP_RANGE; // copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns); } else { - if (!_devices_rank) PUSH_RANGE("setup: copy data to device", 3); + if (!_devices_rank) PUSH_RANGE("setup: copy data to device", 4); // In 2cGW, G(-k) = G*(k) has already been addressed in r2() qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, false); if (!_devices_rank) POP_RANGE; - if (!_devices_rank) PUSH_RANGE("Sigma contraction", 4); + if (!_devices_rank) PUSH_RANGE("Sigma contraction", 5); qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(), qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); if (!_devices_rank) POP_RANGE; From 90a761e851d68ebaa538c5e45dad9bfb6ea9875b Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 21 Aug 2025 15:55:15 -0400 Subject: [PATCH 07/36] adding different handles for each qkpt stream to encourage concurrency --- src/cu_routines.cu | 9 ++++++++- src/green/gpu/cu_routines.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/cu_routines.cu b/src/cu_routines.cu index 1f8b5bc..87499aa 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -224,8 +224,12 @@ namespace green::gpu { qpt.init(&_handle, &_solver_handle); // Each process gets one cuda runner for qpoints + _qkpt_handles.resize(_nqkpt); for (int i = 0; i < _nqkpt; ++i) { - qkpts[i] = new gw_qkpt(_nao, _NQ, _ns, _nts, _nt_batch, &_handle, g_kstij_device, g_ksmtij_device, sigma_kstij_device, + // TODO: Do we need to add the same handle to qkpt as qpt? + if (cublasCreate(&_qkpt_handles[i]) != CUBLAS_STATUS_SUCCESS) + throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas"); + qkpts[i] = new gw_qkpt(_nao, _NQ, _ns, _nts, _nt_batch, &_qkpt_handles[i], g_kstij_device, g_ksmtij_device, sigma_kstij_device, sigma_k_locks); } } @@ -392,6 +396,9 @@ namespace green::gpu { } if (cublasDestroy(_handle) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("cublas error destroying handle"); if (cusolverDnDestroy(_solver_handle) != CUSOLVER_STATUS_SUCCESS) throw std::runtime_error("culapck error destroying handle"); + for (int i = 0; i < qkpts.size(); ++i) { + if (cublasDestroy(_qkpt_handles[i]) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("cublas error destroying handle"); + } if (!_low_device_memory) cudaFree(g_kstij_device); if (!_low_device_memory) cudaFree(g_ksmtij_device); cudaFree(sigma_kstij_device); diff --git a/src/green/gpu/cu_routines.h b/src/green/gpu/cu_routines.h index b2d9c71..f397d26 100644 --- a/src/green/gpu/cu_routines.h +++ b/src/green/gpu/cu_routines.h @@ -155,6 +155,7 @@ namespace green::gpu { bool _low_device_memory; cublasHandle_t _handle; cusolverDnHandle_t _solver_handle; + std::vector _qkpt_handles; gw_qpt qpt; std::vector*> qkpts; From 7f7ff16b4ed84249426962bf690c77ed6e43bff5 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Fri, 22 Aug 2025 23:15:53 -0400 Subject: [PATCH 08/36] add profile statement for polarization retrieval --- src/cu_routines.cu | 3 +-- src/cugw_qpt.cu | 6 ++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/cu_routines.cu b/src/cu_routines.cu index 87499aa..9687f62 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -195,7 +195,7 @@ namespace green::gpu { int _devCount_per_node) : _low_device_memory(low_device_memory), qkpts(_nqkpt), V_Qpm(_NQ, _nao, _nao), V_Qim(_NQ, _nao, _nao), Gk1_stij(_ns, _nts, _nao, _nao), Gk_smtij(_ns, _nts, _nao, _nao), - qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data(), cuda_lin_solver) { + qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data(), cuda_lin_solver), _qkpt_handles(_nqkpt) { if (cudaSetDevice(_intranode_rank % _devCount_per_node) != cudaSuccess) throw std::runtime_error("Error in cudaSetDevice2"); if (cublasCreate(&_handle) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas"); @@ -224,7 +224,6 @@ namespace green::gpu { qpt.init(&_handle, &_solver_handle); // Each process gets one cuda runner for qpoints - _qkpt_handles.resize(_nqkpt); for (int i = 0; i < _nqkpt; ++i) { // TODO: Do we need to add the same handle to qkpt as qpt? if (cublasCreate(&_qkpt_handles[i]) != CUBLAS_STATUS_SUCCESS) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index 6026c64..82e09a0 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -20,6 +20,7 @@ */ #include +#include namespace green::gpu { template gw_qpt::gw_qpt(int nao, int naux, int nt, int nw_b, @@ -152,6 +153,7 @@ namespace green::gpu { template typename gw_qpt::cuda_complex* gw_qpt::Pqk_tQP(cudaEvent_t all_done_event, cudaStream_t calc_stream, int need_minus_q) { + nvtx3::scped_range pol_range{"Getting P for Sigma contraction"}; // make sure the other stream waits until our data is ready (i.e. the equation system solved) if (cudaStreamWaitEvent(calc_stream, polarization_ready_event_, 0 /*cudaEventWaitDefault*/)) throw std::runtime_error("could not wait for data"); @@ -641,20 +643,20 @@ namespace green::gpu { template void gw_qkpt::write_sigma(bool low_memory_mode) { // write results. Make sure we have exclusive write access to sigma, then add array sigmak_tij to sigma_ktij - acquire_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); scalar_t one = 1.; if (!low_memory_mode) { + acquire_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); if (RAXPY(*handle_, 2 * ns_ * ntnao2_, &one, (scalar_t*)sigmak_stij_, 1, (scalar_t*)(sigma_ktij_ + k_ * ns_ * ntnao2_), 1) != CUBLAS_STATUS_SUCCESS) { throw std::runtime_error("RAXPY fails on gw_qkpt.write_sigma()."); } + release_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); } else { // Copy sigmak_stij_ back to CPU cudaMemcpyAsync(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost, stream_); cleanup_req_ = true; // std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); } - release_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); } From 48e610c37949a4e4bf6328c19e20dc24aba89bb1 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Fri, 22 Aug 2025 23:31:05 -0400 Subject: [PATCH 09/36] typo in nvtx3 function --- src/cugw_qpt.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index 82e09a0..f4ee4af 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -153,7 +153,7 @@ namespace green::gpu { template typename gw_qpt::cuda_complex* gw_qpt::Pqk_tQP(cudaEvent_t all_done_event, cudaStream_t calc_stream, int need_minus_q) { - nvtx3::scped_range pol_range{"Getting P for Sigma contraction"}; + nvtx3::scoped_range pol_range{"Getting P for Sigma contraction"}; // make sure the other stream waits until our data is ready (i.e. the equation system solved) if (cudaStreamWaitEvent(calc_stream, polarization_ready_event_, 0 /*cudaEventWaitDefault*/)) throw std::runtime_error("could not wait for data"); From 1118a67b089975c0a37e02f32c8c0b0cb9f8848b Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Sat, 23 Aug 2025 09:12:07 -0400 Subject: [PATCH 10/36] use push/pop range for Pqk_tQP function --- src/cu_routines.cu | 4 ++-- src/cugw_qpt.cu | 29 ++++++++++++++++++++++++++--- src/green/gpu/cugw_qpt.h | 2 +- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/cu_routines.cu b/src/cu_routines.cu index 9687f62..dcc3db1 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -324,7 +324,7 @@ namespace green::gpu { if (!_devices_rank) POP_RANGE; if (!_devices_rank) PUSH_RANGE("Sigma contraction", 5); qkpt->compute_second_tau_contraction(Sigmak_stij.data(), - qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); + qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q, !_devices_rank)); if (!_devices_rank) POP_RANGE; // copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns); } else { @@ -334,7 +334,7 @@ namespace green::gpu { if (!_devices_rank) POP_RANGE; if (!_devices_rank) PUSH_RANGE("Sigma contraction", 5); qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(), - qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); + qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q, !_devices_rank)); if (!_devices_rank) POP_RANGE; // copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts); } diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index f4ee4af..937eb56 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -20,7 +20,29 @@ */ #include -#include +// #ifdef USE_NVTX +#include "nvtx3/nvToolsExt.h" + +const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff }; +const int num_colors = sizeof(colors)/sizeof(uint32_t); + +#define PUSH_RANGE(name,cid) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ +} +#define POP_RANGE nvtxRangePop(); +// #else +// #define PUSH_RANGE(name,cid) +// #define POP_RANGE +// #endif namespace green::gpu { template gw_qpt::gw_qpt(int nao, int naux, int nt, int nw_b, @@ -152,14 +174,15 @@ namespace green::gpu { template typename gw_qpt::cuda_complex* gw_qpt::Pqk_tQP(cudaEvent_t all_done_event, cudaStream_t calc_stream, - int need_minus_q) { - nvtx3::scoped_range pol_range{"Getting P for Sigma contraction"}; + int need_minus_q, bool profile) { + if (profile) PUSH_RANGE("Retrieve P for Sigma contraction", 6); // make sure the other stream waits until our data is ready (i.e. the equation system solved) if (cudaStreamWaitEvent(calc_stream, polarization_ready_event_, 0 /*cudaEventWaitDefault*/)) throw std::runtime_error("could not wait for data"); // make sure this stream waits until the other calculation is done if (cudaStreamWaitEvent(stream_, all_done_event, 0 /*cudaEventWaitDefault*/)) throw std::runtime_error("could not wait for data"); + if (profile) POP_RANGE; return (!need_minus_q) ? Pqk_tQP_ : Pqk_tQP_conj_; } diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index 85c5e2c..56122e2 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -74,7 +74,7 @@ namespace green::gpu { * \param need_minus_q we need to get Pqk_tQP_conj_ for a (-q) point * \return Pqk_tQP or Pqk_tQP_conj array for dressed polarization */ - cuda_complex* Pqk_tQP(cudaEvent_t all_done_event, cudaStream_t calc_stream, int need_minus_q); + cuda_complex* Pqk_tQP(cudaEvent_t all_done_event, cudaStream_t calc_stream, int need_minus_q, bool profile=false); /** * \return synchronization lock to Pqk0_tQP array From 304e158ad55983cc3590cb54753dada5c6a3e867 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Tue, 28 Oct 2025 13:37:18 -0500 Subject: [PATCH 11/36] remove nvtx tags --- src/cu_routines.cu | 55 ++-------------------------------------- src/cugw_qpt.cu | 29 +++------------------ src/green/gpu/cugw_qpt.h | 2 +- 3 files changed, 6 insertions(+), 80 deletions(-) diff --git a/src/cu_routines.cu b/src/cu_routines.cu index dcc3db1..7c04941 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -21,29 +21,6 @@ #include -// #ifdef USE_NVTX -#include "nvtx3/nvToolsExt.h" - -const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff }; -const int num_colors = sizeof(colors)/sizeof(uint32_t); - -#define PUSH_RANGE(name,cid) { \ - int color_id = cid; \ - color_id = color_id%num_colors;\ - nvtxEventAttributes_t eventAttrib = {0}; \ - eventAttrib.version = NVTX_VERSION; \ - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ - eventAttrib.colorType = NVTX_COLOR_ARGB; \ - eventAttrib.color = colors[color_id]; \ - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ - eventAttrib.message.ascii = name; \ - nvtxRangePushEx(&eventAttrib); \ -} -#define POP_RANGE nvtxRangePop(); -// #else -// #define PUSH_RANGE(name,cid) -// #define POP_RANGE -// #endif __global__ void initialize_array(cuDoubleComplex* array, cuDoubleComplex value, int count) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -240,7 +217,6 @@ namespace green::gpu { int verbose, irre_pos_callback& irre_pos, mom_cons_callback& momentum_conservation, gw_reader1_callback& r1, gw_reader2_callback& r2) { // this is the main GW loop - if (!_devices_rank) PUSH_RANGE("Solve in CU_ROUTINES begin", 0); if (!_devices_rank && verbose > 0) std::cout << "GW main loop" << std::endl; qpt.verbose() = verbose; @@ -248,7 +224,6 @@ namespace green::gpu { if (verbose > 2) std::cout << "q = " << q_reduced_id << std::endl; size_t q = reduced_to_full[q_reduced_id]; qpt.reset_Pqk0(); - if (!_devices_rank) PUSH_RANGE("Build P0", 1); for (size_t k = 0; k < _nk; ++k) { std::array k_vector = momentum_conservation({ {k, 0, q} @@ -259,13 +234,10 @@ namespace green::gpu { bool need_minus_k = reduced_to_full[k_reduced_id] != k; bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1; - if (!_devices_rank) PUSH_RANGE("r1: read ints and G(k2)", 2); r1(k, k1, k_reduced_id, k1_reduced_id, k_vector, V_Qpm, Vk1k2_Qij, Gk_smtij, Gk1_stij, need_minus_k, need_minus_k1); - if (!_devices_rank) POP_RANGE; gw_qkpt* qkpt = obtain_idle_qkpt(qkpts); - if (!_devices_rank) PUSH_RANGE("setup: copy data to device", 3); if (_low_device_memory) { if (!_X2C) { qkpt->set_up_qkpt_first(Gk1_stij.data(), Gk_smtij.data(), V_Qpm.data(), k_reduced_id, need_minus_k, k1_reduced_id, @@ -277,14 +249,9 @@ namespace green::gpu { } else { qkpt->set_up_qkpt_first(nullptr, nullptr, V_Qpm.data(), k_reduced_id, need_minus_k, k1_reduced_id, need_minus_k1); } - if (!_devices_rank) POP_RANGE; - if (!_devices_rank) PUSH_RANGE("P0 contraction", 4); qkpt->compute_first_tau_contraction(qpt.Pqk0_tQP(qkpt->all_done_event()), qpt.Pqk0_tQP_lock()); - if (!_devices_rank) POP_RANGE; } - if (!_devices_rank) POP_RANGE; - if (!_devices_rank) PUSH_RANGE("Build P", 1); qpt.wait_for_kpts(); qpt.scale_Pq0_tQP(1. / _nk); @@ -292,8 +259,6 @@ namespace green::gpu { qpt.compute_Pq(); qpt.transform_wt(); - if (!_devices_rank) POP_RANGE; - if (!_devices_rank) PUSH_RANGE("Build Sigma", 1); // Write to Sigma(k), k belongs to _ink for (size_t k_reduced_id = 0; k_reduced_id < _ink; ++k_reduced_id) { @@ -308,34 +273,22 @@ namespace green::gpu { bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1; bool need_minus_q = reduced_to_full[q_reduced_id] != q_or_qinv; - if (!_devices_rank) PUSH_RANGE("r2: read ints and G(k3)", 2); r2(k, k1, k1_reduced_id, k_vector, V_Qim, Vk1k2_Qij, Gk1_stij, need_minus_k1); - if (!_devices_rank) POP_RANGE; - if (!_devices_rank) PUSH_RANGE("r1: wait / copy data back to host and free qkpt stream", 3); gw_qkpt* qkpt = obtain_idle_qkpt_for_sigma(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C); - if (!_devices_rank) POP_RANGE; qkpt->set_k_red_id(k_reduced_id); if (_low_device_memory) { if (!_X2C) { - if (!_devices_rank) PUSH_RANGE("setup: copy data to device", 4); qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); - if (!_devices_rank) POP_RANGE; - if (!_devices_rank) PUSH_RANGE("Sigma contraction", 5); qkpt->compute_second_tau_contraction(Sigmak_stij.data(), - qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q, !_devices_rank)); - if (!_devices_rank) POP_RANGE; + qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); // copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns); } else { - if (!_devices_rank) PUSH_RANGE("setup: copy data to device", 4); // In 2cGW, G(-k) = G*(k) has already been addressed in r2() qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, false); - if (!_devices_rank) POP_RANGE; - if (!_devices_rank) PUSH_RANGE("Sigma contraction", 5); qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(), - qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q, !_devices_rank)); - if (!_devices_rank) POP_RANGE; + qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); // copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts); } } else { @@ -345,16 +298,12 @@ namespace green::gpu { } } } - if (!_devices_rank) POP_RANGE; } - if (!_devices_rank) PUSH_RANGE("Wait for remaining qkpt workers", 1); wait_and_clean_qkpts(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C); - if (!_devices_rank) POP_RANGE; cudaDeviceSynchronize(); if (!_low_device_memory and !_X2C) { copy_Sigma_from_device_to_host(sigma_kstij_device, Sigma_tskij_host.data(), _ink, _nao, _nts, _ns); } - if (!_devices_rank) POP_RANGE; } template diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index 937eb56..0fda82a 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -20,29 +20,8 @@ */ #include -// #ifdef USE_NVTX -#include "nvtx3/nvToolsExt.h" - -const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff }; -const int num_colors = sizeof(colors)/sizeof(uint32_t); - -#define PUSH_RANGE(name,cid) { \ - int color_id = cid; \ - color_id = color_id%num_colors;\ - nvtxEventAttributes_t eventAttrib = {0}; \ - eventAttrib.version = NVTX_VERSION; \ - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ - eventAttrib.colorType = NVTX_COLOR_ARGB; \ - eventAttrib.color = colors[color_id]; \ - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ - eventAttrib.message.ascii = name; \ - nvtxRangePushEx(&eventAttrib); \ -} -#define POP_RANGE nvtxRangePop(); -// #else -// #define PUSH_RANGE(name,cid) -// #define POP_RANGE -// #endif + + namespace green::gpu { template gw_qpt::gw_qpt(int nao, int naux, int nt, int nw_b, @@ -174,15 +153,13 @@ namespace green::gpu { template typename gw_qpt::cuda_complex* gw_qpt::Pqk_tQP(cudaEvent_t all_done_event, cudaStream_t calc_stream, - int need_minus_q, bool profile) { - if (profile) PUSH_RANGE("Retrieve P for Sigma contraction", 6); + int need_minus_q) { // make sure the other stream waits until our data is ready (i.e. the equation system solved) if (cudaStreamWaitEvent(calc_stream, polarization_ready_event_, 0 /*cudaEventWaitDefault*/)) throw std::runtime_error("could not wait for data"); // make sure this stream waits until the other calculation is done if (cudaStreamWaitEvent(stream_, all_done_event, 0 /*cudaEventWaitDefault*/)) throw std::runtime_error("could not wait for data"); - if (profile) POP_RANGE; return (!need_minus_q) ? Pqk_tQP_ : Pqk_tQP_conj_; } diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index 56122e2..85c5e2c 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -74,7 +74,7 @@ namespace green::gpu { * \param need_minus_q we need to get Pqk_tQP_conj_ for a (-q) point * \return Pqk_tQP or Pqk_tQP_conj array for dressed polarization */ - cuda_complex* Pqk_tQP(cudaEvent_t all_done_event, cudaStream_t calc_stream, int need_minus_q, bool profile=false); + cuda_complex* Pqk_tQP(cudaEvent_t all_done_event, cudaStream_t calc_stream, int need_minus_q); /** * \return synchronization lock to Pqk0_tQP array From 5dd3a8eb92455d578efe24eb425c86b4dffb4cd6 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Tue, 28 Oct 2025 23:47:19 -0500 Subject: [PATCH 12/36] add comments and clean up function signatures --- src/cu_routines.cu | 18 +++++++++------- src/cugw_qpt.cu | 45 +++------------------------------------- src/green/gpu/cugw_qpt.h | 8 +++---- 3 files changed, 18 insertions(+), 53 deletions(-) diff --git a/src/cu_routines.cu b/src/cu_routines.cu index 7c04941..25fd071 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -234,10 +234,13 @@ namespace green::gpu { bool need_minus_k = reduced_to_full[k_reduced_id] != k; bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1; + // Read V_Qpm and Gk_smtij r1(k, k1, k_reduced_id, k1_reduced_id, k_vector, V_Qpm, Vk1k2_Qij, Gk_smtij, Gk1_stij, need_minus_k, need_minus_k1); + // Get an idle qkpt worker gw_qkpt* qkpt = obtain_idle_qkpt(qkpts); + // Copy data from host to device and set up qkpt if (_low_device_memory) { if (!_X2C) { qkpt->set_up_qkpt_first(Gk1_stij.data(), Gk_smtij.data(), V_Qpm.data(), k_reduced_id, need_minus_k, k1_reduced_id, @@ -249,10 +252,12 @@ namespace green::gpu { } else { qkpt->set_up_qkpt_first(nullptr, nullptr, V_Qpm.data(), k_reduced_id, need_minus_k, k1_reduced_id, need_minus_k1); } + // Compute Polarization Pqk0 qkpt->compute_first_tau_contraction(qpt.Pqk0_tQP(qkpt->all_done_event()), qpt.Pqk0_tQP_lock()); } + // Compute Polarization Pq from irreducible polarization Pqk0 qpt.wait_for_kpts(); qpt.scale_Pq0_tQP(1. / _nk); qpt.transform_tw(); @@ -273,27 +278,26 @@ namespace green::gpu { bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1; bool need_minus_q = reduced_to_full[q_reduced_id] != q_or_qinv; + // Read V_Qij and Gk1_stij r2(k, k1, k1_reduced_id, k_vector, V_Qim, Vk1k2_Qij, Gk1_stij, need_minus_k1); + // Get an idle qkpt worker gw_qkpt* qkpt = obtain_idle_qkpt_for_sigma(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C); qkpt->set_k_red_id(k_reduced_id); + // Copy data from host to device, compute Sigma(k), and schedule async memcpy to host pinned memory "qkpt::Sigmak_stij_buffer_" if (_low_device_memory) { if (!_X2C) { qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); - qkpt->compute_second_tau_contraction(Sigmak_stij.data(), - qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); - // copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns); + qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); } else { // In 2cGW, G(-k) = G*(k) has already been addressed in r2() qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, false); - qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(), - qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); - // copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts); + qkpt->compute_second_tau_contraction_2C(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); } } else { qkpt->set_up_qkpt_second(nullptr, V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); - qkpt->compute_second_tau_contraction(nullptr, qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); + qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); } } } diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index 0fda82a..0faec3c 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -532,44 +532,8 @@ namespace green::gpu { release_lock<<<1, 1, 0, stream_>>>(Pqk0_tQP_lock); } - // template - // void gw_qkpt::async_compute_second_tau_contraction(ztensor<5>& Sigma_tskij_host, cxx_complex* Sigmak_stij_host, - // cuda_complex* Pqk_tQP) {} - // cuda_complex one = cu_type_map::cast(1., 0.); - // cuda_complex zero = cu_type_map::cast(0., 0.); - // cuda_complex m1 = cu_type_map::cast(-1., 0.); - // cuda_complex* Y1t_Qin = X1t_tmQ_; // name change, reuse memory - // cuda_complex* Y2t_inP = X2t_Ptm_; // name change, reuse memory - // cublasSetStream(*handle_, stream_); - // for (int s = 0; s < ns_; ++s) { - // for (int t = 0; t < nt_; t += nt_batch_) { - // int st = s * nt_ + t; - // int nt_mult = std::min(nt_batch_, nt_ - t); - // // Y1_Qin = V_Qim * G1_mn; G1_mn = G^{k1}(t)_mn - // if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_N, nao_, nauxnao_, nao_, &one, g_stij_ + st * nao2_, nao_, - // nao2_, V_Qim_, nao_, 0, &zero, Y1t_Qin, nao_, nauxnao2_, nt_mult) != CUBLAS_STATUS_SUCCESS) { - // throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); - // } - // // Y2_inP = Y1_Qin * Pq_QP - // if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_T, naux_, nao2_, naux_, &one, Pqk_tQP + t * naux2_, naux_, - // naux2_, Y1t_Qin, nao2_, nauxnao2_, &zero, Y2t_inP, naux_, nauxnao2_, - // nt_mult) != CUBLAS_STATUS_SUCCESS) { - // throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); - // } - // // Sigma_ij = Y2_inP V_nPj - // if (GEMM_STRIDED_BATCHED(*handle_, CUBLAS_OP_N, CUBLAS_OP_N, nao_, nao_, nauxnao_, &m1, V_nPj_, nao_, 0, Y2t_inP, - // nauxnao_, nauxnao2_, &zero, sigmak_stij_ + st * nao2_, nao_, nao2_, - // nt_mult) != CUBLAS_STATUS_SUCCESS) { - // throw std::runtime_error("GEMM_STRIDED_BATCHED fails on gw_qkpt.compute_second_tau_contraction()."); - // } - // } - // } - // write_sigma(_low_memory_requirement, Sigmak_stij_host); - // cudaEventRecord(all_done_event_); - // } - template - void gw_qkpt::compute_second_tau_contraction(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) { + void gw_qkpt::compute_second_tau_contraction(cuda_complex* Pqk_tQP) { cuda_complex one = cu_type_map::cast(1., 0.); cuda_complex zero = cu_type_map::cast(0., 0.); cuda_complex m1 = cu_type_map::cast(-1., 0.); @@ -604,7 +568,7 @@ namespace green::gpu { } template - void gw_qkpt::compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) { + void gw_qkpt::compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP) { cuda_complex one = cu_type_map::cast(1., 0.); cuda_complex zero = cu_type_map::cast(0., 0.); cuda_complex m1 = cu_type_map::cast(-1., 0.); @@ -654,8 +618,7 @@ namespace green::gpu { } else { // Copy sigmak_stij_ back to CPU cudaMemcpyAsync(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost, stream_); - cleanup_req_ = true; - // std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); + cleanup_req_ = true; // This is required in addition to the all_done_event_ to indicate that we need to copy back to host. For initial states, this is not needed. } } @@ -665,8 +628,6 @@ namespace green::gpu { // Block the stream until all the tasks are completed if (require_cleanup()) { cudaStreamSynchronize(stream_); - // TODO: Why do we have another intermediate for Sigma(k)? - // I iunderstand that this is not the cuda malloc buffer, but simply a normal one. std::memcpy(Sigmak_stij_host.data(), Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); if (!x2c) { copy_Sigma(Sigma_tskij_host, Sigmak_stij_host); diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index 85c5e2c..918a76e 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -284,16 +284,16 @@ namespace green::gpu { /** * \brief Using dressed GW polarization compute self-energy at a given momentum point * - * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point * \param Pqk_tQP Dressed polarization bubble */ - void compute_second_tau_contraction(cxx_complex* Sigmak_stij_host = nullptr, cuda_complex* Pqk_tQP = nullptr); + void compute_second_tau_contraction(cuda_complex* Pqk_tQP = nullptr); + /** * \brief Using dressed GW polarization compute self-energy at a given momentum point (X2C version) - * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point + * * \param Pqk_tQP Dressed polarization bubble */ - void compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host = nullptr, cuda_complex* Pqk_tQP = nullptr); + void compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP = nullptr); /** * \brief For a given k-point copy self-energy back to a host memory From 259d3f53af5d85afd67504f7bfa04fff9000e5ac Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Tue, 28 Oct 2025 23:48:46 -0500 Subject: [PATCH 13/36] remove unnecessary stream synchronization calls from obtain_Pq functions --- src/cugw_qpt.cu | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index 0faec3c..f12b375 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -154,12 +154,6 @@ namespace green::gpu { template typename gw_qpt::cuda_complex* gw_qpt::Pqk_tQP(cudaEvent_t all_done_event, cudaStream_t calc_stream, int need_minus_q) { - // make sure the other stream waits until our data is ready (i.e. the equation system solved) - if (cudaStreamWaitEvent(calc_stream, polarization_ready_event_, 0 /*cudaEventWaitDefault*/)) - throw std::runtime_error("could not wait for data"); - // make sure this stream waits until the other calculation is done - if (cudaStreamWaitEvent(stream_, all_done_event, 0 /*cudaEventWaitDefault*/)) - throw std::runtime_error("could not wait for data"); return (!need_minus_q) ? Pqk_tQP_ : Pqk_tQP_conj_; } From 0592ace393f00c93ed7e83dadd8fbcee3d865ef9 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Tue, 28 Oct 2025 23:56:03 -0500 Subject: [PATCH 14/36] more refactoring --- src/cu_routines.cu | 6 +++--- src/green/gpu/cu_routines.h | 2 +- src/green/gpu/cugw_qpt.h | 26 -------------------------- 3 files changed, 4 insertions(+), 30 deletions(-) diff --git a/src/cu_routines.cu b/src/cu_routines.cu index 25fd071..808d84f 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -202,11 +202,11 @@ namespace green::gpu { qpt.init(&_handle, &_solver_handle); // Each process gets one cuda runner for qpoints for (int i = 0; i < _nqkpt; ++i) { - // TODO: Do we need to add the same handle to qkpt as qpt? if (cublasCreate(&_qkpt_handles[i]) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas"); - qkpts[i] = new gw_qkpt(_nao, _NQ, _ns, _nts, _nt_batch, &_qkpt_handles[i], g_kstij_device, g_ksmtij_device, sigma_kstij_device, - sigma_k_locks); + // initialize qkpt workers + qkpts[i] = new gw_qkpt(_nao, _NQ, _ns, _nts, _nt_batch, &_qkpt_handles[i], + g_kstij_device, g_ksmtij_device, sigma_kstij_device, sigma_k_locks); } } diff --git a/src/green/gpu/cu_routines.h b/src/green/gpu/cu_routines.h index f397d26..a25495c 100644 --- a/src/green/gpu/cu_routines.h +++ b/src/green/gpu/cu_routines.h @@ -155,7 +155,7 @@ namespace green::gpu { bool _low_device_memory; cublasHandle_t _handle; cusolverDnHandle_t _solver_handle; - std::vector _qkpt_handles; + std::vector _qkpt_handles; // list of cublas handles for qkpt streams gw_qpt qpt; std::vector*> qkpts; diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index 918a76e..bff64ec 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -262,25 +262,6 @@ namespace green::gpu { */ void write_P0(int t, cuda_complex* Pqk0_tQP, int* Pqk0_tQP_lock); - // /** - // * \brief Using dressed GW polarization compute self-energy at a given momentum point - // * - // * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point - // * \param Pqk_tQP Dressed polarization bubble - // * \param Sigma_tskij_host host copy of full self-energy for MPI task assigned to the current device - // */ - // void async_compute_second_tau_contraction(ztensor<5>& Sigma_tskij_host, cxx_complex* Sigma_stij_host=nullptr, - // cuda_complex* Pqk_tQP=nullptr); - // /** - // * \brief Using dressed GW polarization compute self-energy at a given momentum point (X2C version) - // * - // * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point - // * \param Pqk_tQP Dressed polarization bubble - // * \param Sigma_tskij_host host copy of full self-energy for MPI task assigned to the current device - // */ - // void async_compute_second_tau_contraction_2C(ztensor<5>& Sigma_tskij_host, cxx_complex* Sigma_stij_host=nullptr, - // cuda_complex* Pqk_tQP=nullptr); - /** * \brief Using dressed GW polarization compute self-energy at a given momentum point * @@ -492,9 +473,6 @@ namespace green::gpu { void wait_and_clean_qkpts(std::vector*>& qkpts, bool low_memory_mode, tensor,4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_shared, bool x2c) { - // if (!utils::context.global_rank) { - // std::cout << "\nDEBUG: Waiting for all qkpts to finish working\n" << std::endl; - // } static int pos = 0; pos++; if (pos >= qkpts.size()) pos = 0; @@ -505,10 +483,6 @@ namespace green::gpu { } qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host, Sigma_tskij_shared, x2c); } - // if (!utils::context.global_rank) { - // std::cout << "\nDEBUG: Cleanup of qkpts completed\n" << std::endl; - // } - return; } } // namespace green::gpu From d2fffabd40e458bc7e0c3edac50a9e5d55985ff2 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Wed, 29 Oct 2025 00:36:35 -0500 Subject: [PATCH 15/36] redo Pqk_tQP function -- checking stream readiness has some effect on small jobs --- src/cugw_qpt.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index f12b375..0faec3c 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -154,6 +154,12 @@ namespace green::gpu { template typename gw_qpt::cuda_complex* gw_qpt::Pqk_tQP(cudaEvent_t all_done_event, cudaStream_t calc_stream, int need_minus_q) { + // make sure the other stream waits until our data is ready (i.e. the equation system solved) + if (cudaStreamWaitEvent(calc_stream, polarization_ready_event_, 0 /*cudaEventWaitDefault*/)) + throw std::runtime_error("could not wait for data"); + // make sure this stream waits until the other calculation is done + if (cudaStreamWaitEvent(stream_, all_done_event, 0 /*cudaEventWaitDefault*/)) + throw std::runtime_error("could not wait for data"); return (!need_minus_q) ? Pqk_tQP_ : Pqk_tQP_conj_; } From 1aa3a28ffe378f730e82f9360efff3dd25301dfd Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Wed, 5 Nov 2025 16:28:09 -0500 Subject: [PATCH 16/36] Update src/green/gpu/cugw_qpt.h Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/green/gpu/cugw_qpt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index bff64ec..c475210 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -303,7 +303,7 @@ namespace green::gpu { void cleanup(bool low_memory_mode, tensor, 4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c); /** - * \brief + * \brief Copies the self-energy from the per-k buffer to the full host tensor for non-X2C calculations. * */ void copy_Sigma(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij); From 552b7e2a9b11578abbf6decd16cb0af0e901ffb4 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Wed, 5 Nov 2025 16:28:41 -0500 Subject: [PATCH 17/36] Update src/green/gpu/cugw_qpt.h Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/green/gpu/cugw_qpt.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index c475210..e787fa1 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -309,11 +309,10 @@ namespace green::gpu { void copy_Sigma(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij); /** - * \brief + * \brief Copy 2-component self-energy data from device to host storage. * - * \param Sigma_tskij_host - * \param Sigmak_stij - * \param k_id + * \param Sigma_tskij_host Host storage for the full self-energy tensor. + * \param Sigmak_stij Self-energy tensor for a given momentum point. */ void copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij); From 06cc9ac98fec6e2c82afae91109c49e7f7121ad2 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Wed, 5 Nov 2025 16:29:20 -0500 Subject: [PATCH 18/36] Update src/green/gpu/cugw_qpt.h Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/green/gpu/cugw_qpt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index e787fa1..16bf4f6 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -317,7 +317,7 @@ namespace green::gpu { void copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij); /** - * \brief Check if cuda devices are budy + * \brief Check if cuda devices are busy * \return true if asynchronous calculations are still running */ bool is_busy(); From fb6a62caadbe2942065465cf9ee1b398fa863e7a Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Wed, 5 Nov 2025 16:45:55 -0500 Subject: [PATCH 19/36] address Copilot reviews --- src/cugw_qpt.cu | 1 - src/green/gpu/cugw_qpt.h | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index 0faec3c..a39e86c 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -651,7 +651,6 @@ namespace green::gpu { void gw_qkpt::copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij) { size_t nao = Sigmak_stij.shape()[3]; size_t nso = 2 * nao; - MatrixXcf Sigma_ij(nso, nso); for (size_t ss = 0; ss < 3; ++ss) { size_t a = (ss % 2 == 0) ? 0 : 1; size_t b = ((ss + 1) / 2 != 1) ? 0 : 1; diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index 16bf4f6..c97e907 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -303,13 +303,15 @@ namespace green::gpu { void cleanup(bool low_memory_mode, tensor, 4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c); /** - * \brief Copies the self-energy from the per-k buffer to the full host tensor for non-X2C calculations. + * \brief Copy the non-relativistic self-energy from the per-k buffer to the full host tensor. * + * \param Sigma_tskij_host Host storage for the full self-energy tensor. + * \param Sigmak_stij Self-energy tensor for a given momentum point. */ void copy_Sigma(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij); /** - * \brief Copy 2-component self-energy data from device to host storage. + * \brief Copy 2-component self-energy from per-k buffer to full host tensor. * * \param Sigma_tskij_host Host storage for the full self-energy tensor. * \param Sigmak_stij Self-energy tensor for a given momentum point. @@ -472,9 +474,7 @@ namespace green::gpu { void wait_and_clean_qkpts(std::vector*>& qkpts, bool low_memory_mode, tensor,4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_shared, bool x2c) { - static int pos = 0; - pos++; - if (pos >= qkpts.size()) pos = 0; + if (int pos >= qkpts.size()) pos = 0; for (pos = 0; pos < qkpts.size(); pos++) { // wait for qkpt to finish its tasks, then cleanup while (qkpts[pos]->is_busy()) { From 366602591d901960f652a7283a02e872bc06a022 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 00:41:04 -0500 Subject: [PATCH 20/36] rename share coulomb allocation event --- src/green/gpu/gpu_kernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/green/gpu/gpu_kernel.h b/src/green/gpu/gpu_kernel.h index f1b0028..ade922d 100644 --- a/src/green/gpu/gpu_kernel.h +++ b/src/green/gpu/gpu_kernel.h @@ -66,7 +66,7 @@ namespace green::gpu { */ inline void set_shared_Coulomb() { if (_coul_int_reading_type == as_a_whole) { - statistics.start("Read"); + statistics.start("Allocate shared Coulomb"); // Allocate Coulomb integrals in double precision and cast them to single precision whenever needed allocate_shared_Coulomb(&_Vk1k2_Qij); statistics.end(); From 91ebc4beda0bb8874eb6a026a01b6de968ca3a35 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 00:41:57 -0500 Subject: [PATCH 21/36] clean up and modify estimation of flops achieved --- src/green/gpu/gw_gpu_kernel.h | 14 +++-- src/gw_gpu_kernel.cpp | 100 +++++++++++++++++++++------------- 2 files changed, 73 insertions(+), 41 deletions(-) diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index c2cba49..143b4c8 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -87,10 +87,9 @@ namespace green::gpu { /** * \brief calculate effective floating points operations per second reached on GPU. - * This is not representative of the GPU capabilities, but instead, accounts for read/write overheads. - * The value is entirely in the context Green-MBPT solver. + * This is not representative of the GPU capabilities, but instead, includes read/write overheads. */ - void flops_achieved(MPI_Comm comm); + void flops_achieved(); /** * \brief print the effective FLOPs achieved for the iteration. @@ -146,7 +145,10 @@ namespace green::gpu { protected: void gw_innerloop(G_type& g, St_type& sigma_tau) override; private: - void complexity_estimation(); + /** + * @brief Calculate and print complexity estimation for each device + */ + void complexity_estimation(); template void compute_gw_selfenergy(G_type& g, St_type& sigma_tau); @@ -182,6 +184,10 @@ namespace green::gpu { protected: void gw_innerloop(G_type& g, St_type& sigma_tau) override; private: + /** + * @brief Calculate and print complexity estimation for each device + * + */ void complexity_estimation(); template diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 4fe3b56..9ca8d49 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -26,27 +26,40 @@ namespace green::gpu { void scalar_gw_gpu_kernel::complexity_estimation() { + if (_devices_comm == MPI_COMM_NULL) return; + // Get number of k-points assigned to each device + size_t ink_on_device = 1 + (_ink - 1 - _devices_rank) / _devices_size; // Calculate the complexity of GW double NQsq=(double)_NQ*_NQ; //first set of matmuls - double flop_count_firstmatmul= _ink*_nk*_ns*_nts/2.*( + double flop_count_firstmatmul = ink_on_device*_nk*_ns*_nts/2.*( matmul_cost(_nao*_NQ, _nao, _nao) //X1_t_mQ = G_t_p * V_pmQ; - +matmul_cost(_NQ*_nao, _nao, _nao)//X2_Pt_m = (V_Pt_n)* * G_m_n; - +matmul_cost(_NQ, _naosq, _NQ) //Pq0_QP=X2_Ptm Q1_tmQ + + matmul_cost(_NQ*_nao, _nao, _nao)//X2_Pt_m = (V_Pt_n)* * G_m_n; + + matmul_cost(_NQ, _naosq, _NQ) //Pq0_QP=X2_Ptm Q1_tmQ ); - double flop_count_fourier=_ink*(matmul_cost(NQsq, _nts, _nw_b)+matmul_cost(NQsq, _nw_b, _nts)); //Fourier transform forward and back - double flop_count_solver=2./3.*_ink*_nw_b*(NQsq*_NQ+NQsq); //approximate LU and backsubst cost (note we are doing cholesky which is cheaper + double flop_count_fourier = ink_on_device*( + matmul_cost(NQsq, _nts, _nw_b) + matmul_cost(NQsq, _nw_b, _nts) + ); //Fourier transform forward and back + double flop_count_solver = 2./3.*ink_on_device*_nw_b*(NQsq*_NQ+NQsq); //approximate LU and backsubst cost (note we are doing cholesky which is cheaper //secondset of matmuls - double flop_count_secondmatmul=_ink*_nk*_ns*_nts*( + double flop_count_secondmatmul = ink_on_device*_nk*_ns*_nts*( matmul_cost(_NQ*_nao, _nao, _nao) //Y1_Qin = V_Qim * G1_mn; - +matmul_cost(_naosq, _NQ, _NQ) //Y2_inP = Y1_Qin * Pq_QP - +matmul_cost(_nao, _NQ*_nao, _nao)//Sigma_ij = Y2_inP V_nPj + + matmul_cost(_naosq, _NQ, _NQ) //Y2_inP = Y1_Qin * Pq_QP + + matmul_cost(_nao, _NQ*_nao, _nao)//Sigma_ij = Y2_inP V_nPj ); - _flop_count= flop_count_firstmatmul+flop_count_fourier+flop_count_solver+flop_count_secondmatmul; + _flop_count = flop_count_firstmatmul + flop_count_fourier + flop_count_solver + flop_count_secondmatmul; + + // Sum over all devices to get toal FLOP count; but don't broadcast back since each device only needs its own flop count to calculate achieved FLOPs + double total_flop_count = _flop_count; + MPI_Allreduce(MPI_IN_PLACE, &flop_count_firstmatmul, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); + MPI_Allreduce(MPI_IN_PLACE, &flop_count_fourier, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); + MPI_Allreduce(MPI_IN_PLACE, &flop_count_solver, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); + MPI_Allreduce(MPI_IN_PLACE, &flop_count_secondmatmul, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); + MPI_Allreduce(MPI_IN_PLACE, &total_flop_count, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); if (!utils::context.global_rank && _verbose > 1) { std::cout << "############ Total GW Operations per Iteration ############" << std::endl; - std::cout << "Total: " << _flop_count << std::endl; + std::cout << "Total: " << total_flop_count << std::endl; std::cout << "First matmul: " << flop_count_firstmatmul << std::endl; std::cout << "Fourier: " << flop_count_fourier << std::endl; std::cout << "Solver: " << flop_count_solver << std::endl; @@ -56,22 +69,33 @@ namespace green::gpu { } void x2c_gw_gpu_kernel::complexity_estimation() { + if (_devices_comm == MPI_COMM_NULL) return; + // Get number of k-points assigned to each device + size_t ink_on_device = 1 + (_ink - 1 - _devices_rank) / _devices_size; // Calculate the complexity of GW // TODO virtual function for complexity calculation double NQsq=(double)_NQ*_NQ; //first set of matmuls // Doesn't fit for generalized GW. - double flop_count_firstmatmul= _ink*_nk*4*_nts/2.* + double flop_count_firstmatmul= ink_on_device*_nk*4*_nts/2.* (matmul_cost(_nao*_NQ, _nao, _nao)+matmul_cost(_nao, _NQ*_nao, _nao)+matmul_cost(_NQ, _NQ, _naosq)); - double flop_count_fourier=_ink*(matmul_cost(NQsq, _nts, _nw_b)+matmul_cost(NQsq, _nw_b, _nts)); //Fourier transform forward and back - double flop_count_solver=2./3.*_ink*_nw_b*(NQsq*_NQ+NQsq); //approximate LU and backsubst cost + double flop_count_fourier=ink_on_device*(matmul_cost(NQsq, _nts, _nw_b)+matmul_cost(NQsq, _nw_b, _nts)); //Fourier transform forward and back + double flop_count_solver=2./3.*ink_on_device*_nw_b*(NQsq*_NQ+NQsq); //approximate LU and backsubst cost //secondset of matmuls - double flop_count_secondmatmul=_ink*_nk*4*_nts*(matmul_cost(_nao*_NQ, _nao, _nao)+matmul_cost(_NQ, _naosq, _NQ)+matmul_cost(_nao, _nao, _NQ*_nao)); + double flop_count_secondmatmul=ink_on_device*_nk*4*_nts*(matmul_cost(_nao*_NQ, _nao, _nao)+matmul_cost(_NQ, _naosq, _NQ)+matmul_cost(_nao, _nao, _NQ*_nao)); _flop_count= flop_count_firstmatmul+flop_count_fourier+flop_count_solver+flop_count_secondmatmul; + // Sum over all devices to get toal FLOP count; but don't broadcast back since each device only needs its own flop count to calculate achieved FLOPs + double total_flop_count = _flop_count; + MPI_Allreduce(MPI_IN_PLACE, &flop_count_firstmatmul, 1, MPI_DOUBLE, MPI_SUM, comm_device); + MPI_Allreduce(MPI_IN_PLACE, &flop_count_fourier, 1, MPI_DOUBLE, MPI_SUM, comm_device); + MPI_Allreduce(MPI_IN_PLACE, &flop_count_solver, 1, MPI_DOUBLE, MPI_SUM, comm_device); + MPI_Allreduce(MPI_IN_PLACE, &flop_count_secondmatmul, 1, MPI_DOUBLE, MPI_SUM, comm_device); + MPI_Allreduce(MPI_IN_PLACE, &total_flop_count, 1, MPI_DOUBLE, MPI_SUM, comm_device); + if (!utils::context.global_rank && _verbose > 1) { std::cout << "############ Total Two-Component GW Operations per Iteration ############" << std::endl; - std::cout << "Total: " << _flop_count << std::endl; + std::cout << "Total: " << total_flop_count << std::endl; std::cout << "First matmul: " << flop_count_firstmatmul << std::endl; std::cout << "Fourier: " << flop_count_fourier << std::endl; std::cout << "Solver: " << flop_count_solver << std::endl; @@ -84,7 +108,7 @@ namespace green::gpu { MPI_Datatype dt_matrix = utils::create_matrix_datatype>(_nso*_nso); MPI_Op matrix_sum_op = utils::create_matrix_operation>(); statistics.start("total"); - statistics.start("Initialization"); + statistics.start("Initialization: CPU"); sigma_tau.fence(); if (!utils::context.node_rank) sigma_tau.object().set_zero(); sigma_tau.fence(); @@ -122,32 +146,34 @@ namespace green::gpu { MPI_Op_free(&matrix_sum_op); } - void gw_gpu_kernel::flops_achieved(MPI_Comm comm) { + void gw_gpu_kernel::flops_achieved() { + if (_devices_comm == MPI_COMM_NULL) return; double gpu_time=0.; - if (comm != MPI_COMM_NULL) { - utils::event_t& cugw_event = statistics.event("Solve cuGW"); - if (!cugw_event.active) { - gpu_time = cugw_event.duration; - } else { - throw std::runtime_error("'Solve cuGW' still active, but it should not be."); - } - } - - if (!utils::context.global_rank) { - MPI_Reduce(MPI_IN_PLACE, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global); + utils::event_t& cugw_event = statistics.event("Solve cuGW"); + if (!cugw_event.active) { + gpu_time = cugw_event.duration; } else { - MPI_Reduce(&gpu_time, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global); + throw std::runtime_error("'Solve cuGW' still active, but it should not be."); } - _eff_flops = _flop_count / gpu_time; } void gw_gpu_kernel::print_effective_flops() { + if (_devices_comm == MPI_COMM_NULL) return; + double average_eff_flops = _eff_flops; + double max_eff_flops = _eff_flops; + double min_eff_flops = _eff_flops; + MPI_Reduce(&average_eff_flops, &max_eff_flops, 1, MPI_DOUBLE, MPI_MAX, 0, _devices_comm); + MPI_Reduce(&average_eff_flops, &min_eff_flops, 1, MPI_DOUBLE, MPI_MIN, 0, _devices_comm); + MPI_Reduce(&average_eff_flops, &average_eff_flops, 1, MPI_DOUBLE, MPI_SUM, 0, _devices_comm); + average_eff_flops /= _devCount_total; if (!utils::context.global_rank && _verbose > 1) { auto old_precision = std::cout.precision(); std::cout << std::setprecision(6); std::cout << "=================== GPU Performance ====================" << std::endl; - std::cout << "Effective FLOPs in the GW iteration: " << _eff_flops / 1.0e9 << " Giga flops." << std::endl; + std::cout << "Average GFLOPs achieved in the GW iteration: " << average_eff_flops / (1024 * 1024 * 1024) << std::endl; + std::cout << "Minimum GFLOPs achieved in the GW iteration: " << min_eff_flops / (1024 * 1024 * 1024) << std::endl; + std::cout << "Maximum GFLOPs achieved in the GW iteration: " << max_eff_flops / (1024 * 1024 * 1024) << std::endl; std::cout << "============================================================" << std::endl; std::cout << std::setprecision(old_precision); } @@ -173,7 +199,7 @@ namespace green::gpu { void scalar_gw_gpu_kernel::compute_gw_selfenergy(G_type& g, St_type& sigma_tau) { // check devices' free space and space requirements GW_check_devices_free_space(); - statistics.start("Initialization"); + statistics.start("Initialization: GPU"); cugw_utils cugw(_nts, _nt_batch, _nw_b, _ns, _nk, _ink, _nqkpt, _NQ, _nao, g.object(), _low_device_memory, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver, utils::context.global_rank, utils::context.node_rank, _devCount_per_node); @@ -185,7 +211,7 @@ namespace green::gpu { tensor,3>& V_Qpm, std::complex *Vk1k2_Qij, tensor,4>&Gk_smtij, tensor,4>&Gk1_stij, bool need_minus_k, bool need_minus_k1) { - statistics.start("read"); + statistics.start("Read Integrals", true); int q = k_vector[2]; if (_coul_int_reading_type == chunks) { read_next(k_vector); @@ -203,7 +229,7 @@ namespace green::gpu { tensor,3>& V_Qim, std::complex *Vk1k2_Qij, tensor,4>&Gk1_stij, bool need_minus_k1) { - statistics.start("read"); + statistics.start("Read Integrals", true); int q = k_vector[1]; if (_coul_int_reading_type == chunks) { read_next(k_vector); @@ -222,7 +248,7 @@ namespace green::gpu { // and do MPIAllreduce on CPU later on. Since the number of processes with a GPU is very // limited, the additional memory overhead is fairly limited. ztensor<5> Sigma_tskij_host_local(_nts, _ns, _ink, _nao, _nao); - statistics.start("Solve cuGW"); + statistics.start("Solve cuGW", false); cugw.solve(_nts, _ns, _nk, _ink, _nao, _bz_utils.symmetry().reduced_to_full(), _bz_utils.symmetry().full_to_reduced(), _Vk1k2_Qij, Sigma_tskij_host_local, _devices_rank, _devices_size, _low_device_memory, _verbose, irre_pos, mom_cons, r1, r2); @@ -308,7 +334,7 @@ namespace green::gpu { tensor,4>&Gk_4mtij, tensor,4>&Gk1_4tij, bool need_minus_k, bool need_minus_k1) { - statistics.start("read"); + statistics.start("Read Integrals", true); int q = k_vector[2]; if (q == 0 or _coul_int_reading_type == chunks) { read_next(k_vector); @@ -324,7 +350,7 @@ namespace green::gpu { tensor,3>& V_Qim, std::complex *Vk1k2_Qij, tensor,4>&Gk1_4tij, bool need_minus_k1) { - statistics.start("read"); + statistics.start("Read Integrals", true); int q = k_vector[1]; if (q == 0 or _coul_int_reading_type == chunks) { read_next(k_vector); From 2c6866320d9ba964f80f8387e8411bcd233632f0 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 00:52:52 -0500 Subject: [PATCH 22/36] fix typo in wait_and_clean_qkpts --- src/green/gpu/cugw_qpt.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index c97e907..af87ae1 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -474,8 +474,7 @@ namespace green::gpu { void wait_and_clean_qkpts(std::vector*>& qkpts, bool low_memory_mode, tensor,4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_shared, bool x2c) { - if (int pos >= qkpts.size()) pos = 0; - for (pos = 0; pos < qkpts.size(); pos++) { + for (int pos = 0; pos < qkpts.size(); pos++) { // wait for qkpt to finish its tasks, then cleanup while (qkpts[pos]->is_busy()) { continue; From a7f6141d75f146932ec43570b4e1fcff3346fadd Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 09:58:56 -0500 Subject: [PATCH 23/36] rename qkpt_handles --- src/cu_routines.cu | 8 ++++---- src/green/gpu/cu_routines.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cu_routines.cu b/src/cu_routines.cu index 808d84f..866af76 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -172,7 +172,7 @@ namespace green::gpu { int _devCount_per_node) : _low_device_memory(low_device_memory), qkpts(_nqkpt), V_Qpm(_NQ, _nao, _nao), V_Qim(_NQ, _nao, _nao), Gk1_stij(_ns, _nts, _nao, _nao), Gk_smtij(_ns, _nts, _nao, _nao), - qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data(), cuda_lin_solver), _qkpt_handles(_nqkpt) { + qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data(), cuda_lin_solver), _qkpt_cublas_handles(_nqkpt) { if (cudaSetDevice(_intranode_rank % _devCount_per_node) != cudaSuccess) throw std::runtime_error("Error in cudaSetDevice2"); if (cublasCreate(&_handle) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas"); @@ -202,10 +202,10 @@ namespace green::gpu { qpt.init(&_handle, &_solver_handle); // Each process gets one cuda runner for qpoints for (int i = 0; i < _nqkpt; ++i) { - if (cublasCreate(&_qkpt_handles[i]) != CUBLAS_STATUS_SUCCESS) + if (cublasCreate(&_qkpt_cublas_handles[i]) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas"); // initialize qkpt workers - qkpts[i] = new gw_qkpt(_nao, _NQ, _ns, _nts, _nt_batch, &_qkpt_handles[i], + qkpts[i] = new gw_qkpt(_nao, _NQ, _ns, _nts, _nt_batch, &_qkpt_cublas_handles[i], g_kstij_device, g_ksmtij_device, sigma_kstij_device, sigma_k_locks); } } @@ -349,7 +349,7 @@ namespace green::gpu { if (cublasDestroy(_handle) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("cublas error destroying handle"); if (cusolverDnDestroy(_solver_handle) != CUSOLVER_STATUS_SUCCESS) throw std::runtime_error("culapck error destroying handle"); for (int i = 0; i < qkpts.size(); ++i) { - if (cublasDestroy(_qkpt_handles[i]) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("cublas error destroying handle"); + if (cublasDestroy(_qkpt_cublas_handles[i]) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("cublas error destroying handle"); } if (!_low_device_memory) cudaFree(g_kstij_device); if (!_low_device_memory) cudaFree(g_ksmtij_device); diff --git a/src/green/gpu/cu_routines.h b/src/green/gpu/cu_routines.h index a25495c..9e0e32a 100644 --- a/src/green/gpu/cu_routines.h +++ b/src/green/gpu/cu_routines.h @@ -155,7 +155,7 @@ namespace green::gpu { bool _low_device_memory; cublasHandle_t _handle; cusolverDnHandle_t _solver_handle; - std::vector _qkpt_handles; // list of cublas handles for qkpt streams + std::vector _qkpt_cublas_handles; // list of cublas handles for qkpt streams gw_qpt qpt; std::vector*> qkpts; From 6ad23d9fa4af0f9e4c1cb28d924f055b7cdb7042 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 11:45:56 -0500 Subject: [PATCH 24/36] naming and documentation updates; use reset events at end of solve cycle --- src/cugw_qpt.cu | 6 +++--- src/green/gpu/cugw_qpt.h | 4 ++-- src/green/gpu/gpu_kernel.h | 2 +- src/gw_gpu_kernel.cpp | 2 ++ 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index a39e86c..b3d1350 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -348,11 +348,11 @@ namespace green::gpu { throw std::runtime_error("failure allocating V on host"); if (_low_memory_requirement) { if (cudaMallocHost(&Gk1_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) - throw std::runtime_error("failure allocating Gk_tsij on host"); + throw std::runtime_error("failure allocating Gk1_tsij on host"); if (cudaMallocHost(&Gk_smtij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) - throw std::runtime_error("failure allocating Gk_tsij on host"); + throw std::runtime_error("failure allocating Gk_smtij on host"); if (cudaMallocHost(&Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) - throw std::runtime_error("failure allocating Gk_tsij on host"); + throw std::runtime_error("failure allocating Sigmak_stij on host"); // Sigmak_stij_buffer_ = Gk_smtij_buffer_; } diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index af87ae1..d49616a 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -325,9 +325,9 @@ namespace green::gpu { bool is_busy(); /** - * \brief Set the k_red_id_ object + * \brief Set the irreducible k-point index for gw_qkpt worker * - * \param k + * \param k incoming k-point index */ void set_k_red_id(int k) { k_red_id_ = k; diff --git a/src/green/gpu/gpu_kernel.h b/src/green/gpu/gpu_kernel.h index ade922d..7ddf66f 100644 --- a/src/green/gpu/gpu_kernel.h +++ b/src/green/gpu/gpu_kernel.h @@ -66,7 +66,7 @@ namespace green::gpu { */ inline void set_shared_Coulomb() { if (_coul_int_reading_type == as_a_whole) { - statistics.start("Allocate shared Coulomb"); + statistics.start("read whole integral"); // Allocate Coulomb integrals in double precision and cast them to single precision whenever needed allocate_shared_Coulomb(&_Vk1k2_Qij); statistics.end(); diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 9ca8d49..11ea4e7 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -137,6 +137,8 @@ namespace green::gpu { statistics.end(); statistics.print(utils::context.global); print_effective_flops(); + // Reset all timing stats for next iteration + statistics.reset(); clean_MPI_structure(); clean_shared_Coulomb(); From 8e0e0efd29f979f95f16fb6922bcf1fb6af310aa Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 11:49:20 -0500 Subject: [PATCH 25/36] fix function call for flops_achieved() --- src/gw_gpu_kernel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 11ea4e7..52b79c0 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -125,7 +125,7 @@ namespace green::gpu { MPI_Barrier(utils::context.global); sigma_tau.fence(); // Print effective FLOPs achieved in the calculation - flops_achieved(_devices_comm); + flops_achieved(); if (!utils::context.node_rank) { if (_devices_comm != MPI_COMM_NULL) statistics.start("selfenergy_reduce"); utils::allreduce(MPI_IN_PLACE, sigma_tau.object().data(), sigma_tau.object().size()/(_nso*_nso), dt_matrix, matrix_sum_op, utils::context.internode_comm); From 422947ee6d007c4df38e7dc9b9b71c2556853081 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 11:58:30 -0500 Subject: [PATCH 26/36] documentation fixes from copilot review --- src/green/gpu/cugw_qpt.h | 6 +++--- src/green/gpu/gw_gpu_kernel.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index d49616a..a0bdf3f 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -283,10 +283,10 @@ namespace green::gpu { void write_sigma(bool low_memory_mode = false); /** - * \brief return the status of copy_selfenergy from device to host + * \brief return the status of copy_selfenergy from device to host. + * Return value of `false` means stream is ready for next calculation without cleanup. * - * \return false - not required, stream ready for next calculation - * \return true - required + * \return true if cleanup is required, false otherwise */ bool require_cleanup(){ return cleanup_req_; diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index 143b4c8..5d3b51b 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -186,7 +186,6 @@ namespace green::gpu { private: /** * @brief Calculate and print complexity estimation for each device - * */ void complexity_estimation(); From d22fb80a9bd709781ffe0031ec5e7018f554748f Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 11:59:03 -0500 Subject: [PATCH 27/36] documentation and logic fix thanks to copilot --- src/gw_gpu_kernel.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 52b79c0..f2e864b 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -49,7 +49,7 @@ namespace green::gpu { ); _flop_count = flop_count_firstmatmul + flop_count_fourier + flop_count_solver + flop_count_secondmatmul; - // Sum over all devices to get toal FLOP count; but don't broadcast back since each device only needs its own flop count to calculate achieved FLOPs + // Sum over all devices to get total FLOP count; but don't broadcast back since each device only needs its own flop count to calculate achieved FLOPs double total_flop_count = _flop_count; MPI_Allreduce(MPI_IN_PLACE, &flop_count_firstmatmul, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); MPI_Allreduce(MPI_IN_PLACE, &flop_count_fourier, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); @@ -85,13 +85,13 @@ namespace green::gpu { double flop_count_secondmatmul=ink_on_device*_nk*4*_nts*(matmul_cost(_nao*_NQ, _nao, _nao)+matmul_cost(_NQ, _naosq, _NQ)+matmul_cost(_nao, _nao, _NQ*_nao)); _flop_count= flop_count_firstmatmul+flop_count_fourier+flop_count_solver+flop_count_secondmatmul; - // Sum over all devices to get toal FLOP count; but don't broadcast back since each device only needs its own flop count to calculate achieved FLOPs + // Sum over all devices to get total FLOP count; but don't broadcast back since each device only needs its own flop count to calculate achieved FLOPs double total_flop_count = _flop_count; - MPI_Allreduce(MPI_IN_PLACE, &flop_count_firstmatmul, 1, MPI_DOUBLE, MPI_SUM, comm_device); - MPI_Allreduce(MPI_IN_PLACE, &flop_count_fourier, 1, MPI_DOUBLE, MPI_SUM, comm_device); - MPI_Allreduce(MPI_IN_PLACE, &flop_count_solver, 1, MPI_DOUBLE, MPI_SUM, comm_device); - MPI_Allreduce(MPI_IN_PLACE, &flop_count_secondmatmul, 1, MPI_DOUBLE, MPI_SUM, comm_device); - MPI_Allreduce(MPI_IN_PLACE, &total_flop_count, 1, MPI_DOUBLE, MPI_SUM, comm_device); + MPI_Allreduce(MPI_IN_PLACE, &flop_count_firstmatmul, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); + MPI_Allreduce(MPI_IN_PLACE, &flop_count_fourier, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); + MPI_Allreduce(MPI_IN_PLACE, &flop_count_solver, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); + MPI_Allreduce(MPI_IN_PLACE, &flop_count_secondmatmul, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); + MPI_Allreduce(MPI_IN_PLACE, &total_flop_count, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); if (!utils::context.global_rank && _verbose > 1) { std::cout << "############ Total Two-Component GW Operations per Iteration ############" << std::endl; @@ -168,14 +168,14 @@ namespace green::gpu { MPI_Reduce(&average_eff_flops, &max_eff_flops, 1, MPI_DOUBLE, MPI_MAX, 0, _devices_comm); MPI_Reduce(&average_eff_flops, &min_eff_flops, 1, MPI_DOUBLE, MPI_MIN, 0, _devices_comm); MPI_Reduce(&average_eff_flops, &average_eff_flops, 1, MPI_DOUBLE, MPI_SUM, 0, _devices_comm); - average_eff_flops /= _devCount_total; + average_eff_flops /= _devices_size; if (!utils::context.global_rank && _verbose > 1) { auto old_precision = std::cout.precision(); std::cout << std::setprecision(6); std::cout << "=================== GPU Performance ====================" << std::endl; - std::cout << "Average GFLOPs achieved in the GW iteration: " << average_eff_flops / (1024 * 1024 * 1024) << std::endl; - std::cout << "Minimum GFLOPs achieved in the GW iteration: " << min_eff_flops / (1024 * 1024 * 1024) << std::endl; - std::cout << "Maximum GFLOPs achieved in the GW iteration: " << max_eff_flops / (1024 * 1024 * 1024) << std::endl; + std::cout << "Average GFLOPs achieved in the GW iteration: " << average_eff_flops / 1e9 << std::endl; + std::cout << "Minimum GFLOPs achieved in the GW iteration: " << min_eff_flops / 1e9 << std::endl; + std::cout << "Maximum GFLOPs achieved in the GW iteration: " << max_eff_flops / 1e9 << std::endl; std::cout << "============================================================" << std::endl; std::cout << std::setprecision(old_precision); } From d07819ee3e3274285f7bd062be2069feb0679dd5 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 14:32:37 -0500 Subject: [PATCH 28/36] optimize host pinned memory buffers --- src/cugw_qpt.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index b3d1350..7ab1a51 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -351,9 +351,8 @@ namespace green::gpu { throw std::runtime_error("failure allocating Gk1_tsij on host"); if (cudaMallocHost(&Gk_smtij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) throw std::runtime_error("failure allocating Gk_smtij on host"); - if (cudaMallocHost(&Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) - throw std::runtime_error("failure allocating Sigmak_stij on host"); - // Sigmak_stij_buffer_ = Gk_smtij_buffer_; + // Reuse Gk_smtij_buffer_ for sigmak_stij_buffer_ to save memory since they are not needed at the same time + Sigmak_stij_buffer_ = Gk_smtij_buffer_; } if (cudaMalloc(&Pqk0_tQP_local_, nt_batch_ * naux2_ * sizeof(cuda_complex)) != cudaSuccess) From 25acf9cdbacc51671b5ab8bbab1e369664d8d23d Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 14:33:29 -0500 Subject: [PATCH 29/36] use only specific cuda arch for pauli-master gpu quadro p1000 --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 8be4f02..5f8e5bc 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -25,7 +25,7 @@ jobs: module load BuildEnv/gcc-12.2.0.lua cuda-sdk; mkdir -p build; cd build; - cmake -DGPU_ARCHS="61;70;75;80;86;90" -DCMAKE_CXX_FLAGS=" --coverage -fno-inline -fno-inline-small-functions -fno-default-inline -fprofile-arcs -ftest-coverage " ..; + cmake -DGPU_ARCHS="61" -DCMAKE_CXX_FLAGS=" --coverage -fno-inline -fno-inline-small-functions -fno-default-inline -fprofile-arcs -ftest-coverage " ..; make -j 8; - name: Test From c01e1c894ce5ed168ac9c9e79968b7c88b469d34 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 14:40:55 -0500 Subject: [PATCH 30/36] undo update for test.yaml --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5f8e5bc..8be4f02 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -25,7 +25,7 @@ jobs: module load BuildEnv/gcc-12.2.0.lua cuda-sdk; mkdir -p build; cd build; - cmake -DGPU_ARCHS="61" -DCMAKE_CXX_FLAGS=" --coverage -fno-inline -fno-inline-small-functions -fno-default-inline -fprofile-arcs -ftest-coverage " ..; + cmake -DGPU_ARCHS="61;70;75;80;86;90" -DCMAKE_CXX_FLAGS=" --coverage -fno-inline -fno-inline-small-functions -fno-default-inline -fprofile-arcs -ftest-coverage " ..; make -j 8; - name: Test From 1ff204a3112ea4387ca7ad159b7111eea6963300 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 14:47:30 -0500 Subject: [PATCH 31/36] fix mpi_reduce for performance metrics --- src/gw_gpu_kernel.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index f2e864b..b1f373f 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -162,18 +162,24 @@ namespace green::gpu { void gw_gpu_kernel::print_effective_flops() { if (_devices_comm == MPI_COMM_NULL) return; - double average_eff_flops = _eff_flops; + double avg_eff_flops = _eff_flops; double max_eff_flops = _eff_flops; double min_eff_flops = _eff_flops; - MPI_Reduce(&average_eff_flops, &max_eff_flops, 1, MPI_DOUBLE, MPI_MAX, 0, _devices_comm); - MPI_Reduce(&average_eff_flops, &min_eff_flops, 1, MPI_DOUBLE, MPI_MIN, 0, _devices_comm); - MPI_Reduce(&average_eff_flops, &average_eff_flops, 1, MPI_DOUBLE, MPI_SUM, 0, _devices_comm); - average_eff_flops /= _devices_size; + if (!_devices_rank) { // using the fact that when devices_rank = 0, global_rank = 0 + MPI_Reduce(MPI_IN_PLACE, &max_eff_flops, 1, MPI_DOUBLE, MPI_MAX, 0, _devices_comm); + MPI_Reduce(MPI_IN_PLACE, &min_eff_flops, 1, MPI_DOUBLE, MPI_MIN, 0, _devices_comm); + MPI_Reduce(MPI_IN_PLACE, &avg_eff_flops, 1, MPI_DOUBLE, MPI_SUM, 0, _devices_comm); + avg_eff_flops /= _devices_size; + } else { + MPI_Reduce(&max_eff_flops, &max_eff_flops, 1, MPI_DOUBLE, MPI_MAX, 0, _devices_comm); + MPI_Reduce(&min_eff_flops, &min_eff_flops, 1, MPI_DOUBLE, MPI_MIN, 0, _devices_comm); + MPI_Reduce(&avg_eff_flops, &avg_eff_flops, 1, MPI_DOUBLE, MPI_SUM, 0, _devices_comm); + } if (!utils::context.global_rank && _verbose > 1) { auto old_precision = std::cout.precision(); std::cout << std::setprecision(6); std::cout << "=================== GPU Performance ====================" << std::endl; - std::cout << "Average GFLOPs achieved in the GW iteration: " << average_eff_flops / 1e9 << std::endl; + std::cout << "Average GFLOPs achieved in the GW iteration: " << avg_eff_flops / 1e9 << std::endl; std::cout << "Minimum GFLOPs achieved in the GW iteration: " << min_eff_flops / 1e9 << std::endl; std::cout << "Maximum GFLOPs achieved in the GW iteration: " << max_eff_flops / 1e9 << std::endl; std::cout << "============================================================" << std::endl; From c36ee8189e8aebb063498e27fbea15f39d0f5df1 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 16:21:48 -0500 Subject: [PATCH 32/36] fix sigmak_stij_buffer allocation - same memory as Gk is conflicting --- src/cugw_qpt.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index 7ab1a51..d8b40a9 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -351,8 +351,8 @@ namespace green::gpu { throw std::runtime_error("failure allocating Gk1_tsij on host"); if (cudaMallocHost(&Gk_smtij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) throw std::runtime_error("failure allocating Gk_smtij on host"); - // Reuse Gk_smtij_buffer_ for sigmak_stij_buffer_ to save memory since they are not needed at the same time - Sigmak_stij_buffer_ = Gk_smtij_buffer_; + if (cudaMallocHost(&Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) + throw std::runtime_error("failure allocating Sigmak_stij on host"); } if (cudaMalloc(&Pqk0_tQP_local_, nt_batch_ * naux2_ * sizeof(cuda_complex)) != cudaSuccess) From 7bca91536f8eb24651a6a22ec8d04faf6e1bc774 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 16:28:51 -0500 Subject: [PATCH 33/36] modify complexity estimation and performance metrics --- src/green/gpu/gw_gpu_kernel.h | 8 +++---- src/gw_gpu_kernel.cpp | 42 +++++++++-------------------------- 2 files changed, 15 insertions(+), 35 deletions(-) diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index 5d3b51b..2d96f80 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -145,10 +145,10 @@ namespace green::gpu { protected: void gw_innerloop(G_type& g, St_type& sigma_tau) override; private: - /** - * @brief Calculate and print complexity estimation for each device - */ - void complexity_estimation(); + /** + * @brief Calculate and print complexity estimation for each device + */ + void complexity_estimation(); template void compute_gw_selfenergy(G_type& g, St_type& sigma_tau); diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index b1f373f..f773cd9 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -26,37 +26,26 @@ namespace green::gpu { void scalar_gw_gpu_kernel::complexity_estimation() { - if (_devices_comm == MPI_COMM_NULL) return; - // Get number of k-points assigned to each device - size_t ink_on_device = 1 + (_ink - 1 - _devices_rank) / _devices_size; // Calculate the complexity of GW double NQsq=(double)_NQ*_NQ; //first set of matmuls - double flop_count_firstmatmul = ink_on_device*_nk*_ns*_nts/2.*( + double flop_count_firstmatmul = _ink*_nk*_ns*_nts/2.*( matmul_cost(_nao*_NQ, _nao, _nao) //X1_t_mQ = G_t_p * V_pmQ; + matmul_cost(_NQ*_nao, _nao, _nao)//X2_Pt_m = (V_Pt_n)* * G_m_n; + matmul_cost(_NQ, _naosq, _NQ) //Pq0_QP=X2_Ptm Q1_tmQ ); - double flop_count_fourier = ink_on_device*( + double flop_count_fourier = _ink*( matmul_cost(NQsq, _nts, _nw_b) + matmul_cost(NQsq, _nw_b, _nts) ); //Fourier transform forward and back - double flop_count_solver = 2./3.*ink_on_device*_nw_b*(NQsq*_NQ+NQsq); //approximate LU and backsubst cost (note we are doing cholesky which is cheaper + double flop_count_solver = 2./3.*_ink*_nw_b*(NQsq*_NQ+NQsq); //approximate LU and backsubst cost (note we are doing cholesky which is cheaper //secondset of matmuls - double flop_count_secondmatmul = ink_on_device*_nk*_ns*_nts*( + double flop_count_secondmatmul = _ink*_nk*_ns*_nts*( matmul_cost(_NQ*_nao, _nao, _nao) //Y1_Qin = V_Qim * G1_mn; + matmul_cost(_naosq, _NQ, _NQ) //Y2_inP = Y1_Qin * Pq_QP + matmul_cost(_nao, _NQ*_nao, _nao)//Sigma_ij = Y2_inP V_nPj ); _flop_count = flop_count_firstmatmul + flop_count_fourier + flop_count_solver + flop_count_secondmatmul; - // Sum over all devices to get total FLOP count; but don't broadcast back since each device only needs its own flop count to calculate achieved FLOPs - double total_flop_count = _flop_count; - MPI_Allreduce(MPI_IN_PLACE, &flop_count_firstmatmul, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); - MPI_Allreduce(MPI_IN_PLACE, &flop_count_fourier, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); - MPI_Allreduce(MPI_IN_PLACE, &flop_count_solver, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); - MPI_Allreduce(MPI_IN_PLACE, &flop_count_secondmatmul, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); - MPI_Allreduce(MPI_IN_PLACE, &total_flop_count, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); - if (!utils::context.global_rank && _verbose > 1) { std::cout << "############ Total GW Operations per Iteration ############" << std::endl; std::cout << "Total: " << total_flop_count << std::endl; @@ -69,30 +58,19 @@ namespace green::gpu { } void x2c_gw_gpu_kernel::complexity_estimation() { - if (_devices_comm == MPI_COMM_NULL) return; - // Get number of k-points assigned to each device - size_t ink_on_device = 1 + (_ink - 1 - _devices_rank) / _devices_size; // Calculate the complexity of GW // TODO virtual function for complexity calculation double NQsq=(double)_NQ*_NQ; //first set of matmuls // Doesn't fit for generalized GW. - double flop_count_firstmatmul= ink_on_device*_nk*4*_nts/2.* + double flop_count_firstmatmul= _ink*_nk*4*_nts/2.* (matmul_cost(_nao*_NQ, _nao, _nao)+matmul_cost(_nao, _NQ*_nao, _nao)+matmul_cost(_NQ, _NQ, _naosq)); - double flop_count_fourier=ink_on_device*(matmul_cost(NQsq, _nts, _nw_b)+matmul_cost(NQsq, _nw_b, _nts)); //Fourier transform forward and back - double flop_count_solver=2./3.*ink_on_device*_nw_b*(NQsq*_NQ+NQsq); //approximate LU and backsubst cost + double flop_count_fourier=_ink*(matmul_cost(NQsq, _nts, _nw_b)+matmul_cost(NQsq, _nw_b, _nts)); //Fourier transform forward and back + double flop_count_solver=2./3.*_ink*_nw_b*(NQsq*_NQ+NQsq); //approximate LU and backsubst cost //secondset of matmuls - double flop_count_secondmatmul=ink_on_device*_nk*4*_nts*(matmul_cost(_nao*_NQ, _nao, _nao)+matmul_cost(_NQ, _naosq, _NQ)+matmul_cost(_nao, _nao, _NQ*_nao)); + double flop_count_secondmatmul=_ink*_nk*4*_nts*(matmul_cost(_nao*_NQ, _nao, _nao)+matmul_cost(_NQ, _naosq, _NQ)+matmul_cost(_nao, _nao, _NQ*_nao)); _flop_count= flop_count_firstmatmul+flop_count_fourier+flop_count_solver+flop_count_secondmatmul; - // Sum over all devices to get total FLOP count; but don't broadcast back since each device only needs its own flop count to calculate achieved FLOPs - double total_flop_count = _flop_count; - MPI_Allreduce(MPI_IN_PLACE, &flop_count_firstmatmul, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); - MPI_Allreduce(MPI_IN_PLACE, &flop_count_fourier, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); - MPI_Allreduce(MPI_IN_PLACE, &flop_count_solver, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); - MPI_Allreduce(MPI_IN_PLACE, &flop_count_secondmatmul, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); - MPI_Allreduce(MPI_IN_PLACE, &total_flop_count, 1, MPI_DOUBLE, MPI_SUM, _devices_comm); - if (!utils::context.global_rank && _verbose > 1) { std::cout << "############ Total Two-Component GW Operations per Iteration ############" << std::endl; std::cout << "Total: " << total_flop_count << std::endl; @@ -157,7 +135,9 @@ namespace green::gpu { } else { throw std::runtime_error("'Solve cuGW' still active, but it should not be."); } - _eff_flops = _flop_count / gpu_time; + size_t ink_on_device = 1 + (_ink - 1 - _devices_rank) / _devices_size; + double ops_on_device = _flop_count * ink_on_device / _ink; // Get flop count for this device + _eff_flops = ops_on_device / gpu_time; } void gw_gpu_kernel::print_effective_flops() { From 86bc13c851be62a8e3954f84907eb10c54765bef Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 16:30:53 -0500 Subject: [PATCH 34/36] minof fix in complexity estimation --- src/gw_gpu_kernel.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index f773cd9..f59d512 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -48,7 +48,7 @@ namespace green::gpu { if (!utils::context.global_rank && _verbose > 1) { std::cout << "############ Total GW Operations per Iteration ############" << std::endl; - std::cout << "Total: " << total_flop_count << std::endl; + std::cout << "Total: " << _flop_count << std::endl; std::cout << "First matmul: " << flop_count_firstmatmul << std::endl; std::cout << "Fourier: " << flop_count_fourier << std::endl; std::cout << "Solver: " << flop_count_solver << std::endl; @@ -73,7 +73,7 @@ namespace green::gpu { if (!utils::context.global_rank && _verbose > 1) { std::cout << "############ Total Two-Component GW Operations per Iteration ############" << std::endl; - std::cout << "Total: " << total_flop_count << std::endl; + std::cout << "Total: " << _flop_count << std::endl; std::cout << "First matmul: " << flop_count_firstmatmul << std::endl; std::cout << "Fourier: " << flop_count_fourier << std::endl; std::cout << "Solver: " << flop_count_solver << std::endl; From 1bbc9080b2e52fc28e9a9e36cd8908ca1fa2bf84 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 16:44:15 -0500 Subject: [PATCH 35/36] Update src/cugw_qpt.cu Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cugw_qpt.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index d8b40a9..382e015 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -348,7 +348,7 @@ namespace green::gpu { throw std::runtime_error("failure allocating V on host"); if (_low_memory_requirement) { if (cudaMallocHost(&Gk1_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) - throw std::runtime_error("failure allocating Gk1_tsij on host"); + throw std::runtime_error("failure allocating Gk1_stij on host"); if (cudaMallocHost(&Gk_smtij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) throw std::runtime_error("failure allocating Gk_smtij on host"); if (cudaMallocHost(&Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) From 69c13e54486654b6b0d2917b64556fb240551d9e Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Thu, 6 Nov 2025 16:44:28 -0500 Subject: [PATCH 36/36] Update src/cugw_qpt.cu Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cugw_qpt.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index 382e015..8acc15f 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -385,6 +385,7 @@ namespace green::gpu { if (_low_memory_requirement) { cudaFreeHost(Gk1_stij_buffer_); cudaFreeHost(Gk_smtij_buffer_); + cudaFreeHost(Sigmak_stij_buffer_); } if (require_cleanup()) { throw std::runtime_error("cleanup of self-energy was not done correctly.");