diff --git a/src/cu_routines.cu b/src/cu_routines.cu index 4542006..866af76 100644 --- a/src/cu_routines.cu +++ b/src/cu_routines.cu @@ -21,6 +21,7 @@ #include + __global__ void initialize_array(cuDoubleComplex* array, cuDoubleComplex value, int count) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= count) return; @@ -171,7 +172,7 @@ namespace green::gpu { int _devCount_per_node) : _low_device_memory(low_device_memory), qkpts(_nqkpt), V_Qpm(_NQ, _nao, _nao), V_Qim(_NQ, _nao, _nao), Gk1_stij(_ns, _nts, _nao, _nao), Gk_smtij(_ns, _nts, _nao, _nao), - qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data(), cuda_lin_solver) { + qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data(), cuda_lin_solver), _qkpt_cublas_handles(_nqkpt) { if (cudaSetDevice(_intranode_rank % _devCount_per_node) != cudaSuccess) throw std::runtime_error("Error in cudaSetDevice2"); if (cublasCreate(&_handle) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas"); @@ -201,8 +202,11 @@ namespace green::gpu { qpt.init(&_handle, &_solver_handle); // Each process gets one cuda runner for qpoints for (int i = 0; i < _nqkpt; ++i) { - qkpts[i] = new gw_qkpt(_nao, _NQ, _ns, _nts, _nt_batch, &_handle, g_kstij_device, g_ksmtij_device, sigma_kstij_device, - sigma_k_locks); + if (cublasCreate(&_qkpt_cublas_handles[i]) != CUBLAS_STATUS_SUCCESS) + throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas"); + // initialize qkpt workers + qkpts[i] = new gw_qkpt(_nao, _NQ, _ns, _nts, _nt_batch, &_qkpt_cublas_handles[i], + g_kstij_device, g_ksmtij_device, sigma_kstij_device, sigma_k_locks); } } @@ -230,9 +234,13 @@ namespace green::gpu { bool need_minus_k = reduced_to_full[k_reduced_id] != k; bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1; + // Read V_Qpm and Gk_smtij r1(k, k1, k_reduced_id, k1_reduced_id, k_vector, V_Qpm, Vk1k2_Qij, Gk_smtij, Gk1_stij, need_minus_k, need_minus_k1); + // Get an idle qkpt worker gw_qkpt* qkpt = obtain_idle_qkpt(qkpts); + + // Copy data from host to device and set up qkpt if (_low_device_memory) { if (!_X2C) { qkpt->set_up_qkpt_first(Gk1_stij.data(), Gk_smtij.data(), V_Qpm.data(), k_reduced_id, need_minus_k, k1_reduced_id, @@ -244,13 +252,19 @@ namespace green::gpu { } else { qkpt->set_up_qkpt_first(nullptr, nullptr, V_Qpm.data(), k_reduced_id, need_minus_k, k1_reduced_id, need_minus_k1); } + // Compute Polarization Pqk0 qkpt->compute_first_tau_contraction(qpt.Pqk0_tQP(qkpt->all_done_event()), qpt.Pqk0_tQP_lock()); } + + + // Compute Polarization Pq from irreducible polarization Pqk0 qpt.wait_for_kpts(); qpt.scale_Pq0_tQP(1. / _nk); qpt.transform_tw(); qpt.compute_Pq(); qpt.transform_wt(); + + // Write to Sigma(k), k belongs to _ink for (size_t k_reduced_id = 0; k_reduced_id < _ink; ++k_reduced_id) { size_t k = reduced_to_full[k_reduced_id]; @@ -264,30 +278,32 @@ namespace green::gpu { bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1; bool need_minus_q = reduced_to_full[q_reduced_id] != q_or_qinv; + // Read V_Qij and Gk1_stij r2(k, k1, k1_reduced_id, k_vector, V_Qim, Vk1k2_Qij, Gk1_stij, need_minus_k1); - gw_qkpt* qkpt = obtain_idle_qkpt(qkpts); + // Get an idle qkpt worker + gw_qkpt* qkpt = obtain_idle_qkpt_for_sigma(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C); + + qkpt->set_k_red_id(k_reduced_id); + // Copy data from host to device, compute Sigma(k), and schedule async memcpy to host pinned memory "qkpt::Sigmak_stij_buffer_" if (_low_device_memory) { if (!_X2C) { qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); - qkpt->compute_second_tau_contraction(Sigmak_stij.data(), - qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); - copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns); + qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); } else { // In 2cGW, G(-k) = G*(k) has already been addressed in r2() qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, false); - qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(), - qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); - copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts); + qkpt->compute_second_tau_contraction_2C(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); } } else { qkpt->set_up_qkpt_second(nullptr, V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); - qkpt->compute_second_tau_contraction(nullptr, qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); + qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); } } } } } + wait_and_clean_qkpts(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C); cudaDeviceSynchronize(); if (!_low_device_memory and !_X2C) { copy_Sigma_from_device_to_host(sigma_kstij_device, Sigma_tskij_host.data(), _ink, _nao, _nts, _ns); @@ -303,6 +319,7 @@ namespace green::gpu { } } } + template void cugw_utils::copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_4tij, int k, int nts) { size_t nao = Sigmak_4tij.shape()[3]; @@ -331,6 +348,9 @@ namespace green::gpu { } if (cublasDestroy(_handle) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("cublas error destroying handle"); if (cusolverDnDestroy(_solver_handle) != CUSOLVER_STATUS_SUCCESS) throw std::runtime_error("culapck error destroying handle"); + for (int i = 0; i < qkpts.size(); ++i) { + if (cublasDestroy(_qkpt_cublas_handles[i]) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("cublas error destroying handle"); + } if (!_low_device_memory) cudaFree(g_kstij_device); if (!_low_device_memory) cudaFree(g_ksmtij_device); cudaFree(sigma_kstij_device); diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index a366155..8acc15f 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -20,6 +20,8 @@ */ #include + + namespace green::gpu { template gw_qpt::gw_qpt(int nao, int naux, int nt, int nw_b, @@ -320,7 +322,7 @@ namespace green::gpu { g_ktij_(g_ktij), g_kmtij_(g_kmtij), sigma_ktij_(sigma_ktij), sigma_k_locks_(sigma_k_locks), nao_(nao), nao2_(nao * nao), nao3_(nao2_ * nao), naux_(naux), naux2_(naux * naux), nauxnao_(naux * nao), nauxnao2_(naux * nao * nao), ns_(ns), nt_(nt), nt_batch_(nt_batch), ntnaux_(nt * naux), ntnaux2_(nt * naux * naux), ntnao_(nt * nao), ntnao2_(nt * nao2_), - handle_(handle) { + handle_(handle), cleanup_req_(false) { _low_memory_requirement = (g_ktij == nullptr) ? true : false; if (cudaStreamCreate(&stream_) != cudaSuccess) throw std::runtime_error("main stream creation failed"); @@ -346,10 +348,11 @@ namespace green::gpu { throw std::runtime_error("failure allocating V on host"); if (_low_memory_requirement) { if (cudaMallocHost(&Gk1_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) - throw std::runtime_error("failure allocating Gk_tsij on host"); + throw std::runtime_error("failure allocating Gk1_stij on host"); if (cudaMallocHost(&Gk_smtij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) - throw std::runtime_error("failure allocating Gk_tsij on host"); - Sigmak_stij_buffer_ = Gk_smtij_buffer_; + throw std::runtime_error("failure allocating Gk_smtij on host"); + if (cudaMallocHost(&Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) + throw std::runtime_error("failure allocating Sigmak_stij on host"); } if (cudaMalloc(&Pqk0_tQP_local_, nt_batch_ * naux2_ * sizeof(cuda_complex)) != cudaSuccess) @@ -382,6 +385,10 @@ namespace green::gpu { if (_low_memory_requirement) { cudaFreeHost(Gk1_stij_buffer_); cudaFreeHost(Gk_smtij_buffer_); + cudaFreeHost(Sigmak_stij_buffer_); + } + if (require_cleanup()) { + throw std::runtime_error("cleanup of self-energy was not done correctly."); } } @@ -526,7 +533,7 @@ namespace green::gpu { } template - void gw_qkpt::compute_second_tau_contraction(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) { + void gw_qkpt::compute_second_tau_contraction(cuda_complex* Pqk_tQP) { cuda_complex one = cu_type_map::cast(1., 0.); cuda_complex zero = cu_type_map::cast(0., 0.); cuda_complex m1 = cu_type_map::cast(-1., 0.); @@ -556,12 +563,12 @@ namespace green::gpu { } } } - write_sigma(_low_memory_requirement, Sigmak_stij_host); + write_sigma(_low_memory_requirement); cudaEventRecord(all_done_event_); } template - void gw_qkpt::compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) { + void gw_qkpt::compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP) { cuda_complex one = cu_type_map::cast(1., 0.); cuda_complex zero = cu_type_map::cast(0., 0.); cuda_complex m1 = cu_type_map::cast(-1., 0.); @@ -593,28 +600,71 @@ namespace green::gpu { } } } - write_sigma(true, Sigmak_stij_host); + write_sigma(true); cudaEventRecord(all_done_event_); } template - void gw_qkpt::write_sigma(bool low_memory_mode, cxx_complex* Sigmak_stij_host) { + void gw_qkpt::write_sigma(bool low_memory_mode) { // write results. Make sure we have exclusive write access to sigma, then add array sigmak_tij to sigma_ktij - acquire_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); scalar_t one = 1.; if (!low_memory_mode) { + acquire_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); if (RAXPY(*handle_, 2 * ns_ * ntnao2_, &one, (scalar_t*)sigmak_stij_, 1, (scalar_t*)(sigma_ktij_ + k_ * ns_ * ntnao2_), 1) != CUBLAS_STATUS_SUCCESS) { throw std::runtime_error("RAXPY fails on gw_qkpt.write_sigma()."); } + release_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); } else { // Copy sigmak_stij_ back to CPU - if (Sigmak_stij_host == nullptr) - throw std::runtime_error("gw_qkpt.write_sigma(): Sigmak_stij_host cannot be a null pointer"); - cudaMemcpy(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost); - std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); + cudaMemcpyAsync(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost, stream_); + cleanup_req_ = true; // This is required in addition to the all_done_event_ to indicate that we need to copy back to host. For initial states, this is not needed. + } + } + + + template + void gw_qkpt::cleanup(bool low_memory_mode, tensor, 4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c) { + // Block the stream until all the tasks are completed + if (require_cleanup()) { + cudaStreamSynchronize(stream_); + std::memcpy(Sigmak_stij_host.data(), Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); + if (!x2c) { + copy_Sigma(Sigma_tskij_host, Sigmak_stij_host); + } else { + copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij_host); + } + cleanup_req_ = false; + } + } + + template + void gw_qkpt::copy_Sigma(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij) { + for (size_t t = 0; t < nt_; ++t) { + for (size_t s = 0; s < ns_; ++s) { + matrix(Sigma_tskij_host(t, s, k_red_id_)) += matrix(Sigmak_stij(s, t)).template cast>(); + } + } + } + + template + void gw_qkpt::copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij) { + size_t nao = Sigmak_stij.shape()[3]; + size_t nso = 2 * nao; + for (size_t ss = 0; ss < 3; ++ss) { + size_t a = (ss % 2 == 0) ? 0 : 1; + size_t b = ((ss + 1) / 2 != 1) ? 0 : 1; + size_t i_shift = a * nao; + size_t j_shift = b * nao; + for (size_t t = 0; t < nt_; ++t) { + matrix(Sigma_tskij_host(t, 0, k_red_id_)).block(i_shift, j_shift, nao, nao) += + matrix(Sigmak_stij(ss, t)).template cast>(); + if (ss == 2) { + matrix(Sigma_tskij_host(t, 0, k_red_id_)).block(j_shift, i_shift, nao, nao) += + matrix(Sigmak_stij(ss, t)).conjugate().transpose().template cast>(); + } + } } - release_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); } template diff --git a/src/green/gpu/cu_routines.h b/src/green/gpu/cu_routines.h index b2d9c71..9e0e32a 100644 --- a/src/green/gpu/cu_routines.h +++ b/src/green/gpu/cu_routines.h @@ -155,6 +155,7 @@ namespace green::gpu { bool _low_device_memory; cublasHandle_t _handle; cusolverDnHandle_t _solver_handle; + std::vector _qkpt_cublas_handles; // list of cublas handles for qkpt streams gw_qpt qpt; std::vector*> qkpts; diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index 292ac48..a0bdf3f 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -265,30 +265,74 @@ namespace green::gpu { /** * \brief Using dressed GW polarization compute self-energy at a given momentum point * - * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point * \param Pqk_tQP Dressed polarization bubble */ - void compute_second_tau_contraction(cxx_complex* Sigmak_stij_host = nullptr, cuda_complex* Pqk_tQP = nullptr); + void compute_second_tau_contraction(cuda_complex* Pqk_tQP = nullptr); + /** * \brief Using dressed GW polarization compute self-energy at a given momentum point (X2C version) - * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point + * * \param Pqk_tQP Dressed polarization bubble */ - void compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host = nullptr, cuda_complex* Pqk_tQP = nullptr); + void compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP = nullptr); /** * \brief For a given k-point copy self-energy back to a host memory * \param low_memory_mode - whether the whole self-energy allocated in memory or not + */ + void write_sigma(bool low_memory_mode = false); + + /** + * \brief return the status of copy_selfenergy from device to host. + * Return value of `false` means stream is ready for next calculation without cleanup. + * + * \return true if cleanup is required, false otherwise + */ + bool require_cleanup(){ + return cleanup_req_; + } + + /** + * \brief perform cleanup, i.e. copy data from Sigmak buffer (4-index array for a given momentum point) to Host shared memory Self-energy + * + * \param low_memory_mode - whether the whole self-energy allocated in memory or not * \param Sigmak_stij_host - Host stored self-energy object at a given momentum point + * \param Sigma_tskij_host - Host stored full self-energy object for each device + * \param x2c - use x2c specific functions or not + */ + void cleanup(bool low_memory_mode, tensor, 4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c); + + /** + * \brief Copy the non-relativistic self-energy from the per-k buffer to the full host tensor. + * + * \param Sigma_tskij_host Host storage for the full self-energy tensor. + * \param Sigmak_stij Self-energy tensor for a given momentum point. + */ + void copy_Sigma(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij); + + /** + * \brief Copy 2-component self-energy from per-k buffer to full host tensor. + * + * \param Sigma_tskij_host Host storage for the full self-energy tensor. + * \param Sigmak_stij Self-energy tensor for a given momentum point. */ - void write_sigma(bool low_memory_mode = false, cxx_complex* Sigmak_stij_host = nullptr); + void copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor, 4>& Sigmak_stij); /** - * \brief Check if cuda devices are budy + * \brief Check if cuda devices are busy * \return true if asynchronous calculations are still running */ bool is_busy(); + /** + * \brief Set the irreducible k-point index for gw_qkpt worker + * + * \param k incoming k-point index + */ + void set_k_red_id(int k) { + k_red_id_ = k; + } + // static std::size_t size(size_t nao, size_t naux, size_t nt, size_t nt_batch, size_t ns) { return (2 * naux * nao * nao // V_Qpm+V_pmQ @@ -374,6 +418,14 @@ namespace green::gpu { // pointer to cublas handle cublasHandle_t* handle_; + + // irreducible k-pt assigned to the qkpt stream + int k_red_id_; + + // status of data transfer / copy from Device to Host. + // false: not required, stream ready for next calculation + // true: required, stream occupied + bool cleanup_req_ = false; }; template @@ -387,4 +439,48 @@ namespace green::gpu { return qkpts[pos]; } + /** + * \brief returns an idle qkpt stream, otherwise waits until a stream is available + * + * \tparam prec - precision for calculation + * \param qkpts - vector of qkpt workers (gw_qkpt type) + * \param low_memory_mode - low memory mode for read/write integrals + * \param Sigmak_stij_host - cudaMallocHost buffer for transfering Sigma + * \return gw_qkpt* - pointer to idle qkpt + */ + template + gw_qkpt* obtain_idle_qkpt_for_sigma(std::vector*>& qkpts, bool low_memory_mode, + tensor,4>& Sigmak_stij_host, + ztensor<5>& Sigma_tskij_host, bool x2c) { + static int pos = 0; + pos++; + if (pos >= qkpts.size()) pos = 0; + while (qkpts[pos]->is_busy()) { + pos = (pos + 1) % qkpts.size(); + } + qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host, Sigma_tskij_host, x2c); + return qkpts[pos]; + } + + /** + * \brief waits for all qkpts to complete and cleans them up + * + * \tparam prec - precision for calculation + * \param qkpts - vector of qkpt workers (gw_qkpt type) + * \param low_memory_mode - low memory mode for read/write integrals + * \param Sigmak_stij_host - cudaMallocHost buffer for transfering Sigma + */ + template + void wait_and_clean_qkpts(std::vector*>& qkpts, bool low_memory_mode, + tensor,4>& Sigmak_stij_host, + ztensor<5>& Sigma_tskij_shared, bool x2c) { + for (int pos = 0; pos < qkpts.size(); pos++) { + // wait for qkpt to finish its tasks, then cleanup + while (qkpts[pos]->is_busy()) { + continue; + } + qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host, Sigma_tskij_shared, x2c); + } + } + } // namespace green::gpu diff --git a/src/green/gpu/gpu_kernel.h b/src/green/gpu/gpu_kernel.h index f1b0028..7ddf66f 100644 --- a/src/green/gpu/gpu_kernel.h +++ b/src/green/gpu/gpu_kernel.h @@ -66,7 +66,7 @@ namespace green::gpu { */ inline void set_shared_Coulomb() { if (_coul_int_reading_type == as_a_whole) { - statistics.start("Read"); + statistics.start("read whole integral"); // Allocate Coulomb integrals in double precision and cast them to single precision whenever needed allocate_shared_Coulomb(&_Vk1k2_Qij); statistics.end(); diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index c2cba49..2d96f80 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -87,10 +87,9 @@ namespace green::gpu { /** * \brief calculate effective floating points operations per second reached on GPU. - * This is not representative of the GPU capabilities, but instead, accounts for read/write overheads. - * The value is entirely in the context Green-MBPT solver. + * This is not representative of the GPU capabilities, but instead, includes read/write overheads. */ - void flops_achieved(MPI_Comm comm); + void flops_achieved(); /** * \brief print the effective FLOPs achieved for the iteration. @@ -146,6 +145,9 @@ namespace green::gpu { protected: void gw_innerloop(G_type& g, St_type& sigma_tau) override; private: + /** + * @brief Calculate and print complexity estimation for each device + */ void complexity_estimation(); template @@ -182,6 +184,9 @@ namespace green::gpu { protected: void gw_innerloop(G_type& g, St_type& sigma_tau) override; private: + /** + * @brief Calculate and print complexity estimation for each device + */ void complexity_estimation(); template diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 4fe3b56..f59d512 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -29,20 +29,22 @@ namespace green::gpu { // Calculate the complexity of GW double NQsq=(double)_NQ*_NQ; //first set of matmuls - double flop_count_firstmatmul= _ink*_nk*_ns*_nts/2.*( + double flop_count_firstmatmul = _ink*_nk*_ns*_nts/2.*( matmul_cost(_nao*_NQ, _nao, _nao) //X1_t_mQ = G_t_p * V_pmQ; - +matmul_cost(_NQ*_nao, _nao, _nao)//X2_Pt_m = (V_Pt_n)* * G_m_n; - +matmul_cost(_NQ, _naosq, _NQ) //Pq0_QP=X2_Ptm Q1_tmQ + + matmul_cost(_NQ*_nao, _nao, _nao)//X2_Pt_m = (V_Pt_n)* * G_m_n; + + matmul_cost(_NQ, _naosq, _NQ) //Pq0_QP=X2_Ptm Q1_tmQ ); - double flop_count_fourier=_ink*(matmul_cost(NQsq, _nts, _nw_b)+matmul_cost(NQsq, _nw_b, _nts)); //Fourier transform forward and back - double flop_count_solver=2./3.*_ink*_nw_b*(NQsq*_NQ+NQsq); //approximate LU and backsubst cost (note we are doing cholesky which is cheaper + double flop_count_fourier = _ink*( + matmul_cost(NQsq, _nts, _nw_b) + matmul_cost(NQsq, _nw_b, _nts) + ); //Fourier transform forward and back + double flop_count_solver = 2./3.*_ink*_nw_b*(NQsq*_NQ+NQsq); //approximate LU and backsubst cost (note we are doing cholesky which is cheaper //secondset of matmuls - double flop_count_secondmatmul=_ink*_nk*_ns*_nts*( + double flop_count_secondmatmul = _ink*_nk*_ns*_nts*( matmul_cost(_NQ*_nao, _nao, _nao) //Y1_Qin = V_Qim * G1_mn; - +matmul_cost(_naosq, _NQ, _NQ) //Y2_inP = Y1_Qin * Pq_QP - +matmul_cost(_nao, _NQ*_nao, _nao)//Sigma_ij = Y2_inP V_nPj + + matmul_cost(_naosq, _NQ, _NQ) //Y2_inP = Y1_Qin * Pq_QP + + matmul_cost(_nao, _NQ*_nao, _nao)//Sigma_ij = Y2_inP V_nPj ); - _flop_count= flop_count_firstmatmul+flop_count_fourier+flop_count_solver+flop_count_secondmatmul; + _flop_count = flop_count_firstmatmul + flop_count_fourier + flop_count_solver + flop_count_secondmatmul; if (!utils::context.global_rank && _verbose > 1) { std::cout << "############ Total GW Operations per Iteration ############" << std::endl; @@ -84,7 +86,7 @@ namespace green::gpu { MPI_Datatype dt_matrix = utils::create_matrix_datatype>(_nso*_nso); MPI_Op matrix_sum_op = utils::create_matrix_operation>(); statistics.start("total"); - statistics.start("Initialization"); + statistics.start("Initialization: CPU"); sigma_tau.fence(); if (!utils::context.node_rank) sigma_tau.object().set_zero(); sigma_tau.fence(); @@ -101,7 +103,7 @@ namespace green::gpu { MPI_Barrier(utils::context.global); sigma_tau.fence(); // Print effective FLOPs achieved in the calculation - flops_achieved(_devices_comm); + flops_achieved(); if (!utils::context.node_rank) { if (_devices_comm != MPI_COMM_NULL) statistics.start("selfenergy_reduce"); utils::allreduce(MPI_IN_PLACE, sigma_tau.object().data(), sigma_tau.object().size()/(_nso*_nso), dt_matrix, matrix_sum_op, utils::context.internode_comm); @@ -113,6 +115,8 @@ namespace green::gpu { statistics.end(); statistics.print(utils::context.global); print_effective_flops(); + // Reset all timing stats for next iteration + statistics.reset(); clean_MPI_structure(); clean_shared_Coulomb(); @@ -122,32 +126,42 @@ namespace green::gpu { MPI_Op_free(&matrix_sum_op); } - void gw_gpu_kernel::flops_achieved(MPI_Comm comm) { + void gw_gpu_kernel::flops_achieved() { + if (_devices_comm == MPI_COMM_NULL) return; double gpu_time=0.; - if (comm != MPI_COMM_NULL) { - utils::event_t& cugw_event = statistics.event("Solve cuGW"); - if (!cugw_event.active) { - gpu_time = cugw_event.duration; - } else { - throw std::runtime_error("'Solve cuGW' still active, but it should not be."); - } - } - - if (!utils::context.global_rank) { - MPI_Reduce(MPI_IN_PLACE, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global); + utils::event_t& cugw_event = statistics.event("Solve cuGW"); + if (!cugw_event.active) { + gpu_time = cugw_event.duration; } else { - MPI_Reduce(&gpu_time, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global); + throw std::runtime_error("'Solve cuGW' still active, but it should not be."); } - - _eff_flops = _flop_count / gpu_time; + size_t ink_on_device = 1 + (_ink - 1 - _devices_rank) / _devices_size; + double ops_on_device = _flop_count * ink_on_device / _ink; // Get flop count for this device + _eff_flops = ops_on_device / gpu_time; } void gw_gpu_kernel::print_effective_flops() { + if (_devices_comm == MPI_COMM_NULL) return; + double avg_eff_flops = _eff_flops; + double max_eff_flops = _eff_flops; + double min_eff_flops = _eff_flops; + if (!_devices_rank) { // using the fact that when devices_rank = 0, global_rank = 0 + MPI_Reduce(MPI_IN_PLACE, &max_eff_flops, 1, MPI_DOUBLE, MPI_MAX, 0, _devices_comm); + MPI_Reduce(MPI_IN_PLACE, &min_eff_flops, 1, MPI_DOUBLE, MPI_MIN, 0, _devices_comm); + MPI_Reduce(MPI_IN_PLACE, &avg_eff_flops, 1, MPI_DOUBLE, MPI_SUM, 0, _devices_comm); + avg_eff_flops /= _devices_size; + } else { + MPI_Reduce(&max_eff_flops, &max_eff_flops, 1, MPI_DOUBLE, MPI_MAX, 0, _devices_comm); + MPI_Reduce(&min_eff_flops, &min_eff_flops, 1, MPI_DOUBLE, MPI_MIN, 0, _devices_comm); + MPI_Reduce(&avg_eff_flops, &avg_eff_flops, 1, MPI_DOUBLE, MPI_SUM, 0, _devices_comm); + } if (!utils::context.global_rank && _verbose > 1) { auto old_precision = std::cout.precision(); std::cout << std::setprecision(6); std::cout << "=================== GPU Performance ====================" << std::endl; - std::cout << "Effective FLOPs in the GW iteration: " << _eff_flops / 1.0e9 << " Giga flops." << std::endl; + std::cout << "Average GFLOPs achieved in the GW iteration: " << avg_eff_flops / 1e9 << std::endl; + std::cout << "Minimum GFLOPs achieved in the GW iteration: " << min_eff_flops / 1e9 << std::endl; + std::cout << "Maximum GFLOPs achieved in the GW iteration: " << max_eff_flops / 1e9 << std::endl; std::cout << "============================================================" << std::endl; std::cout << std::setprecision(old_precision); } @@ -173,7 +187,7 @@ namespace green::gpu { void scalar_gw_gpu_kernel::compute_gw_selfenergy(G_type& g, St_type& sigma_tau) { // check devices' free space and space requirements GW_check_devices_free_space(); - statistics.start("Initialization"); + statistics.start("Initialization: GPU"); cugw_utils cugw(_nts, _nt_batch, _nw_b, _ns, _nk, _ink, _nqkpt, _NQ, _nao, g.object(), _low_device_memory, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver, utils::context.global_rank, utils::context.node_rank, _devCount_per_node); @@ -185,7 +199,7 @@ namespace green::gpu { tensor,3>& V_Qpm, std::complex *Vk1k2_Qij, tensor,4>&Gk_smtij, tensor,4>&Gk1_stij, bool need_minus_k, bool need_minus_k1) { - statistics.start("read"); + statistics.start("Read Integrals", true); int q = k_vector[2]; if (_coul_int_reading_type == chunks) { read_next(k_vector); @@ -203,7 +217,7 @@ namespace green::gpu { tensor,3>& V_Qim, std::complex *Vk1k2_Qij, tensor,4>&Gk1_stij, bool need_minus_k1) { - statistics.start("read"); + statistics.start("Read Integrals", true); int q = k_vector[1]; if (_coul_int_reading_type == chunks) { read_next(k_vector); @@ -222,7 +236,7 @@ namespace green::gpu { // and do MPIAllreduce on CPU later on. Since the number of processes with a GPU is very // limited, the additional memory overhead is fairly limited. ztensor<5> Sigma_tskij_host_local(_nts, _ns, _ink, _nao, _nao); - statistics.start("Solve cuGW"); + statistics.start("Solve cuGW", false); cugw.solve(_nts, _ns, _nk, _ink, _nao, _bz_utils.symmetry().reduced_to_full(), _bz_utils.symmetry().full_to_reduced(), _Vk1k2_Qij, Sigma_tskij_host_local, _devices_rank, _devices_size, _low_device_memory, _verbose, irre_pos, mom_cons, r1, r2); @@ -308,7 +322,7 @@ namespace green::gpu { tensor,4>&Gk_4mtij, tensor,4>&Gk1_4tij, bool need_minus_k, bool need_minus_k1) { - statistics.start("read"); + statistics.start("Read Integrals", true); int q = k_vector[2]; if (q == 0 or _coul_int_reading_type == chunks) { read_next(k_vector); @@ -324,7 +338,7 @@ namespace green::gpu { tensor,3>& V_Qim, std::complex *Vk1k2_Qij, tensor,4>&Gk1_4tij, bool need_minus_k1) { - statistics.start("read"); + statistics.start("Read Integrals", true); int q = k_vector[1]; if (q == 0 or _coul_int_reading_type == chunks) { read_next(k_vector);