diff --git a/src/cu_compute_Pq.cu b/src/cu_compute_Pq.cu index bfc1224..a600fd3 100644 --- a/src/cu_compute_Pq.cu +++ b/src/cu_compute_Pq.cu @@ -26,8 +26,8 @@ __global__ void validate_info(int *info){ if(idx>0) return; if(*info!=0){ printf("info is: %d\n",*info); - printf("nonzero info. Aborting.\n"); - asm("exit;"); + printf("nonzero info. Aborting application.\n"); + asm("trap;"); // nonzero info = cholesky or LU fails, then all threads should be stopped } } __global__ void validate_info(int *info, int N){ @@ -36,8 +36,8 @@ __global__ void validate_info(int *info, int N){ for(int i=0;i void cugw_utils::solve(int _nts, int _ns, int _nk, int _ink, int _nao, const std::vector& reduced_to_full, const std::vector& full_to_reduced, std::complex* Vk1k2_Qij, - ztensor<5>& Sigma_tskij_host, int _devices_rank, int _devices_size, bool low_device_memory, + St_type& sigma_tau_host_shared, int _devices_rank, int _devices_size, bool low_device_memory, int verbose, irre_pos_callback& irre_pos, mom_cons_callback& momentum_conservation, gw_reader1_callback& r1, gw_reader2_callback& r2) { // this is the main GW loop @@ -252,6 +252,7 @@ namespace green::gpu { qpt.compute_Pq(); qpt.transform_wt(); // Write to Sigma(k), k belongs to _ink + MPI_Win_lock_all(MPI_MODE_NOCHECK, sigma_tau_host_shared.win()); for (size_t k_reduced_id = 0; k_reduced_id < _ink; ++k_reduced_id) { size_t k = reduced_to_full[k_reduced_id]; for (size_t q_or_qinv = 0; q_or_qinv < _nk; ++q_or_qinv) { @@ -264,33 +265,42 @@ namespace green::gpu { bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1; bool need_minus_q = reduced_to_full[q_reduced_id] != q_or_qinv; + // read and prepare G(k-q), V(k, k-q) and V(k-q, k) r2(k, k1, k1_reduced_id, k_vector, V_Qim, Vk1k2_Qij, Gk1_stij, need_minus_k1); - gw_qkpt* qkpt = obtain_idle_qkpt(qkpts); + gw_qkpt* qkpt = obtain_idle_qkpt_for_sigma(qkpts, _low_device_memory, Sigmak_stij.data()); if (_low_device_memory) { if (!_X2C) { qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); - qkpt->compute_second_tau_contraction(Sigmak_stij.data(), - qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); - copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns); + qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); + copy_Sigma(sigma_tau_host_shared.object(), Sigmak_stij, k_reduced_id, _nts, _ns); } else { // In 2cGW, G(-k) = G*(k) has already been addressed in r2() qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, false); - qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(), - qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); - copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts); + qkpt->compute_second_tau_contraction_2C(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); + copy_Sigma_2c(sigma_tau_host_shared.object(), Sigmak_stij, k_reduced_id, _nts); } } else { qkpt->set_up_qkpt_second(nullptr, V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1); - qkpt->compute_second_tau_contraction(nullptr, qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); + qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q)); } } } } + MPI_Win_sync(sigma_tau_host_shared.win()); + MPI_Barrier(utils::context.node_comm); + MPI_Win_unlock_all(sigma_tau_host_shared.win()); } cudaDeviceSynchronize(); + MPI_Win_lock_all(MPI_MODE_NOCHECK, sigma_tau_host_shared.win()); + wait_and_clean_qkpts(qkpts, _low_device_memory, Sigmak_stij.data()); + MPI_Win_sync(sigma_tau_host_shared.win()); + MPI_Win_unlock_all(sigma_tau_host_shared.win()); + // wait for all qkpts to complete if (!_low_device_memory and !_X2C) { - copy_Sigma_from_device_to_host(sigma_kstij_device, Sigma_tskij_host.data(), _ink, _nao, _nts, _ns); + MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, sigma_tau_host_shared.win()); + copy_Sigma_from_device_to_host(sigma_kstij_device, sigma_tau_host_shared.object().data(), _ink, _nao, _nts, _ns); + MPI_Win_unlock(0, sigma_tau_host_shared.win()); } } diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu index a366155..24c7c8c 100644 --- a/src/cugw_qpt.cu +++ b/src/cugw_qpt.cu @@ -252,7 +252,7 @@ namespace green::gpu { naux_, Pivot_, d_info_, nw_b_) != CUBLAS_STATUS_SUCCESS) { throw std::runtime_error("CUDA GETRF failed!"); } - validate_info<<<1, 1, 0, stream_>>>(d_info_, nw_b_); + validate_info<<<1, 1, 0, stream_>>>(d_info_, nw_b_); cudaEventRecord(LU_decomposition_ready_event_, stream_); if (cudaStreamWaitEvent(stream_, LU_decomposition_ready_event_, 0 /*cudaEventWaitDefault*/)) @@ -320,7 +320,7 @@ namespace green::gpu { g_ktij_(g_ktij), g_kmtij_(g_kmtij), sigma_ktij_(sigma_ktij), sigma_k_locks_(sigma_k_locks), nao_(nao), nao2_(nao * nao), nao3_(nao2_ * nao), naux_(naux), naux2_(naux * naux), nauxnao_(naux * nao), nauxnao2_(naux * nao * nao), ns_(ns), nt_(nt), nt_batch_(nt_batch), ntnaux_(nt * naux), ntnaux2_(nt * naux * naux), ntnao_(nt * nao), ntnao2_(nt * nao2_), - handle_(handle) { + handle_(handle), cleanup_req_(false) { _low_memory_requirement = (g_ktij == nullptr) ? true : false; if (cudaStreamCreate(&stream_) != cudaSuccess) throw std::runtime_error("main stream creation failed"); @@ -349,7 +349,11 @@ namespace green::gpu { throw std::runtime_error("failure allocating Gk_tsij on host"); if (cudaMallocHost(&Gk_smtij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) throw std::runtime_error("failure allocating Gk_tsij on host"); - Sigmak_stij_buffer_ = Gk_smtij_buffer_; + // ! GH: I think this will interfere with our cudaMemcpyAsync. Should we simply allocate a different array for Sigmak_stij_buffer_? + // ! The more I think, this here is the real reason why we had to use cudaMemcpy and not the asynchronous version. + // ! Sigmak_stij_buffer_ = Gk_smtij_buffer_; + if (cudaMallocHost(&Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess) + throw std::runtime_error("failure allocating Gk_tsij on host"); } if (cudaMalloc(&Pqk0_tQP_local_, nt_batch_ * naux2_ * sizeof(cuda_complex)) != cudaSuccess) @@ -383,6 +387,9 @@ namespace green::gpu { cudaFreeHost(Gk1_stij_buffer_); cudaFreeHost(Gk_smtij_buffer_); } + if (cleanup_req_ == true) { + throw std::runtime_error("cleanup of self-energy was not done correctly."); + } } template @@ -526,7 +533,7 @@ namespace green::gpu { } template - void gw_qkpt::compute_second_tau_contraction(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) { + void gw_qkpt::compute_second_tau_contraction(cuda_complex* Pqk_tQP) { cuda_complex one = cu_type_map::cast(1., 0.); cuda_complex zero = cu_type_map::cast(0., 0.); cuda_complex m1 = cu_type_map::cast(-1., 0.); @@ -556,12 +563,12 @@ namespace green::gpu { } } } - write_sigma(_low_memory_requirement, Sigmak_stij_host); + write_sigma(_low_memory_requirement); cudaEventRecord(all_done_event_); } template - void gw_qkpt::compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) { + void gw_qkpt::compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP) { cuda_complex one = cu_type_map::cast(1., 0.); cuda_complex zero = cu_type_map::cast(0., 0.); cuda_complex m1 = cu_type_map::cast(-1., 0.); @@ -593,13 +600,14 @@ namespace green::gpu { } } } - write_sigma(true, Sigmak_stij_host); + write_sigma(true); cudaEventRecord(all_done_event_); } template - void gw_qkpt::write_sigma(bool low_memory_mode, cxx_complex* Sigmak_stij_host) { + void gw_qkpt::write_sigma(bool low_memory_mode) { // write results. Make sure we have exclusive write access to sigma, then add array sigmak_tij to sigma_ktij + // TODO: In my understanding, the lock is only required for RAXPY part now, so we should move them inside the first if condition acquire_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); scalar_t one = 1.; if (!low_memory_mode) { @@ -608,15 +616,23 @@ namespace green::gpu { throw std::runtime_error("RAXPY fails on gw_qkpt.write_sigma()."); } } else { - // Copy sigmak_stij_ back to CPU - if (Sigmak_stij_host == nullptr) - throw std::runtime_error("gw_qkpt.write_sigma(): Sigmak_stij_host cannot be a null pointer"); - cudaMemcpy(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost); - std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); + // Copy sigmak_stij_ asynchronously back to CPU + cudaMemcpyAsync(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost, stream_); + // cudaMemcpyAsync will require a cleanup at later stage. + // So, we update the cleanup_req_ status to true + cleanup_req_ = true; } release_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_); } + template + void gw_qkpt::cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host) { + if (cleanup_req_) { + std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)); + cleanup_req_ = false; + } + } + template bool gw_qkpt::is_busy() { cudaError_t stream_status = cudaStreamQuery(stream_); diff --git a/src/green/gpu/cu_routines.h b/src/green/gpu/cu_routines.h index b2d9c71..1467cdc 100644 --- a/src/green/gpu/cu_routines.h +++ b/src/green/gpu/cu_routines.h @@ -22,6 +22,8 @@ #ifndef GREEN_GPU_CU_ROUTINES_H #define GREEN_GPU_CU_ROUTINES_H #include +#include +#include #include @@ -132,6 +134,7 @@ namespace green::gpu { using scalar_t = typename cu_type_map>::cxx_base_type; using cxx_complex = typename cu_type_map>::cxx_type; using cuda_complex = typename cu_type_map>::cuda_type; + using St_type = utils::shared_object>; public: cugw_utils(int _nts, int _nt_batch, int _nw_b, int _ns, int _nk, int _ink, int _nqkpt, int _NQ, int _nao, @@ -141,7 +144,7 @@ namespace green::gpu { ~cugw_utils(); void solve(int _nts, int _ns, int _nk, int _ink, int _nao, const std::vector& reduced_to_full, - const std::vector& full_to_reduced, std::complex* Vk1k2_Qij, ztensor<5>& Sigma_tskij_host, + const std::vector& full_to_reduced, std::complex* Vk1k2_Qij, St_type& Sigma_tskij_host, int _devices_rank, int _devices_size, bool _low_device_memory, int verbose, irre_pos_callback& irre_pos, mom_cons_callback& momentum_conservation, gw_reader1_callback& r1, gw_reader2_callback& r2); @@ -163,7 +166,7 @@ namespace green::gpu { tensor, 3> V_Qim; tensor, 4> Gk1_stij; tensor, 4> Gk_smtij; - tensor, 4>& Sigmak_stij = Gk_smtij; + tensor, 4> Sigmak_stij; cuda_complex* g_kstij_device; cuda_complex* g_ksmtij_device; diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h index 292ac48..5ede49a 100644 --- a/src/green/gpu/cugw_qpt.h +++ b/src/green/gpu/cugw_qpt.h @@ -30,7 +30,19 @@ #include "cublas_routines_prec.h" #include "cuda_common.h" +/** + * \brief checks success of a LU or Cholesky decomposition + * + * \param info output of Cuda equivalent of decomposition function, e.g., POTRF + */ __global__ void validate_info(int* info); + +/** + * \brief checks success of a LU or Cholesky decomposition + * + * \param info (vector) output of Cuda equivalent of decomposition function, e.g., POTRF + * \param N length of info + */ __global__ void validate_info(int* info, int N); __global__ void set_up_one_minus_P(cuDoubleComplex* one_minus_P, cuDoubleComplex* P, int naux); __global__ void set_up_one_minus_P(cuComplex* one_minus_P, cuComplex* P, int naux); @@ -265,30 +277,46 @@ namespace green::gpu { /** * \brief Using dressed GW polarization compute self-energy at a given momentum point * - * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point * \param Pqk_tQP Dressed polarization bubble */ - void compute_second_tau_contraction(cxx_complex* Sigmak_stij_host = nullptr, cuda_complex* Pqk_tQP = nullptr); + void compute_second_tau_contraction(cuda_complex* Pqk_tQP = nullptr); /** * \brief Using dressed GW polarization compute self-energy at a given momentum point (X2C version) - * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point + * * \param Pqk_tQP Dressed polarization bubble */ - void compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host = nullptr, cuda_complex* Pqk_tQP = nullptr); + void compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP = nullptr); /** * \brief For a given k-point copy self-energy back to a host memory * \param low_memory_mode - whether the whole self-energy allocated in memory or not - * \param Sigmak_stij_host - Host stored self-energy object at a given momentum point */ - void write_sigma(bool low_memory_mode = false, cxx_complex* Sigmak_stij_host = nullptr); + void write_sigma(bool low_memory_mode = false); /** - * \brief Check if cuda devices are budy - * \return true if asynchronous calculations are still running + * \brief Check if cuda devices are busy + * \return true - if asynchronous calculations are still running */ bool is_busy(); + /** + * \brief return the status of copy_selfenergy from device to host + * + * \return false - not required, stream ready for next calculation + * \return true - required + */ + bool require_cleanup(){ + return cleanup_req_; + } + + /** + * \brief perform cleanup, i.e. copy data from Sigmak buffer (4-index array for a given momentum point) to Host shared memory Self-energy + * + * \param low_memory_mode - whether the whole self-energy allocated in memory or not + * \param Sigma_stij_host - HHost stored self-energy object at a given momentum point + */ + void cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host); + // static std::size_t size(size_t nao, size_t naux, size_t nt, size_t nt_batch, size_t ns) { return (2 * naux * nao * nao // V_Qpm+V_pmQ @@ -374,8 +402,20 @@ namespace green::gpu { // pointer to cublas handle cublasHandle_t* handle_; + + // status of data transfer / copy from Device to Host. + // false: not required, stream ready for next calculation + // true: required + bool cleanup_req_; }; + /** + * \brief returns an idle qkpt stream, otherwise waits until a stream is available + * + * \tparam prec - precision for calculation + * \param qkpts - vector of qkpt workers (gw_qkpt type) + * \return gw_qkpt* - pointer to idle qkpt + */ template gw_qkpt* obtain_idle_qkpt(std::vector*>& qkpts) { static int pos = 0; @@ -387,4 +427,48 @@ namespace green::gpu { return qkpts[pos]; } + /** + * \brief returns an idle qkpt stream, otherwise waits until a stream is available + * + * \tparam prec - precision for calculation + * \param qkpts - vector of qkpt workers (gw_qkpt type) + * \param low_memory_mode - low memory mode for read/write integrals + * \param Sigmak_stij_host - cudaMallocHost buffer for transfering Sigma + * \return gw_qkpt* - pointer to idle qkpt + */ + template + gw_qkpt* obtain_idle_qkpt_for_sigma(std::vector*>& qkpts, bool low_memory_mode, + typename cu_type_map>::cxx_type* Sigmak_stij_host) { + static int pos = 0; + pos++; + if (pos >= qkpts.size()) pos = 0; + while (qkpts[pos]->is_busy()) { + pos = (pos + 1) % qkpts.size(); + } + qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host); + return qkpts[pos]; + } + + /** + * \brief waits for all qkpts to complete and cleans them up + * + * \tparam prec - precision for calculation + * \param qkpts - vector of qkpt workers (gw_qkpt type) + * \param low_memory_mode - low memory mode for read/write integrals + * \param Sigmak_stij_host - cudaMallocHost buffer for transfering Sigma + */ + template + void wait_and_clean_qkpts(std::vector*>& qkpts, bool low_memory_mode, + typename cu_type_map>::cxx_type* Sigmak_stij_host) { + static int pos = 0; + pos++; + if (pos >= qkpts.size()) pos = 0; + for (pos = 0; pos < qkpts.size(); pos++) { + while (qkpts[pos]->is_busy()) { + continue; + } + qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host); + } + return; + } } // namespace green::gpu diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index 2627a6a..0eeed37 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -80,6 +80,13 @@ namespace green::gpu { virtual void gw_innerloop(G_type& g, St_type& sigma_tau) = 0; void GW_check_devices_free_space(); + /** + * \brief count and floating points operations per second achieved on GPU. + * This is not representative of the GPU capabilities, but instead, accounts for read/write overheads. + * The value is entirely in the context Green-MBPT solver. + */ + void flops_achieved(); + /* * Read a chunk of Coulomb integral with given (k[0], k[3]) k-pair */ diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 0908335..7326a9c 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -110,6 +110,7 @@ namespace green::gpu { MPI_Barrier(utils::context.global); statistics.end(); statistics.print(utils::context.global); + // flops_achieved(); clean_MPI_structure(); clean_shared_Coulomb(); @@ -119,6 +120,26 @@ namespace green::gpu { MPI_Op_free(&matrix_sum_op); } + void gw_gpu_kernel::flops_achieved() { + double gpu_time; + utils::event_t cugw_event = statistics.event("Solve cuGW"); + gpu_time = cugw_event.duration; + + if (!utils::context.global_rank) { + MPI_Reduce(MPI_IN_PLACE, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global); + } else { + MPI_Reduce(&gpu_time, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global); + } + + double flops = _flop_count / gpu_time; + + if (!utils::context.global_rank && _verbose > 1) { + std::cout << "############ GPU FLOPs achieved ############" << std::endl; + std::cout << "FLOPs achieved: " << flops / 1.0e9 << " Giga flops." << std::endl; + std::cout << "###########################################################" << std::endl; + } + } + void scalar_gw_gpu_kernel::gw_innerloop(G_type& g, St_type& sigma_tau) { if (!_sp) { compute_gw_selfenergy(g, sigma_tau); @@ -183,22 +204,14 @@ namespace green::gpu { statistics.end(); }; - // Since all process in _devices_comm will write to the self-energy simultaneously, - // instaed of adding locks in cugw.solve(), we allocate private _Sigma_tskij_local_host - // and do MPIAllreduce on CPU later on. Since the number of processes with a GPU is very - // limited, the additional memory overhead is fairly limited. - ztensor<5> Sigma_tskij_host_local(_nts, _ns, _ink, _nao, _nao); + // Previous implementation avoided writing to the shared self-energy directly + // But the memory overhead can be costly for large applicaitons where GPUs are a must. + // In revised implementation, cugw.solve() will overwrite the shared self energy directly. statistics.start("Solve cuGW"); cugw.solve(_nts, _ns, _nk, _ink, _nao, _bz_utils.symmetry().reduced_to_full(), _bz_utils.symmetry().full_to_reduced(), - _Vk1k2_Qij, Sigma_tskij_host_local, _devices_rank, _devices_size, _low_device_memory, _verbose, + _Vk1k2_Qij, sigma_tau, _devices_rank, _devices_size, _low_device_memory, _verbose, irre_pos, mom_cons, r1, r2); statistics.end(); - statistics.start("Update Host Self-energy"); - // Copy back to Sigma_tskij_local_host - MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, sigma_tau.win()); - sigma_tau.object() += Sigma_tskij_host_local; - MPI_Win_unlock(0, sigma_tau.win()); - statistics.end(); } void gw_gpu_kernel::GW_check_devices_free_space() { @@ -302,15 +315,11 @@ namespace green::gpu { statistics.end(); }; - ztensor<5> Sigma_tskij_host_local(_nts, 1, _ink, _nso, _nso); + statistics.start("Solve cuGW"); cugw.solve(_nts, psuedo_ns, _nk, _ink, _nao, _bz_utils.symmetry().reduced_to_full(), _bz_utils.symmetry().full_to_reduced(), - _Vk1k2_Qij, Sigma_tskij_host_local, _devices_rank, _devices_size, true, _verbose, + _Vk1k2_Qij, sigma_tau, _devices_rank, _devices_size, true, _verbose, irre_pos, mom_cons, r1, r2); - // Convert Sigma_tskij_host_local to (_nts, 1, _ink, _nso, _nso) - // Copy back to Sigma_tskij_local_host - MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, sigma_tau.win()); - sigma_tau.object() += Sigma_tskij_host_local; - MPI_Win_unlock(0, sigma_tau.win()); + statistics.end(); } void x2c_gw_gpu_kernel::copy_Gk_2c(const ztensor<5> &G_tskij_host, tensor,4> &Gk_4tij, int k, bool need_minus_k, bool minus_t) {