Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/cu_compute_Pq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ __global__ void validate_info(int *info){
if(idx>0) return;
if(*info!=0){
printf("info is: %d\n",*info);
printf("nonzero info. Aborting.\n");
asm("exit;");
printf("nonzero info. Aborting application.\n");
asm("trap;"); // nonzero info = cholesky or LU fails, then all threads should be stopped
}
}
__global__ void validate_info(int *info, int N){
Expand All @@ -36,8 +36,8 @@ __global__ void validate_info(int *info, int N){
for(int i=0;i<N;++i){
if(*(info+i)!=0){
printf("info is: %d\n",*(info+i));
printf("nonzero info for batched job: %d. Aborting.\n",i);
asm("exit;");
printf("nonzero info for batched job: %d. Aborting application.\n",i);
asm("trap;"); // nonzero info = cholesky or LU fails, then all threads should be stopped
}
}
}
Expand Down
30 changes: 20 additions & 10 deletions src/cu_routines.cu
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ namespace green::gpu {
template <typename prec>
void cugw_utils<prec>::solve(int _nts, int _ns, int _nk, int _ink, int _nao, const std::vector<size_t>& reduced_to_full,
const std::vector<size_t>& full_to_reduced, std::complex<double>* Vk1k2_Qij,
ztensor<5>& Sigma_tskij_host, int _devices_rank, int _devices_size, bool low_device_memory,
St_type& sigma_tau_host_shared, int _devices_rank, int _devices_size, bool low_device_memory,
int verbose, irre_pos_callback& irre_pos, mom_cons_callback& momentum_conservation,
gw_reader1_callback<prec>& r1, gw_reader2_callback<prec>& r2) {
// this is the main GW loop
Expand Down Expand Up @@ -252,6 +252,7 @@ namespace green::gpu {
qpt.compute_Pq();
qpt.transform_wt();
// Write to Sigma(k), k belongs to _ink
MPI_Win_lock_all(MPI_MODE_NOCHECK, sigma_tau_host_shared.win());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This we need to discuss. It's fairly catastrophic in amulti-GPU environment: only ONE MPI process will enter the section below at a time... I believe there's no reason for that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@egull that's not how MPI_Win_lock_all(MPI_MODE_NOCHECK, win) works. This call only asserts the start of the communication epoch, and no synchronization is done here. To do the memory synchronization one has to call MPI_Win_sync, which is done bellow after the loop.

However, there is much more dangerous things going on here. Since all processes that have GPU enter the loop, there would be a guaranteed race condition in this loop, as we run over all k-points and do a summation over all q-points.

Doing similar synchronization pattern as implemented here is safe (and this is actually advised to reduce synchronizations) if we know that there is no overlap between memory regions that are accessed by different processes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're a bit late to that party. We'll discuss today how to do that in a way that is both performant (which your solution is not) and correct (which his solution is not). The way I think it works is via

MPI_Win_lock_all
MPI_Win_sync
...then do the update/access
MPI_Win_sync
MPI_Win_flush_all
MPI_Win_unlock_all

the MPI_Win_sync at most syncronizes a private with a public version, but may not syncronize the private version of other threads. The standard section 11 has more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which my implementation is not performant?

Also, mpi_win_flush is not needed here, we don't do any RMA operations on shared window here.

for (size_t k_reduced_id = 0; k_reduced_id < _ink; ++k_reduced_id) {
size_t k = reduced_to_full[k_reduced_id];
for (size_t q_or_qinv = 0; q_or_qinv < _nk; ++q_or_qinv) {
Expand All @@ -264,33 +265,42 @@ namespace green::gpu {
bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1;
bool need_minus_q = reduced_to_full[q_reduced_id] != q_or_qinv;

// read and prepare G(k-q), V(k, k-q) and V(k-q, k)
r2(k, k1, k1_reduced_id, k_vector, V_Qim, Vk1k2_Qij, Gk1_stij, need_minus_k1);

gw_qkpt<prec>* qkpt = obtain_idle_qkpt(qkpts);
gw_qkpt<prec>* qkpt = obtain_idle_qkpt_for_sigma(qkpts, _low_device_memory, Sigmak_stij.data());
if (_low_device_memory) {
if (!_X2C) {
qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1);
qkpt->compute_second_tau_contraction(Sigmak_stij.data(),
qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns);
qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
copy_Sigma(sigma_tau_host_shared.object(), Sigmak_stij, k_reduced_id, _nts, _ns);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why can't the lock be around this?

} else {
// In 2cGW, G(-k) = G*(k) has already been addressed in r2()
qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, false);
qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(),
qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts);
qkpt->compute_second_tau_contraction_2C(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
copy_Sigma_2c(sigma_tau_host_shared.object(), Sigmak_stij, k_reduced_id, _nts);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...and around this?

}
} else {
qkpt->set_up_qkpt_second(nullptr, V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1);
qkpt->compute_second_tau_contraction(nullptr, qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

even here, you probably don't want to lock out everybody but do the lock/unlock jsut around the memcpy

}
}
}
}
MPI_Win_sync(sigma_tau_host_shared.win());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why so far outside the write?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually I'm totally confused about the need to sync at all.

MPI_Barrier(utils::context.node_comm);
MPI_Win_unlock_all(sigma_tau_host_shared.win());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is silly here? Why? we always have lock/unlock in pairs

}
cudaDeviceSynchronize();
MPI_Win_lock_all(MPI_MODE_NOCHECK, sigma_tau_host_shared.win());
wait_and_clean_qkpts(qkpts, _low_device_memory, Sigmak_stij.data());
MPI_Win_sync(sigma_tau_host_shared.win());
MPI_Win_unlock_all(sigma_tau_host_shared.win());
// wait for all qkpts to complete
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

propose to put the lock/unlock magic right around the memcpy and keep it very local there.

if (!_low_device_memory and !_X2C) {
copy_Sigma_from_device_to_host(sigma_kstij_device, Sigma_tskij_host.data(), _ink, _nao, _nts, _ns);
MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, sigma_tau_host_shared.win());
copy_Sigma_from_device_to_host(sigma_kstij_device, sigma_tau_host_shared.object().data(), _ink, _nao, _nts, _ns);
MPI_Win_unlock(0, sigma_tau_host_shared.win());
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

}

Expand Down
42 changes: 29 additions & 13 deletions src/cugw_qpt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ namespace green::gpu {
naux_, Pivot_, d_info_, nw_b_) != CUBLAS_STATUS_SUCCESS) {
throw std::runtime_error("CUDA GETRF failed!");
}
validate_info<<<1, 1, 0, stream_>>>(d_info_, nw_b_);
validate_info<<<1, 1, 0, stream_>>>(d_info_, nw_b_);
cudaEventRecord(LU_decomposition_ready_event_, stream_);

if (cudaStreamWaitEvent(stream_, LU_decomposition_ready_event_, 0 /*cudaEventWaitDefault*/))
Expand Down Expand Up @@ -320,7 +320,7 @@ namespace green::gpu {
g_ktij_(g_ktij), g_kmtij_(g_kmtij), sigma_ktij_(sigma_ktij), sigma_k_locks_(sigma_k_locks), nao_(nao), nao2_(nao * nao),
nao3_(nao2_ * nao), naux_(naux), naux2_(naux * naux), nauxnao_(naux * nao), nauxnao2_(naux * nao * nao), ns_(ns), nt_(nt),
nt_batch_(nt_batch), ntnaux_(nt * naux), ntnaux2_(nt * naux * naux), ntnao_(nt * nao), ntnao2_(nt * nao2_),
handle_(handle) {
handle_(handle), cleanup_req_(false) {
_low_memory_requirement = (g_ktij == nullptr) ? true : false;
if (cudaStreamCreate(&stream_) != cudaSuccess) throw std::runtime_error("main stream creation failed");

Expand Down Expand Up @@ -349,7 +349,11 @@ namespace green::gpu {
throw std::runtime_error("failure allocating Gk_tsij on host");
if (cudaMallocHost(&Gk_smtij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess)
throw std::runtime_error("failure allocating Gk_tsij on host");
Sigmak_stij_buffer_ = Gk_smtij_buffer_;
// ! GH: I think this will interfere with our cudaMemcpyAsync. Should we simply allocate a different array for Sigmak_stij_buffer_?
// ! The more I think, this here is the real reason why we had to use cudaMemcpy and not the asynchronous version.
// ! <previously> Sigmak_stij_buffer_ = Gk_smtij_buffer_;
if (cudaMallocHost(&Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess)
throw std::runtime_error("failure allocating Gk_tsij on host");
}

if (cudaMalloc(&Pqk0_tQP_local_, nt_batch_ * naux2_ * sizeof(cuda_complex)) != cudaSuccess)
Expand Down Expand Up @@ -383,6 +387,9 @@ namespace green::gpu {
cudaFreeHost(Gk1_stij_buffer_);
cudaFreeHost(Gk_smtij_buffer_);
}
if (cleanup_req_ == true) {
throw std::runtime_error("cleanup of self-energy was not done correctly.");
}
}

template <typename prec>
Expand Down Expand Up @@ -526,7 +533,7 @@ namespace green::gpu {
}

template <typename prec>
void gw_qkpt<prec>::compute_second_tau_contraction(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) {
void gw_qkpt<prec>::compute_second_tau_contraction(cuda_complex* Pqk_tQP) {
cuda_complex one = cu_type_map<cxx_complex>::cast(1., 0.);
cuda_complex zero = cu_type_map<cxx_complex>::cast(0., 0.);
cuda_complex m1 = cu_type_map<cxx_complex>::cast(-1., 0.);
Expand Down Expand Up @@ -556,12 +563,12 @@ namespace green::gpu {
}
}
}
write_sigma(_low_memory_requirement, Sigmak_stij_host);
write_sigma(_low_memory_requirement);
cudaEventRecord(all_done_event_);
}

template <typename prec>
void gw_qkpt<prec>::compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) {
void gw_qkpt<prec>::compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP) {
cuda_complex one = cu_type_map<cxx_complex>::cast(1., 0.);
cuda_complex zero = cu_type_map<cxx_complex>::cast(0., 0.);
cuda_complex m1 = cu_type_map<cxx_complex>::cast(-1., 0.);
Expand Down Expand Up @@ -593,13 +600,14 @@ namespace green::gpu {
}
}
}
write_sigma(true, Sigmak_stij_host);
write_sigma(true);
cudaEventRecord(all_done_event_);
}

template <typename prec>
void gw_qkpt<prec>::write_sigma(bool low_memory_mode, cxx_complex* Sigmak_stij_host) {
void gw_qkpt<prec>::write_sigma(bool low_memory_mode) {
// write results. Make sure we have exclusive write access to sigma, then add array sigmak_tij to sigma_ktij
// TODO: In my understanding, the lock is only required for RAXPY part now, so we should move them inside the first if condition
acquire_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_);
scalar_t one = 1.;
if (!low_memory_mode) {
Expand All @@ -608,15 +616,23 @@ namespace green::gpu {
throw std::runtime_error("RAXPY fails on gw_qkpt.write_sigma().");
}
} else {
// Copy sigmak_stij_ back to CPU
if (Sigmak_stij_host == nullptr)
throw std::runtime_error("gw_qkpt.write_sigma(): Sigmak_stij_host cannot be a null pointer");
cudaMemcpy(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost);
std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex));
// Copy sigmak_stij_ asynchronously back to CPU
cudaMemcpyAsync(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost, stream_);
// cudaMemcpyAsync will require a cleanup at later stage.
// So, we update the cleanup_req_ status to true
cleanup_req_ = true;
}
release_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_);
}

template <typename prec>
void gw_qkpt<prec>::cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host) {
if (cleanup_req_) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would put the shared window lock just here

std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex));
cleanup_req_ = false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would put the shared window unlock just here.

}
}

template <typename prec>
bool gw_qkpt<prec>::is_busy() {
cudaError_t stream_status = cudaStreamQuery(stream_);
Expand Down
7 changes: 5 additions & 2 deletions src/green/gpu/cu_routines.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#ifndef GREEN_GPU_CU_ROUTINES_H
#define GREEN_GPU_CU_ROUTINES_H
#include <green/gpu/common_defs.h>
#include <green/utils/mpi_shared.h>
#include <green/utils/mpi_utils.h>

#include <cstring>

Expand Down Expand Up @@ -132,6 +134,7 @@ namespace green::gpu {
using scalar_t = typename cu_type_map<std::complex<prec>>::cxx_base_type;
using cxx_complex = typename cu_type_map<std::complex<prec>>::cxx_type;
using cuda_complex = typename cu_type_map<std::complex<prec>>::cuda_type;
using St_type = utils::shared_object<ztensor<5>>;

public:
cugw_utils(int _nts, int _nt_batch, int _nw_b, int _ns, int _nk, int _ink, int _nqkpt, int _NQ, int _nao,
Expand All @@ -141,7 +144,7 @@ namespace green::gpu {
~cugw_utils();

void solve(int _nts, int _ns, int _nk, int _ink, int _nao, const std::vector<size_t>& reduced_to_full,
const std::vector<size_t>& full_to_reduced, std::complex<double>* Vk1k2_Qij, ztensor<5>& Sigma_tskij_host,
const std::vector<size_t>& full_to_reduced, std::complex<double>* Vk1k2_Qij, St_type& Sigma_tskij_host,
int _devices_rank, int _devices_size, bool _low_device_memory, int verbose, irre_pos_callback& irre_pos,
mom_cons_callback& momentum_conservation, gw_reader1_callback<prec>& r1, gw_reader2_callback<prec>& r2);

Expand All @@ -163,7 +166,7 @@ namespace green::gpu {
tensor<std::complex<prec>, 3> V_Qim;
tensor<std::complex<prec>, 4> Gk1_stij;
tensor<std::complex<prec>, 4> Gk_smtij;
tensor<std::complex<prec>, 4>& Sigmak_stij = Gk_smtij;
tensor<std::complex<prec>, 4> Sigmak_stij;

cuda_complex* g_kstij_device;
cuda_complex* g_ksmtij_device;
Expand Down
100 changes: 92 additions & 8 deletions src/green/gpu/cugw_qpt.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,19 @@
#include "cublas_routines_prec.h"
#include "cuda_common.h"

/**
* \brief checks success of a LU or Cholesky decomposition
*
* \param info output of Cuda equivalent of decomposition function, e.g., POTRF
*/
__global__ void validate_info(int* info);

/**
* \brief checks success of a LU or Cholesky decomposition
*
* \param info (vector) output of Cuda equivalent of decomposition function, e.g., POTRF
* \param N length of info
*/
__global__ void validate_info(int* info, int N);
__global__ void set_up_one_minus_P(cuDoubleComplex* one_minus_P, cuDoubleComplex* P, int naux);
__global__ void set_up_one_minus_P(cuComplex* one_minus_P, cuComplex* P, int naux);
Expand Down Expand Up @@ -265,30 +277,46 @@ namespace green::gpu {
/**
* \brief Using dressed GW polarization compute self-energy at a given momentum point
*
* \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point
* \param Pqk_tQP Dressed polarization bubble
*/
void compute_second_tau_contraction(cxx_complex* Sigmak_stij_host = nullptr, cuda_complex* Pqk_tQP = nullptr);
void compute_second_tau_contraction(cuda_complex* Pqk_tQP = nullptr);
/**
* \brief Using dressed GW polarization compute self-energy at a given momentum point (X2C version)
* \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point
*
* \param Pqk_tQP Dressed polarization bubble
*/
void compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host = nullptr, cuda_complex* Pqk_tQP = nullptr);
void compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP = nullptr);

/**
* \brief For a given k-point copy self-energy back to a host memory
* \param low_memory_mode - whether the whole self-energy allocated in memory or not
* \param Sigmak_stij_host - Host stored self-energy object at a given momentum point
*/
void write_sigma(bool low_memory_mode = false, cxx_complex* Sigmak_stij_host = nullptr);
void write_sigma(bool low_memory_mode = false);

/**
* \brief Check if cuda devices are budy
* \return true if asynchronous calculations are still running
* \brief Check if cuda devices are busy
* \return true - if asynchronous calculations are still running
*/
bool is_busy();

/**
* \brief return the status of copy_selfenergy from device to host
*
* \return false - not required, stream ready for next calculation
* \return true - required
*/
bool require_cleanup(){
return cleanup_req_;
}

/**
* \brief perform cleanup, i.e. copy data from Sigmak buffer (4-index array for a given momentum point) to Host shared memory Self-energy
*
* \param low_memory_mode - whether the whole self-energy allocated in memory or not
* \param Sigma_stij_host - HHost stored self-energy object at a given momentum point
*/
void cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host);

//
static std::size_t size(size_t nao, size_t naux, size_t nt, size_t nt_batch, size_t ns) {
return (2 * naux * nao * nao // V_Qpm+V_pmQ
Expand Down Expand Up @@ -374,8 +402,20 @@ namespace green::gpu {

// pointer to cublas handle
cublasHandle_t* handle_;

// status of data transfer / copy from Device to Host.
// false: not required, stream ready for next calculation
// true: required
bool cleanup_req_;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

propose making a destructor that throws an exception if cleanup_req_ is true and a constructor that sets it to false.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

c++ standard advises to not throwing exceptions in destructor.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's right. I guess we need to print an error and abort the program. This should never happen unless there's a logic error anyway.

};

/**
* \brief returns an idle qkpt stream, otherwise waits until a stream is available
*
* \tparam prec - precision for calculation
* \param qkpts - vector of qkpt workers (gw_qkpt<prec> type)
* \return gw_qkpt<prec>* - pointer to idle qkpt
*/
template <typename prec>
gw_qkpt<prec>* obtain_idle_qkpt(std::vector<gw_qkpt<prec>*>& qkpts) {
static int pos = 0;
Expand All @@ -387,4 +427,48 @@ namespace green::gpu {
return qkpts[pos];
}

/**
* \brief returns an idle qkpt stream, otherwise waits until a stream is available
*
* \tparam prec - precision for calculation
* \param qkpts - vector of qkpt workers (gw_qkpt<prec> type)
* \param low_memory_mode - low memory mode for read/write integrals
* \param Sigmak_stij_host - cudaMallocHost buffer for transfering Sigma
* \return gw_qkpt<prec>* - pointer to idle qkpt
*/
template <typename prec>
gw_qkpt<prec>* obtain_idle_qkpt_for_sigma(std::vector<gw_qkpt<prec>*>& qkpts, bool low_memory_mode,
typename cu_type_map<std::complex<prec>>::cxx_type* Sigmak_stij_host) {
static int pos = 0;
pos++;
if (pos >= qkpts.size()) pos = 0;
while (qkpts[pos]->is_busy()) {
pos = (pos + 1) % qkpts.size();
}
qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host);
return qkpts[pos];
}

/**
* \brief waits for all qkpts to complete and cleans them up
*
* \tparam prec - precision for calculation
* \param qkpts - vector of qkpt workers (gw_qkpt<prec> type)
* \param low_memory_mode - low memory mode for read/write integrals
* \param Sigmak_stij_host - cudaMallocHost buffer for transfering Sigma
*/
template <typename prec>
void wait_and_clean_qkpts(std::vector<gw_qkpt<prec>*>& qkpts, bool low_memory_mode,
typename cu_type_map<std::complex<prec>>::cxx_type* Sigmak_stij_host) {
static int pos = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

next three lines does not make sense to me.

pos++;
if (pos >= qkpts.size()) pos = 0;
for (pos = 0; pos < qkpts.size(); pos++) {
while (qkpts[pos]->is_busy()) {
continue;
}
qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host);
}
return;
}
} // namespace green::gpu
7 changes: 7 additions & 0 deletions src/green/gpu/gw_gpu_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,13 @@ namespace green::gpu {
virtual void gw_innerloop(G_type& g, St_type& sigma_tau) = 0;
void GW_check_devices_free_space();

/**
* \brief count and floating points operations per second achieved on GPU.
* This is not representative of the GPU capabilities, but instead, accounts for read/write overheads.
* The value is entirely in the context Green-MBPT solver.
*/
void flops_achieved();

/*
* Read a chunk of Coulomb integral with given (k[0], k[3]) k-pair
*/
Expand Down
Loading