Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
c4f0c74
new async self-energy update in qkpt strems
gauravharsha Aug 17, 2025
1e76013
fix minor bugs
gauravharsha Aug 17, 2025
b386b6d
more bugs
gauravharsha Aug 17, 2025
5fa2212
update definition of the cleanup function
gauravharsha Aug 17, 2025
f8bc8ec
add profile arguments
gauravharsha Aug 17, 2025
c15e7b4
modify profile colors
gauravharsha Aug 18, 2025
90a761e
adding different handles for each qkpt stream to encourage concurrency
gauravharsha Aug 21, 2025
7f7ff16
add profile statement for polarization retrieval
gauravharsha Aug 23, 2025
48e610c
typo in nvtx3 function
gauravharsha Aug 23, 2025
1118a67
use push/pop range for Pqk_tQP function
gauravharsha Aug 23, 2025
304e158
remove nvtx tags
gauravharsha Oct 28, 2025
5dd3a8e
add comments and clean up function signatures
gauravharsha Oct 29, 2025
259d3f5
remove unnecessary stream synchronization calls from obtain_Pq functions
gauravharsha Oct 29, 2025
0592ace
more refactoring
gauravharsha Oct 29, 2025
d2fffab
redo Pqk_tQP function -- checking stream readiness has some effect on…
gauravharsha Oct 29, 2025
1aa3a28
Update src/green/gpu/cugw_qpt.h
gauravharsha Nov 5, 2025
552b7e2
Update src/green/gpu/cugw_qpt.h
gauravharsha Nov 5, 2025
06cc9ac
Update src/green/gpu/cugw_qpt.h
gauravharsha Nov 5, 2025
fb6a62c
address Copilot reviews
gauravharsha Nov 5, 2025
3666025
rename share coulomb allocation event
gauravharsha Nov 6, 2025
91ebc4b
clean up and modify estimation of flops achieved
gauravharsha Nov 6, 2025
2c68663
fix typo in wait_and_clean_qkpts
gauravharsha Nov 6, 2025
a7f6141
rename qkpt_handles
gauravharsha Nov 6, 2025
6ad23d9
naming and documentation updates; use reset events at end of solve cycle
gauravharsha Nov 6, 2025
8e0e0ef
fix function call for flops_achieved()
gauravharsha Nov 6, 2025
422947e
documentation fixes from copilot review
gauravharsha Nov 6, 2025
d22fb80
documentation and logic fix thanks to copilot
gauravharsha Nov 6, 2025
d07819e
optimize host pinned memory buffers
gauravharsha Nov 6, 2025
25acf9c
use only specific cuda arch for pauli-master gpu quadro p1000
gauravharsha Nov 6, 2025
c01e1c8
undo update for test.yaml
gauravharsha Nov 6, 2025
1ff204a
fix mpi_reduce for performance metrics
gauravharsha Nov 6, 2025
c36ee81
fix sigmak_stij_buffer allocation - same memory as Gk is conflicting
gauravharsha Nov 6, 2025
7bca915
modify complexity estimation and performance metrics
gauravharsha Nov 6, 2025
86bc13c
minof fix in complexity estimation
gauravharsha Nov 6, 2025
1bbc908
Update src/cugw_qpt.cu
gauravharsha Nov 6, 2025
69c13e5
Update src/cugw_qpt.cu
gauravharsha Nov 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 31 additions & 11 deletions src/cu_routines.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <green/gpu/cu_routines.h>


__global__ void initialize_array(cuDoubleComplex* array, cuDoubleComplex value, int count) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= count) return;
Expand Down Expand Up @@ -171,7 +172,7 @@ namespace green::gpu {
int _devCount_per_node) :
_low_device_memory(low_device_memory), qkpts(_nqkpt), V_Qpm(_NQ, _nao, _nao), V_Qim(_NQ, _nao, _nao),
Gk1_stij(_ns, _nts, _nao, _nao), Gk_smtij(_ns, _nts, _nao, _nao),
qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data(), cuda_lin_solver) {
qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data(), cuda_lin_solver), _qkpt_cublas_handles(_nqkpt) {
if (cudaSetDevice(_intranode_rank % _devCount_per_node) != cudaSuccess) throw std::runtime_error("Error in cudaSetDevice2");
if (cublasCreate(&_handle) != CUBLAS_STATUS_SUCCESS)
throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas");
Expand Down Expand Up @@ -201,8 +202,11 @@ namespace green::gpu {
qpt.init(&_handle, &_solver_handle);
// Each process gets one cuda runner for qpoints
for (int i = 0; i < _nqkpt; ++i) {
qkpts[i] = new gw_qkpt<prec>(_nao, _NQ, _ns, _nts, _nt_batch, &_handle, g_kstij_device, g_ksmtij_device, sigma_kstij_device,
sigma_k_locks);
if (cublasCreate(&_qkpt_cublas_handles[i]) != CUBLAS_STATUS_SUCCESS)
throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas");
// initialize qkpt workers
qkpts[i] = new gw_qkpt<prec>(_nao, _NQ, _ns, _nts, _nt_batch, &_qkpt_cublas_handles[i],
g_kstij_device, g_ksmtij_device, sigma_kstij_device, sigma_k_locks);
}
}

Expand Down Expand Up @@ -230,9 +234,13 @@ namespace green::gpu {
bool need_minus_k = reduced_to_full[k_reduced_id] != k;
bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1;

// Read V_Qpm and Gk_smtij
r1(k, k1, k_reduced_id, k1_reduced_id, k_vector, V_Qpm, Vk1k2_Qij, Gk_smtij, Gk1_stij, need_minus_k, need_minus_k1);

// Get an idle qkpt worker
gw_qkpt<prec>* qkpt = obtain_idle_qkpt(qkpts);

// Copy data from host to device and set up qkpt
if (_low_device_memory) {
if (!_X2C) {
qkpt->set_up_qkpt_first(Gk1_stij.data(), Gk_smtij.data(), V_Qpm.data(), k_reduced_id, need_minus_k, k1_reduced_id,
Expand All @@ -244,13 +252,19 @@ namespace green::gpu {
} else {
qkpt->set_up_qkpt_first(nullptr, nullptr, V_Qpm.data(), k_reduced_id, need_minus_k, k1_reduced_id, need_minus_k1);
}
// Compute Polarization Pqk0
qkpt->compute_first_tau_contraction(qpt.Pqk0_tQP(qkpt->all_done_event()), qpt.Pqk0_tQP_lock());
}


// Compute Polarization Pq from irreducible polarization Pqk0
qpt.wait_for_kpts();
qpt.scale_Pq0_tQP(1. / _nk);
qpt.transform_tw();
qpt.compute_Pq();
qpt.transform_wt();


// Write to Sigma(k), k belongs to _ink
for (size_t k_reduced_id = 0; k_reduced_id < _ink; ++k_reduced_id) {
size_t k = reduced_to_full[k_reduced_id];
Expand All @@ -264,30 +278,32 @@ namespace green::gpu {
bool need_minus_k1 = reduced_to_full[k1_reduced_id] != k1;
bool need_minus_q = reduced_to_full[q_reduced_id] != q_or_qinv;

// Read V_Qij and Gk1_stij
r2(k, k1, k1_reduced_id, k_vector, V_Qim, Vk1k2_Qij, Gk1_stij, need_minus_k1);

gw_qkpt<prec>* qkpt = obtain_idle_qkpt(qkpts);
// Get an idle qkpt worker
gw_qkpt<prec>* qkpt = obtain_idle_qkpt_for_sigma(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C);

qkpt->set_k_red_id(k_reduced_id);
// Copy data from host to device, compute Sigma(k), and schedule async memcpy to host pinned memory "qkpt::Sigmak_stij_buffer_"
if (_low_device_memory) {
if (!_X2C) {
qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1);
qkpt->compute_second_tau_contraction(Sigmak_stij.data(),
qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns);
qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
} else {
// In 2cGW, G(-k) = G*(k) has already been addressed in r2()
qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, false);
qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(),
qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts);
qkpt->compute_second_tau_contraction_2C(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
}
} else {
qkpt->set_up_qkpt_second(nullptr, V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1);
qkpt->compute_second_tau_contraction(nullptr, qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
}
}
}
}
}
wait_and_clean_qkpts(qkpts, _low_device_memory, Sigmak_stij, Sigma_tskij_host, _X2C);
cudaDeviceSynchronize();
if (!_low_device_memory and !_X2C) {
copy_Sigma_from_device_to_host(sigma_kstij_device, Sigma_tskij_host.data(), _ink, _nao, _nts, _ns);
Expand All @@ -303,6 +319,7 @@ namespace green::gpu {
}
}
}

template <typename prec>
void cugw_utils<prec>::copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor<std::complex<prec>, 4>& Sigmak_4tij, int k, int nts) {
size_t nao = Sigmak_4tij.shape()[3];
Expand Down Expand Up @@ -331,6 +348,9 @@ namespace green::gpu {
}
if (cublasDestroy(_handle) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("cublas error destroying handle");
if (cusolverDnDestroy(_solver_handle) != CUSOLVER_STATUS_SUCCESS) throw std::runtime_error("culapck error destroying handle");
for (int i = 0; i < qkpts.size(); ++i) {
if (cublasDestroy(_qkpt_cublas_handles[i]) != CUBLAS_STATUS_SUCCESS) throw std::runtime_error("cublas error destroying handle");
}
if (!_low_device_memory) cudaFree(g_kstij_device);
if (!_low_device_memory) cudaFree(g_ksmtij_device);
cudaFree(sigma_kstij_device);
Expand Down
80 changes: 65 additions & 15 deletions src/cugw_qpt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
*/

#include <green/gpu/cugw_qpt.h>


namespace green::gpu {
template<typename prec>
gw_qpt<prec>::gw_qpt(int nao, int naux, int nt, int nw_b,
Expand Down Expand Up @@ -320,7 +322,7 @@ namespace green::gpu {
g_ktij_(g_ktij), g_kmtij_(g_kmtij), sigma_ktij_(sigma_ktij), sigma_k_locks_(sigma_k_locks), nao_(nao), nao2_(nao * nao),
nao3_(nao2_ * nao), naux_(naux), naux2_(naux * naux), nauxnao_(naux * nao), nauxnao2_(naux * nao * nao), ns_(ns), nt_(nt),
nt_batch_(nt_batch), ntnaux_(nt * naux), ntnaux2_(nt * naux * naux), ntnao_(nt * nao), ntnao2_(nt * nao2_),
handle_(handle) {
handle_(handle), cleanup_req_(false) {
_low_memory_requirement = (g_ktij == nullptr) ? true : false;
if (cudaStreamCreate(&stream_) != cudaSuccess) throw std::runtime_error("main stream creation failed");

Expand All @@ -346,10 +348,11 @@ namespace green::gpu {
throw std::runtime_error("failure allocating V on host");
if (_low_memory_requirement) {
if (cudaMallocHost(&Gk1_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess)
throw std::runtime_error("failure allocating Gk_tsij on host");
throw std::runtime_error("failure allocating Gk1_stij on host");
if (cudaMallocHost(&Gk_smtij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess)
throw std::runtime_error("failure allocating Gk_tsij on host");
Sigmak_stij_buffer_ = Gk_smtij_buffer_;
throw std::runtime_error("failure allocating Gk_smtij on host");
if (cudaMallocHost(&Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess)
throw std::runtime_error("failure allocating Sigmak_stij on host");
}

if (cudaMalloc(&Pqk0_tQP_local_, nt_batch_ * naux2_ * sizeof(cuda_complex)) != cudaSuccess)
Expand Down Expand Up @@ -382,6 +385,10 @@ namespace green::gpu {
if (_low_memory_requirement) {
cudaFreeHost(Gk1_stij_buffer_);
cudaFreeHost(Gk_smtij_buffer_);
cudaFreeHost(Sigmak_stij_buffer_);
}
if (require_cleanup()) {
throw std::runtime_error("cleanup of self-energy was not done correctly.");
}
}

Expand Down Expand Up @@ -526,7 +533,7 @@ namespace green::gpu {
}

template <typename prec>
void gw_qkpt<prec>::compute_second_tau_contraction(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) {
void gw_qkpt<prec>::compute_second_tau_contraction(cuda_complex* Pqk_tQP) {
cuda_complex one = cu_type_map<cxx_complex>::cast(1., 0.);
cuda_complex zero = cu_type_map<cxx_complex>::cast(0., 0.);
cuda_complex m1 = cu_type_map<cxx_complex>::cast(-1., 0.);
Expand Down Expand Up @@ -556,12 +563,12 @@ namespace green::gpu {
}
}
}
write_sigma(_low_memory_requirement, Sigmak_stij_host);
write_sigma(_low_memory_requirement);
cudaEventRecord(all_done_event_);
}

template <typename prec>
void gw_qkpt<prec>::compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) {
void gw_qkpt<prec>::compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP) {
cuda_complex one = cu_type_map<cxx_complex>::cast(1., 0.);
cuda_complex zero = cu_type_map<cxx_complex>::cast(0., 0.);
cuda_complex m1 = cu_type_map<cxx_complex>::cast(-1., 0.);
Expand Down Expand Up @@ -593,28 +600,71 @@ namespace green::gpu {
}
}
}
write_sigma(true, Sigmak_stij_host);
write_sigma(true);
cudaEventRecord(all_done_event_);
}

template <typename prec>
void gw_qkpt<prec>::write_sigma(bool low_memory_mode, cxx_complex* Sigmak_stij_host) {
void gw_qkpt<prec>::write_sigma(bool low_memory_mode) {
// write results. Make sure we have exclusive write access to sigma, then add array sigmak_tij to sigma_ktij
acquire_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_);
scalar_t one = 1.;
if (!low_memory_mode) {
acquire_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_);
if (RAXPY(*handle_, 2 * ns_ * ntnao2_, &one, (scalar_t*)sigmak_stij_, 1, (scalar_t*)(sigma_ktij_ + k_ * ns_ * ntnao2_),
1) != CUBLAS_STATUS_SUCCESS) {
throw std::runtime_error("RAXPY fails on gw_qkpt.write_sigma().");
}
release_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_);
} else {
// Copy sigmak_stij_ back to CPU
if (Sigmak_stij_host == nullptr)
throw std::runtime_error("gw_qkpt.write_sigma(): Sigmak_stij_host cannot be a null pointer");
cudaMemcpy(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost);
std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex));
cudaMemcpyAsync(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost, stream_);
cleanup_req_ = true; // This is required in addition to the all_done_event_ to indicate that we need to copy back to host. For initial states, this is not needed.
}
}


template <typename prec>
void gw_qkpt<prec>::cleanup(bool low_memory_mode, tensor<std::complex<prec>, 4>& Sigmak_stij_host, ztensor<5>& Sigma_tskij_host, bool x2c) {
// Block the stream until all the tasks are completed
if (require_cleanup()) {
cudaStreamSynchronize(stream_);
std::memcpy(Sigmak_stij_host.data(), Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex));
if (!x2c) {
copy_Sigma(Sigma_tskij_host, Sigmak_stij_host);
} else {
copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij_host);
}
cleanup_req_ = false;
}
}

template <typename prec>
void gw_qkpt<prec>::copy_Sigma(ztensor<5>& Sigma_tskij_host, tensor<std::complex<prec>, 4>& Sigmak_stij) {
for (size_t t = 0; t < nt_; ++t) {
for (size_t s = 0; s < ns_; ++s) {
matrix(Sigma_tskij_host(t, s, k_red_id_)) += matrix(Sigmak_stij(s, t)).template cast<typename std::complex<double>>();
}
}
}

template <typename prec>
void gw_qkpt<prec>::copy_Sigma_2c(ztensor<5>& Sigma_tskij_host, tensor<std::complex<prec>, 4>& Sigmak_stij) {
size_t nao = Sigmak_stij.shape()[3];
size_t nso = 2 * nao;
for (size_t ss = 0; ss < 3; ++ss) {
size_t a = (ss % 2 == 0) ? 0 : 1;
size_t b = ((ss + 1) / 2 != 1) ? 0 : 1;
size_t i_shift = a * nao;
size_t j_shift = b * nao;
for (size_t t = 0; t < nt_; ++t) {
matrix(Sigma_tskij_host(t, 0, k_red_id_)).block(i_shift, j_shift, nao, nao) +=
matrix(Sigmak_stij(ss, t)).template cast<typename std::complex<double>>();
if (ss == 2) {
matrix(Sigma_tskij_host(t, 0, k_red_id_)).block(j_shift, i_shift, nao, nao) +=
matrix(Sigmak_stij(ss, t)).conjugate().transpose().template cast<typename std::complex<double>>();
}
}
}
release_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_);
}

template <typename prec>
Expand Down
1 change: 1 addition & 0 deletions src/green/gpu/cu_routines.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ namespace green::gpu {
bool _low_device_memory;
cublasHandle_t _handle;
cusolverDnHandle_t _solver_handle;
std::vector<cublasHandle_t> _qkpt_cublas_handles; // list of cublas handles for qkpt streams

gw_qpt<prec> qpt;
std::vector<gw_qkpt<prec>*> qkpts;
Expand Down
Loading