diff --git a/src/cu_compute_Pq.cu b/src/cu_compute_Pq.cu
index bfc1224..a600fd3 100644
--- a/src/cu_compute_Pq.cu
+++ b/src/cu_compute_Pq.cu
@@ -26,8 +26,8 @@ __global__ void validate_info(int *info){
   if(idx>0) return;
   if(*info!=0){
     printf("info is: %d\n",*info);
-    printf("nonzero info. Aborting.\n");
-    asm("exit;");
+    printf("nonzero info. Aborting application.\n");
+    asm("trap;"); // nonzero info = cholesky or LU fails, then all threads should be stopped
   }
 }
 __global__ void validate_info(int *info, int N){
@@ -36,8 +36,8 @@ __global__ void validate_info(int *info, int N){
   for(int i=0;i<N;++i){
     if(*(info+i)!=0){
       printf("info is: %d\n",*(info+i));
-      printf("nonzero info for batched job: %d. Aborting.\n",i);
-      asm("exit;");
+      printf("nonzero info for batched job: %d. Aborting application.\n",i);
+      asm("trap;"); // nonzero info = cholesky or LU fails, then all threads should be stopped
     }
   }
 }
diff --git a/src/cu_routines.cu b/src/cu_routines.cu
index 4542006..83ce345 100644
--- a/src/cu_routines.cu
+++ b/src/cu_routines.cu
@@ -209,7 +209,7 @@ namespace green::gpu {
   template <typename prec>
   void cugw_utils<prec>::solve(int _nts, int _ns, int _nk, int _ink, int _nao, const std::vector<size_t>& reduced_to_full,
                                const std::vector<size_t>& full_to_reduced, std::complex<double>* Vk1k2_Qij,
-                               ztensor<5>& Sigma_tskij_host, int _devices_rank, int _devices_size, bool low_device_memory,
+                               St_type& sigma_tau_host_shared, int _devices_rank, int _devices_size, bool low_device_memory,
                                int verbose, irre_pos_callback& irre_pos, mom_cons_callback& momentum_conservation,
                                gw_reader1_callback<prec>& r1, gw_reader2_callback<prec>& r2) {
     // this is the main GW loop
@@ -252,6 +252,7 @@ namespace green::gpu {
       qpt.compute_Pq();
       qpt.transform_wt();
       // Write to Sigma(k), k belongs to _ink
+      MPI_Win_lock_all(MPI_MODE_NOCHECK, sigma_tau_host_shared.win());
       for (size_t k_reduced_id = 0; k_reduced_id < _ink; ++k_reduced_id) {
         size_t k = reduced_to_full[k_reduced_id];
         for (size_t q_or_qinv = 0; q_or_qinv < _nk; ++q_or_qinv) {
@@ -264,33 +265,42 @@ namespace green::gpu {
             bool                  need_minus_k1 = reduced_to_full[k1_reduced_id] != k1;
             bool                  need_minus_q  = reduced_to_full[q_reduced_id] != q_or_qinv;
 
+            // read and prepare G(k-q), V(k, k-q) and V(k-q, k)
             r2(k, k1, k1_reduced_id, k_vector, V_Qim, Vk1k2_Qij, Gk1_stij, need_minus_k1);
 
-            gw_qkpt<prec>* qkpt = obtain_idle_qkpt(qkpts);
+            gw_qkpt<prec>* qkpt = obtain_idle_qkpt_for_sigma(qkpts, _low_device_memory, Sigmak_stij.data());
             if (_low_device_memory) {
               if (!_X2C) {
                 qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1);
-                qkpt->compute_second_tau_contraction(Sigmak_stij.data(),
-                                                     qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
-                copy_Sigma(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts, _ns);
+                qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
+                copy_Sigma(sigma_tau_host_shared.object(), Sigmak_stij, k_reduced_id, _nts, _ns);
               } else {
                 // In 2cGW, G(-k) = G*(k) has already been addressed in r2()
                 qkpt->set_up_qkpt_second(Gk1_stij.data(), V_Qim.data(), k_reduced_id, k1_reduced_id, false);
-                qkpt->compute_second_tau_contraction_2C(Sigmak_stij.data(),
-                                                        qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
-                copy_Sigma_2c(Sigma_tskij_host, Sigmak_stij, k_reduced_id, _nts);
+                qkpt->compute_second_tau_contraction_2C(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
+                copy_Sigma_2c(sigma_tau_host_shared.object(), Sigmak_stij, k_reduced_id, _nts);
               }
             } else {
               qkpt->set_up_qkpt_second(nullptr, V_Qim.data(), k_reduced_id, k1_reduced_id, need_minus_k1);
-              qkpt->compute_second_tau_contraction(nullptr, qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
+              qkpt->compute_second_tau_contraction(qpt.Pqk_tQP(qkpt->all_done_event(), qkpt->stream(), need_minus_q));
             }
           }
         }
       }
+      MPI_Win_sync(sigma_tau_host_shared.win());
+      MPI_Barrier(utils::context.node_comm);
+      MPI_Win_unlock_all(sigma_tau_host_shared.win());
     }
     cudaDeviceSynchronize();
+    MPI_Win_lock_all(MPI_MODE_NOCHECK, sigma_tau_host_shared.win());
+    wait_and_clean_qkpts(qkpts, _low_device_memory, Sigmak_stij.data());
+    MPI_Win_sync(sigma_tau_host_shared.win());
+    MPI_Win_unlock_all(sigma_tau_host_shared.win());
+    // wait for all qkpts to complete
     if (!_low_device_memory and !_X2C) {
-      copy_Sigma_from_device_to_host(sigma_kstij_device, Sigma_tskij_host.data(), _ink, _nao, _nts, _ns);
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, sigma_tau_host_shared.win());
+      copy_Sigma_from_device_to_host(sigma_kstij_device, sigma_tau_host_shared.object().data(), _ink, _nao, _nts, _ns);
+      MPI_Win_unlock(0, sigma_tau_host_shared.win());
     }
   }
 
diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu
index a366155..24c7c8c 100644
--- a/src/cugw_qpt.cu
+++ b/src/cugw_qpt.cu
@@ -252,7 +252,7 @@ namespace green::gpu {
               naux_, Pivot_, d_info_, nw_b_) != CUBLAS_STATUS_SUCCESS) {
       throw std::runtime_error("CUDA GETRF failed!");
     }
-     validate_info<<<1, 1, 0, stream_>>>(d_info_, nw_b_);
+    validate_info<<<1, 1, 0, stream_>>>(d_info_, nw_b_);
     cudaEventRecord(LU_decomposition_ready_event_, stream_);
 
     if (cudaStreamWaitEvent(stream_, LU_decomposition_ready_event_, 0 /*cudaEventWaitDefault*/))
@@ -320,7 +320,7 @@ namespace green::gpu {
       g_ktij_(g_ktij), g_kmtij_(g_kmtij), sigma_ktij_(sigma_ktij), sigma_k_locks_(sigma_k_locks), nao_(nao), nao2_(nao * nao),
       nao3_(nao2_ * nao), naux_(naux), naux2_(naux * naux), nauxnao_(naux * nao), nauxnao2_(naux * nao * nao), ns_(ns), nt_(nt),
       nt_batch_(nt_batch), ntnaux_(nt * naux), ntnaux2_(nt * naux * naux), ntnao_(nt * nao), ntnao2_(nt * nao2_),
-      handle_(handle) {
+      handle_(handle), cleanup_req_(false) {
     _low_memory_requirement = (g_ktij == nullptr) ? true : false;
     if (cudaStreamCreate(&stream_) != cudaSuccess) throw std::runtime_error("main stream creation failed");
 
@@ -349,7 +349,11 @@ namespace green::gpu {
         throw std::runtime_error("failure allocating Gk_tsij on host");
       if (cudaMallocHost(&Gk_smtij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess)
         throw std::runtime_error("failure allocating Gk_tsij on host");
-      Sigmak_stij_buffer_ = Gk_smtij_buffer_;
+      // ! GH:  I think this will interfere with our cudaMemcpyAsync. Should we simply allocate a different array for Sigmak_stij_buffer_?
+      // !      The more I think, this here is the real reason why we had to use cudaMemcpy and not the asynchronous version.
+      // ! <previously> Sigmak_stij_buffer_ = Gk_smtij_buffer_;
+      if (cudaMallocHost(&Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex)) != cudaSuccess)
+        throw std::runtime_error("failure allocating Gk_tsij on host");
     }
 
     if (cudaMalloc(&Pqk0_tQP_local_, nt_batch_ * naux2_ * sizeof(cuda_complex)) != cudaSuccess)
@@ -383,6 +387,9 @@ namespace green::gpu {
       cudaFreeHost(Gk1_stij_buffer_);
       cudaFreeHost(Gk_smtij_buffer_);
     }
+    if (cleanup_req_ == true) {
+      throw std::runtime_error("cleanup of self-energy was not done correctly.");
+    }
   }
 
   template <typename prec>
@@ -526,7 +533,7 @@ namespace green::gpu {
   }
 
   template <typename prec>
-  void gw_qkpt<prec>::compute_second_tau_contraction(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) {
+  void gw_qkpt<prec>::compute_second_tau_contraction(cuda_complex* Pqk_tQP) {
     cuda_complex  one     = cu_type_map<cxx_complex>::cast(1., 0.);
     cuda_complex  zero    = cu_type_map<cxx_complex>::cast(0., 0.);
     cuda_complex  m1      = cu_type_map<cxx_complex>::cast(-1., 0.);
@@ -556,12 +563,12 @@ namespace green::gpu {
         }
       }
     }
-    write_sigma(_low_memory_requirement, Sigmak_stij_host);
+    write_sigma(_low_memory_requirement);
     cudaEventRecord(all_done_event_);
   }
 
   template <typename prec>
-  void gw_qkpt<prec>::compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host, cuda_complex* Pqk_tQP) {
+  void gw_qkpt<prec>::compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP) {
     cuda_complex  one     = cu_type_map<cxx_complex>::cast(1., 0.);
     cuda_complex  zero    = cu_type_map<cxx_complex>::cast(0., 0.);
     cuda_complex  m1      = cu_type_map<cxx_complex>::cast(-1., 0.);
@@ -593,13 +600,14 @@ namespace green::gpu {
         }
       }
     }
-    write_sigma(true, Sigmak_stij_host);
+    write_sigma(true);
     cudaEventRecord(all_done_event_);
   }
 
   template <typename prec>
-  void gw_qkpt<prec>::write_sigma(bool low_memory_mode, cxx_complex* Sigmak_stij_host) {
+  void gw_qkpt<prec>::write_sigma(bool low_memory_mode) {
     // write results. Make sure we have exclusive write access to sigma, then add array sigmak_tij to sigma_ktij
+    // TODO: In my understanding, the lock is only required for RAXPY part now, so we should move them inside the first if condition
     acquire_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_);
     scalar_t one = 1.;
     if (!low_memory_mode) {
@@ -608,15 +616,23 @@ namespace green::gpu {
         throw std::runtime_error("RAXPY fails on gw_qkpt.write_sigma().");
       }
     } else {
-      // Copy sigmak_stij_ back to CPU
-      if (Sigmak_stij_host == nullptr)
-        throw std::runtime_error("gw_qkpt.write_sigma(): Sigmak_stij_host cannot be a null pointer");
-      cudaMemcpy(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost);
-      std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex));
+      // Copy sigmak_stij_ asynchronously back to CPU
+      cudaMemcpyAsync(Sigmak_stij_buffer_, sigmak_stij_, ns_ * ntnao2_ * sizeof(cuda_complex), cudaMemcpyDeviceToHost, stream_);
+      // cudaMemcpyAsync will require a cleanup at later stage.
+      // So, we update the cleanup_req_ status to true
+      cleanup_req_ = true;
     }
     release_lock<<<1, 1, 0, stream_>>>(sigma_k_locks_ + k_);
   }
 
+  template <typename prec>
+  void gw_qkpt<prec>::cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host) {
+    if (cleanup_req_) {
+      std::memcpy(Sigmak_stij_host, Sigmak_stij_buffer_, ns_ * ntnao2_ * sizeof(cxx_complex));
+      cleanup_req_ = false;
+    }
+  }
+
   template <typename prec>
   bool gw_qkpt<prec>::is_busy() {
     cudaError_t stream_status = cudaStreamQuery(stream_);
diff --git a/src/green/gpu/cu_routines.h b/src/green/gpu/cu_routines.h
index b2d9c71..1467cdc 100644
--- a/src/green/gpu/cu_routines.h
+++ b/src/green/gpu/cu_routines.h
@@ -22,6 +22,8 @@
 #ifndef GREEN_GPU_CU_ROUTINES_H
 #define GREEN_GPU_CU_ROUTINES_H
 #include <green/gpu/common_defs.h>
+#include <green/utils/mpi_shared.h>
+#include <green/utils/mpi_utils.h>
 
 #include <cstring>
 
@@ -132,6 +134,7 @@ namespace green::gpu {
     using scalar_t     = typename cu_type_map<std::complex<prec>>::cxx_base_type;
     using cxx_complex  = typename cu_type_map<std::complex<prec>>::cxx_type;
     using cuda_complex = typename cu_type_map<std::complex<prec>>::cuda_type;
+    using St_type      = utils::shared_object<ztensor<5>>;
 
   public:
     cugw_utils(int _nts, int _nt_batch, int _nw_b, int _ns, int _nk, int _ink, int _nqkpt, int _NQ, int _nao,
@@ -141,7 +144,7 @@ namespace green::gpu {
     ~cugw_utils();
 
     void solve(int _nts, int _ns, int _nk, int _ink, int _nao, const std::vector<size_t>& reduced_to_full,
-               const std::vector<size_t>& full_to_reduced, std::complex<double>* Vk1k2_Qij, ztensor<5>& Sigma_tskij_host,
+               const std::vector<size_t>& full_to_reduced, std::complex<double>* Vk1k2_Qij, St_type& Sigma_tskij_host,
                int _devices_rank, int _devices_size, bool _low_device_memory, int verbose, irre_pos_callback& irre_pos,
                mom_cons_callback& momentum_conservation, gw_reader1_callback<prec>& r1, gw_reader2_callback<prec>& r2);
 
@@ -163,7 +166,7 @@ namespace green::gpu {
     tensor<std::complex<prec>, 3>  V_Qim;
     tensor<std::complex<prec>, 4>  Gk1_stij;
     tensor<std::complex<prec>, 4>  Gk_smtij;
-    tensor<std::complex<prec>, 4>& Sigmak_stij = Gk_smtij;
+    tensor<std::complex<prec>, 4>  Sigmak_stij;
 
     cuda_complex*                  g_kstij_device;
     cuda_complex*                  g_ksmtij_device;
diff --git a/src/green/gpu/cugw_qpt.h b/src/green/gpu/cugw_qpt.h
index 292ac48..5ede49a 100644
--- a/src/green/gpu/cugw_qpt.h
+++ b/src/green/gpu/cugw_qpt.h
@@ -30,7 +30,19 @@
 #include "cublas_routines_prec.h"
 #include "cuda_common.h"
 
+/**
+ * \brief checks success of a LU or Cholesky decomposition
+ *
+ * \param info output of Cuda equivalent of decomposition function, e.g., POTRF
+ */
 __global__ void validate_info(int* info);
+
+/**
+ * \brief checks success of a LU or Cholesky decomposition
+ *
+ * \param info (vector) output of Cuda equivalent of decomposition function, e.g., POTRF
+ * \param N length of info
+ */
 __global__ void validate_info(int* info, int N);
 __global__ void set_up_one_minus_P(cuDoubleComplex* one_minus_P, cuDoubleComplex* P, int naux);
 __global__ void set_up_one_minus_P(cuComplex* one_minus_P, cuComplex* P, int naux);
@@ -265,30 +277,46 @@ namespace green::gpu {
     /**
      * \brief Using dressed GW polarization compute self-energy at a given momentum point
      *
-     * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point
      * \param Pqk_tQP Dressed polarization bubble
      */
-    void compute_second_tau_contraction(cxx_complex* Sigmak_stij_host = nullptr, cuda_complex* Pqk_tQP = nullptr);
+    void compute_second_tau_contraction(cuda_complex* Pqk_tQP = nullptr);
     /**
      * \brief Using dressed GW polarization compute self-energy at a given momentum point (X2C version)
-     * \param Sigmak_stij_host Host stored array for Self-energy at a given momentum point
+     * 
      * \param Pqk_tQP Dressed polarization bubble
      */
-    void compute_second_tau_contraction_2C(cxx_complex* Sigmak_stij_host = nullptr, cuda_complex* Pqk_tQP = nullptr);
+    void compute_second_tau_contraction_2C(cuda_complex* Pqk_tQP = nullptr);
 
     /**
      * \brief For a given k-point copy self-energy back to a host memory
      * \param low_memory_mode - whether the whole self-energy allocated in memory or not
-     * \param Sigmak_stij_host - Host stored self-energy object at a given momentum point
      */
-    void write_sigma(bool low_memory_mode = false, cxx_complex* Sigmak_stij_host = nullptr);
+    void write_sigma(bool low_memory_mode = false);
 
     /**
-     * \brief Check if cuda devices are budy
-     * \return true if asynchronous calculations are still running
+     * \brief Check if cuda devices are busy
+     * \return true - if asynchronous calculations are still running
      */
     bool is_busy();
 
+    /**
+     * \brief return the status of copy_selfenergy from device to host
+     * 
+     * \return false - not required, stream ready for next calculation
+     * \return true - required
+     */
+    bool require_cleanup(){
+      return cleanup_req_; 
+    }
+
+    /**
+     * \brief perform cleanup, i.e. copy data from Sigmak buffer (4-index array for a given momentum point) to Host shared memory Self-energy
+     * 
+     * \param low_memory_mode - whether the whole self-energy allocated in memory or not
+     * \param Sigma_stij_host - HHost stored self-energy object at a given momentum point
+     */
+    void cleanup(bool low_memory_mode, cxx_complex* Sigmak_stij_host);
+
     //
     static std::size_t size(size_t nao, size_t naux, size_t nt, size_t nt_batch, size_t ns) {
       return (2 * naux * nao * nao               // V_Qpm+V_pmQ
@@ -374,8 +402,20 @@ namespace green::gpu {
 
     // pointer to cublas handle
     cublasHandle_t* handle_;
+
+    // status of data transfer / copy from Device to Host.
+    // false: not required, stream ready for next calculation
+    // true: required
+    bool cleanup_req_;
   };
 
+  /**
+   * \brief returns an idle qkpt stream, otherwise waits until a stream is available
+   * 
+   * \tparam prec - precision for calculation
+   * \param qkpts - vector of qkpt workers (gw_qkpt<prec> type)
+   * \return gw_qkpt<prec>* - pointer to idle qkpt
+   */
   template <typename prec>
   gw_qkpt<prec>* obtain_idle_qkpt(std::vector<gw_qkpt<prec>*>& qkpts) {
     static int pos = 0;
@@ -387,4 +427,48 @@ namespace green::gpu {
     return qkpts[pos];
   }
 
+  /**
+   * \brief returns an idle qkpt stream, otherwise waits until a stream is available
+   * 
+   * \tparam prec - precision for calculation
+   * \param qkpts - vector of qkpt workers (gw_qkpt<prec> type)
+   * \param low_memory_mode - low memory mode for read/write integrals
+   * \param Sigmak_stij_host - cudaMallocHost buffer for transfering Sigma
+   * \return gw_qkpt<prec>* - pointer to idle qkpt
+   */
+  template <typename prec>
+  gw_qkpt<prec>* obtain_idle_qkpt_for_sigma(std::vector<gw_qkpt<prec>*>& qkpts, bool low_memory_mode,
+                                            typename cu_type_map<std::complex<prec>>::cxx_type* Sigmak_stij_host) {
+    static int pos = 0;
+    pos++;
+    if (pos >= qkpts.size()) pos = 0;
+    while (qkpts[pos]->is_busy()) {
+      pos = (pos + 1) % qkpts.size();
+    }
+    qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host);
+    return qkpts[pos];
+  }
+
+  /**
+   * \brief waits for all qkpts to complete and cleans them up
+   * 
+   * \tparam prec - precision for calculation
+   * \param qkpts - vector of qkpt workers (gw_qkpt<prec> type)
+   * \param low_memory_mode - low memory mode for read/write integrals
+   * \param Sigmak_stij_host - cudaMallocHost buffer for transfering Sigma
+   */
+  template <typename prec>
+  void wait_and_clean_qkpts(std::vector<gw_qkpt<prec>*>& qkpts, bool low_memory_mode,
+                            typename cu_type_map<std::complex<prec>>::cxx_type* Sigmak_stij_host) {
+    static int pos = 0;
+    pos++;
+    if (pos >= qkpts.size()) pos = 0;
+    for (pos = 0; pos < qkpts.size(); pos++) {
+      while (qkpts[pos]->is_busy()) {
+        continue;
+      }
+      qkpts[pos]->cleanup(low_memory_mode, Sigmak_stij_host);
+    }
+    return;
+  }
 }  // namespace green::gpu
diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h
index 2627a6a..0eeed37 100644
--- a/src/green/gpu/gw_gpu_kernel.h
+++ b/src/green/gpu/gw_gpu_kernel.h
@@ -80,6 +80,13 @@ namespace green::gpu {
     virtual void gw_innerloop(G_type& g, St_type& sigma_tau) = 0;
     void GW_check_devices_free_space();
 
+    /**
+     * \brief count and floating points operations per second achieved on GPU.
+     * This is not representative of the GPU capabilities, but instead, accounts for read/write overheads.
+     * The value is entirely in the context Green-MBPT solver.
+     */
+    void flops_achieved();
+
     /*
      * Read a chunk of Coulomb integral with given (k[0], k[3]) k-pair
      */
diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index 0908335..7326a9c 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -110,6 +110,7 @@ namespace green::gpu {
       MPI_Barrier(utils::context.global);
       statistics.end();
       statistics.print(utils::context.global);
+      // flops_achieved();
 
       clean_MPI_structure();
       clean_shared_Coulomb();
@@ -119,6 +120,26 @@ namespace green::gpu {
       MPI_Op_free(&matrix_sum_op);
     }
 
+    void gw_gpu_kernel::flops_achieved() {
+      double gpu_time;
+      utils::event_t cugw_event = statistics.event("Solve cuGW");
+      gpu_time = cugw_event.duration;
+
+      if (!utils::context.global_rank) {
+        MPI_Reduce(MPI_IN_PLACE, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global);
+      } else {
+        MPI_Reduce(&gpu_time, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global);
+      }
+
+      double flops = _flop_count / gpu_time;
+
+      if (!utils::context.global_rank && _verbose > 1) {
+        std::cout << "############ GPU FLOPs achieved ############" << std::endl;
+        std::cout << "FLOPs achieved: " << flops / 1.0e9 << " Giga flops." << std::endl;
+        std::cout << "###########################################################" << std::endl;
+      }
+    }
+
     void scalar_gw_gpu_kernel::gw_innerloop(G_type& g, St_type& sigma_tau) {
       if (!_sp) {
         compute_gw_selfenergy<double>(g, sigma_tau);
@@ -183,22 +204,14 @@ namespace green::gpu {
         statistics.end();
       };
 
-      // Since all process in _devices_comm will write to the self-energy simultaneously,
-      // instaed of adding locks in cugw.solve(), we allocate private _Sigma_tskij_local_host
-      // and do MPIAllreduce on CPU later on. Since the number of processes with a GPU is very
-      // limited, the additional memory overhead is fairly limited.
-      ztensor<5> Sigma_tskij_host_local(_nts, _ns, _ink, _nao, _nao);
+      // Previous implementation avoided writing to the shared self-energy directly
+      // But the memory overhead can be costly for large applicaitons where GPUs are a must.
+      // In revised implementation, cugw.solve() will overwrite the shared self energy directly.
       statistics.start("Solve cuGW");
       cugw.solve(_nts, _ns, _nk, _ink, _nao, _bz_utils.symmetry().reduced_to_full(), _bz_utils.symmetry().full_to_reduced(),
-                 _Vk1k2_Qij, Sigma_tskij_host_local, _devices_rank, _devices_size, _low_device_memory, _verbose,
+                 _Vk1k2_Qij, sigma_tau, _devices_rank, _devices_size, _low_device_memory, _verbose,
                  irre_pos, mom_cons, r1, r2);
       statistics.end();
-      statistics.start("Update Host Self-energy");
-      // Copy back to Sigma_tskij_local_host
-      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, sigma_tau.win());
-      sigma_tau.object() += Sigma_tskij_host_local;
-      MPI_Win_unlock(0, sigma_tau.win());
-      statistics.end();
     }
 
     void gw_gpu_kernel::GW_check_devices_free_space() {
@@ -302,15 +315,11 @@ namespace green::gpu {
           statistics.end();
       };
 
-      ztensor<5> Sigma_tskij_host_local(_nts, 1, _ink, _nso, _nso);
+      statistics.start("Solve cuGW");
       cugw.solve(_nts, psuedo_ns, _nk, _ink, _nao, _bz_utils.symmetry().reduced_to_full(), _bz_utils.symmetry().full_to_reduced(),
-                 _Vk1k2_Qij, Sigma_tskij_host_local, _devices_rank, _devices_size, true, _verbose,
+                 _Vk1k2_Qij, sigma_tau, _devices_rank, _devices_size, true, _verbose,
                  irre_pos, mom_cons, r1, r2);
-      // Convert Sigma_tskij_host_local to (_nts, 1, _ink, _nso, _nso)
-      // Copy back to Sigma_tskij_local_host
-      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, sigma_tau.win());
-      sigma_tau.object() += Sigma_tskij_host_local;
-      MPI_Win_unlock(0, sigma_tau.win());
+      statistics.end();
     }
 
     void x2c_gw_gpu_kernel::copy_Gk_2c(const ztensor<5> &G_tskij_host, tensor<std::complex<double>,4> &Gk_4tij, int k, bool need_minus_k, bool minus_t) {