From 123058b00ea2014d5b07e6fdaeecd185821f1e55 Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Fri, 7 Nov 2025 15:08:05 -0500
Subject: [PATCH 1/9] auto optimization of nt_batch

---
 src/green/gpu/gpu_factory.h   |  2 +-
 src/green/gpu/gw_gpu_kernel.h |  9 +++++++
 src/gw_gpu_kernel.cpp         | 50 +++++++++++++++++++++++++++++------
 test/cu_solver_test.cpp       | 49 ++++++++++++++++++++++++----------
 4 files changed, 87 insertions(+), 23 deletions(-)
diff --git a/src/green/gpu/gpu_factory.h b/src/green/gpu/gpu_factory.h
index 027b54c..81eae2b 100644
--- a/src/green/gpu/gpu_factory.h
+++ b/src/green/gpu/gpu_factory.h
@@ -98,7 +98,7 @@ namespace green::gpu {
                                LinearSolverType::LU);
     p.define<bool>("cuda_low_gpu_memory", "GPU Device has small amount of memory");
     p.define<bool>("cuda_low_cpu_memory", "Host has small amount of memory, we will read Coulomb integrals in chunks");
-    p.define<size_t>("nt_batch", "Size of tau batch in cuda GW solver", 1);
+    p.define<size_t>("nt_batch", "Size of tau batch in cuda GW solver (default: value will be determined to maximize performance)", 0);
   }
 
 }  // namespace green::mbpt
diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h
index 2d96f80..6808e97 100644
--- a/src/green/gpu/gw_gpu_kernel.h
+++ b/src/green/gpu/gw_gpu_kernel.h
@@ -97,6 +97,15 @@ namespace green::gpu {
      */
     void print_effective_flops();
 
+    /**
+     * \brief optimize the tau batch size based on available memory on GPU
+     * 
+     * \param mem_avail available memory on GPU
+     * \param qpt_size size per qpt object for nt_batch = 1
+     * \param qkpt_size size per qkpt object for nt_batch = 1
+     */
+    void optimize_ntbatch(size_t mem_avail, size_t qpt_size, size_t qkpt_size);
+
     double                      _beta;
     size_t                      _nts;
     size_t                      _nts_b;
diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index f59d512..f6fead2 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -167,6 +167,24 @@ namespace green::gpu {
       }
     }
 
+    void gw_gpu_kernel::optimize_ntbatch(size_t mem_avail, size_t qpt_size, size_t qkpt_size) {
+      // If user provided nt_batch size, use it directly
+      if (_nt_batch != 0) return;
+      // Estimate optimal nt_batch size
+      size_t mem_avail_for_qkpt = available_memory * 0.8 - qpt_size; // leave 20% memory for other usages
+      // qkpt_size = size_fix + size_per_t * nt_batch
+      size_t size_fix = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, _nt_batch, _ns);
+      size_t size_per_t = qkpt_size - size_fix;
+      // Optimize nt_batch
+      mem_avail_for_qkpt /= 2; // create at least 2 qkpt workers
+      mem_avail_for_qkpt -= size_fix;
+      _nt_batch = std::min(static_cast<size_t>(mem_avail_for_qkpt / size_per_t), static_cast<size_t>(_nts));
+      // If nt_batch is large and (nts - nt_batch) is small, then we might be better off with nt_batch = nts / 2
+      if (_nt_batch > _nts / 2 && _nts - _nt_batch < _nts / 4) {
+        _nt_batch = _nts / 2;
+      }
+    }
+
     void scalar_gw_gpu_kernel::gw_innerloop(G_type& g, St_type& sigma_tau) {
       if (!_sp) {
         compute_gw_selfenergy<double>(g, sigma_tau);
@@ -256,13 +274,22 @@ namespace green::gpu {
       std::stringstream ss;
       ss << std::setprecision(4) << std::boolalpha;
       if (!_devices_rank && _verbose > 1) ss << "Economical gpu memory mode: " << _low_device_memory << std::endl;
-      std::size_t qpt_size = (!_sp) ? gw_qpt<double>::size(_nao, _NQ, _nts, _nw_b) : gw_qpt<float>::size(_nao, _NQ, _nts, _nw_b);
-      std::size_t qkpt_size = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, _nt_batch, _ns);
+      // Get size per qpt and qkpt for nt_batch = 1
+      size_t nt_batch_initial = 1;
+      size_t qpt_size = (!_sp) ? gw_qpt<double>::size(_nao, _NQ, _nts, _nw_b) : gw_qpt<float>::size(_nao, _NQ, _nts, _nw_b);
+      size_t qkpt_size = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, nt_batch_initial, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, nt_batch_initial, _ns);
+      // Get available memory on GPU
+      size_t available_memory;
+      size_t total_memory;
+      cudaMemGetInfo(&available_memory, &total_memory);
+      // Optimize nt_batch size
+      if (!_devices_rank && _verbose > 1)
+        ss << "Using user specified nt_batch value" << std::endl;
+      else
+        ss << "Optimizing nt_batch value to maximize performance" << std::endl;
+      optimize_ntbatch(available_memory, qpt_size, qkpt_size);
       if (!_devices_rank && _verbose > 1) ss << "size of tau batch: " << _nt_batch << std::endl;
       if (!_devices_rank && _verbose > 1) ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl;
-      std::size_t available_memory;
-      std::size_t total_memory;
-      cudaMemGetInfo(&available_memory, &total_memory);
       _nqkpt = std::min(std::min(size_t((available_memory * 0.8 - qpt_size) / qkpt_size), 16ul), _ink);
       if (!_devices_rank && _verbose > 1) {
         ss << "size per qkpt: " << qkpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl;
@@ -271,9 +298,16 @@ namespace green::gpu {
         ss << "can create: " << _nqkpt << " qkpts in parallel" << std::endl;
       }
       std::cout << ss.str();
-      if (_nqkpt == 0) throw std::runtime_error("not enough memory to create qkpt. Please reduce nt_batch");
-      if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank)
-        std::cerr << "WARNING: ONLY ONE QKPT CREATED. LIKELY CODE WILL BE SLOW. REDUCE NT_BATCH" << std::endl;
+      if (_nqkpt == 0 && _nt_batch == 1)
+        throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU.");
+      if (_nqkpt == 0)
+        throw std::runtime_error("not enough memory to create qkpt. Please reduce nt_batch");
+      if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) {
+        if (_nt_batch > 1)
+          std::cerr << "WARNING: ONLY ONE QKPT CREATED! Performance will be sub-optimal. Reduce nt_batch" << std::endl;
+        else
+          std::cerr << "WARNING: Calculation is too large!! GPU can only afford qkpt. Expect the code to be slow" << std::endl;
+      }
     }
 
     void scalar_gw_gpu_kernel::copy_Gk(const ztensor<5> &G_tskij_host, ztensor<4> &Gk_stij, int k, bool minus_t) {
diff --git a/test/cu_solver_test.cpp b/test/cu_solver_test.cpp
index 69f1088..ba149ab 100644
--- a/test/cu_solver_test.cpp
+++ b/test/cu_solver_test.cpp
@@ -95,7 +95,7 @@ void solve_hf(const std::string& input, const std::string& int_hf, const std::st
 }
 
 
-void solve_gw(const std::string& input, const std::string& int_f, const std::string& data, const std::string& lin, const std::string& mem, bool sp) {
+void solve_gw(const std::string& input, const std::string& int_f, const std::string& data, const std::string& lin, const std::string& mem, bool sp, const std::string& nt_batch) {
   auto        p           = green::params::params("DESCR");
   std::string input_file  = TEST_PATH + input;
   std::string df_int_path = TEST_PATH + int_f;
@@ -104,7 +104,7 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str
   std::string args =
       "test --restart 0 --itermax 1 --E_thr 1e-13 --mixing_type SIGMA_DAMPING --damping 0.8 --input_file=" + input_file +
       " --BETA 100 --grid_file=" + grid_file + " --dfintegral_file=" + df_int_path + " --verbose=5 " +
-      " --cuda_low_gpu_memory " + mem + " --cuda_low_cpu_memory " + mem + " --cuda_linear_solver=" + lin;
+      " --cuda_low_gpu_memory " + mem + " --cuda_low_cpu_memory " + mem + " --cuda_linear_solver=" + lin + " --nt_batch=" + nt_batch;
   green::grids::define_parameters(p);
   green::symmetry::define_parameters(p);
   green::gpu::custom_kernel_parameters(p);
@@ -161,22 +161,43 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str
 
 TEST_CASE("GPU Solver") {
   SECTION("GW_LU") {
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true);
+    // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch);
+    // automatically optimize nt_batch
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true, "0");
+    // set nt_batch = 1
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true, "1");
   }
   SECTION("GW_Cholesky") {
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true);
+    // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch);
+    // automatically optimize nt_batch
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true, "0");
+    // set nt_batch = 1
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true, "1");
   }
   SECTION("GW_X2C") {
-    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false);
-    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true);
-    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false);
-    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true);
+    // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch);
+    // automatically optimize nt_batch
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false, "0");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true, "0");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false, "0");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true, "0");
+    // set nt_batch = 1
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false, "1");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true, "1");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false, "1");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true, "1");
   }
 
   SECTION("HF") {

From a68a6e93cd87e39e3d6f82b277bf4ce2fe103a8a Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Fri, 7 Nov 2025 15:11:45 -0500
Subject: [PATCH 2/9] fix typo in nt_batch optimize

---
 src/gw_gpu_kernel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index f6fead2..977575a 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -171,7 +171,7 @@ namespace green::gpu {
       // If user provided nt_batch size, use it directly
       if (_nt_batch != 0) return;
       // Estimate optimal nt_batch size
-      size_t mem_avail_for_qkpt = available_memory * 0.8 - qpt_size; // leave 20% memory for other usages
+      size_t mem_avail_for_qkpt = mem_avail * 0.8 - qpt_size; // leave 20% memory for other usages
       // qkpt_size = size_fix + size_per_t * nt_batch
       size_t size_fix = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, _nt_batch, _ns);
       size_t size_per_t = qkpt_size - size_fix;

From 282b5ce91b2e971eb05e334a73f840f4e74394ac Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Fri, 7 Nov 2025 15:41:31 -0500
Subject: [PATCH 3/9] Fix logic for printiing correct status in nt_batch
 optimization

---
 src/gw_gpu_kernel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index 977575a..6de0880 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -283,7 +283,7 @@ namespace green::gpu {
       size_t total_memory;
       cudaMemGetInfo(&available_memory, &total_memory);
       // Optimize nt_batch size
-      if (!_devices_rank && _verbose > 1)
+      if (!_devices_rank && _verbose > 1 && _nt_batch != 0)
         ss << "Using user specified nt_batch value" << std::endl;
       else
         ss << "Optimizing nt_batch value to maximize performance" << std::endl;

From 6332382ddc4eeef64a74ccbceb058e23fb4603d6 Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Fri, 7 Nov 2025 16:03:05 -0500
Subject: [PATCH 4/9] Update src/gw_gpu_kernel.cpp documentation

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/gw_gpu_kernel.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index 6de0880..2d9871c 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -288,6 +288,8 @@ namespace green::gpu {
       else
         ss << "Optimizing nt_batch value to maximize performance" << std::endl;
       optimize_ntbatch(available_memory, qpt_size, qkpt_size);
+      // Recalculate qkpt_size with the (possibly) updated _nt_batch value
+      qkpt_size = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, _nt_batch, _ns);
       if (!_devices_rank && _verbose > 1) ss << "size of tau batch: " << _nt_batch << std::endl;
       if (!_devices_rank && _verbose > 1) ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl;
       _nqkpt = std::min(std::min(size_t((available_memory * 0.8 - qpt_size) / qkpt_size), 16ul), _ink);

From 72604c0b082556c5b97c845dcfd695f423c456ef Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Fri, 7 Nov 2025 16:03:43 -0500
Subject: [PATCH 5/9] address the case when nt_batch cannot be more than 1

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/gw_gpu_kernel.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index 2d9871c..8a8041e 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -177,6 +177,10 @@ namespace green::gpu {
       size_t size_per_t = qkpt_size - size_fix;
       // Optimize nt_batch
       mem_avail_for_qkpt /= 2; // create at least 2 qkpt workers
+      if (mem_avail_for_qkpt < size_fix) {
+        _nt_batch = 1; // Set to minimum or handle error
+        return;
+      }
       mem_avail_for_qkpt -= size_fix;
       _nt_batch = std::min(static_cast<size_t>(mem_avail_for_qkpt / size_per_t), static_cast<size_t>(_nts));
       // If nt_batch is large and (nts - nt_batch) is small, then we might be better off with nt_batch = nts / 2

From 3404e0f7b63cdaf62d3bbbae006556da3eecf4ce Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Fri, 7 Nov 2025 16:04:26 -0500
Subject: [PATCH 6/9] Update logic for print messages in src/gw_gpu_kernel.cpp

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/gw_gpu_kernel.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index 8a8041e..fed3118 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -287,10 +287,12 @@ namespace green::gpu {
       size_t total_memory;
       cudaMemGetInfo(&available_memory, &total_memory);
       // Optimize nt_batch size
-      if (!_devices_rank && _verbose > 1 && _nt_batch != 0)
-        ss << "Using user specified nt_batch value" << std::endl;
-      else
-        ss << "Optimizing nt_batch value to maximize performance" << std::endl;
+      if (!_devices_rank && _verbose > 1) {
+        if (_nt_batch != 0)
+          ss << "Using user specified nt_batch value" << std::endl;
+        else
+          ss << "Optimizing nt_batch value to maximize performance" << std::endl;
+      }
       optimize_ntbatch(available_memory, qpt_size, qkpt_size);
       // Recalculate qkpt_size with the (possibly) updated _nt_batch value
       qkpt_size = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, _nt_batch, _ns);

From 0a5e13594b1e27017b9a777f01b25304fdf0d31c Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Fri, 7 Nov 2025 16:23:14 -0500
Subject: [PATCH 7/9] copilot documentation suggestions and better handling of
 exceptions for very large jobs

---
 src/gw_gpu_kernel.cpp | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index fed3118..a2885d9 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -171,18 +171,25 @@ namespace green::gpu {
       // If user provided nt_batch size, use it directly
       if (_nt_batch != 0) return;
       // Estimate optimal nt_batch size
-      size_t mem_avail_for_qkpt = mem_avail * 0.8 - qpt_size; // leave 20% memory for other usages
+      mem_avail *= 0.8; // leave 20% memory for other usages
+      if (mem_avail < qkpt_size)
+        throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU.");
+      mem_avail -= qpt_size; // reserve space for qpt
       // qkpt_size = size_fix + size_per_t * nt_batch
-      size_t size_fix = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, _nt_batch, _ns);
+      size_t size_fix = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, 0, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, 0, _ns);
       size_t size_per_t = qkpt_size - size_fix;
-      // Optimize nt_batch
-      mem_avail_for_qkpt /= 2; // create at least 2 qkpt workers
-      if (mem_avail_for_qkpt < size_fix) {
-        _nt_batch = 1; // Set to minimum or handle error
+      // Preclude cases with very low available memory
+      size_t n_qkpt_max = mem_avail / qkpt_size;
+      if (n_qkpt_max == 0)
+        throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU.");
+      if (n_qkpt_max == 1) {
+        _nt_batch = 1;
         return;
       }
-      mem_avail_for_qkpt -= size_fix;
-      _nt_batch = std::min(static_cast<size_t>(mem_avail_for_qkpt / size_per_t), static_cast<size_t>(_nts));
+      // Optimize nt_batch for n_streams >= 2
+      mem_avail /= 2; // create at least 2 qkpt workers
+      mem_avail -= size_fix; // reserve fixed size for each qkpt worker
+      _nt_batch = std::min(static_cast<size_t>(mem_avail / size_per_t), static_cast<size_t>(_nts));
       // If nt_batch is large and (nts - nt_batch) is small, then we might be better off with nt_batch = nts / 2
       if (_nt_batch > _nts / 2 && _nts - _nt_batch < _nts / 4) {
         _nt_batch = _nts / 2;
@@ -306,15 +313,16 @@ namespace green::gpu {
         ss << "can create: " << _nqkpt << " qkpts in parallel" << std::endl;
       }
       std::cout << ss.str();
+      // NOTE: implement checks again in case nt_batch is specified by user
       if (_nqkpt == 0 && _nt_batch == 1)
         throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU.");
       if (_nqkpt == 0)
-        throw std::runtime_error("not enough memory to create qkpt. Please reduce nt_batch");
+        throw std::runtime_error("Not enough memory to create qkpt. Please reduce nt_batch");
       if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) {
         if (_nt_batch > 1)
-          std::cerr << "WARNING: ONLY ONE QKPT CREATED! Performance will be sub-optimal. Reduce nt_batch" << std::endl;
+          std::cerr << "WARNING: Only one qkpt created! Performance will be sub-optimal. Reduce nt_batch" << std::endl;
         else
-          std::cerr << "WARNING: Calculation is too large!! GPU can only afford qkpt. Expect the code to be slow" << std::endl;
+          std::cerr << "WARNING: Calculation is too large!! GPU can only afford one qkpt. Expect the code to be slow" << std::endl;
       }
     }
 

From 7a7acfc94972f8ca522ce02fbaa0c9cf677ef6e3 Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Fri, 7 Nov 2025 17:21:00 -0500
Subject: [PATCH 8/9] update explanation for nt_batch parameter for CLI

---
 src/green/gpu/gpu_factory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/green/gpu/gpu_factory.h b/src/green/gpu/gpu_factory.h
index 81eae2b..5d106b2 100644
--- a/src/green/gpu/gpu_factory.h
+++ b/src/green/gpu/gpu_factory.h
@@ -98,7 +98,7 @@ namespace green::gpu {
                                LinearSolverType::LU);
     p.define<bool>("cuda_low_gpu_memory", "GPU Device has small amount of memory");
     p.define<bool>("cuda_low_cpu_memory", "Host has small amount of memory, we will read Coulomb integrals in chunks");
-    p.define<size_t>("nt_batch", "Size of tau batch in cuda GW solver (default: value will be determined to maximize performance)", 0);
+    p.define<size_t>("nt_batch", "Size of tau batch in cuda GW solver; if set to 0; value will be determined to maximize performance", 0);
   }
 
 }  // namespace green::mbpt

From dc7809d63650bc4e982a84cda3428983d9cde04d Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Sat, 8 Nov 2025 10:24:01 -0500
Subject: [PATCH 9/9] clean up output statements

---
 src/gw_gpu_kernel.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index a2885d9..ca2dd34 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -294,19 +294,18 @@ namespace green::gpu {
       size_t total_memory;
       cudaMemGetInfo(&available_memory, &total_memory);
       // Optimize nt_batch size
-      if (!_devices_rank && _verbose > 1) {
-        if (_nt_batch != 0)
-          ss << "Using user specified nt_batch value" << std::endl;
-        else
-          ss << "Optimizing nt_batch value to maximize performance" << std::endl;
-      }
       optimize_ntbatch(available_memory, qpt_size, qkpt_size);
       // Recalculate qkpt_size with the (possibly) updated _nt_batch value
       qkpt_size = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, _nt_batch, _ns);
-      if (!_devices_rank && _verbose > 1) ss << "size of tau batch: " << _nt_batch << std::endl;
-      if (!_devices_rank && _verbose > 1) ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl;
       _nqkpt = std::min(std::min(size_t((available_memory * 0.8 - qpt_size) / qkpt_size), 16ul), _ink);
+      // Print memory info
       if (!_devices_rank && _verbose > 1) {
+        if (_nt_batch != 0)
+          ss << "Using user specified nt_batch value" << std::endl;
+        else
+          ss << "Optimized nt_batch value to maximize performance" << std::endl;
+        ss << "size of tau batch: " << _nt_batch << std::endl;
+        ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl;
         ss << "size per qkpt: " << qkpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl;
         ss << "available memory: " << available_memory / (1024 * 1024. * 1024.) << " GB " << " of total: "
                   << total_memory / (1024 * 1024. * 1024.) << " GB. " << std::endl;