diff --git a/src/green/gpu/gpu_factory.h b/src/green/gpu/gpu_factory.h
index 027b54c..5d106b2 100644
--- a/src/green/gpu/gpu_factory.h
+++ b/src/green/gpu/gpu_factory.h
@@ -98,7 +98,7 @@ namespace green::gpu {
                                LinearSolverType::LU);
     p.define<bool>("cuda_low_gpu_memory", "GPU Device has small amount of memory");
     p.define<bool>("cuda_low_cpu_memory", "Host has small amount of memory, we will read Coulomb integrals in chunks");
-    p.define<size_t>("nt_batch", "Size of tau batch in cuda GW solver", 1);
+    p.define<size_t>("nt_batch", "Size of tau batch in cuda GW solver; if set to 0; value will be determined to maximize performance", 0);
   }
 
 }  // namespace green::mbpt
diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h
index 2d96f80..6808e97 100644
--- a/src/green/gpu/gw_gpu_kernel.h
+++ b/src/green/gpu/gw_gpu_kernel.h
@@ -97,6 +97,15 @@ namespace green::gpu {
      */
     void print_effective_flops();
 
+    /**
+     * \brief optimize the tau batch size based on available memory on GPU
+     * 
+     * \param mem_avail available memory on GPU
+     * \param qpt_size size per qpt object for nt_batch = 1
+     * \param qkpt_size size per qkpt object for nt_batch = 1
+     */
+    void optimize_ntbatch(size_t mem_avail, size_t qpt_size, size_t qkpt_size);
+
     double                      _beta;
     size_t                      _nts;
     size_t                      _nts_b;
diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index f59d512..ca2dd34 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -167,6 +167,35 @@ namespace green::gpu {
       }
     }
 
+    void gw_gpu_kernel::optimize_ntbatch(size_t mem_avail, size_t qpt_size, size_t qkpt_size) {
+      // If user provided nt_batch size, use it directly
+      if (_nt_batch != 0) return;
+      // Estimate optimal nt_batch size
+      mem_avail *= 0.8; // leave 20% memory for other usages
+      if (mem_avail < qkpt_size)
+        throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU.");
+      mem_avail -= qpt_size; // reserve space for qpt
+      // qkpt_size = size_fix + size_per_t * nt_batch
+      size_t size_fix = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, 0, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, 0, _ns);
+      size_t size_per_t = qkpt_size - size_fix;
+      // Preclude cases with very low available memory
+      size_t n_qkpt_max = mem_avail / qkpt_size;
+      if (n_qkpt_max == 0)
+        throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU.");
+      if (n_qkpt_max == 1) {
+        _nt_batch = 1;
+        return;
+      }
+      // Optimize nt_batch for n_streams >= 2
+      mem_avail /= 2; // create at least 2 qkpt workers
+      mem_avail -= size_fix; // reserve fixed size for each qkpt worker
+      _nt_batch = std::min(static_cast<size_t>(mem_avail / size_per_t), static_cast<size_t>(_nts));
+      // If nt_batch is large and (nts - nt_batch) is small, then we might be better off with nt_batch = nts / 2
+      if (_nt_batch > _nts / 2 && _nts - _nt_batch < _nts / 4) {
+        _nt_batch = _nts / 2;
+      }
+    }
+
     void scalar_gw_gpu_kernel::gw_innerloop(G_type& g, St_type& sigma_tau) {
       if (!_sp) {
         compute_gw_selfenergy<double>(g, sigma_tau);
@@ -256,24 +285,44 @@ namespace green::gpu {
       std::stringstream ss;
       ss << std::setprecision(4) << std::boolalpha;
       if (!_devices_rank && _verbose > 1) ss << "Economical gpu memory mode: " << _low_device_memory << std::endl;
-      std::size_t qpt_size = (!_sp) ? gw_qpt<double>::size(_nao, _NQ, _nts, _nw_b) : gw_qpt<float>::size(_nao, _NQ, _nts, _nw_b);
-      std::size_t qkpt_size = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, _nt_batch, _ns);
-      if (!_devices_rank && _verbose > 1) ss << "size of tau batch: " << _nt_batch << std::endl;
-      if (!_devices_rank && _verbose > 1) ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl;
-      std::size_t available_memory;
-      std::size_t total_memory;
+      // Get size per qpt and qkpt for nt_batch = 1
+      size_t nt_batch_initial = 1;
+      size_t qpt_size = (!_sp) ? gw_qpt<double>::size(_nao, _NQ, _nts, _nw_b) : gw_qpt<float>::size(_nao, _NQ, _nts, _nw_b);
+      size_t qkpt_size = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, nt_batch_initial, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, nt_batch_initial, _ns);
+      // Get available memory on GPU
+      size_t available_memory;
+      size_t total_memory;
       cudaMemGetInfo(&available_memory, &total_memory);
+      // Optimize nt_batch size
+      optimize_ntbatch(available_memory, qpt_size, qkpt_size);
+      // Recalculate qkpt_size with the (possibly) updated _nt_batch value
+      qkpt_size = (!_sp) ? gw_qkpt<double>::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt<float>::size(_nao, _NQ, _nts, _nt_batch, _ns);
       _nqkpt = std::min(std::min(size_t((available_memory * 0.8 - qpt_size) / qkpt_size), 16ul), _ink);
+      // Print memory info
       if (!_devices_rank && _verbose > 1) {
+        if (_nt_batch != 0)
+          ss << "Using user specified nt_batch value" << std::endl;
+        else
+          ss << "Optimized nt_batch value to maximize performance" << std::endl;
+        ss << "size of tau batch: " << _nt_batch << std::endl;
+        ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl;
         ss << "size per qkpt: " << qkpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl;
         ss << "available memory: " << available_memory / (1024 * 1024. * 1024.) << " GB " << " of total: "
                   << total_memory / (1024 * 1024. * 1024.) << " GB. " << std::endl;
         ss << "can create: " << _nqkpt << " qkpts in parallel" << std::endl;
       }
       std::cout << ss.str();
-      if (_nqkpt == 0) throw std::runtime_error("not enough memory to create qkpt. Please reduce nt_batch");
-      if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank)
-        std::cerr << "WARNING: ONLY ONE QKPT CREATED. LIKELY CODE WILL BE SLOW. REDUCE NT_BATCH" << std::endl;
+      // NOTE: implement checks again in case nt_batch is specified by user
+      if (_nqkpt == 0 && _nt_batch == 1)
+        throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU.");
+      if (_nqkpt == 0)
+        throw std::runtime_error("Not enough memory to create qkpt. Please reduce nt_batch");
+      if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) {
+        if (_nt_batch > 1)
+          std::cerr << "WARNING: Only one qkpt created! Performance will be sub-optimal. Reduce nt_batch" << std::endl;
+        else
+          std::cerr << "WARNING: Calculation is too large!! GPU can only afford one qkpt. Expect the code to be slow" << std::endl;
+      }
     }
 
     void scalar_gw_gpu_kernel::copy_Gk(const ztensor<5> &G_tskij_host, ztensor<4> &Gk_stij, int k, bool minus_t) {
diff --git a/test/cu_solver_test.cpp b/test/cu_solver_test.cpp
index 69f1088..ba149ab 100644
--- a/test/cu_solver_test.cpp
+++ b/test/cu_solver_test.cpp
@@ -95,7 +95,7 @@ void solve_hf(const std::string& input, const std::string& int_hf, const std::st
 }
 
 
-void solve_gw(const std::string& input, const std::string& int_f, const std::string& data, const std::string& lin, const std::string& mem, bool sp) {
+void solve_gw(const std::string& input, const std::string& int_f, const std::string& data, const std::string& lin, const std::string& mem, bool sp, const std::string& nt_batch) {
   auto        p           = green::params::params("DESCR");
   std::string input_file  = TEST_PATH + input;
   std::string df_int_path = TEST_PATH + int_f;
@@ -104,7 +104,7 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str
   std::string args =
       "test --restart 0 --itermax 1 --E_thr 1e-13 --mixing_type SIGMA_DAMPING --damping 0.8 --input_file=" + input_file +
       " --BETA 100 --grid_file=" + grid_file + " --dfintegral_file=" + df_int_path + " --verbose=5 " +
-      " --cuda_low_gpu_memory " + mem + " --cuda_low_cpu_memory " + mem + " --cuda_linear_solver=" + lin;
+      " --cuda_low_gpu_memory " + mem + " --cuda_low_cpu_memory " + mem + " --cuda_linear_solver=" + lin + " --nt_batch=" + nt_batch;
   green::grids::define_parameters(p);
   green::symmetry::define_parameters(p);
   green::gpu::custom_kernel_parameters(p);
@@ -161,22 +161,43 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str
 
 TEST_CASE("GPU Solver") {
   SECTION("GW_LU") {
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true);
+    // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch);
+    // automatically optimize nt_batch
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true, "0");
+    // set nt_batch = 1
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true, "1");
   }
   SECTION("GW_Cholesky") {
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false);
-    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true);
+    // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch);
+    // automatically optimize nt_batch
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false, "0");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true, "0");
+    // set nt_batch = 1
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false, "1");
+    solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true, "1");
   }
   SECTION("GW_X2C") {
-    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false);
-    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true);
-    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false);
-    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true);
+    // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch);
+    // automatically optimize nt_batch
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false, "0");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true, "0");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false, "0");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true, "0");
+    // set nt_batch = 1
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false, "1");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true, "1");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false, "1");
+    solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true, "1");
   }
 
   SECTION("HF") {