From 123058b00ea2014d5b07e6fdaeecd185821f1e55 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Fri, 7 Nov 2025 15:08:05 -0500 Subject: [PATCH 1/9] auto optimization of nt_batch --- src/green/gpu/gpu_factory.h | 2 +- src/green/gpu/gw_gpu_kernel.h | 9 +++++++ src/gw_gpu_kernel.cpp | 50 +++++++++++++++++++++++++++++------ test/cu_solver_test.cpp | 49 ++++++++++++++++++++++++---------- 4 files changed, 87 insertions(+), 23 deletions(-) diff --git a/src/green/gpu/gpu_factory.h b/src/green/gpu/gpu_factory.h index 027b54c..81eae2b 100644 --- a/src/green/gpu/gpu_factory.h +++ b/src/green/gpu/gpu_factory.h @@ -98,7 +98,7 @@ namespace green::gpu { LinearSolverType::LU); p.define("cuda_low_gpu_memory", "GPU Device has small amount of memory"); p.define("cuda_low_cpu_memory", "Host has small amount of memory, we will read Coulomb integrals in chunks"); - p.define("nt_batch", "Size of tau batch in cuda GW solver", 1); + p.define("nt_batch", "Size of tau batch in cuda GW solver (default: value will be determined to maximize performance)", 0); } } // namespace green::mbpt diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index 2d96f80..6808e97 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -97,6 +97,15 @@ namespace green::gpu { */ void print_effective_flops(); + /** + * \brief optimize the tau batch size based on available memory on GPU + * + * \param mem_avail available memory on GPU + * \param qpt_size size per qpt object for nt_batch = 1 + * \param qkpt_size size per qkpt object for nt_batch = 1 + */ + void optimize_ntbatch(size_t mem_avail, size_t qpt_size, size_t qkpt_size); + double _beta; size_t _nts; size_t _nts_b; diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index f59d512..f6fead2 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -167,6 +167,24 @@ namespace green::gpu { } } + void gw_gpu_kernel::optimize_ntbatch(size_t mem_avail, size_t qpt_size, size_t qkpt_size) { + // If user provided nt_batch size, use it directly + if (_nt_batch != 0) return; + // Estimate optimal nt_batch size + size_t mem_avail_for_qkpt = available_memory * 0.8 - qpt_size; // leave 20% memory for other usages + // qkpt_size = size_fix + size_per_t * nt_batch + size_t size_fix = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns); + size_t size_per_t = qkpt_size - size_fix; + // Optimize nt_batch + mem_avail_for_qkpt /= 2; // create at least 2 qkpt workers + mem_avail_for_qkpt -= size_fix; + _nt_batch = std::min(static_cast(mem_avail_for_qkpt / size_per_t), static_cast(_nts)); + // If nt_batch is large and (nts - nt_batch) is small, then we might be better off with nt_batch = nts / 2 + if (_nt_batch > _nts / 2 && _nts - _nt_batch < _nts / 4) { + _nt_batch = _nts / 2; + } + } + void scalar_gw_gpu_kernel::gw_innerloop(G_type& g, St_type& sigma_tau) { if (!_sp) { compute_gw_selfenergy(g, sigma_tau); @@ -256,13 +274,22 @@ namespace green::gpu { std::stringstream ss; ss << std::setprecision(4) << std::boolalpha; if (!_devices_rank && _verbose > 1) ss << "Economical gpu memory mode: " << _low_device_memory << std::endl; - std::size_t qpt_size = (!_sp) ? gw_qpt::size(_nao, _NQ, _nts, _nw_b) : gw_qpt::size(_nao, _NQ, _nts, _nw_b); - std::size_t qkpt_size = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns); + // Get size per qpt and qkpt for nt_batch = 1 + size_t nt_batch_initial = 1; + size_t qpt_size = (!_sp) ? gw_qpt::size(_nao, _NQ, _nts, _nw_b) : gw_qpt::size(_nao, _NQ, _nts, _nw_b); + size_t qkpt_size = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, nt_batch_initial, _ns) : gw_qkpt::size(_nao, _NQ, _nts, nt_batch_initial, _ns); + // Get available memory on GPU + size_t available_memory; + size_t total_memory; + cudaMemGetInfo(&available_memory, &total_memory); + // Optimize nt_batch size + if (!_devices_rank && _verbose > 1) + ss << "Using user specified nt_batch value" << std::endl; + else + ss << "Optimizing nt_batch value to maximize performance" << std::endl; + optimize_ntbatch(available_memory, qpt_size, qkpt_size); if (!_devices_rank && _verbose > 1) ss << "size of tau batch: " << _nt_batch << std::endl; if (!_devices_rank && _verbose > 1) ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl; - std::size_t available_memory; - std::size_t total_memory; - cudaMemGetInfo(&available_memory, &total_memory); _nqkpt = std::min(std::min(size_t((available_memory * 0.8 - qpt_size) / qkpt_size), 16ul), _ink); if (!_devices_rank && _verbose > 1) { ss << "size per qkpt: " << qkpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl; @@ -271,9 +298,16 @@ namespace green::gpu { ss << "can create: " << _nqkpt << " qkpts in parallel" << std::endl; } std::cout << ss.str(); - if (_nqkpt == 0) throw std::runtime_error("not enough memory to create qkpt. Please reduce nt_batch"); - if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) - std::cerr << "WARNING: ONLY ONE QKPT CREATED. LIKELY CODE WILL BE SLOW. REDUCE NT_BATCH" << std::endl; + if (_nqkpt == 0 && _nt_batch == 1) + throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU."); + if (_nqkpt == 0) + throw std::runtime_error("not enough memory to create qkpt. Please reduce nt_batch"); + if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) { + if (_nt_batch > 1) + std::cerr << "WARNING: ONLY ONE QKPT CREATED! Performance will be sub-optimal. Reduce nt_batch" << std::endl; + else + std::cerr << "WARNING: Calculation is too large!! GPU can only afford qkpt. Expect the code to be slow" << std::endl; + } } void scalar_gw_gpu_kernel::copy_Gk(const ztensor<5> &G_tskij_host, ztensor<4> &Gk_stij, int k, bool minus_t) { diff --git a/test/cu_solver_test.cpp b/test/cu_solver_test.cpp index 69f1088..ba149ab 100644 --- a/test/cu_solver_test.cpp +++ b/test/cu_solver_test.cpp @@ -95,7 +95,7 @@ void solve_hf(const std::string& input, const std::string& int_hf, const std::st } -void solve_gw(const std::string& input, const std::string& int_f, const std::string& data, const std::string& lin, const std::string& mem, bool sp) { +void solve_gw(const std::string& input, const std::string& int_f, const std::string& data, const std::string& lin, const std::string& mem, bool sp, const std::string& nt_batch) { auto p = green::params::params("DESCR"); std::string input_file = TEST_PATH + input; std::string df_int_path = TEST_PATH + int_f; @@ -104,7 +104,7 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str std::string args = "test --restart 0 --itermax 1 --E_thr 1e-13 --mixing_type SIGMA_DAMPING --damping 0.8 --input_file=" + input_file + " --BETA 100 --grid_file=" + grid_file + " --dfintegral_file=" + df_int_path + " --verbose=5 " + - " --cuda_low_gpu_memory " + mem + " --cuda_low_cpu_memory " + mem + " --cuda_linear_solver=" + lin; + " --cuda_low_gpu_memory " + mem + " --cuda_low_cpu_memory " + mem + " --cuda_linear_solver=" + lin + " --nt_batch=" + nt_batch; green::grids::define_parameters(p); green::symmetry::define_parameters(p); green::gpu::custom_kernel_parameters(p); @@ -161,22 +161,43 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str TEST_CASE("GPU Solver") { SECTION("GW_LU") { - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true); + // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch); + // automatically optimize nt_batch + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true, "0"); + // set nt_batch = 1 + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true, "1"); } SECTION("GW_Cholesky") { - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true); + // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch); + // automatically optimize nt_batch + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true, "0"); + // set nt_batch = 1 + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true, "1"); } SECTION("GW_X2C") { - solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false); - solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true); - solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false); - solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true); + // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch); + // automatically optimize nt_batch + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false, "0"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true, "0"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false, "0"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true, "0"); + // set nt_batch = 1 + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false, "1"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true, "1"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false, "1"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true, "1"); } SECTION("HF") { From a68a6e93cd87e39e3d6f82b277bf4ce2fe103a8a Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Fri, 7 Nov 2025 15:11:45 -0500 Subject: [PATCH 2/9] fix typo in nt_batch optimize --- src/gw_gpu_kernel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index f6fead2..977575a 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -171,7 +171,7 @@ namespace green::gpu { // If user provided nt_batch size, use it directly if (_nt_batch != 0) return; // Estimate optimal nt_batch size - size_t mem_avail_for_qkpt = available_memory * 0.8 - qpt_size; // leave 20% memory for other usages + size_t mem_avail_for_qkpt = mem_avail * 0.8 - qpt_size; // leave 20% memory for other usages // qkpt_size = size_fix + size_per_t * nt_batch size_t size_fix = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns); size_t size_per_t = qkpt_size - size_fix; From 282b5ce91b2e971eb05e334a73f840f4e74394ac Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Fri, 7 Nov 2025 15:41:31 -0500 Subject: [PATCH 3/9] Fix logic for printiing correct status in nt_batch optimization --- src/gw_gpu_kernel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 977575a..6de0880 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -283,7 +283,7 @@ namespace green::gpu { size_t total_memory; cudaMemGetInfo(&available_memory, &total_memory); // Optimize nt_batch size - if (!_devices_rank && _verbose > 1) + if (!_devices_rank && _verbose > 1 && _nt_batch != 0) ss << "Using user specified nt_batch value" << std::endl; else ss << "Optimizing nt_batch value to maximize performance" << std::endl; From 6332382ddc4eeef64a74ccbceb058e23fb4603d6 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Fri, 7 Nov 2025 16:03:05 -0500 Subject: [PATCH 4/9] Update src/gw_gpu_kernel.cpp documentation Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/gw_gpu_kernel.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 6de0880..2d9871c 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -288,6 +288,8 @@ namespace green::gpu { else ss << "Optimizing nt_batch value to maximize performance" << std::endl; optimize_ntbatch(available_memory, qpt_size, qkpt_size); + // Recalculate qkpt_size with the (possibly) updated _nt_batch value + qkpt_size = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns); if (!_devices_rank && _verbose > 1) ss << "size of tau batch: " << _nt_batch << std::endl; if (!_devices_rank && _verbose > 1) ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl; _nqkpt = std::min(std::min(size_t((available_memory * 0.8 - qpt_size) / qkpt_size), 16ul), _ink); From 72604c0b082556c5b97c845dcfd695f423c456ef Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Fri, 7 Nov 2025 16:03:43 -0500 Subject: [PATCH 5/9] address the case when nt_batch cannot be more than 1 Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/gw_gpu_kernel.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 2d9871c..8a8041e 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -177,6 +177,10 @@ namespace green::gpu { size_t size_per_t = qkpt_size - size_fix; // Optimize nt_batch mem_avail_for_qkpt /= 2; // create at least 2 qkpt workers + if (mem_avail_for_qkpt < size_fix) { + _nt_batch = 1; // Set to minimum or handle error + return; + } mem_avail_for_qkpt -= size_fix; _nt_batch = std::min(static_cast(mem_avail_for_qkpt / size_per_t), static_cast(_nts)); // If nt_batch is large and (nts - nt_batch) is small, then we might be better off with nt_batch = nts / 2 From 3404e0f7b63cdaf62d3bbbae006556da3eecf4ce Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Fri, 7 Nov 2025 16:04:26 -0500 Subject: [PATCH 6/9] Update logic for print messages in src/gw_gpu_kernel.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/gw_gpu_kernel.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 8a8041e..fed3118 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -287,10 +287,12 @@ namespace green::gpu { size_t total_memory; cudaMemGetInfo(&available_memory, &total_memory); // Optimize nt_batch size - if (!_devices_rank && _verbose > 1 && _nt_batch != 0) - ss << "Using user specified nt_batch value" << std::endl; - else - ss << "Optimizing nt_batch value to maximize performance" << std::endl; + if (!_devices_rank && _verbose > 1) { + if (_nt_batch != 0) + ss << "Using user specified nt_batch value" << std::endl; + else + ss << "Optimizing nt_batch value to maximize performance" << std::endl; + } optimize_ntbatch(available_memory, qpt_size, qkpt_size); // Recalculate qkpt_size with the (possibly) updated _nt_batch value qkpt_size = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns); From 0a5e13594b1e27017b9a777f01b25304fdf0d31c Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Fri, 7 Nov 2025 16:23:14 -0500 Subject: [PATCH 7/9] copilot documentation suggestions and better handling of exceptions for very large jobs --- src/gw_gpu_kernel.cpp | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index fed3118..a2885d9 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -171,18 +171,25 @@ namespace green::gpu { // If user provided nt_batch size, use it directly if (_nt_batch != 0) return; // Estimate optimal nt_batch size - size_t mem_avail_for_qkpt = mem_avail * 0.8 - qpt_size; // leave 20% memory for other usages + mem_avail *= 0.8; // leave 20% memory for other usages + if (mem_avail < qkpt_size) + throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU."); + mem_avail -= qpt_size; // reserve space for qpt // qkpt_size = size_fix + size_per_t * nt_batch - size_t size_fix = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns); + size_t size_fix = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, 0, _ns) : gw_qkpt::size(_nao, _NQ, _nts, 0, _ns); size_t size_per_t = qkpt_size - size_fix; - // Optimize nt_batch - mem_avail_for_qkpt /= 2; // create at least 2 qkpt workers - if (mem_avail_for_qkpt < size_fix) { - _nt_batch = 1; // Set to minimum or handle error + // Preclude cases with very low available memory + size_t n_qkpt_max = mem_avail / qkpt_size; + if (n_qkpt_max == 0) + throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU."); + if (n_qkpt_max == 1) { + _nt_batch = 1; return; } - mem_avail_for_qkpt -= size_fix; - _nt_batch = std::min(static_cast(mem_avail_for_qkpt / size_per_t), static_cast(_nts)); + // Optimize nt_batch for n_streams >= 2 + mem_avail /= 2; // create at least 2 qkpt workers + mem_avail -= size_fix; // reserve fixed size for each qkpt worker + _nt_batch = std::min(static_cast(mem_avail / size_per_t), static_cast(_nts)); // If nt_batch is large and (nts - nt_batch) is small, then we might be better off with nt_batch = nts / 2 if (_nt_batch > _nts / 2 && _nts - _nt_batch < _nts / 4) { _nt_batch = _nts / 2; @@ -306,15 +313,16 @@ namespace green::gpu { ss << "can create: " << _nqkpt << " qkpts in parallel" << std::endl; } std::cout << ss.str(); + // NOTE: implement checks again in case nt_batch is specified by user if (_nqkpt == 0 && _nt_batch == 1) throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU."); if (_nqkpt == 0) - throw std::runtime_error("not enough memory to create qkpt. Please reduce nt_batch"); + throw std::runtime_error("Not enough memory to create qkpt. Please reduce nt_batch"); if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) { if (_nt_batch > 1) - std::cerr << "WARNING: ONLY ONE QKPT CREATED! Performance will be sub-optimal. Reduce nt_batch" << std::endl; + std::cerr << "WARNING: Only one qkpt created! Performance will be sub-optimal. Reduce nt_batch" << std::endl; else - std::cerr << "WARNING: Calculation is too large!! GPU can only afford qkpt. Expect the code to be slow" << std::endl; + std::cerr << "WARNING: Calculation is too large!! GPU can only afford one qkpt. Expect the code to be slow" << std::endl; } } From 7a7acfc94972f8ca522ce02fbaa0c9cf677ef6e3 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Fri, 7 Nov 2025 17:21:00 -0500 Subject: [PATCH 8/9] update explanation for nt_batch parameter for CLI --- src/green/gpu/gpu_factory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/green/gpu/gpu_factory.h b/src/green/gpu/gpu_factory.h index 81eae2b..5d106b2 100644 --- a/src/green/gpu/gpu_factory.h +++ b/src/green/gpu/gpu_factory.h @@ -98,7 +98,7 @@ namespace green::gpu { LinearSolverType::LU); p.define("cuda_low_gpu_memory", "GPU Device has small amount of memory"); p.define("cuda_low_cpu_memory", "Host has small amount of memory, we will read Coulomb integrals in chunks"); - p.define("nt_batch", "Size of tau batch in cuda GW solver (default: value will be determined to maximize performance)", 0); + p.define("nt_batch", "Size of tau batch in cuda GW solver; if set to 0; value will be determined to maximize performance", 0); } } // namespace green::mbpt From dc7809d63650bc4e982a84cda3428983d9cde04d Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Sat, 8 Nov 2025 10:24:01 -0500 Subject: [PATCH 9/9] clean up output statements --- src/gw_gpu_kernel.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index a2885d9..ca2dd34 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -294,19 +294,18 @@ namespace green::gpu { size_t total_memory; cudaMemGetInfo(&available_memory, &total_memory); // Optimize nt_batch size - if (!_devices_rank && _verbose > 1) { - if (_nt_batch != 0) - ss << "Using user specified nt_batch value" << std::endl; - else - ss << "Optimizing nt_batch value to maximize performance" << std::endl; - } optimize_ntbatch(available_memory, qpt_size, qkpt_size); // Recalculate qkpt_size with the (possibly) updated _nt_batch value qkpt_size = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns); - if (!_devices_rank && _verbose > 1) ss << "size of tau batch: " << _nt_batch << std::endl; - if (!_devices_rank && _verbose > 1) ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl; _nqkpt = std::min(std::min(size_t((available_memory * 0.8 - qpt_size) / qkpt_size), 16ul), _ink); + // Print memory info if (!_devices_rank && _verbose > 1) { + if (_nt_batch != 0) + ss << "Using user specified nt_batch value" << std::endl; + else + ss << "Optimized nt_batch value to maximize performance" << std::endl; + ss << "size of tau batch: " << _nt_batch << std::endl; + ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl; ss << "size per qkpt: " << qkpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl; ss << "available memory: " << available_memory / (1024 * 1024. * 1024.) << " GB " << " of total: " << total_memory / (1024 * 1024. * 1024.) << " GB. " << std::endl;