diff --git a/src/green/gpu/gpu_factory.h b/src/green/gpu/gpu_factory.h index 027b54c..5d106b2 100644 --- a/src/green/gpu/gpu_factory.h +++ b/src/green/gpu/gpu_factory.h @@ -98,7 +98,7 @@ namespace green::gpu { LinearSolverType::LU); p.define("cuda_low_gpu_memory", "GPU Device has small amount of memory"); p.define("cuda_low_cpu_memory", "Host has small amount of memory, we will read Coulomb integrals in chunks"); - p.define("nt_batch", "Size of tau batch in cuda GW solver", 1); + p.define("nt_batch", "Size of tau batch in cuda GW solver; if set to 0; value will be determined to maximize performance", 0); } } // namespace green::mbpt diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index 2d96f80..6808e97 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -97,6 +97,15 @@ namespace green::gpu { */ void print_effective_flops(); + /** + * \brief optimize the tau batch size based on available memory on GPU + * + * \param mem_avail available memory on GPU + * \param qpt_size size per qpt object for nt_batch = 1 + * \param qkpt_size size per qkpt object for nt_batch = 1 + */ + void optimize_ntbatch(size_t mem_avail, size_t qpt_size, size_t qkpt_size); + double _beta; size_t _nts; size_t _nts_b; diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index f59d512..ca2dd34 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -167,6 +167,35 @@ namespace green::gpu { } } + void gw_gpu_kernel::optimize_ntbatch(size_t mem_avail, size_t qpt_size, size_t qkpt_size) { + // If user provided nt_batch size, use it directly + if (_nt_batch != 0) return; + // Estimate optimal nt_batch size + mem_avail *= 0.8; // leave 20% memory for other usages + if (mem_avail < qkpt_size) + throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU."); + mem_avail -= qpt_size; // reserve space for qpt + // qkpt_size = size_fix + size_per_t * nt_batch + size_t size_fix = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, 0, _ns) : gw_qkpt::size(_nao, _NQ, _nts, 0, _ns); + size_t size_per_t = qkpt_size - size_fix; + // Preclude cases with very low available memory + size_t n_qkpt_max = mem_avail / qkpt_size; + if (n_qkpt_max == 0) + throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU."); + if (n_qkpt_max == 1) { + _nt_batch = 1; + return; + } + // Optimize nt_batch for n_streams >= 2 + mem_avail /= 2; // create at least 2 qkpt workers + mem_avail -= size_fix; // reserve fixed size for each qkpt worker + _nt_batch = std::min(static_cast(mem_avail / size_per_t), static_cast(_nts)); + // If nt_batch is large and (nts - nt_batch) is small, then we might be better off with nt_batch = nts / 2 + if (_nt_batch > _nts / 2 && _nts - _nt_batch < _nts / 4) { + _nt_batch = _nts / 2; + } + } + void scalar_gw_gpu_kernel::gw_innerloop(G_type& g, St_type& sigma_tau) { if (!_sp) { compute_gw_selfenergy(g, sigma_tau); @@ -256,24 +285,44 @@ namespace green::gpu { std::stringstream ss; ss << std::setprecision(4) << std::boolalpha; if (!_devices_rank && _verbose > 1) ss << "Economical gpu memory mode: " << _low_device_memory << std::endl; - std::size_t qpt_size = (!_sp) ? gw_qpt::size(_nao, _NQ, _nts, _nw_b) : gw_qpt::size(_nao, _NQ, _nts, _nw_b); - std::size_t qkpt_size = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns); - if (!_devices_rank && _verbose > 1) ss << "size of tau batch: " << _nt_batch << std::endl; - if (!_devices_rank && _verbose > 1) ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl; - std::size_t available_memory; - std::size_t total_memory; + // Get size per qpt and qkpt for nt_batch = 1 + size_t nt_batch_initial = 1; + size_t qpt_size = (!_sp) ? gw_qpt::size(_nao, _NQ, _nts, _nw_b) : gw_qpt::size(_nao, _NQ, _nts, _nw_b); + size_t qkpt_size = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, nt_batch_initial, _ns) : gw_qkpt::size(_nao, _NQ, _nts, nt_batch_initial, _ns); + // Get available memory on GPU + size_t available_memory; + size_t total_memory; cudaMemGetInfo(&available_memory, &total_memory); + // Optimize nt_batch size + optimize_ntbatch(available_memory, qpt_size, qkpt_size); + // Recalculate qkpt_size with the (possibly) updated _nt_batch value + qkpt_size = (!_sp) ? gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns) : gw_qkpt::size(_nao, _NQ, _nts, _nt_batch, _ns); _nqkpt = std::min(std::min(size_t((available_memory * 0.8 - qpt_size) / qkpt_size), 16ul), _ink); + // Print memory info if (!_devices_rank && _verbose > 1) { + if (_nt_batch != 0) + ss << "Using user specified nt_batch value" << std::endl; + else + ss << "Optimized nt_batch value to maximize performance" << std::endl; + ss << "size of tau batch: " << _nt_batch << std::endl; + ss << "size per qpt: " << qpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl; ss << "size per qkpt: " << qkpt_size / (1024 * 1024. * 1024.) << " GB " << std::endl; ss << "available memory: " << available_memory / (1024 * 1024. * 1024.) << " GB " << " of total: " << total_memory / (1024 * 1024. * 1024.) << " GB. " << std::endl; ss << "can create: " << _nqkpt << " qkpts in parallel" << std::endl; } std::cout << ss.str(); - if (_nqkpt == 0) throw std::runtime_error("not enough memory to create qkpt. Please reduce nt_batch"); - if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) - std::cerr << "WARNING: ONLY ONE QKPT CREATED. LIKELY CODE WILL BE SLOW. REDUCE NT_BATCH" << std::endl; + // NOTE: implement checks again in case nt_batch is specified by user + if (_nqkpt == 0 && _nt_batch == 1) + throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU."); + if (_nqkpt == 0) + throw std::runtime_error("Not enough memory to create qkpt. Please reduce nt_batch"); + if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) { + if (_nt_batch > 1) + std::cerr << "WARNING: Only one qkpt created! Performance will be sub-optimal. Reduce nt_batch" << std::endl; + else + std::cerr << "WARNING: Calculation is too large!! GPU can only afford one qkpt. Expect the code to be slow" << std::endl; + } } void scalar_gw_gpu_kernel::copy_Gk(const ztensor<5> &G_tskij_host, ztensor<4> &Gk_stij, int k, bool minus_t) { diff --git a/test/cu_solver_test.cpp b/test/cu_solver_test.cpp index 69f1088..ba149ab 100644 --- a/test/cu_solver_test.cpp +++ b/test/cu_solver_test.cpp @@ -95,7 +95,7 @@ void solve_hf(const std::string& input, const std::string& int_hf, const std::st } -void solve_gw(const std::string& input, const std::string& int_f, const std::string& data, const std::string& lin, const std::string& mem, bool sp) { +void solve_gw(const std::string& input, const std::string& int_f, const std::string& data, const std::string& lin, const std::string& mem, bool sp, const std::string& nt_batch) { auto p = green::params::params("DESCR"); std::string input_file = TEST_PATH + input; std::string df_int_path = TEST_PATH + int_f; @@ -104,7 +104,7 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str std::string args = "test --restart 0 --itermax 1 --E_thr 1e-13 --mixing_type SIGMA_DAMPING --damping 0.8 --input_file=" + input_file + " --BETA 100 --grid_file=" + grid_file + " --dfintegral_file=" + df_int_path + " --verbose=5 " + - " --cuda_low_gpu_memory " + mem + " --cuda_low_cpu_memory " + mem + " --cuda_linear_solver=" + lin; + " --cuda_low_gpu_memory " + mem + " --cuda_low_cpu_memory " + mem + " --cuda_linear_solver=" + lin + " --nt_batch=" + nt_batch; green::grids::define_parameters(p); green::symmetry::define_parameters(p); green::gpu::custom_kernel_parameters(p); @@ -161,22 +161,43 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str TEST_CASE("GPU Solver") { SECTION("GW_LU") { - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true); + // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch); + // automatically optimize nt_batch + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true, "0"); + // set nt_batch = 1 + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", false, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", false, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "false", true, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "LU", "true", true, "1"); } SECTION("GW_Cholesky") { - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false); - solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true); + // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch); + // automatically optimize nt_batch + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false, "0"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true, "0"); + // set nt_batch = 1 + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", false, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "false", true, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", false, "1"); + solve_gw("/GW/input.h5", "/GW/df_int", "/GW/data.h5", "Cholesky", "true", true, "1"); } SECTION("GW_X2C") { - solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false); - solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true); - solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false); - solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true); + // solve_gw(input_file, df_int_path, test_file, linear_solver, low_mem, sp_precision, nt_batch); + // automatically optimize nt_batch + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false, "0"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true, "0"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false, "0"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true, "0"); + // set nt_batch = 1 + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", false, "1"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "false", true, "1"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", false, "1"); + solve_gw("/GW_X2C/input.h5", "/GW_X2C/df_hf_int", "/GW_X2C/data.h5", "LU", "true", true, "1"); } SECTION("HF") {