From 347e568b16fadbbcb6e4c07bdc3fdbeea713cf11 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Wed, 22 Jan 2020 00:20:45 -0500 Subject: [PATCH 01/14] first commit to revamp kernarg pool --- lib/hsa/mcwamp_hsa.cpp | 122 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 118 insertions(+), 4 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 49cd274a4a6..2dd033c1048 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -1058,8 +1058,11 @@ class HSADispatch : public HSAOp { std::vector arg_vec; uint32_t arg_count; size_t prevArgVecCapacity; +#if 0 void* kernargMemory; int kernargMemoryIndex; +#endif + std::pair kernargMemory = {nullptr, 0}; hsa_kernel_dispatch_packet_t aql; hsa_wait_state_t waitMode; @@ -3124,6 +3127,97 @@ class HSADevice final : public KalmarDevice return cpu_accessible_am; }; + + class KernargBufferPools { + // std::tuple + using BufferPool = std::tuple, std::vector>; + public: + KernargBufferPools(HSADevice& device, hsa_amd_memory_pool_t rocr_mem_pool) + : device(device), rocr_mem_pool(rocr_mem_pool) { + } + std::pair getKernargBuffer(const size_t size) { + std::lock_guard l{lock}; + std::call_once(init_flag, [this]() { this->grow_pool(512, 1024); }); + for (auto& p : pools) { + if (std::get<0>(p) >= size) { + if (std::get<1>(p).empty()) { + if (std::get<2>(p).empty()) { + // FIXME: replenish the pool + throw Kalmar::runtime_exception("Kernarg pool is empty.", -1); + } + std::get<1>(p).swap(std::get<2>(p)); + } + auto r = std::make_pair(std::get<1>(p).back(), std::get<0>(p)); + std::get<1>(p).pop_back(); + return r; + } + } + + // FIXME: allocate a larger buffer + throw Kalmar::runtime_exception("Can't find suitable kernarg buffer.", -1); + } + + void releaseKernargBuffer(const std::pair& b) { + if (b.first == nullptr) return; + std::lock_guard l{lock}; + for (auto& p : pools) { + if (std::get<0>(p) == b.second) { + std::get<2>(p).push_back(b.first); + return; + } + } + throw Kalmar::runtime_exception("Error when releasing kernarg buffer.", -1); + } + + private: + + void grow_pool(size_t buffer_size, size_t num_buffers) { + auto p = pools.begin(); + for (; p != pools.end(); ++p) { + if (std::get<0>(*p) == buffer_size) + break; + } + if (p == pools.end()) { + pools.emplace_back(std::make_tuple(buffer_size, std::vector(0), std::vector(0))); + p = pools.end() - 1; + } + std::get<1>(*p).reserve(std::get<1>(*p).capacity() + num_buffers); + std::get<2>(*p).reserve(std::get<2>(*p).capacity() + num_buffers); + + char* rocr_alloc = nullptr; + hsa_status_t status; + status = hsa_amd_memory_pool_allocate(rocr_mem_pool, buffer_size * num_buffers, 0, reinterpret_cast(&rocr_alloc)); + STATUS_CHECK(status, __LINE__); + + status = hsa_amd_agents_allow_access(1, &device.agent, NULL, rocr_alloc); + STATUS_CHECK(status, __LINE__); + + for (int i = 0; i < num_buffers; i++) { + std::get<1>(*p).push_back(rocr_alloc + i * buffer_size); + } + rocr_allocs.push_back(rocr_alloc); + } + + HSADevice& device; + hsa_amd_memory_pool_t rocr_mem_pool; + std::mutex lock; + std::once_flag init_flag; + std::vector pools; + std::vector rocr_allocs; + }; + + std::shared_ptr kernargBufferPools; + + std::pair getKernargBuffer(size_t size) { + return kernargBufferPools->getKernargBuffer(size); + } + + void releaseKernargBuffer(const std::pair& b) { + kernargBufferPools->releaseKernargBuffer(b); + } + +#if 0 + void releaseKernargBuffer(void* kernargBuffer, int kernargBufferIndex) { if ( (KERNARG_POOL_SIZE > 0) && (kernargBufferIndex >= 0) ) { kernargPoolMutex.lock(); @@ -3268,6 +3362,9 @@ class HSADevice final : public KalmarDevice return std::make_pair(ret, cursor); } +#endif + + void* getSymbolAddress(const char* symbolName) override { hsa_status_t status; @@ -4052,11 +4149,15 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : } useCoarseGrainedRegion = result; + kernargBufferPools = std::make_shared(*this, getHSAKernargRegion()); + +#if 0 /// pre-allocate a pool of kernarg buffers in case: /// - kernarg region is available /// - compile-time macro KERNARG_POOL_SIZE is larger than 0 #if KERNARG_POOL_SIZE > 0 growKernargBuffer(); +#endif #endif // Setup AM pool. @@ -4546,8 +4647,7 @@ HSADispatch::HSADispatch(Kalmar::HSADevice* _device, Kalmar::KalmarQueue *queue, HSAOp(hc::HSA_OP_ID_DISPATCH, queue, Kalmar::hcCommandKernel), device(_device), kernel(_kernel), - waitMode(HSA_WAIT_STATE_BLOCKED), - kernargMemory(nullptr) + waitMode(HSA_WAIT_STATE_BLOCKED) { if (aql) { this->aql = *aql; @@ -4683,6 +4783,9 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg //printf("hostKernargSize size: %d in bytesn", hostKernargSize); if (hostKernargSize > 0) { + + +#if 0 hsa_amd_memory_pool_t kernarg_region = device->getHSAKernargRegion(); std::pair ret = device->getKernargBuffer(hostKernargSize); kernargMemory = ret.first; @@ -4691,8 +4794,14 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg // as kernarg buffers are fine-grained, we can directly use memcpy memcpy(kernargMemory, hostKernarg, hostKernargSize); - aql.kernarg_address = kernargMemory; +#endif + + + kernargMemory = device->getKernargBuffer(hostKernargSize); + memcpy(kernargMemory.first, hostKernarg, hostKernargSize); + + aql.kernarg_address = kernargMemory.first; } else { aql.kernarg_address = nullptr; } @@ -4838,12 +4947,17 @@ HSADispatch::dispose() { // clear reference counts for signal-less ops. asyncOpsWithoutSignal.clear(); +#if 0 if (kernargMemory != nullptr) { //std::cerr << "op#" << getSeqNum() << " releasing kernal arg buffer index=" << kernargMemoryIndex<< "\n"; device->releaseKernargBuffer(kernargMemory, kernargMemoryIndex); kernargMemory = nullptr; } - +#endif + if (kernargMemory.first) { + device->releaseKernargBuffer(kernargMemory); + kernargMemory = {nullptr, 0}; + } clearArgs(); std::vector().swap(arg_vec); From c5d20d5a7fed81f6477b316fb8e438b834ef733d Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Tue, 4 Feb 2020 18:55:08 -0500 Subject: [PATCH 02/14] Support multiple kernarg pools of different buffer sizes, minor cleanup --- lib/hsa/mcwamp_hsa.cpp | 72 ++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 2dd033c1048..bdbae820be5 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -3131,29 +3131,45 @@ class HSADevice final : public KalmarDevice class KernargBufferPools { // std::tuple using BufferPool = std::tuple, std::vector>; + enum { + _buffer_size = 0, + _free_pool = 1, + _released_pool = 2 + }; public: KernargBufferPools(HSADevice& device, hsa_amd_memory_pool_t rocr_mem_pool) : device(device), rocr_mem_pool(rocr_mem_pool) { + // support kernarg buffer size up to 4k + pools.emplace_back(std::make_tuple( 512, std::vector(0), std::vector(0))); + pools.emplace_back(std::make_tuple(1024, std::vector(0), std::vector(0))); + pools.emplace_back(std::make_tuple(2048, std::vector(0), std::vector(0))); + pools.emplace_back(std::make_tuple(4096, std::vector(0), std::vector(0))); + } + ~KernargBufferPools() { + for (const auto b : rocr_allocs) { + hsa_amd_memory_pool_free(b); + } } std::pair getKernargBuffer(const size_t size) { std::lock_guard l{lock}; - std::call_once(init_flag, [this]() { this->grow_pool(512, 1024); }); for (auto& p : pools) { - if (std::get<0>(p) >= size) { - if (std::get<1>(p).empty()) { - if (std::get<2>(p).empty()) { - // FIXME: replenish the pool - throw Kalmar::runtime_exception("Kernarg pool is empty.", -1); + if (std::get<_buffer_size>(p) >= size) { + if (std::get<_free_pool>(p).empty()) { + constexpr float grow_threshold = 0.2f; + if (std::get<_released_pool>(p).size() <= + static_cast(grow_threshold * std::get<_free_pool>(p).capacity())) { + constexpr size_t grow_mem_bytes = (1024 * 1024); + grow(p, grow_mem_bytes); + } + else { + std::get<_free_pool>(p).swap(std::get<_released_pool>(p)); } - std::get<1>(p).swap(std::get<2>(p)); } - auto r = std::make_pair(std::get<1>(p).back(), std::get<0>(p)); - std::get<1>(p).pop_back(); + auto r = std::make_pair(std::get<_free_pool>(p).back(), std::get<_buffer_size>(p)); + std::get<_free_pool>(p).pop_back(); return r; } } - - // FIXME: allocate a larger buffer throw Kalmar::runtime_exception("Can't find suitable kernarg buffer.", -1); } @@ -3161,8 +3177,8 @@ class HSADevice final : public KalmarDevice if (b.first == nullptr) return; std::lock_guard l{lock}; for (auto& p : pools) { - if (std::get<0>(p) == b.second) { - std::get<2>(p).push_back(b.first); + if (std::get<_buffer_size>(p) == b.second) { + std::get<_released_pool>(p).push_back(b.first); return; } } @@ -3171,18 +3187,18 @@ class HSADevice final : public KalmarDevice private: - void grow_pool(size_t buffer_size, size_t num_buffers) { - auto p = pools.begin(); - for (; p != pools.end(); ++p) { - if (std::get<0>(*p) == buffer_size) - break; - } - if (p == pools.end()) { - pools.emplace_back(std::make_tuple(buffer_size, std::vector(0), std::vector(0))); - p = pools.end() - 1; + void grow(BufferPool& p, size_t mem_size) { + + const auto buffer_size = std::get<_buffer_size>(p); + + if (mem_size < buffer_size) { + throw Kalmar::runtime_exception("Error when growing the kernarg buffer pool", -1); } - std::get<1>(*p).reserve(std::get<1>(*p).capacity() + num_buffers); - std::get<2>(*p).reserve(std::get<2>(*p).capacity() + num_buffers); + const auto num_buffers = mem_size / buffer_size; + const auto new_capacity = std::get<_free_pool>(p).capacity() + num_buffers; + + std::get<_free_pool>(p).reserve(new_capacity); + std::get<_released_pool>(p).reserve(new_capacity); char* rocr_alloc = nullptr; hsa_status_t status; @@ -3192,12 +3208,14 @@ class HSADevice final : public KalmarDevice status = hsa_amd_agents_allow_access(1, &device.agent, NULL, rocr_alloc); STATUS_CHECK(status, __LINE__); - for (int i = 0; i < num_buffers; i++) { - std::get<1>(*p).push_back(rocr_alloc + i * buffer_size); - } rocr_allocs.push_back(rocr_alloc); + auto& fp = std::get<_free_pool>(p); + for (int i = 0; i < num_buffers; i++, rocr_alloc+=buffer_size) { + fp.push_back(rocr_alloc); + } } + HSADevice& device; hsa_amd_memory_pool_t rocr_mem_pool; std::mutex lock; From 87da59fae83b3222c6ae3b1e50c5318388e886a8 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Wed, 5 Feb 2020 15:58:20 -0500 Subject: [PATCH 03/14] Add a knob to switch to the legacy kernarg pool management --- lib/hsa/mcwamp_hsa.cpp | 70 +++++++++++++----------------------------- tests/lit.cfg | 3 ++ 2 files changed, 25 insertions(+), 48 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index bdbae820be5..eccbc9981c1 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -134,10 +134,10 @@ int HCC_FLUSH_ON_WAIT=1; #define HCC_PROFILE_VERBOSE_BARRIER (1 << 4) // 0x10 int HCC_PROFILE_VERBOSE=0x1F; - - char * HCC_PROFILE_FILE=nullptr; +int HCC_NEW_KERNARG_MANAGER=1; + // Profiler: // Use str::stream so output is atomic wrt other threads: #define LOG_PROFILE(op, start, end, type, tag, msg) \ @@ -1058,10 +1058,6 @@ class HSADispatch : public HSAOp { std::vector arg_vec; uint32_t arg_count; size_t prevArgVecCapacity; -#if 0 - void* kernargMemory; - int kernargMemoryIndex; -#endif std::pair kernargMemory = {nullptr, 0}; hsa_kernel_dispatch_packet_t aql; @@ -3226,16 +3222,15 @@ class HSADevice final : public KalmarDevice std::shared_ptr kernargBufferPools; - std::pair getKernargBuffer(size_t size) { - return kernargBufferPools->getKernargBuffer(size); - } - void releaseKernargBuffer(const std::pair& b) { - kernargBufferPools->releaseKernargBuffer(b); + if (HCC_NEW_KERNARG_MANAGER) { + return kernargBufferPools->releaseKernargBuffer(b); + } + else if (b.first) { + return releaseKernargBuffer(b.first, b.second); + } } -#if 0 - void releaseKernargBuffer(void* kernargBuffer, int kernargBufferIndex) { if ( (KERNARG_POOL_SIZE > 0) && (kernargBufferIndex >= 0) ) { kernargPoolMutex.lock(); @@ -3270,7 +3265,12 @@ class HSADevice final : public KalmarDevice }; } - std::pair getKernargBuffer(int size) { + std::pair getKernargBuffer(size_t size) { + + if (HCC_NEW_KERNARG_MANAGER) { + return kernargBufferPools->getKernargBuffer(size); + } + void* ret = nullptr; int cursor = 0; @@ -3380,9 +3380,6 @@ class HSADevice final : public KalmarDevice return std::make_pair(ret, cursor); } -#endif - - void* getSymbolAddress(const char* symbolName) override { hsa_status_t status; @@ -4072,6 +4069,7 @@ void HSAContext::ReadHccEnv() GET_ENV_STRING (HCC_PROFILE_FILE, "Set file name for HCC_PROFILE mode. Default=stderr"); GET_ENV_INT (HCC_FLUSH_ON_WAIT, "recover all resources on queue wait"); + GET_ENV_INT (HCC_NEW_KERNARG_MANAGER, "Enable the new kernarg pool manager. Default=1"); }; @@ -4169,14 +4167,14 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : kernargBufferPools = std::make_shared(*this, getHSAKernargRegion()); -#if 0 - /// pre-allocate a pool of kernarg buffers in case: - /// - kernarg region is available - /// - compile-time macro KERNARG_POOL_SIZE is larger than 0 -#if KERNARG_POOL_SIZE > 0 - growKernargBuffer(); -#endif + if (!HCC_NEW_KERNARG_MANAGER) { + /// pre-allocate a pool of kernarg buffers in case: + /// - kernarg region is available + /// - compile-time macro KERNARG_POOL_SIZE is larger than 0 +#if KERNARG_POOL_SIZE + growKernargBuffer(); #endif + } // Setup AM pool. ri._am_memory_pool = (ri._found_local_memory_pool) @@ -4801,30 +4799,13 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg //printf("hostKernargSize size: %d in bytesn", hostKernargSize); if (hostKernargSize > 0) { - - -#if 0 - hsa_amd_memory_pool_t kernarg_region = device->getHSAKernargRegion(); - std::pair ret = device->getKernargBuffer(hostKernargSize); - kernargMemory = ret.first; - kernargMemoryIndex = ret.second; - //std::cerr << "op #" << getSeqNum() << " allocated kernarg cursor=" << kernargMemoryIndex << "\n"; - - // as kernarg buffers are fine-grained, we can directly use memcpy - memcpy(kernargMemory, hostKernarg, hostKernargSize); - aql.kernarg_address = kernargMemory; -#endif - - kernargMemory = device->getKernargBuffer(hostKernargSize); memcpy(kernargMemory.first, hostKernarg, hostKernargSize); - aql.kernarg_address = kernargMemory.first; } else { aql.kernarg_address = nullptr; } - // write packet uint32_t queueMask = lockedHsaQueue->size - 1; // TODO: Need to check if package write is correct. @@ -4965,13 +4946,6 @@ HSADispatch::dispose() { // clear reference counts for signal-less ops. asyncOpsWithoutSignal.clear(); -#if 0 - if (kernargMemory != nullptr) { - //std::cerr << "op#" << getSeqNum() << " releasing kernal arg buffer index=" << kernargMemoryIndex<< "\n"; - device->releaseKernargBuffer(kernargMemory, kernargMemoryIndex); - kernargMemory = nullptr; - } -#endif if (kernargMemory.first) { device->releaseKernargBuffer(kernargMemory); kernargMemory = {nullptr, 0}; diff --git a/tests/lit.cfg b/tests/lit.cfg index 596e700c4d6..974836d8970 100644 --- a/tests/lit.cfg +++ b/tests/lit.cfg @@ -61,6 +61,9 @@ if os.environ.get('AMDGPU_OBJ_CODEGEN'): if os.environ.get('HCC_EXTRA_GPU_ARCH'): config.environment['HCC_EXTRA_GPU_ARCH'] = os.environ['HCC_EXTRA_GPU_ARCH'] +if os.environ.get('HCC_NEW_KERNARG_MANAGER'): + config.environment['HCC_NEW_KERNARG_MANAGER'] = os.environ['HCC_NEW_KERNARG_MANAGER'] + # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) From 4bf79a30363a596f91e8ee2b8fe60bd3e6386731 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Wed, 5 Feb 2020 16:44:16 -0500 Subject: [PATCH 04/14] Put the released kernarg buffers back to the free pool --- lib/hsa/mcwamp_hsa.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index eccbc9981c1..c450fe3a614 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -3150,19 +3150,23 @@ class HSADevice final : public KalmarDevice std::lock_guard l{lock}; for (auto& p : pools) { if (std::get<_buffer_size>(p) >= size) { - if (std::get<_free_pool>(p).empty()) { + auto& fp = std::get<_free_pool>(p); + auto& rp = std::get<_released_pool>(p); + if (fp.empty()) { constexpr float grow_threshold = 0.2f; - if (std::get<_released_pool>(p).size() <= - static_cast(grow_threshold * std::get<_free_pool>(p).capacity())) { + if (rp.size() <= + static_cast(grow_threshold * fp.capacity())) { constexpr size_t grow_mem_bytes = (1024 * 1024); grow(p, grow_mem_bytes); + fp.insert(fp.cend(), rp.begin(), rp.end()); + rp.clear(); } else { - std::get<_free_pool>(p).swap(std::get<_released_pool>(p)); + fp.swap(rp); } } - auto r = std::make_pair(std::get<_free_pool>(p).back(), std::get<_buffer_size>(p)); - std::get<_free_pool>(p).pop_back(); + auto r = std::make_pair(fp.back(), std::get<_buffer_size>(p)); + fp.pop_back(); return r; } } From 59d0fd4d02ad0bfbad2debf5e3aff33c6aff04e9 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Fri, 7 Feb 2020 18:13:30 -0500 Subject: [PATCH 05/14] Add an env var to force kernarg to allocate from coarse grained memory --- lib/hsa/mcwamp_hsa.cpp | 16 +++++++--------- tests/lit.cfg | 3 +++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index c450fe3a614..da3e714a899 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -137,6 +137,7 @@ int HCC_PROFILE_VERBOSE=0x1F; char * HCC_PROFILE_FILE=nullptr; int HCC_NEW_KERNARG_MANAGER=1; +int HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED=0; // Profiler: // Use str::stream so output is atomic wrt other threads: @@ -2545,24 +2546,20 @@ class HSADevice final : public KalmarDevice DBOUT(DB_INIT, "found coarse-grain system memory pool=" << region.handle << " size(MB) = " << size << std::endl); ri->_coarsegrained_system_memory_pool = region; ri->_found_coarsegrained_system_memory_pool = true; + if (HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED) { + ri->_kernarg_memory_pool = region; + ri->_found_kernarg_memory_pool = true; + } } // choose coarse grained system for kernarg, if not available, fall back to fine grained system. if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { - if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { - DBOUT(DB_INIT, "using coarse grained system for kernarg memory, size(MB) = " << size << std::endl); - ri->_kernarg_memory_pool = region; - ri->_found_kernarg_memory_pool = true; - } - else if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED + if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED && ri->_found_kernarg_memory_pool == false) { DBOUT(DB_INIT, "using fine grained system for kernarg memory, size(MB) = " << size << std::endl); ri->_kernarg_memory_pool = region; ri->_found_kernarg_memory_pool = true; } - else { - DBOUT(DB_INIT, "Unknown memory pool with kernarg_init flag set!!!, size(MB) = " << size << std::endl); - } } return HSA_STATUS_SUCCESS; @@ -4074,6 +4071,7 @@ void HSAContext::ReadHccEnv() GET_ENV_INT (HCC_FLUSH_ON_WAIT, "recover all resources on queue wait"); GET_ENV_INT (HCC_NEW_KERNARG_MANAGER, "Enable the new kernarg pool manager. Default=1"); + GET_ENV_INT (HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED, "Use coarse grained memory for kernarg. Default=0"); }; diff --git a/tests/lit.cfg b/tests/lit.cfg index 974836d8970..48c7e3ce830 100644 --- a/tests/lit.cfg +++ b/tests/lit.cfg @@ -64,6 +64,9 @@ if os.environ.get('HCC_EXTRA_GPU_ARCH'): if os.environ.get('HCC_NEW_KERNARG_MANAGER'): config.environment['HCC_NEW_KERNARG_MANAGER'] = os.environ['HCC_NEW_KERNARG_MANAGER'] +if os.environ.get('HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED'): + config.environment['HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED'] = os.environ['HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED'] + # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) From 3a4dddc044a79fc87d98dde26ca29e5a9109057b Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Tue, 18 Feb 2020 15:23:32 -0500 Subject: [PATCH 06/14] Several updates to new kernarg allocation strategy - fix check for host access to gpu coarse grained memory - add counter for signaling kernarg buffers recycling - rename env var to control kernarg memory allocation --- lib/hsa/mcwamp_hsa.cpp | 70 +++++++++++++++++++++++++++++++----------- tests/lit.cfg | 8 ++--- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index da3e714a899..f4c68f0ddec 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -136,8 +136,8 @@ int HCC_PROFILE_VERBOSE=0x1F; char * HCC_PROFILE_FILE=nullptr; -int HCC_NEW_KERNARG_MANAGER=1; -int HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED=0; +int HCC_KERNARG_MANAGER=1; +int HCC_KERNARG_MANAGER_COARSE_GRAINED=0; // Profiler: // Use str::stream so output is atomic wrt other threads: @@ -2546,10 +2546,6 @@ class HSADevice final : public KalmarDevice DBOUT(DB_INIT, "found coarse-grain system memory pool=" << region.handle << " size(MB) = " << size << std::endl); ri->_coarsegrained_system_memory_pool = region; ri->_found_coarsegrained_system_memory_pool = true; - if (HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED) { - ri->_kernarg_memory_pool = region; - ri->_found_kernarg_memory_pool = true; - } } // choose coarse grained system for kernarg, if not available, fall back to fine grained system. @@ -2602,13 +2598,13 @@ class HSADevice final : public KalmarDevice // Returns true if specified agent has access to the specified pool. // Typically used to detect when a CPU agent has access to GPU device memory via large-bar: - int hasAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool) + bool hasAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool) { hsa_status_t err; hsa_amd_memory_pool_access_t access; err = hsa_amd_agent_memory_pool_get_info(agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); STATUS_CHECK(err, __LINE__); - return access; + return (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED); } @@ -3130,13 +3126,19 @@ class HSADevice final : public KalmarDevice _released_pool = 2 }; public: - KernargBufferPools(HSADevice& device, hsa_amd_memory_pool_t rocr_mem_pool) - : device(device), rocr_mem_pool(rocr_mem_pool) { + KernargBufferPools(HSADevice& device) : device(device) { + rocr_mem_pool = device.ri._kernarg_memory_pool; + mem_pool_is_coarse_grained = device.ri._kernarg_memory_pool.handle == + device.ri._local_memory_pool.handle; // support kernarg buffer size up to 4k pools.emplace_back(std::make_tuple( 512, std::vector(0), std::vector(0))); pools.emplace_back(std::make_tuple(1024, std::vector(0), std::vector(0))); pools.emplace_back(std::make_tuple(2048, std::vector(0), std::vector(0))); pools.emplace_back(std::make_tuple(4096, std::vector(0), std::vector(0))); + + if (mem_pool_is_coarse_grained) { + sync_id++; + } } ~KernargBufferPools() { for (const auto b : rocr_allocs) { @@ -3159,7 +3161,10 @@ class HSADevice final : public KalmarDevice rp.clear(); } else { - fp.swap(rp); + fp.swap(rp); + } + if (mem_pool_is_coarse_grained) { + sync_id++; } } auto r = std::make_pair(fp.back(), std::get<_buffer_size>(p)); @@ -3182,6 +3187,14 @@ class HSADevice final : public KalmarDevice throw Kalmar::runtime_exception("Error when releasing kernarg buffer.", -1); } + uint32_t getSyncID() { + return sync_id; + } + + bool isKernargCoarseGrained() { + return mem_pool_is_coarse_grained; + } + private: void grow(BufferPool& p, size_t mem_size) { @@ -3215,16 +3228,25 @@ class HSADevice final : public KalmarDevice HSADevice& device; hsa_amd_memory_pool_t rocr_mem_pool; + bool mem_pool_is_coarse_grained; + uint32_t sync_id = 0; std::mutex lock; std::once_flag init_flag; std::vector pools; std::vector rocr_allocs; }; - + friend class KernargBufferPools; std::shared_ptr kernargBufferPools; + bool isKernargCoarseGrained() { + if (HCC_KERNARG_MANAGER) { + return kernargBufferPools->isKernargCoarseGrained(); + } + return false; + } + void releaseKernargBuffer(const std::pair& b) { - if (HCC_NEW_KERNARG_MANAGER) { + if (HCC_KERNARG_MANAGER) { return kernargBufferPools->releaseKernargBuffer(b); } else if (b.first) { @@ -3268,7 +3290,7 @@ class HSADevice final : public KalmarDevice std::pair getKernargBuffer(size_t size) { - if (HCC_NEW_KERNARG_MANAGER) { + if (HCC_KERNARG_MANAGER) { return kernargBufferPools->getKernargBuffer(size); } @@ -4070,8 +4092,8 @@ void HSAContext::ReadHccEnv() GET_ENV_STRING (HCC_PROFILE_FILE, "Set file name for HCC_PROFILE mode. Default=stderr"); GET_ENV_INT (HCC_FLUSH_ON_WAIT, "recover all resources on queue wait"); - GET_ENV_INT (HCC_NEW_KERNARG_MANAGER, "Enable the new kernarg pool manager. Default=1"); - GET_ENV_INT (HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED, "Use coarse grained memory for kernarg. Default=0"); + GET_ENV_INT (HCC_KERNARG_MANAGER, "Enable the new kernarg pool manager. Default=1"); + GET_ENV_INT (HCC_KERNARG_MANAGER_COARSE_GRAINED, "Use coarse grained memory for kernarg. Default=0"); }; @@ -4149,6 +4171,15 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : status = hsa_amd_agent_iterate_memory_pools(agent, &HSADevice::get_memory_pools, &ri); STATUS_CHECK(status, __LINE__); + if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED) { + if (ri._found_local_memory_pool && + hasAccess(getHostAgent(), ri._local_memory_pool)) { + DBOUT(DB_INIT, "using coarse-grained GPU local memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl); + ri._kernarg_memory_pool = ri._local_memory_pool; + ri._found_kernarg_memory_pool = true; + } + } + status = hsa_amd_agent_iterate_memory_pools(hostAgent, HSADevice::get_host_pools, &ri); STATUS_CHECK(status, __LINE__); @@ -4167,9 +4198,9 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : } useCoarseGrainedRegion = result; - kernargBufferPools = std::make_shared(*this, getHSAKernargRegion()); + kernargBufferPools = std::make_shared(*this); - if (!HCC_NEW_KERNARG_MANAGER) { + if (!HCC_KERNARG_MANAGER) { /// pre-allocate a pool of kernarg buffers in case: /// - kernarg region is available /// - compile-time macro KERNARG_POOL_SIZE is larger than 0 @@ -4803,6 +4834,9 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg if (hostKernargSize > 0) { kernargMemory = device->getKernargBuffer(hostKernargSize); memcpy(kernargMemory.first, hostKernarg, hostKernargSize); + if (device->isKernargCoarseGrained()) { + // TODO: flush HDP + } aql.kernarg_address = kernargMemory.first; } else { aql.kernarg_address = nullptr; diff --git a/tests/lit.cfg b/tests/lit.cfg index 48c7e3ce830..cbdbc1b1e1d 100644 --- a/tests/lit.cfg +++ b/tests/lit.cfg @@ -61,11 +61,11 @@ if os.environ.get('AMDGPU_OBJ_CODEGEN'): if os.environ.get('HCC_EXTRA_GPU_ARCH'): config.environment['HCC_EXTRA_GPU_ARCH'] = os.environ['HCC_EXTRA_GPU_ARCH'] -if os.environ.get('HCC_NEW_KERNARG_MANAGER'): - config.environment['HCC_NEW_KERNARG_MANAGER'] = os.environ['HCC_NEW_KERNARG_MANAGER'] +if os.environ.get('HCC_KERNARG_MANAGER'): + config.environment['HCC_KERNARG_MANAGER'] = os.environ['HCC_KERNARG_MANAGER'] -if os.environ.get('HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED'): - config.environment['HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED'] = os.environ['HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED'] +if os.environ.get('HCC_KERNARG_MANAGER_COARSE_GRAINED'): + config.environment['HCC_KERNARG_MANAGER_COARSE_GRAINED'] = os.environ['HCC_KERNARG_MANAGER_COARSE_GRAINED'] # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) From 3e0d6bc7eb083c269d1d80f8ae51a4cdf73f8786 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Tue, 18 Feb 2020 20:50:26 -0500 Subject: [PATCH 07/14] Flush HDP after writing to kernarg if the memory is gpu coarse grained --- lib/hsa/mcwamp_hsa.cpp | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index f4c68f0ddec..91a3dd2c1e6 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -2270,6 +2270,11 @@ class HSADevice final : public KalmarDevice std::map programs; hsa_agent_t agent; + + // Information related to executing an HDP flush + hsa_amd_hdp_flush_t hdp; + bool has_hdp_access; + size_t max_tile_static_size; size_t queue_size; @@ -3404,6 +3409,12 @@ class HSADevice final : public KalmarDevice return std::make_pair(ret, cursor); } + void hdp_mem_flush() { + if (!has_hdp_access) + throw Kalmar::runtime_exception("HDP flush error", 0); + __atomic_store_n(reinterpret_cast(hdp.HDP_MEM_FLUSH_CNTL), 0x1, __ATOMIC_SEQ_CST); + } + void* getSymbolAddress(const char* symbolName) override { hsa_status_t status; @@ -4171,7 +4182,13 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : status = hsa_amd_agent_iterate_memory_pools(agent, &HSADevice::get_memory_pools, &ri); STATUS_CHECK(status, __LINE__); - if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED) { + hdp.HDP_MEM_FLUSH_CNTL = nullptr; + hdp.HDP_REG_FLUSH_CNTL = nullptr; + status = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdp); + has_hdp_access = (status == HSA_STATUS_SUCCESS && + hdp.HDP_MEM_FLUSH_CNTL != nullptr); + + if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED && has_hdp_access) { if (ri._found_local_memory_pool && hasAccess(getHostAgent(), ri._local_memory_pool)) { DBOUT(DB_INIT, "using coarse-grained GPU local memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl); @@ -4835,7 +4852,9 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg kernargMemory = device->getKernargBuffer(hostKernargSize); memcpy(kernargMemory.first, hostKernarg, hostKernargSize); if (device->isKernargCoarseGrained()) { - // TODO: flush HDP + // If kernarg is in GPU coarse grained memory, flush the HDP + // content visible to the GPU + device->hdp_mem_flush(); } aql.kernarg_address = kernargMemory.first; } else { From 207935b19ab3cbae779b3653896af42bdb551b3d Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Thu, 20 Feb 2020 00:31:42 -0500 Subject: [PATCH 08/14] If kernarg is in coarse grained GPU memory, add an sys scope acq fence after recycling old kernarg buffers such that the GPU cache is flushed --- lib/hsa/mcwamp_hsa.cpp | 70 +++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 91a3dd2c1e6..eb7463e1822 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1059,7 +1060,7 @@ class HSADispatch : public HSAOp { std::vector arg_vec; uint32_t arg_count; size_t prevArgVecCapacity; - std::pair kernargMemory = {nullptr, 0}; + std::tuple kernargMemory{nullptr, 0, 0}; hsa_kernel_dispatch_packet_t aql; hsa_wait_state_t waitMode; @@ -1356,6 +1357,10 @@ class HSAQueue final : public KalmarQueue // indicate whether this is a cooperative queue bool is_cooperative; + // For kernarg buffer in coarse grained GPU memory only. + // Indicates when was the last synchronization point with the kernarg pool. + uint32_t last_kernarg_sync_id; + public: HSAQueue(KalmarDevice* pDev, hsa_agent_t agent, execute_order order, queue_priority priority, bool cooperative = false) ; @@ -2175,6 +2180,8 @@ class HSAQueue final : public KalmarQueue bool copy2d_ext(const void *src, void *dst, size_t width, size_t height, size_t srcPitch, size_t dstPitch, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, const Kalmar::KalmarDevice *copyDevice, bool forceUnpinnedCopy); + uint32_t get_last_kernarg_sync_id() { return last_kernarg_sync_id; } + void set_last_kernarg_sync_id(uint32_t sync_id) { last_kernarg_sync_id = sync_id; } }; RocrQueue::RocrQueue(hsa_agent_t agent, size_t queue_size, HSAQueue *hccQueue, queue_priority priority) @@ -3150,7 +3157,7 @@ class HSADevice final : public KalmarDevice hsa_amd_memory_pool_free(b); } } - std::pair getKernargBuffer(const size_t size) { + std::tuple getKernargBuffer(const size_t size) { std::lock_guard l{lock}; for (auto& p : pools) { if (std::get<_buffer_size>(p) >= size) { @@ -3172,7 +3179,7 @@ class HSADevice final : public KalmarDevice sync_id++; } } - auto r = std::make_pair(fp.back(), std::get<_buffer_size>(p)); + auto r = std::make_tuple(fp.back(), std::get<_buffer_size>(p), sync_id); fp.pop_back(); return r; } @@ -3180,12 +3187,14 @@ class HSADevice final : public KalmarDevice throw Kalmar::runtime_exception("Can't find suitable kernarg buffer.", -1); } - void releaseKernargBuffer(const std::pair& b) { - if (b.first == nullptr) return; + void releaseKernargBuffer(const std::tuple& b) { + const auto kernarg_ptr = std::get<0>(b); + const auto kernarg_size = std::get<1>(b); + if (kernarg_ptr == nullptr) return; std::lock_guard l{lock}; for (auto& p : pools) { - if (std::get<_buffer_size>(p) == b.second) { - std::get<_released_pool>(p).push_back(b.first); + if (std::get<_buffer_size>(p) == kernarg_size) { + std::get<_released_pool>(p).push_back(kernarg_ptr); return; } } @@ -3243,6 +3252,13 @@ class HSADevice final : public KalmarDevice friend class KernargBufferPools; std::shared_ptr kernargBufferPools; + uint32_t getSyncID() { + if (HCC_KERNARG_MANAGER) { + return kernargBufferPools->getSyncID(); + } + return 0; + } + bool isKernargCoarseGrained() { if (HCC_KERNARG_MANAGER) { return kernargBufferPools->isKernargCoarseGrained(); @@ -3250,12 +3266,12 @@ class HSADevice final : public KalmarDevice return false; } - void releaseKernargBuffer(const std::pair& b) { + void releaseKernargBuffer(const std::tuple& b) { if (HCC_KERNARG_MANAGER) { return kernargBufferPools->releaseKernargBuffer(b); } - else if (b.first) { - return releaseKernargBuffer(b.first, b.second); + else if (std::get<0>(b)) { + return releaseKernargBuffer(std::get<0>(b), std::get<1>(b)); } } @@ -3293,7 +3309,7 @@ class HSADevice final : public KalmarDevice }; } - std::pair getKernargBuffer(size_t size) { + std::tuple getKernargBuffer(size_t size) { if (HCC_KERNARG_MANAGER) { return kernargBufferPools->getKernargBuffer(size); @@ -3403,10 +3419,7 @@ class HSADevice final : public KalmarDevice cursor = -1; memset (ret, 0x00, size); } - - - - return std::make_pair(ret, cursor); + return std::make_tuple(ret, cursor, 0); } void hdp_mem_flush() { @@ -4364,6 +4377,11 @@ HSAQueue::HSAQueue(KalmarDevice* pDev, hsa_agent_t agent, execute_order order, q hsa_status_t status= hsa_signal_create(1, 1, &agent, &sync_copy_signal); STATUS_CHECK(status, __LINE__); + + auto hsa_dev_ptr = reinterpret_cast(pDev); + if (hsa_dev_ptr->isKernargCoarseGrained()) { + set_last_kernarg_sync_id(hsa_dev_ptr->getSyncID() - 1); + } } void HSAQueue::dispose() { @@ -4844,19 +4862,26 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg header |= (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE); } - // bind kernel arguments - //printf("hostKernargSize size: %d in bytesn", hostKernargSize); - if (hostKernargSize > 0) { kernargMemory = device->getKernargBuffer(hostKernargSize); - memcpy(kernargMemory.first, hostKernarg, hostKernargSize); + auto kernarg_ptr = std::get<0>(kernargMemory); + memcpy(kernarg_ptr, hostKernarg, hostKernargSize); if (device->isKernargCoarseGrained()) { // If kernarg is in GPU coarse grained memory, flush the HDP // content visible to the GPU device->hdp_mem_flush(); + auto sync_id = std::get<2>(kernargMemory); + auto q_ptr = reinterpret_cast(getQueue()); + auto queue_last_kernarg_sync_id = q_ptr->get_last_kernarg_sync_id(); + if (queue_last_kernarg_sync_id != sync_id) { + // kernarg buffers have been recycled, put a system scope acquire fence to + // clear the GPU cache to purge the content of staled kernarg buffers + header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); + q_ptr->set_last_kernarg_sync_id(sync_id); + } } - aql.kernarg_address = kernargMemory.first; + aql.kernarg_address = kernarg_ptr; } else { aql.kernarg_address = nullptr; } @@ -5001,9 +5026,10 @@ HSADispatch::dispose() { // clear reference counts for signal-less ops. asyncOpsWithoutSignal.clear(); - if (kernargMemory.first) { + if (std::get<0>(kernargMemory)) { device->releaseKernargBuffer(kernargMemory); - kernargMemory = {nullptr, 0}; + constexpr decltype(kernargMemory) reset{nullptr, 0, 0}; + kernargMemory = reset; } clearArgs(); std::vector().swap(arg_vec); From 4b711376d6c2d2fe6802cef7dd19a2e2c4da5509 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Fri, 21 Feb 2020 17:16:15 -0500 Subject: [PATCH 09/14] Proper synchronization sequence for kernarg write, HDP and AQL --- lib/hsa/mcwamp_hsa.cpp | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index eb7463e1822..70320240ca0 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -9,6 +9,7 @@ #include "../hc2/headers/types/program_state.hpp" #include +#include #include #include #include @@ -3184,7 +3185,9 @@ class HSADevice final : public KalmarDevice return r; } } - throw Kalmar::runtime_exception("Can't find suitable kernarg buffer.", -1); + std::stringstream err_msg; + err_msg << "Can't find suitable kernarg buffer for size " << size << " bytes."; + throw Kalmar::runtime_exception(err_msg.str().c_str(), -1); } void releaseKernargBuffer(const std::tuple& b) { @@ -4868,9 +4871,23 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg auto kernarg_ptr = std::get<0>(kernargMemory); memcpy(kernarg_ptr, hostKernarg, hostKernargSize); if (device->isKernargCoarseGrained()) { - // If kernarg is in GPU coarse grained memory, flush the HDP - // content visible to the GPU + // If kernarg is in GPU coarse grained memory, we need the + // following magic sequence to avoid race conditions between + // the CPU and GPU (to ensure the kernarg is visible before + // the GPU's CP start processing the AQL packet): + // 1- write to kernarg buffer in GPU VRAM (the previous memcpy) + // 2- flush the HDP cache + // 3- Do a readback from GPU VRAM to ensure the flush has been completed + // and to prevent the GPU's CP from processing the AQL packet + // 4- Write the AQL packet + + atomic_thread_fence(std::memory_order_acq_rel); device->hdp_mem_flush(); + atomic_thread_fence(std::memory_order_acq_rel); + volatile char* read_back = reinterpret_cast(kernarg_ptr); + read_back[0]; + atomic_thread_fence(std::memory_order_acq_rel); + auto sync_id = std::get<2>(kernargMemory); auto q_ptr = reinterpret_cast(getQueue()); auto queue_last_kernarg_sync_id = q_ptr->get_last_kernarg_sync_id(); From e4d74b4fa99a2ca825dc4e73a3e8d313a35bd063 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Sun, 23 Feb 2020 18:28:03 -0500 Subject: [PATCH 10/14] Update the kernarg pool growth heuristic, add a few knobs for debugging --- lib/hsa/mcwamp_hsa.cpp | 82 ++++++++++++++++++++++++++++++------------ tests/lit.cfg | 15 ++++++++ 2 files changed, 75 insertions(+), 22 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 70320240ca0..1e4a98040cb 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -140,6 +140,11 @@ char * HCC_PROFILE_FILE=nullptr; int HCC_KERNARG_MANAGER=1; int HCC_KERNARG_MANAGER_COARSE_GRAINED=0; +int HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE=0; +int HCC_KERNARG_MANAGER_GROW_THRESHOLD=20; +int HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE=(1024 * 1024); +int HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS=64; +int HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE=0; // Profiler: // Use str::stream so output is atomic wrt other threads: @@ -3149,6 +3154,11 @@ class HSADevice final : public KalmarDevice pools.emplace_back(std::make_tuple(2048, std::vector(0), std::vector(0))); pools.emplace_back(std::make_tuple(4096, std::vector(0), std::vector(0))); + if (HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE > std::get<_buffer_size>(pools.back())) { + pools.emplace_back(std::make_tuple(HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE, + std::vector(0), std::vector(0))); + } + if (mem_pool_is_coarse_grained) { sync_id++; } @@ -3165,11 +3175,8 @@ class HSADevice final : public KalmarDevice auto& fp = std::get<_free_pool>(p); auto& rp = std::get<_released_pool>(p); if (fp.empty()) { - constexpr float grow_threshold = 0.2f; - if (rp.size() <= - static_cast(grow_threshold * fp.capacity())) { - constexpr size_t grow_mem_bytes = (1024 * 1024); - grow(p, grow_mem_bytes); + if (rp.size() < HCC_KERNARG_MANAGER_GROW_THRESHOLD) { + grow(p); fp.insert(fp.cend(), rp.begin(), rp.end()); rp.clear(); } @@ -3214,35 +3221,34 @@ class HSADevice final : public KalmarDevice private: - void grow(BufferPool& p, size_t mem_size) { + void grow(BufferPool& p) { const auto buffer_size = std::get<_buffer_size>(p); - - if (mem_size < buffer_size) { - throw Kalmar::runtime_exception("Error when growing the kernarg buffer pool", -1); - } - const auto num_buffers = mem_size / buffer_size; - const auto new_capacity = std::get<_free_pool>(p).capacity() + num_buffers; - - std::get<_free_pool>(p).reserve(new_capacity); - std::get<_released_pool>(p).reserve(new_capacity); + const int chunk_size_per_num_buffers = HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS * + buffer_size; + auto actual_chunk_size = std::max(chunk_size_per_num_buffers, + HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE); + const auto num_buffers = actual_chunk_size / buffer_size; + actual_chunk_size = buffer_size * num_buffers; char* rocr_alloc = nullptr; hsa_status_t status; - status = hsa_amd_memory_pool_allocate(rocr_mem_pool, buffer_size * num_buffers, 0, reinterpret_cast(&rocr_alloc)); + status = hsa_amd_memory_pool_allocate(rocr_mem_pool, actual_chunk_size, 0, reinterpret_cast(&rocr_alloc)); STATUS_CHECK(status, __LINE__); - status = hsa_amd_agents_allow_access(1, &device.agent, NULL, rocr_alloc); STATUS_CHECK(status, __LINE__); - rocr_allocs.push_back(rocr_alloc); + + const auto new_capacity = std::get<_free_pool>(p).capacity() + num_buffers; + std::get<_free_pool>(p).reserve(new_capacity); + std::get<_released_pool>(p).reserve(new_capacity); + auto& fp = std::get<_free_pool>(p); for (int i = 0; i < num_buffers; i++, rocr_alloc+=buffer_size) { fp.push_back(rocr_alloc); } } - HSADevice& device; hsa_amd_memory_pool_t rocr_mem_pool; bool mem_pool_is_coarse_grained; @@ -4119,8 +4125,39 @@ void HSAContext::ReadHccEnv() GET_ENV_STRING (HCC_PROFILE_FILE, "Set file name for HCC_PROFILE mode. Default=stderr"); GET_ENV_INT (HCC_FLUSH_ON_WAIT, "recover all resources on queue wait"); - GET_ENV_INT (HCC_KERNARG_MANAGER, "Enable the new kernarg pool manager. Default=1"); - GET_ENV_INT (HCC_KERNARG_MANAGER_COARSE_GRAINED, "Use coarse grained memory for kernarg. Default=0"); + { + std::stringstream ss; + ss << "Enable the new kernarg pool manager. Default=" << HCC_KERNARG_MANAGER; + GET_ENV_INT(HCC_KERNARG_MANAGER, ss.str().c_str()); + } + { + std::stringstream ss; + ss << "Use coarse grained memory for kernarg. Default=" << HCC_KERNARG_MANAGER_COARSE_GRAINED; + GET_ENV_INT(HCC_KERNARG_MANAGER_COARSE_GRAINED, ss.str().c_str()); + } + { + std::stringstream ss; + ss << "When using coarse grained memory kernarg, add a system-scope acquire fence for for every kernel dispatch. Default=" + << HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE; + GET_ENV_INT(HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE, ss.str().c_str()); + } + { + std::stringstream ss; + ss << "Grow the kernarg pool if the number of old buffers being recycled is below this threshold. Default=" << HCC_KERNARG_MANAGER_GROW_THRESHOLD; + GET_ENV_INT(HCC_KERNARG_MANAGER_GROW_THRESHOLD, ss.str().c_str()); + } + { + std::stringstream ss; + ss << "Minimum amount of memory (in bytes) to allocate when growing the kernarg pool. Default=" << HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE; + GET_ENV_INT(HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE, ss.str().c_str()); + } + { + std::stringstream ss; + ss << "Minimum number of buffers when growing the kernarg pool. Default=" << HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS; + GET_ENV_INT(HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS, ss.str().c_str()); + } + + GET_ENV_INT(HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE, "Create an exta kernarg pool for the specified buffer size (in bytes). The size must be greater than 4096"); }; @@ -4891,7 +4928,8 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg auto sync_id = std::get<2>(kernargMemory); auto q_ptr = reinterpret_cast(getQueue()); auto queue_last_kernarg_sync_id = q_ptr->get_last_kernarg_sync_id(); - if (queue_last_kernarg_sync_id != sync_id) { + if (queue_last_kernarg_sync_id != sync_id || + HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE != 0) { // kernarg buffers have been recycled, put a system scope acquire fence to // clear the GPU cache to purge the content of staled kernarg buffers header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); diff --git a/tests/lit.cfg b/tests/lit.cfg index cbdbc1b1e1d..3ed0a9b6ef6 100644 --- a/tests/lit.cfg +++ b/tests/lit.cfg @@ -67,6 +67,21 @@ if os.environ.get('HCC_KERNARG_MANAGER'): if os.environ.get('HCC_KERNARG_MANAGER_COARSE_GRAINED'): config.environment['HCC_KERNARG_MANAGER_COARSE_GRAINED'] = os.environ['HCC_KERNARG_MANAGER_COARSE_GRAINED'] +if os.environ.get('HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE'): + config.environment['HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE'] = os.environ['HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE'] + +if os.environ.get('HCC_KERNARG_MANAGER_GROW_THRESHOLD'): + config.environment['HCC_KERNARG_MANAGER_GROW_THRESHOLD'] = os.environ['HCC_KERNARG_MANAGER_GROW_THRESHOLD'] + +if os.environ.get('HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE'): + config.environment['HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE'] = os.environ['HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE'] + +if os.environ.get('HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS'): + config.environment['HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS'] = os.environ['HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS'] + +if os.environ.get('HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE'): + config.environment['HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE'] = os.environ['HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE'] + # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) From d5657516e1f779003f195bc67e8685f94adefbd0 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Wed, 26 Feb 2020 01:18:59 -0500 Subject: [PATCH 11/14] Minor simplification to kernarg buffer recycling, add debug messages --- lib/hsa/mcwamp_hsa.cpp | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 1e4a98040cb..50e9478b14d 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -86,7 +86,7 @@ int HCC_ASYNCOPS_SIZE = (1024); int HCC_ASYNCOPS_WITHOUT_SIGNAL_SIZE = (HCC_ASYNCOPS_SIZE/2); -//--- +//-- // Environment variables: int HCC_PRINT_ENV=0; @@ -3175,13 +3175,11 @@ class HSADevice final : public KalmarDevice auto& fp = std::get<_free_pool>(p); auto& rp = std::get<_released_pool>(p); if (fp.empty()) { - if (rp.size() < HCC_KERNARG_MANAGER_GROW_THRESHOLD) { + fp.swap(rp); + DBOUT(DB_KERNARG, "recycling " << fp.size() << + " kernarg buffers of size " << std::get<_buffer_size>(p)); + if (fp.size() < HCC_KERNARG_MANAGER_GROW_THRESHOLD) { grow(p); - fp.insert(fp.cend(), rp.begin(), rp.end()); - rp.clear(); - } - else { - fp.swap(rp); } if (mem_pool_is_coarse_grained) { sync_id++; @@ -3240,6 +3238,11 @@ class HSADevice final : public KalmarDevice rocr_allocs.push_back(rocr_alloc); const auto new_capacity = std::get<_free_pool>(p).capacity() + num_buffers; + + DBOUT(DB_KERNARG, "growing kernarg pool (" << std::get<_buffer_size>(p) << ") from " << + std::get<_free_pool>(p).capacity() << " to " << + new_capacity << " buffers"); + std::get<_free_pool>(p).reserve(new_capacity); std::get<_released_pool>(p).reserve(new_capacity); @@ -4241,12 +4244,16 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : has_hdp_access = (status == HSA_STATUS_SUCCESS && hdp.HDP_MEM_FLUSH_CNTL != nullptr); - if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED && has_hdp_access) { - if (ri._found_local_memory_pool && - hasAccess(getHostAgent(), ri._local_memory_pool)) { - DBOUT(DB_INIT, "using coarse-grained GPU local memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl); - ri._kernarg_memory_pool = ri._local_memory_pool; - ri._found_kernarg_memory_pool = true; + if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED) { + if (has_hdp_access) { + if (ri._found_local_memory_pool && + hasAccess(getHostAgent(), ri._local_memory_pool)) { + DBOUT(DB_KERNARG, "Using coarse-grained GPU memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl); + ri._kernarg_memory_pool = ri._local_memory_pool; + ri._found_kernarg_memory_pool = true; + } + } else { + DBOUT(DB_KERNARG, "Not using coarse-grained GPU memory for kernarg due to no access to HDP registers."); } } @@ -4975,9 +4982,11 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg DBOUTL(DB_AQL, " dispatch_aql " << *this << "(hwq=" << lockedHsaQueue << ") kernargs=" << hostKernargSize << " " << *q_aql ); DBOUTL(DB_AQL2, rawAql(*q_aql)); +#if 0 if (DBFLAG(DB_KERNARG)) { printKernarg(q_aql->kernarg_address, hostKernargSize); } +#endif // Register signal callback. if (_activity_prof.is_enabled()) { From 3cba1970f4fac84bedc971c8d3cb2c865c1d4801 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Wed, 26 Feb 2020 14:27:11 -0500 Subject: [PATCH 12/14] add a few more debug messages for coarse grained VM kernarg --- lib/hsa/mcwamp_hsa.cpp | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 50e9478b14d..da4a5b6b392 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -3177,7 +3177,7 @@ class HSADevice final : public KalmarDevice if (fp.empty()) { fp.swap(rp); DBOUT(DB_KERNARG, "recycling " << fp.size() << - " kernarg buffers of size " << std::get<_buffer_size>(p)); + " kernarg buffers of size " << std::get<_buffer_size>(p) << std::endl); if (fp.size() < HCC_KERNARG_MANAGER_GROW_THRESHOLD) { grow(p); } @@ -3241,7 +3241,7 @@ class HSADevice final : public KalmarDevice DBOUT(DB_KERNARG, "growing kernarg pool (" << std::get<_buffer_size>(p) << ") from " << std::get<_free_pool>(p).capacity() << " to " << - new_capacity << " buffers"); + new_capacity << " buffers" << std::endl); std::get<_free_pool>(p).reserve(new_capacity); std::get<_released_pool>(p).reserve(new_capacity); @@ -4246,14 +4246,21 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) : if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED) { if (has_hdp_access) { - if (ri._found_local_memory_pool && - hasAccess(getHostAgent(), ri._local_memory_pool)) { - DBOUT(DB_KERNARG, "Using coarse-grained GPU memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl); - ri._kernarg_memory_pool = ri._local_memory_pool; - ri._found_kernarg_memory_pool = true; + if (ri._found_local_memory_pool) { + if (hasAccess(getHostAgent(), ri._local_memory_pool)) { + DBOUT(DB_KERNARG, "Using coarse-grained GPU memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl); + ri._kernarg_memory_pool = ri._local_memory_pool; + ri._found_kernarg_memory_pool = true; + } + else { + DBOUT(DB_KERNARG, "Not using coarse-grained GPU memory for kernarg since host agent has no access to this memory.\n"); + } + } + else { + DBOUT(DB_KERNARG, "Not using coarse-grained GPU memory for kernarg since this memory is unavailable.\n"); } } else { - DBOUT(DB_KERNARG, "Not using coarse-grained GPU memory for kernarg due to no access to HDP registers."); + DBOUT(DB_KERNARG, "Not using coarse-grained GPU memory for kernarg due to no access to HDP registers.\n"); } } From d4badc6b1d4f565e2292b581383071c261644559 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Mon, 2 Mar 2020 15:51:01 -0500 Subject: [PATCH 13/14] Kernarg - enable coarse grained memory support by default --- lib/hsa/mcwamp_hsa.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index da4a5b6b392..7e3fe4e6629 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -139,7 +139,7 @@ int HCC_PROFILE_VERBOSE=0x1F; char * HCC_PROFILE_FILE=nullptr; int HCC_KERNARG_MANAGER=1; -int HCC_KERNARG_MANAGER_COARSE_GRAINED=0; +int HCC_KERNARG_MANAGER_COARSE_GRAINED=1; int HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE=0; int HCC_KERNARG_MANAGER_GROW_THRESHOLD=20; int HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE=(1024 * 1024); From 3da04118b58f7ed82bcc1602033b7057b9b41f51 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Fri, 27 Mar 2020 15:56:23 -0400 Subject: [PATCH 14/14] Bump up kernarg size to take padding and implicit argument into consideration --- lib/hsa/mcwamp_hsa.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 7e3fe4e6629..eaa293079be 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -3152,7 +3152,9 @@ class HSADevice final : public KalmarDevice pools.emplace_back(std::make_tuple( 512, std::vector(0), std::vector(0))); pools.emplace_back(std::make_tuple(1024, std::vector(0), std::vector(0))); pools.emplace_back(std::make_tuple(2048, std::vector(0), std::vector(0))); - pools.emplace_back(std::make_tuple(4096, std::vector(0), std::vector(0))); + + // 4k + padding + implicit arguments + pools.emplace_back(std::make_tuple(4096 + 2 * 64, std::vector(0), std::vector(0))); if (HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE > std::get<_buffer_size>(pools.back())) { pools.emplace_back(std::make_tuple(HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE,