From 347e568b16fadbbcb6e4c07bdc3fdbeea713cf11 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Wed, 22 Jan 2020 00:20:45 -0500
Subject: [PATCH 01/14] first commit to revamp kernarg pool

---
 lib/hsa/mcwamp_hsa.cpp | 122 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 118 insertions(+), 4 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 49cd274a4a6..2dd033c1048 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -1058,8 +1058,11 @@ class HSADispatch : public HSAOp {
     std::vector<uint8_t> arg_vec;
     uint32_t arg_count;
     size_t prevArgVecCapacity;
+#if 0
     void* kernargMemory;
     int kernargMemoryIndex;
+#endif
+    std::pair<void*, size_t> kernargMemory = {nullptr, 0};
 
     hsa_kernel_dispatch_packet_t aql;
     hsa_wait_state_t waitMode;
@@ -3124,6 +3127,97 @@ class HSADevice final : public KalmarDevice
         return cpu_accessible_am;
     };
 
+
+    class KernargBufferPools {
+        // std::tuple<size_per_buffer, free_buffers, released_buffers>
+        using BufferPool = std::tuple<size_t, std::vector<void*>, std::vector<void*>>;
+    public:
+        KernargBufferPools(HSADevice& device, hsa_amd_memory_pool_t rocr_mem_pool) 
+            : device(device), rocr_mem_pool(rocr_mem_pool) {
+        }
+        std::pair<void*, size_t> getKernargBuffer(const size_t size) {
+            std::lock_guard<std::mutex> l{lock};
+            std::call_once(init_flag, [this]() { this->grow_pool(512, 1024); });
+            for (auto& p : pools) {
+                if (std::get<0>(p) >= size) {
+                    if (std::get<1>(p).empty()) {
+                        if (std::get<2>(p).empty()) {
+                            // FIXME: replenish the pool
+                            throw Kalmar::runtime_exception("Kernarg pool is empty.", -1);
+                        }
+                        std::get<1>(p).swap(std::get<2>(p));
+                    }
+                    auto r = std::make_pair(std::get<1>(p).back(), std::get<0>(p));
+                    std::get<1>(p).pop_back();
+                    return r;
+                }               
+            }
+
+            // FIXME: allocate a larger buffer
+            throw Kalmar::runtime_exception("Can't find suitable kernarg buffer.", -1);
+        }
+
+        void releaseKernargBuffer(const std::pair<void*, size_t>& b) {
+            if (b.first == nullptr) return;
+            std::lock_guard<std::mutex> l{lock};
+            for (auto& p : pools) {
+                if (std::get<0>(p) == b.second) {
+                    std::get<2>(p).push_back(b.first);
+                    return;
+                }
+            }
+            throw Kalmar::runtime_exception("Error when releasing kernarg buffer.", -1);
+        }
+
+    private:
+
+        void grow_pool(size_t buffer_size, size_t num_buffers) {
+            auto p = pools.begin();
+            for (; p != pools.end(); ++p) {
+                if (std::get<0>(*p) == buffer_size)
+                    break;
+            }
+            if (p == pools.end()) {
+                pools.emplace_back(std::make_tuple(buffer_size, std::vector<void*>(0), std::vector<void*>(0)));
+                p = pools.end() - 1;
+            }
+            std::get<1>(*p).reserve(std::get<1>(*p).capacity() + num_buffers);
+            std::get<2>(*p).reserve(std::get<2>(*p).capacity() + num_buffers);
+
+            char* rocr_alloc = nullptr;
+            hsa_status_t status;
+            status = hsa_amd_memory_pool_allocate(rocr_mem_pool, buffer_size * num_buffers, 0, reinterpret_cast<void**>(&rocr_alloc));
+            STATUS_CHECK(status, __LINE__);
+
+            status = hsa_amd_agents_allow_access(1, &device.agent, NULL, rocr_alloc);
+            STATUS_CHECK(status, __LINE__);
+
+            for (int i = 0; i < num_buffers; i++) {
+                std::get<1>(*p).push_back(rocr_alloc + i * buffer_size);
+            }
+            rocr_allocs.push_back(rocr_alloc);
+        }
+
+        HSADevice& device;
+        hsa_amd_memory_pool_t rocr_mem_pool;
+        std::mutex lock;
+        std::once_flag init_flag;
+        std::vector<BufferPool> pools;
+        std::vector<void*> rocr_allocs;
+    };
+
+    std::shared_ptr<KernargBufferPools> kernargBufferPools;
+
+    std::pair<void*, size_t> getKernargBuffer(size_t size) {
+        return kernargBufferPools->getKernargBuffer(size);
+    }
+
+    void releaseKernargBuffer(const std::pair<void*, size_t>& b) {
+        kernargBufferPools->releaseKernargBuffer(b);
+    }
+
+#if 0
+
     void releaseKernargBuffer(void* kernargBuffer, int kernargBufferIndex) {
         if ( (KERNARG_POOL_SIZE > 0) && (kernargBufferIndex >= 0) ) {
             kernargPoolMutex.lock();
@@ -3268,6 +3362,9 @@ class HSADevice final : public KalmarDevice
 
         return std::make_pair(ret, cursor);
     }
+#endif
+
+
 
     void* getSymbolAddress(const char* symbolName) override {
         hsa_status_t status;
@@ -4052,11 +4149,15 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) :
     }
     useCoarseGrainedRegion = result;
 
+    kernargBufferPools = std::make_shared<KernargBufferPools>(*this, getHSAKernargRegion());
+
+#if 0
     /// pre-allocate a pool of kernarg buffers in case:
     /// - kernarg region is available
     /// - compile-time macro KERNARG_POOL_SIZE is larger than 0
 #if KERNARG_POOL_SIZE > 0
     growKernargBuffer();
+#endif
 #endif
 
     // Setup AM pool.
@@ -4546,8 +4647,7 @@ HSADispatch::HSADispatch(Kalmar::HSADevice* _device, Kalmar::KalmarQueue *queue,
     HSAOp(hc::HSA_OP_ID_DISPATCH, queue, Kalmar::hcCommandKernel),
     device(_device),
     kernel(_kernel),
-    waitMode(HSA_WAIT_STATE_BLOCKED),
-    kernargMemory(nullptr)
+    waitMode(HSA_WAIT_STATE_BLOCKED)
 {
     if (aql) {
         this->aql = *aql;
@@ -4683,6 +4783,9 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg
     //printf("hostKernargSize size: %d in bytesn", hostKernargSize);
 
     if (hostKernargSize > 0) {
+
+
+#if 0
         hsa_amd_memory_pool_t kernarg_region = device->getHSAKernargRegion();
         std::pair<void*, int> ret = device->getKernargBuffer(hostKernargSize);
         kernargMemory = ret.first;
@@ -4691,8 +4794,14 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg
 
         // as kernarg buffers are fine-grained, we can directly use memcpy
         memcpy(kernargMemory, hostKernarg, hostKernargSize);
-
         aql.kernarg_address = kernargMemory;
+#endif
+
+        
+        kernargMemory = device->getKernargBuffer(hostKernargSize);
+        memcpy(kernargMemory.first, hostKernarg, hostKernargSize);
+
+        aql.kernarg_address = kernargMemory.first;
     } else {
         aql.kernarg_address = nullptr;
     }
@@ -4838,12 +4947,17 @@ HSADispatch::dispose() {
     // clear reference counts for signal-less ops.
     asyncOpsWithoutSignal.clear();
 
+#if 0
     if (kernargMemory != nullptr) {
       //std::cerr << "op#" << getSeqNum() << " releasing kernal arg buffer index=" << kernargMemoryIndex<< "\n";
       device->releaseKernargBuffer(kernargMemory, kernargMemoryIndex);
       kernargMemory = nullptr;
     }
-
+#endif
+    if (kernargMemory.first) {
+        device->releaseKernargBuffer(kernargMemory);
+        kernargMemory = {nullptr, 0};
+    }
     clearArgs();
     std::vector<uint8_t>().swap(arg_vec);
 

From c5d20d5a7fed81f6477b316fb8e438b834ef733d Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Tue, 4 Feb 2020 18:55:08 -0500
Subject: [PATCH 02/14] Support multiple kernarg pools of different buffer
 sizes, minor cleanup

---
 lib/hsa/mcwamp_hsa.cpp | 72 ++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 2dd033c1048..bdbae820be5 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -3131,29 +3131,45 @@ class HSADevice final : public KalmarDevice
     class KernargBufferPools {
         // std::tuple<size_per_buffer, free_buffers, released_buffers>
         using BufferPool = std::tuple<size_t, std::vector<void*>, std::vector<void*>>;
+        enum {
+            _buffer_size = 0,
+            _free_pool = 1,
+            _released_pool = 2
+        };
     public:
         KernargBufferPools(HSADevice& device, hsa_amd_memory_pool_t rocr_mem_pool) 
             : device(device), rocr_mem_pool(rocr_mem_pool) {
+            // support kernarg buffer size up to 4k
+            pools.emplace_back(std::make_tuple( 512, std::vector<void*>(0), std::vector<void*>(0)));
+            pools.emplace_back(std::make_tuple(1024, std::vector<void*>(0), std::vector<void*>(0)));
+            pools.emplace_back(std::make_tuple(2048, std::vector<void*>(0), std::vector<void*>(0)));
+            pools.emplace_back(std::make_tuple(4096, std::vector<void*>(0), std::vector<void*>(0)));
+        }
+        ~KernargBufferPools() {
+            for (const auto b : rocr_allocs) {
+                hsa_amd_memory_pool_free(b);
+            }
         }
         std::pair<void*, size_t> getKernargBuffer(const size_t size) {
             std::lock_guard<std::mutex> l{lock};
-            std::call_once(init_flag, [this]() { this->grow_pool(512, 1024); });
             for (auto& p : pools) {
-                if (std::get<0>(p) >= size) {
-                    if (std::get<1>(p).empty()) {
-                        if (std::get<2>(p).empty()) {
-                            // FIXME: replenish the pool
-                            throw Kalmar::runtime_exception("Kernarg pool is empty.", -1);
+                if (std::get<_buffer_size>(p) >= size) {
+                    if (std::get<_free_pool>(p).empty()) {
+                        constexpr float grow_threshold = 0.2f;
+                        if (std::get<_released_pool>(p).size() <=
+                                static_cast<size_t>(grow_threshold * std::get<_free_pool>(p).capacity())) {
+                            constexpr size_t grow_mem_bytes = (1024 * 1024);
+                            grow(p, grow_mem_bytes);
+                        }
+                        else {
+                           std::get<_free_pool>(p).swap(std::get<_released_pool>(p));
                         }
-                        std::get<1>(p).swap(std::get<2>(p));
                     }
-                    auto r = std::make_pair(std::get<1>(p).back(), std::get<0>(p));
-                    std::get<1>(p).pop_back();
+                    auto r = std::make_pair(std::get<_free_pool>(p).back(), std::get<_buffer_size>(p));
+                    std::get<_free_pool>(p).pop_back();
                     return r;
                 }               
             }
-
-            // FIXME: allocate a larger buffer
             throw Kalmar::runtime_exception("Can't find suitable kernarg buffer.", -1);
         }
 
@@ -3161,8 +3177,8 @@ class HSADevice final : public KalmarDevice
             if (b.first == nullptr) return;
             std::lock_guard<std::mutex> l{lock};
             for (auto& p : pools) {
-                if (std::get<0>(p) == b.second) {
-                    std::get<2>(p).push_back(b.first);
+                if (std::get<_buffer_size>(p) == b.second) {
+                    std::get<_released_pool>(p).push_back(b.first);
                     return;
                 }
             }
@@ -3171,18 +3187,18 @@ class HSADevice final : public KalmarDevice
 
     private:
 
-        void grow_pool(size_t buffer_size, size_t num_buffers) {
-            auto p = pools.begin();
-            for (; p != pools.end(); ++p) {
-                if (std::get<0>(*p) == buffer_size)
-                    break;
-            }
-            if (p == pools.end()) {
-                pools.emplace_back(std::make_tuple(buffer_size, std::vector<void*>(0), std::vector<void*>(0)));
-                p = pools.end() - 1;
+        void grow(BufferPool& p, size_t mem_size) {
+
+            const auto buffer_size = std::get<_buffer_size>(p);
+
+            if (mem_size < buffer_size) {
+                throw Kalmar::runtime_exception("Error when growing the kernarg buffer pool", -1);
             }
-            std::get<1>(*p).reserve(std::get<1>(*p).capacity() + num_buffers);
-            std::get<2>(*p).reserve(std::get<2>(*p).capacity() + num_buffers);
+            const auto num_buffers = mem_size / buffer_size;
+            const auto new_capacity = std::get<_free_pool>(p).capacity() + num_buffers;
+
+            std::get<_free_pool>(p).reserve(new_capacity);
+            std::get<_released_pool>(p).reserve(new_capacity);
 
             char* rocr_alloc = nullptr;
             hsa_status_t status;
@@ -3192,12 +3208,14 @@ class HSADevice final : public KalmarDevice
             status = hsa_amd_agents_allow_access(1, &device.agent, NULL, rocr_alloc);
             STATUS_CHECK(status, __LINE__);
 
-            for (int i = 0; i < num_buffers; i++) {
-                std::get<1>(*p).push_back(rocr_alloc + i * buffer_size);
-            }
             rocr_allocs.push_back(rocr_alloc);
+            auto& fp = std::get<_free_pool>(p);
+            for (int i = 0; i < num_buffers; i++, rocr_alloc+=buffer_size) {
+                fp.push_back(rocr_alloc);
+            }
         }
 
+
         HSADevice& device;
         hsa_amd_memory_pool_t rocr_mem_pool;
         std::mutex lock;

From 87da59fae83b3222c6ae3b1e50c5318388e886a8 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Wed, 5 Feb 2020 15:58:20 -0500
Subject: [PATCH 03/14] Add a knob to switch to the legacy kernarg pool
 management

---
 lib/hsa/mcwamp_hsa.cpp | 70 +++++++++++++-----------------------------
 tests/lit.cfg          |  3 ++
 2 files changed, 25 insertions(+), 48 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index bdbae820be5..eccbc9981c1 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -134,10 +134,10 @@ int HCC_FLUSH_ON_WAIT=1;
 #define HCC_PROFILE_VERBOSE_BARRIER                 (1 << 4)   // 0x10
 int HCC_PROFILE_VERBOSE=0x1F;
 
-
-
 char * HCC_PROFILE_FILE=nullptr;
 
+int HCC_NEW_KERNARG_MANAGER=1;
+
 // Profiler:
 // Use str::stream so output is atomic wrt other threads:
 #define LOG_PROFILE(op, start, end, type, tag, msg) \
@@ -1058,10 +1058,6 @@ class HSADispatch : public HSAOp {
     std::vector<uint8_t> arg_vec;
     uint32_t arg_count;
     size_t prevArgVecCapacity;
-#if 0
-    void* kernargMemory;
-    int kernargMemoryIndex;
-#endif
     std::pair<void*, size_t> kernargMemory = {nullptr, 0};
 
     hsa_kernel_dispatch_packet_t aql;
@@ -3226,16 +3222,15 @@ class HSADevice final : public KalmarDevice
 
     std::shared_ptr<KernargBufferPools> kernargBufferPools;
 
-    std::pair<void*, size_t> getKernargBuffer(size_t size) {
-        return kernargBufferPools->getKernargBuffer(size);
-    }
-
     void releaseKernargBuffer(const std::pair<void*, size_t>& b) {
-        kernargBufferPools->releaseKernargBuffer(b);
+        if (HCC_NEW_KERNARG_MANAGER) {
+            return kernargBufferPools->releaseKernargBuffer(b);
+        }
+        else if (b.first) {
+            return releaseKernargBuffer(b.first, b.second);
+        }
     }
 
-#if 0
-
     void releaseKernargBuffer(void* kernargBuffer, int kernargBufferIndex) {
         if ( (KERNARG_POOL_SIZE > 0) && (kernargBufferIndex >= 0) ) {
             kernargPoolMutex.lock();
@@ -3270,7 +3265,12 @@ class HSADevice final : public KalmarDevice
         };
     }
 
-    std::pair<void*, int> getKernargBuffer(int size) {
+    std::pair<void*, int> getKernargBuffer(size_t size) {
+
+        if (HCC_NEW_KERNARG_MANAGER) {
+            return kernargBufferPools->getKernargBuffer(size);
+        }
+
         void* ret = nullptr;
         int cursor = 0;
 
@@ -3380,9 +3380,6 @@ class HSADevice final : public KalmarDevice
 
         return std::make_pair(ret, cursor);
     }
-#endif
-
-
 
     void* getSymbolAddress(const char* symbolName) override {
         hsa_status_t status;
@@ -4072,6 +4069,7 @@ void HSAContext::ReadHccEnv()
     GET_ENV_STRING (HCC_PROFILE_FILE,    "Set file name for HCC_PROFILE mode.  Default=stderr");
     GET_ENV_INT    (HCC_FLUSH_ON_WAIT,   "recover all resources on queue wait");
 
+    GET_ENV_INT    (HCC_NEW_KERNARG_MANAGER, "Enable the new kernarg pool manager.  Default=1");
 };
 
 
@@ -4169,14 +4167,14 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) :
 
     kernargBufferPools = std::make_shared<KernargBufferPools>(*this, getHSAKernargRegion());
 
-#if 0
-    /// pre-allocate a pool of kernarg buffers in case:
-    /// - kernarg region is available
-    /// - compile-time macro KERNARG_POOL_SIZE is larger than 0
-#if KERNARG_POOL_SIZE > 0
-    growKernargBuffer();
-#endif
+    if (!HCC_NEW_KERNARG_MANAGER) {
+        /// pre-allocate a pool of kernarg buffers in case:
+        /// - kernarg region is available
+        /// - compile-time macro KERNARG_POOL_SIZE is larger than 0
+#if KERNARG_POOL_SIZE 
+        growKernargBuffer();
 #endif
+    }
 
     // Setup AM pool.
     ri._am_memory_pool = (ri._found_local_memory_pool)
@@ -4801,30 +4799,13 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg
     //printf("hostKernargSize size: %d in bytesn", hostKernargSize);
 
     if (hostKernargSize > 0) {
-
-
-#if 0
-        hsa_amd_memory_pool_t kernarg_region = device->getHSAKernargRegion();
-        std::pair<void*, int> ret = device->getKernargBuffer(hostKernargSize);
-        kernargMemory = ret.first;
-        kernargMemoryIndex = ret.second;
-        //std::cerr << "op #" << getSeqNum() << " allocated kernarg cursor=" << kernargMemoryIndex << "\n";
-
-        // as kernarg buffers are fine-grained, we can directly use memcpy
-        memcpy(kernargMemory, hostKernarg, hostKernargSize);
-        aql.kernarg_address = kernargMemory;
-#endif
-
-        
         kernargMemory = device->getKernargBuffer(hostKernargSize);
         memcpy(kernargMemory.first, hostKernarg, hostKernargSize);
-
         aql.kernarg_address = kernargMemory.first;
     } else {
         aql.kernarg_address = nullptr;
     }
 
-
     // write packet
     uint32_t queueMask = lockedHsaQueue->size - 1;
     // TODO: Need to check if package write is correct.
@@ -4965,13 +4946,6 @@ HSADispatch::dispose() {
     // clear reference counts for signal-less ops.
     asyncOpsWithoutSignal.clear();
 
-#if 0
-    if (kernargMemory != nullptr) {
-      //std::cerr << "op#" << getSeqNum() << " releasing kernal arg buffer index=" << kernargMemoryIndex<< "\n";
-      device->releaseKernargBuffer(kernargMemory, kernargMemoryIndex);
-      kernargMemory = nullptr;
-    }
-#endif
     if (kernargMemory.first) {
         device->releaseKernargBuffer(kernargMemory);
         kernargMemory = {nullptr, 0};
diff --git a/tests/lit.cfg b/tests/lit.cfg
index 596e700c4d6..974836d8970 100644
--- a/tests/lit.cfg
+++ b/tests/lit.cfg
@@ -61,6 +61,9 @@ if os.environ.get('AMDGPU_OBJ_CODEGEN'):
 if os.environ.get('HCC_EXTRA_GPU_ARCH'):
     config.environment['HCC_EXTRA_GPU_ARCH'] = os.environ['HCC_EXTRA_GPU_ARCH']
 
+if os.environ.get('HCC_NEW_KERNARG_MANAGER'):
+    config.environment['HCC_NEW_KERNARG_MANAGER'] = os.environ['HCC_NEW_KERNARG_MANAGER']
+
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
 

From 4bf79a30363a596f91e8ee2b8fe60bd3e6386731 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Wed, 5 Feb 2020 16:44:16 -0500
Subject: [PATCH 04/14] Put the released kernarg buffers back to the free pool

---
 lib/hsa/mcwamp_hsa.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index eccbc9981c1..c450fe3a614 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -3150,19 +3150,23 @@ class HSADevice final : public KalmarDevice
             std::lock_guard<std::mutex> l{lock};
             for (auto& p : pools) {
                 if (std::get<_buffer_size>(p) >= size) {
-                    if (std::get<_free_pool>(p).empty()) {
+                    auto& fp = std::get<_free_pool>(p);
+                    auto& rp = std::get<_released_pool>(p);
+                    if (fp.empty()) {
                         constexpr float grow_threshold = 0.2f;
-                        if (std::get<_released_pool>(p).size() <=
-                                static_cast<size_t>(grow_threshold * std::get<_free_pool>(p).capacity())) {
+                        if (rp.size() <=
+                                static_cast<size_t>(grow_threshold * fp.capacity())) {
                             constexpr size_t grow_mem_bytes = (1024 * 1024);
                             grow(p, grow_mem_bytes);
+                            fp.insert(fp.cend(), rp.begin(), rp.end());
+                            rp.clear();
                         }
                         else {
-                           std::get<_free_pool>(p).swap(std::get<_released_pool>(p));
+                           fp.swap(rp);
                         }
                     }
-                    auto r = std::make_pair(std::get<_free_pool>(p).back(), std::get<_buffer_size>(p));
-                    std::get<_free_pool>(p).pop_back();
+                    auto r = std::make_pair(fp.back(), std::get<_buffer_size>(p));
+                    fp.pop_back();
                     return r;
                 }               
             }

From 59d0fd4d02ad0bfbad2debf5e3aff33c6aff04e9 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Fri, 7 Feb 2020 18:13:30 -0500
Subject: [PATCH 05/14] Add an env var to force kernarg to allocate from coarse
 grained memory

---
 lib/hsa/mcwamp_hsa.cpp | 16 +++++++---------
 tests/lit.cfg          |  3 +++
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index c450fe3a614..da3e714a899 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -137,6 +137,7 @@ int HCC_PROFILE_VERBOSE=0x1F;
 char * HCC_PROFILE_FILE=nullptr;
 
 int HCC_NEW_KERNARG_MANAGER=1;
+int HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED=0;
 
 // Profiler:
 // Use str::stream so output is atomic wrt other threads:
@@ -2545,24 +2546,20 @@ class HSADevice final : public KalmarDevice
             DBOUT(DB_INIT, "found coarse-grain system memory pool=" << region.handle << " size(MB) = " << size << std::endl);
             ri->_coarsegrained_system_memory_pool = region;
             ri->_found_coarsegrained_system_memory_pool = true;
+            if (HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED) {
+                ri->_kernarg_memory_pool = region;
+                ri->_found_kernarg_memory_pool = true;
+            }
         }
 
         // choose coarse grained system for kernarg, if not available, fall back to fine grained system.
         if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
-          if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
-            DBOUT(DB_INIT, "using coarse grained system for kernarg memory, size(MB) = " << size << std::endl);
-            ri->_kernarg_memory_pool = region;
-            ri->_found_kernarg_memory_pool = true;
-          }
-          else if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED
+          if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED
                    && ri->_found_kernarg_memory_pool == false) {
             DBOUT(DB_INIT, "using fine grained system for kernarg memory, size(MB) = " << size << std::endl);
             ri->_kernarg_memory_pool = region;
             ri->_found_kernarg_memory_pool = true;
           }
-          else {
-            DBOUT(DB_INIT, "Unknown memory pool with kernarg_init flag set!!!, size(MB) = " << size << std::endl);
-          }
         }
 
         return HSA_STATUS_SUCCESS;
@@ -4074,6 +4071,7 @@ void HSAContext::ReadHccEnv()
     GET_ENV_INT    (HCC_FLUSH_ON_WAIT,   "recover all resources on queue wait");
 
     GET_ENV_INT    (HCC_NEW_KERNARG_MANAGER, "Enable the new kernarg pool manager.  Default=1");
+    GET_ENV_INT    (HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED, "Use coarse grained memory for kernarg.  Default=0");
 };
 
 
diff --git a/tests/lit.cfg b/tests/lit.cfg
index 974836d8970..48c7e3ce830 100644
--- a/tests/lit.cfg
+++ b/tests/lit.cfg
@@ -64,6 +64,9 @@ if os.environ.get('HCC_EXTRA_GPU_ARCH'):
 if os.environ.get('HCC_NEW_KERNARG_MANAGER'):
     config.environment['HCC_NEW_KERNARG_MANAGER'] = os.environ['HCC_NEW_KERNARG_MANAGER']
 
+if os.environ.get('HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED'):
+    config.environment['HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED'] = os.environ['HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED']
+
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
 

From 3a4dddc044a79fc87d98dde26ca29e5a9109057b Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Tue, 18 Feb 2020 15:23:32 -0500
Subject: [PATCH 06/14] Several updates to new kernarg allocation strategy -
 fix check for host access to gpu coarse grained memory - add counter for
 signaling kernarg buffers recycling - rename env var to control kernarg
 memory allocation

---
 lib/hsa/mcwamp_hsa.cpp | 70 +++++++++++++++++++++++++++++++-----------
 tests/lit.cfg          |  8 ++---
 2 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index da3e714a899..f4c68f0ddec 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -136,8 +136,8 @@ int HCC_PROFILE_VERBOSE=0x1F;
 
 char * HCC_PROFILE_FILE=nullptr;
 
-int HCC_NEW_KERNARG_MANAGER=1;
-int HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED=0;
+int HCC_KERNARG_MANAGER=1;
+int HCC_KERNARG_MANAGER_COARSE_GRAINED=0;
 
 // Profiler:
 // Use str::stream so output is atomic wrt other threads:
@@ -2546,10 +2546,6 @@ class HSADevice final : public KalmarDevice
             DBOUT(DB_INIT, "found coarse-grain system memory pool=" << region.handle << " size(MB) = " << size << std::endl);
             ri->_coarsegrained_system_memory_pool = region;
             ri->_found_coarsegrained_system_memory_pool = true;
-            if (HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED) {
-                ri->_kernarg_memory_pool = region;
-                ri->_found_kernarg_memory_pool = true;
-            }
         }
 
         // choose coarse grained system for kernarg, if not available, fall back to fine grained system.
@@ -2602,13 +2598,13 @@ class HSADevice final : public KalmarDevice
 
     // Returns true if specified agent has access to the specified pool.
     // Typically used to detect when a CPU agent has access to GPU device memory via large-bar:
-    int hasAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool)
+    bool hasAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool)
     {
         hsa_status_t err;
         hsa_amd_memory_pool_access_t access;
         err = hsa_amd_agent_memory_pool_get_info(agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
         STATUS_CHECK(err, __LINE__);
-        return access;
+        return (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED);
     }
 
 
@@ -3130,13 +3126,19 @@ class HSADevice final : public KalmarDevice
             _released_pool = 2
         };
     public:
-        KernargBufferPools(HSADevice& device, hsa_amd_memory_pool_t rocr_mem_pool) 
-            : device(device), rocr_mem_pool(rocr_mem_pool) {
+        KernargBufferPools(HSADevice& device) : device(device) {
+            rocr_mem_pool = device.ri._kernarg_memory_pool;
+            mem_pool_is_coarse_grained = device.ri._kernarg_memory_pool.handle ==
+                                           device.ri._local_memory_pool.handle;
             // support kernarg buffer size up to 4k
             pools.emplace_back(std::make_tuple( 512, std::vector<void*>(0), std::vector<void*>(0)));
             pools.emplace_back(std::make_tuple(1024, std::vector<void*>(0), std::vector<void*>(0)));
             pools.emplace_back(std::make_tuple(2048, std::vector<void*>(0), std::vector<void*>(0)));
             pools.emplace_back(std::make_tuple(4096, std::vector<void*>(0), std::vector<void*>(0)));
+
+            if (mem_pool_is_coarse_grained) {
+                sync_id++;
+            }
         }
         ~KernargBufferPools() {
             for (const auto b : rocr_allocs) {
@@ -3159,7 +3161,10 @@ class HSADevice final : public KalmarDevice
                             rp.clear();
                         }
                         else {
-                           fp.swap(rp);
+                            fp.swap(rp);
+                        }
+                        if (mem_pool_is_coarse_grained) {
+                            sync_id++;
                         }
                     }
                     auto r = std::make_pair(fp.back(), std::get<_buffer_size>(p));
@@ -3182,6 +3187,14 @@ class HSADevice final : public KalmarDevice
             throw Kalmar::runtime_exception("Error when releasing kernarg buffer.", -1);
         }
 
+        uint32_t getSyncID() {
+            return sync_id;
+        }
+        
+        bool isKernargCoarseGrained() {
+            return mem_pool_is_coarse_grained;
+        }
+
     private:
 
         void grow(BufferPool& p, size_t mem_size) {
@@ -3215,16 +3228,25 @@ class HSADevice final : public KalmarDevice
 
         HSADevice& device;
         hsa_amd_memory_pool_t rocr_mem_pool;
+        bool mem_pool_is_coarse_grained;
+        uint32_t sync_id = 0;
         std::mutex lock;
         std::once_flag init_flag;
         std::vector<BufferPool> pools;
         std::vector<void*> rocr_allocs;
     };
-
+    friend class KernargBufferPools;
     std::shared_ptr<KernargBufferPools> kernargBufferPools;
 
+    bool isKernargCoarseGrained() {
+        if (HCC_KERNARG_MANAGER) {
+            return kernargBufferPools->isKernargCoarseGrained();
+        }
+        return false;
+    }
+
     void releaseKernargBuffer(const std::pair<void*, size_t>& b) {
-        if (HCC_NEW_KERNARG_MANAGER) {
+        if (HCC_KERNARG_MANAGER) {
             return kernargBufferPools->releaseKernargBuffer(b);
         }
         else if (b.first) {
@@ -3268,7 +3290,7 @@ class HSADevice final : public KalmarDevice
 
     std::pair<void*, int> getKernargBuffer(size_t size) {
 
-        if (HCC_NEW_KERNARG_MANAGER) {
+        if (HCC_KERNARG_MANAGER) {
             return kernargBufferPools->getKernargBuffer(size);
         }
 
@@ -4070,8 +4092,8 @@ void HSAContext::ReadHccEnv()
     GET_ENV_STRING (HCC_PROFILE_FILE,    "Set file name for HCC_PROFILE mode.  Default=stderr");
     GET_ENV_INT    (HCC_FLUSH_ON_WAIT,   "recover all resources on queue wait");
 
-    GET_ENV_INT    (HCC_NEW_KERNARG_MANAGER, "Enable the new kernarg pool manager.  Default=1");
-    GET_ENV_INT    (HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED, "Use coarse grained memory for kernarg.  Default=0");
+    GET_ENV_INT    (HCC_KERNARG_MANAGER, "Enable the new kernarg pool manager.  Default=1");
+    GET_ENV_INT    (HCC_KERNARG_MANAGER_COARSE_GRAINED, "Use coarse grained memory for kernarg.  Default=0");
 };
 
 
@@ -4149,6 +4171,15 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) :
     status = hsa_amd_agent_iterate_memory_pools(agent, &HSADevice::get_memory_pools, &ri);
     STATUS_CHECK(status, __LINE__);
 
+    if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED) {
+        if (ri._found_local_memory_pool &&
+            hasAccess(getHostAgent(), ri._local_memory_pool)) {
+            DBOUT(DB_INIT, "using coarse-grained GPU local memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl);
+            ri._kernarg_memory_pool = ri._local_memory_pool;
+            ri._found_kernarg_memory_pool = true;
+        }
+    }
+
     status = hsa_amd_agent_iterate_memory_pools(hostAgent, HSADevice::get_host_pools, &ri);
     STATUS_CHECK(status, __LINE__);
 
@@ -4167,9 +4198,9 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) :
     }
     useCoarseGrainedRegion = result;
 
-    kernargBufferPools = std::make_shared<KernargBufferPools>(*this, getHSAKernargRegion());
+    kernargBufferPools = std::make_shared<KernargBufferPools>(*this);
 
-    if (!HCC_NEW_KERNARG_MANAGER) {
+    if (!HCC_KERNARG_MANAGER) {
         /// pre-allocate a pool of kernarg buffers in case:
         /// - kernarg region is available
         /// - compile-time macro KERNARG_POOL_SIZE is larger than 0
@@ -4803,6 +4834,9 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg
     if (hostKernargSize > 0) {
         kernargMemory = device->getKernargBuffer(hostKernargSize);
         memcpy(kernargMemory.first, hostKernarg, hostKernargSize);
+        if (device->isKernargCoarseGrained()) {
+            // TODO:  flush HDP
+        }
         aql.kernarg_address = kernargMemory.first;
     } else {
         aql.kernarg_address = nullptr;
diff --git a/tests/lit.cfg b/tests/lit.cfg
index 48c7e3ce830..cbdbc1b1e1d 100644
--- a/tests/lit.cfg
+++ b/tests/lit.cfg
@@ -61,11 +61,11 @@ if os.environ.get('AMDGPU_OBJ_CODEGEN'):
 if os.environ.get('HCC_EXTRA_GPU_ARCH'):
     config.environment['HCC_EXTRA_GPU_ARCH'] = os.environ['HCC_EXTRA_GPU_ARCH']
 
-if os.environ.get('HCC_NEW_KERNARG_MANAGER'):
-    config.environment['HCC_NEW_KERNARG_MANAGER'] = os.environ['HCC_NEW_KERNARG_MANAGER']
+if os.environ.get('HCC_KERNARG_MANAGER'):
+    config.environment['HCC_KERNARG_MANAGER'] = os.environ['HCC_KERNARG_MANAGER']
 
-if os.environ.get('HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED'):
-    config.environment['HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED'] = os.environ['HCC_NEW_KERNARG_MANAGER_COARSE_GRAINED']
+if os.environ.get('HCC_KERNARG_MANAGER_COARSE_GRAINED'):
+    config.environment['HCC_KERNARG_MANAGER_COARSE_GRAINED'] = os.environ['HCC_KERNARG_MANAGER_COARSE_GRAINED']
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)

From 3e0d6bc7eb083c269d1d80f8ae51a4cdf73f8786 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Tue, 18 Feb 2020 20:50:26 -0500
Subject: [PATCH 07/14] Flush HDP after writing to kernarg if the memory is gpu
 coarse grained

---
 lib/hsa/mcwamp_hsa.cpp | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index f4c68f0ddec..91a3dd2c1e6 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -2270,6 +2270,11 @@ class HSADevice final : public KalmarDevice
 
     std::map<std::string, HSAKernel *> programs;
     hsa_agent_t agent;
+
+    // Information related to executing an HDP flush
+    hsa_amd_hdp_flush_t hdp;
+    bool has_hdp_access;
+
     size_t max_tile_static_size;
 
     size_t queue_size;
@@ -3404,6 +3409,12 @@ class HSADevice final : public KalmarDevice
         return std::make_pair(ret, cursor);
     }
 
+    void hdp_mem_flush() {
+        if (!has_hdp_access)
+            throw Kalmar::runtime_exception("HDP flush error", 0);
+        __atomic_store_n(reinterpret_cast<uint32_t*>(hdp.HDP_MEM_FLUSH_CNTL), 0x1, __ATOMIC_SEQ_CST);
+    }
+
     void* getSymbolAddress(const char* symbolName) override {
         hsa_status_t status;
 
@@ -4171,7 +4182,13 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) :
     status = hsa_amd_agent_iterate_memory_pools(agent, &HSADevice::get_memory_pools, &ri);
     STATUS_CHECK(status, __LINE__);
 
-    if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED) {
+    hdp.HDP_MEM_FLUSH_CNTL = nullptr;
+    hdp.HDP_REG_FLUSH_CNTL = nullptr;
+    status = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdp);
+    has_hdp_access = (status == HSA_STATUS_SUCCESS &&
+                      hdp.HDP_MEM_FLUSH_CNTL != nullptr);
+
+    if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED && has_hdp_access) {
         if (ri._found_local_memory_pool &&
             hasAccess(getHostAgent(), ri._local_memory_pool)) {
             DBOUT(DB_INIT, "using coarse-grained GPU local memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl);
@@ -4835,7 +4852,9 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg
         kernargMemory = device->getKernargBuffer(hostKernargSize);
         memcpy(kernargMemory.first, hostKernarg, hostKernargSize);
         if (device->isKernargCoarseGrained()) {
-            // TODO:  flush HDP
+            // If kernarg is in GPU coarse grained memory, flush the HDP
+            // content visible to the GPU
+            device->hdp_mem_flush();
         }
         aql.kernarg_address = kernargMemory.first;
     } else {

From 207935b19ab3cbae779b3653896af42bdb551b3d Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Thu, 20 Feb 2020 00:31:42 -0500
Subject: [PATCH 08/14] If kernarg is in coarse grained GPU memory, add an sys
 scope acq fence after recycling old kernarg buffers such that the GPU cache
 is flushed

---
 lib/hsa/mcwamp_hsa.cpp | 70 +++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 22 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 91a3dd2c1e6..eb7463e1822 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -21,6 +21,7 @@
 #include <sstream>
 #include <string>
 #include <thread>
+#include <tuple>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -1059,7 +1060,7 @@ class HSADispatch : public HSAOp {
     std::vector<uint8_t> arg_vec;
     uint32_t arg_count;
     size_t prevArgVecCapacity;
-    std::pair<void*, size_t> kernargMemory = {nullptr, 0};
+    std::tuple<void*, size_t, uint32_t> kernargMemory{nullptr, 0, 0};
 
     hsa_kernel_dispatch_packet_t aql;
     hsa_wait_state_t waitMode;
@@ -1356,6 +1357,10 @@ class HSAQueue final : public KalmarQueue
     // indicate whether this is a cooperative queue
     bool is_cooperative;
 
+    // For kernarg buffer in coarse grained GPU memory only.
+    // Indicates when was the last synchronization point with the kernarg pool.
+    uint32_t last_kernarg_sync_id;
+
 public:
     HSAQueue(KalmarDevice* pDev, hsa_agent_t agent, execute_order order, queue_priority priority, bool cooperative = false) ;
 
@@ -2175,6 +2180,8 @@ class HSAQueue final : public KalmarQueue
 
     bool copy2d_ext(const void *src, void *dst, size_t width, size_t height, size_t srcPitch, size_t dstPitch, hc::hcCommandKind copyDir, const hc::AmPointerInfo &srcPtrInfo, const hc::AmPointerInfo &dstPtrInfo, const Kalmar::KalmarDevice *copyDevice, bool forceUnpinnedCopy);
 
+    uint32_t get_last_kernarg_sync_id() { return last_kernarg_sync_id; }
+    void set_last_kernarg_sync_id(uint32_t sync_id) { last_kernarg_sync_id = sync_id; }
 };
 
 RocrQueue::RocrQueue(hsa_agent_t agent, size_t queue_size, HSAQueue *hccQueue, queue_priority priority)
@@ -3150,7 +3157,7 @@ class HSADevice final : public KalmarDevice
                 hsa_amd_memory_pool_free(b);
             }
         }
-        std::pair<void*, size_t> getKernargBuffer(const size_t size) {
+        std::tuple<void*, size_t, uint32_t> getKernargBuffer(const size_t size) {
             std::lock_guard<std::mutex> l{lock};
             for (auto& p : pools) {
                 if (std::get<_buffer_size>(p) >= size) {
@@ -3172,7 +3179,7 @@ class HSADevice final : public KalmarDevice
                             sync_id++;
                         }
                     }
-                    auto r = std::make_pair(fp.back(), std::get<_buffer_size>(p));
+                    auto r = std::make_tuple(fp.back(), std::get<_buffer_size>(p), sync_id);
                     fp.pop_back();
                     return r;
                 }               
@@ -3180,12 +3187,14 @@ class HSADevice final : public KalmarDevice
             throw Kalmar::runtime_exception("Can't find suitable kernarg buffer.", -1);
         }
 
-        void releaseKernargBuffer(const std::pair<void*, size_t>& b) {
-            if (b.first == nullptr) return;
+        void releaseKernargBuffer(const std::tuple<void*, size_t, uint32_t>& b) {
+            const auto kernarg_ptr = std::get<0>(b);
+            const auto kernarg_size = std::get<1>(b);
+            if (kernarg_ptr == nullptr) return;
             std::lock_guard<std::mutex> l{lock};
             for (auto& p : pools) {
-                if (std::get<_buffer_size>(p) == b.second) {
-                    std::get<_released_pool>(p).push_back(b.first);
+                if (std::get<_buffer_size>(p) == kernarg_size) {
+                    std::get<_released_pool>(p).push_back(kernarg_ptr);
                     return;
                 }
             }
@@ -3243,6 +3252,13 @@ class HSADevice final : public KalmarDevice
     friend class KernargBufferPools;
     std::shared_ptr<KernargBufferPools> kernargBufferPools;
 
+    uint32_t getSyncID() {
+        if (HCC_KERNARG_MANAGER) {
+            return kernargBufferPools->getSyncID();
+        }
+        return 0;
+    }
+
     bool isKernargCoarseGrained() {
         if (HCC_KERNARG_MANAGER) {
             return kernargBufferPools->isKernargCoarseGrained();
@@ -3250,12 +3266,12 @@ class HSADevice final : public KalmarDevice
         return false;
     }
 
-    void releaseKernargBuffer(const std::pair<void*, size_t>& b) {
+    void releaseKernargBuffer(const std::tuple<void*, size_t, uint32_t>& b) {
         if (HCC_KERNARG_MANAGER) {
             return kernargBufferPools->releaseKernargBuffer(b);
         }
-        else if (b.first) {
-            return releaseKernargBuffer(b.first, b.second);
+        else if (std::get<0>(b)) {
+            return releaseKernargBuffer(std::get<0>(b), std::get<1>(b));
         }
     }
 
@@ -3293,7 +3309,7 @@ class HSADevice final : public KalmarDevice
         };
     }
 
-    std::pair<void*, int> getKernargBuffer(size_t size) {
+    std::tuple<void*, size_t, uint32_t> getKernargBuffer(size_t size) {
 
         if (HCC_KERNARG_MANAGER) {
             return kernargBufferPools->getKernargBuffer(size);
@@ -3403,10 +3419,7 @@ class HSADevice final : public KalmarDevice
             cursor = -1;
             memset (ret, 0x00, size);
         }
-
-
-
-        return std::make_pair(ret, cursor);
+        return std::make_tuple(ret, cursor, 0);
     }
 
     void hdp_mem_flush() {
@@ -4364,6 +4377,11 @@ HSAQueue::HSAQueue(KalmarDevice* pDev, hsa_agent_t agent, execute_order order, q
 
     hsa_status_t status= hsa_signal_create(1, 1, &agent, &sync_copy_signal);
     STATUS_CHECK(status, __LINE__);
+
+    auto hsa_dev_ptr = reinterpret_cast<Kalmar::HSADevice*>(pDev);
+    if (hsa_dev_ptr->isKernargCoarseGrained()) {
+        set_last_kernarg_sync_id(hsa_dev_ptr->getSyncID() - 1);
+    }
 }
 
 void HSAQueue::dispose() {
@@ -4844,19 +4862,26 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg
         header |= (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE);
     }
 
-
     // bind kernel arguments
-    //printf("hostKernargSize size: %d in bytesn", hostKernargSize);
-
     if (hostKernargSize > 0) {
         kernargMemory = device->getKernargBuffer(hostKernargSize);
-        memcpy(kernargMemory.first, hostKernarg, hostKernargSize);
+        auto kernarg_ptr = std::get<0>(kernargMemory);
+        memcpy(kernarg_ptr, hostKernarg, hostKernargSize);
         if (device->isKernargCoarseGrained()) {
             // If kernarg is in GPU coarse grained memory, flush the HDP
             // content visible to the GPU
             device->hdp_mem_flush();
+            auto sync_id = std::get<2>(kernargMemory);
+            auto q_ptr = reinterpret_cast<Kalmar::HSAQueue*>(getQueue());
+            auto queue_last_kernarg_sync_id = q_ptr->get_last_kernarg_sync_id();
+            if (queue_last_kernarg_sync_id != sync_id) {
+                // kernarg buffers have been recycled, put a system scope acquire fence to
+                // clear the GPU cache to purge the content of staled kernarg buffers
+                header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE);
+                q_ptr->set_last_kernarg_sync_id(sync_id);
+            }
         }
-        aql.kernarg_address = kernargMemory.first;
+        aql.kernarg_address = kernarg_ptr;
     } else {
         aql.kernarg_address = nullptr;
     }
@@ -5001,9 +5026,10 @@ HSADispatch::dispose() {
     // clear reference counts for signal-less ops.
     asyncOpsWithoutSignal.clear();
 
-    if (kernargMemory.first) {
+    if (std::get<0>(kernargMemory)) {
         device->releaseKernargBuffer(kernargMemory);
-        kernargMemory = {nullptr, 0};
+        constexpr decltype(kernargMemory) reset{nullptr, 0, 0};
+        kernargMemory = reset;
     }
     clearArgs();
     std::vector<uint8_t>().swap(arg_vec);

From 4b711376d6c2d2fe6802cef7dd19a2e2c4da5509 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Fri, 21 Feb 2020 17:16:15 -0500
Subject: [PATCH 09/14] Proper synchronization sequence for kernarg write, HDP
 and AQL

---
 lib/hsa/mcwamp_hsa.cpp | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index eb7463e1822..70320240ca0 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -9,6 +9,7 @@
 #include "../hc2/headers/types/program_state.hpp"
 
 #include <algorithm>
+#include <atomic>
 #include <cassert>
 #include <chrono>
 #include <cstdio>
@@ -3184,7 +3185,9 @@ class HSADevice final : public KalmarDevice
                     return r;
                 }               
             }
-            throw Kalmar::runtime_exception("Can't find suitable kernarg buffer.", -1);
+            std::stringstream err_msg;
+            err_msg << "Can't find suitable kernarg buffer for size " << size << " bytes.";
+            throw Kalmar::runtime_exception(err_msg.str().c_str(), -1);
         }
 
         void releaseKernargBuffer(const std::tuple<void*, size_t, uint32_t>& b) {
@@ -4868,9 +4871,23 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg
         auto kernarg_ptr = std::get<0>(kernargMemory);
         memcpy(kernarg_ptr, hostKernarg, hostKernargSize);
         if (device->isKernargCoarseGrained()) {
-            // If kernarg is in GPU coarse grained memory, flush the HDP
-            // content visible to the GPU
+            // If kernarg is in GPU coarse grained memory, we need the
+            // following magic sequence to avoid race conditions between
+            // the CPU and GPU (to ensure the kernarg is visible before
+            // the GPU's CP start processing the AQL packet):
+            //   1- write to kernarg buffer in GPU VRAM (the previous memcpy)
+            //   2- flush the HDP cache
+            //   3- Do a readback from GPU VRAM to ensure the flush has been completed
+            //      and to prevent the GPU's CP from processing the AQL packet
+            //   4- Write the AQL packet
+    
+            atomic_thread_fence(std::memory_order_acq_rel);
             device->hdp_mem_flush();
+            atomic_thread_fence(std::memory_order_acq_rel);
+            volatile char* read_back = reinterpret_cast<decltype(read_back)>(kernarg_ptr);
+            read_back[0];
+            atomic_thread_fence(std::memory_order_acq_rel);
+
             auto sync_id = std::get<2>(kernargMemory);
             auto q_ptr = reinterpret_cast<Kalmar::HSAQueue*>(getQueue());
             auto queue_last_kernarg_sync_id = q_ptr->get_last_kernarg_sync_id();

From e4d74b4fa99a2ca825dc4e73a3e8d313a35bd063 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Sun, 23 Feb 2020 18:28:03 -0500
Subject: [PATCH 10/14] Update the kernarg pool growth heuristic, add a few
 knobs for debugging

---
 lib/hsa/mcwamp_hsa.cpp | 82 ++++++++++++++++++++++++++++++------------
 tests/lit.cfg          | 15 ++++++++
 2 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 70320240ca0..1e4a98040cb 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -140,6 +140,11 @@ char * HCC_PROFILE_FILE=nullptr;
 
 int HCC_KERNARG_MANAGER=1;
 int HCC_KERNARG_MANAGER_COARSE_GRAINED=0;
+int HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE=0;
+int HCC_KERNARG_MANAGER_GROW_THRESHOLD=20;
+int HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE=(1024 * 1024);
+int HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS=64;
+int HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE=0;
 
 // Profiler:
 // Use str::stream so output is atomic wrt other threads:
@@ -3149,6 +3154,11 @@ class HSADevice final : public KalmarDevice
             pools.emplace_back(std::make_tuple(2048, std::vector<void*>(0), std::vector<void*>(0)));
             pools.emplace_back(std::make_tuple(4096, std::vector<void*>(0), std::vector<void*>(0)));
 
+            if (HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE > std::get<_buffer_size>(pools.back())) {
+                pools.emplace_back(std::make_tuple(HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE,
+                                                   std::vector<void*>(0), std::vector<void*>(0)));
+            }
+
             if (mem_pool_is_coarse_grained) {
                 sync_id++;
             }
@@ -3165,11 +3175,8 @@ class HSADevice final : public KalmarDevice
                     auto& fp = std::get<_free_pool>(p);
                     auto& rp = std::get<_released_pool>(p);
                     if (fp.empty()) {
-                        constexpr float grow_threshold = 0.2f;
-                        if (rp.size() <=
-                                static_cast<size_t>(grow_threshold * fp.capacity())) {
-                            constexpr size_t grow_mem_bytes = (1024 * 1024);
-                            grow(p, grow_mem_bytes);
+                        if (rp.size() < HCC_KERNARG_MANAGER_GROW_THRESHOLD) {
+                            grow(p);
                             fp.insert(fp.cend(), rp.begin(), rp.end());
                             rp.clear();
                         }
@@ -3214,35 +3221,34 @@ class HSADevice final : public KalmarDevice
 
     private:
 
-        void grow(BufferPool& p, size_t mem_size) {
+        void grow(BufferPool& p) {
 
             const auto buffer_size = std::get<_buffer_size>(p);
-
-            if (mem_size < buffer_size) {
-                throw Kalmar::runtime_exception("Error when growing the kernarg buffer pool", -1);
-            }
-            const auto num_buffers = mem_size / buffer_size;
-            const auto new_capacity = std::get<_free_pool>(p).capacity() + num_buffers;
-
-            std::get<_free_pool>(p).reserve(new_capacity);
-            std::get<_released_pool>(p).reserve(new_capacity);
+            const int chunk_size_per_num_buffers = HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS *
+                                                   buffer_size;
+            auto actual_chunk_size = std::max(chunk_size_per_num_buffers,
+                                              HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE);
+            const auto num_buffers = actual_chunk_size / buffer_size;
+            actual_chunk_size = buffer_size * num_buffers;
 
             char* rocr_alloc = nullptr;
             hsa_status_t status;
-            status = hsa_amd_memory_pool_allocate(rocr_mem_pool, buffer_size * num_buffers, 0, reinterpret_cast<void**>(&rocr_alloc));
+            status = hsa_amd_memory_pool_allocate(rocr_mem_pool, actual_chunk_size, 0, reinterpret_cast<void**>(&rocr_alloc));
             STATUS_CHECK(status, __LINE__);
-
             status = hsa_amd_agents_allow_access(1, &device.agent, NULL, rocr_alloc);
             STATUS_CHECK(status, __LINE__);
-
             rocr_allocs.push_back(rocr_alloc);
+
+            const auto new_capacity = std::get<_free_pool>(p).capacity() + num_buffers;
+            std::get<_free_pool>(p).reserve(new_capacity);
+            std::get<_released_pool>(p).reserve(new_capacity);
+
             auto& fp = std::get<_free_pool>(p);
             for (int i = 0; i < num_buffers; i++, rocr_alloc+=buffer_size) {
                 fp.push_back(rocr_alloc);
             }
         }
 
-
         HSADevice& device;
         hsa_amd_memory_pool_t rocr_mem_pool;
         bool mem_pool_is_coarse_grained;
@@ -4119,8 +4125,39 @@ void HSAContext::ReadHccEnv()
     GET_ENV_STRING (HCC_PROFILE_FILE,    "Set file name for HCC_PROFILE mode.  Default=stderr");
     GET_ENV_INT    (HCC_FLUSH_ON_WAIT,   "recover all resources on queue wait");
 
-    GET_ENV_INT    (HCC_KERNARG_MANAGER, "Enable the new kernarg pool manager.  Default=1");
-    GET_ENV_INT    (HCC_KERNARG_MANAGER_COARSE_GRAINED, "Use coarse grained memory for kernarg.  Default=0");
+    {
+        std::stringstream ss;
+        ss << "Enable the new kernarg pool manager.  Default=" << HCC_KERNARG_MANAGER;
+        GET_ENV_INT(HCC_KERNARG_MANAGER, ss.str().c_str());
+    }
+    {
+        std::stringstream ss;
+        ss << "Use coarse grained memory for kernarg.  Default=" << HCC_KERNARG_MANAGER_COARSE_GRAINED;
+        GET_ENV_INT(HCC_KERNARG_MANAGER_COARSE_GRAINED, ss.str().c_str());
+    }
+    {
+        std::stringstream ss;
+        ss << "When using coarse grained memory kernarg, add a system-scope acquire fence for for every kernel dispatch.  Default=" 
+           << HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE;
+        GET_ENV_INT(HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE, ss.str().c_str());
+    }
+    {
+        std::stringstream ss;
+        ss << "Grow the kernarg pool if the number of old buffers being recycled is below this threshold.  Default=" << HCC_KERNARG_MANAGER_GROW_THRESHOLD;
+        GET_ENV_INT(HCC_KERNARG_MANAGER_GROW_THRESHOLD, ss.str().c_str());
+    }
+    {
+        std::stringstream ss;
+        ss << "Minimum amount of memory (in bytes) to allocate when growing the kernarg pool.  Default=" << HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE;
+        GET_ENV_INT(HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE, ss.str().c_str());
+    }
+    {
+        std::stringstream ss;
+        ss << "Minimum number of buffers when growing the kernarg pool.  Default=" << HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS;
+        GET_ENV_INT(HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS, ss.str().c_str());
+    }
+    
+    GET_ENV_INT(HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE, "Create an exta kernarg pool for the specified buffer size (in bytes).  The size must be greater than 4096");
 };
 
 
@@ -4891,7 +4928,8 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg
             auto sync_id = std::get<2>(kernargMemory);
             auto q_ptr = reinterpret_cast<Kalmar::HSAQueue*>(getQueue());
             auto queue_last_kernarg_sync_id = q_ptr->get_last_kernarg_sync_id();
-            if (queue_last_kernarg_sync_id != sync_id) {
+            if (queue_last_kernarg_sync_id != sync_id ||
+                HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE != 0) {
                 // kernarg buffers have been recycled, put a system scope acquire fence to
                 // clear the GPU cache to purge the content of staled kernarg buffers
                 header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE);
diff --git a/tests/lit.cfg b/tests/lit.cfg
index cbdbc1b1e1d..3ed0a9b6ef6 100644
--- a/tests/lit.cfg
+++ b/tests/lit.cfg
@@ -67,6 +67,21 @@ if os.environ.get('HCC_KERNARG_MANAGER'):
 if os.environ.get('HCC_KERNARG_MANAGER_COARSE_GRAINED'):
     config.environment['HCC_KERNARG_MANAGER_COARSE_GRAINED'] = os.environ['HCC_KERNARG_MANAGER_COARSE_GRAINED']
 
+if os.environ.get('HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE'):
+    config.environment['HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE'] = os.environ['HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE']
+
+if os.environ.get('HCC_KERNARG_MANAGER_GROW_THRESHOLD'):
+    config.environment['HCC_KERNARG_MANAGER_GROW_THRESHOLD'] = os.environ['HCC_KERNARG_MANAGER_GROW_THRESHOLD']
+
+if os.environ.get('HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE'):
+    config.environment['HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE'] = os.environ['HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE']
+
+if os.environ.get('HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS'):
+    config.environment['HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS'] = os.environ['HCC_KERNARG_MANAGER_GROW_MIN_NUM_BUFFERS']
+
+if os.environ.get('HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE'):
+    config.environment['HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE'] = os.environ['HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE']
+
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
 

From d5657516e1f779003f195bc67e8685f94adefbd0 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Wed, 26 Feb 2020 01:18:59 -0500
Subject: [PATCH 11/14] Minor simplification to kernarg buffer recycling, add
 debug messages

---
 lib/hsa/mcwamp_hsa.cpp | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 1e4a98040cb..50e9478b14d 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -86,7 +86,7 @@
 int HCC_ASYNCOPS_SIZE = (1024);
 int HCC_ASYNCOPS_WITHOUT_SIGNAL_SIZE = (HCC_ASYNCOPS_SIZE/2);
 
-//---
+//--
 // Environment variables:
 int HCC_PRINT_ENV=0;
 
@@ -3175,13 +3175,11 @@ class HSADevice final : public KalmarDevice
                     auto& fp = std::get<_free_pool>(p);
                     auto& rp = std::get<_released_pool>(p);
                     if (fp.empty()) {
-                        if (rp.size() < HCC_KERNARG_MANAGER_GROW_THRESHOLD) {
+                        fp.swap(rp);
+                        DBOUT(DB_KERNARG, "recycling " << fp.size() << 
+                                          " kernarg buffers of size " << std::get<_buffer_size>(p));
+                        if (fp.size() < HCC_KERNARG_MANAGER_GROW_THRESHOLD) {
                             grow(p);
-                            fp.insert(fp.cend(), rp.begin(), rp.end());
-                            rp.clear();
-                        }
-                        else {
-                            fp.swap(rp);
                         }
                         if (mem_pool_is_coarse_grained) {
                             sync_id++;
@@ -3240,6 +3238,11 @@ class HSADevice final : public KalmarDevice
             rocr_allocs.push_back(rocr_alloc);
 
             const auto new_capacity = std::get<_free_pool>(p).capacity() + num_buffers;
+
+            DBOUT(DB_KERNARG, "growing kernarg pool (" << std::get<_buffer_size>(p) << ") from " << 
+                              std::get<_free_pool>(p).capacity() << " to " << 
+                              new_capacity << " buffers");
+
             std::get<_free_pool>(p).reserve(new_capacity);
             std::get<_released_pool>(p).reserve(new_capacity);
 
@@ -4241,12 +4244,16 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) :
     has_hdp_access = (status == HSA_STATUS_SUCCESS &&
                       hdp.HDP_MEM_FLUSH_CNTL != nullptr);
 
-    if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED && has_hdp_access) {
-        if (ri._found_local_memory_pool &&
-            hasAccess(getHostAgent(), ri._local_memory_pool)) {
-            DBOUT(DB_INIT, "using coarse-grained GPU local memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl);
-            ri._kernarg_memory_pool = ri._local_memory_pool;
-            ri._found_kernarg_memory_pool = true;
+    if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED) {
+        if (has_hdp_access) {
+            if (ri._found_local_memory_pool &&
+                hasAccess(getHostAgent(), ri._local_memory_pool)) {
+                DBOUT(DB_KERNARG, "Using coarse-grained GPU memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl);
+                ri._kernarg_memory_pool = ri._local_memory_pool;
+                ri._found_kernarg_memory_pool = true;
+            }
+        } else {
+            DBOUT(DB_KERNARG, "Not using coarse-grained GPU memory for kernarg due to no access to HDP registers.");
         }
     }
 
@@ -4975,9 +4982,11 @@ HSADispatch::dispatchKernel(hsa_queue_t* lockedHsaQueue, const void *hostKernarg
     DBOUTL(DB_AQL, " dispatch_aql " << *this << "(hwq=" << lockedHsaQueue << ") kernargs=" << hostKernargSize << " " << *q_aql );
     DBOUTL(DB_AQL2, rawAql(*q_aql));
 
+#if 0
     if (DBFLAG(DB_KERNARG)) {
         printKernarg(q_aql->kernarg_address, hostKernargSize);
     }
+#endif
 
     // Register signal callback.
     if (_activity_prof.is_enabled()) {

From 3cba1970f4fac84bedc971c8d3cb2c865c1d4801 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Wed, 26 Feb 2020 14:27:11 -0500
Subject: [PATCH 12/14] add a few more debug messages for coarse grained VM
 kernarg

---
 lib/hsa/mcwamp_hsa.cpp | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 50e9478b14d..da4a5b6b392 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -3177,7 +3177,7 @@ class HSADevice final : public KalmarDevice
                     if (fp.empty()) {
                         fp.swap(rp);
                         DBOUT(DB_KERNARG, "recycling " << fp.size() << 
-                                          " kernarg buffers of size " << std::get<_buffer_size>(p));
+                                          " kernarg buffers of size " << std::get<_buffer_size>(p) << std::endl);
                         if (fp.size() < HCC_KERNARG_MANAGER_GROW_THRESHOLD) {
                             grow(p);
                         }
@@ -3241,7 +3241,7 @@ class HSADevice final : public KalmarDevice
 
             DBOUT(DB_KERNARG, "growing kernarg pool (" << std::get<_buffer_size>(p) << ") from " << 
                               std::get<_free_pool>(p).capacity() << " to " << 
-                              new_capacity << " buffers");
+                              new_capacity << " buffers" << std::endl);
 
             std::get<_free_pool>(p).reserve(new_capacity);
             std::get<_released_pool>(p).reserve(new_capacity);
@@ -4246,14 +4246,21 @@ HSADevice::HSADevice(hsa_agent_t a, hsa_agent_t host, int x_accSeqNum) :
 
     if (HCC_KERNARG_MANAGER && HCC_KERNARG_MANAGER_COARSE_GRAINED) {
         if (has_hdp_access) {
-            if (ri._found_local_memory_pool &&
-                hasAccess(getHostAgent(), ri._local_memory_pool)) {
-                DBOUT(DB_KERNARG, "Using coarse-grained GPU memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl);
-                ri._kernarg_memory_pool = ri._local_memory_pool;
-                ri._found_kernarg_memory_pool = true;
+            if (ri._found_local_memory_pool) {
+                if (hasAccess(getHostAgent(), ri._local_memory_pool)) {
+                    DBOUT(DB_KERNARG, "Using coarse-grained GPU memory for kernarg, size(MB) = " << ri._local_memory_pool_size << std::endl);
+                    ri._kernarg_memory_pool = ri._local_memory_pool;
+                    ri._found_kernarg_memory_pool = true;
+                }
+                else {
+                    DBOUT(DB_KERNARG, "Not using coarse-grained GPU memory for kernarg since host agent has no access to this memory.\n");
+                }
+            }
+            else {
+                DBOUT(DB_KERNARG, "Not using coarse-grained GPU memory for kernarg since this memory is unavailable.\n");
             }
         } else {
-            DBOUT(DB_KERNARG, "Not using coarse-grained GPU memory for kernarg due to no access to HDP registers.");
+            DBOUT(DB_KERNARG, "Not using coarse-grained GPU memory for kernarg due to no access to HDP registers.\n");
         }
     }
 

From d4badc6b1d4f565e2292b581383071c261644559 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Mon, 2 Mar 2020 15:51:01 -0500
Subject: [PATCH 13/14] Kernarg - enable coarse grained memory support by
 default

---
 lib/hsa/mcwamp_hsa.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index da4a5b6b392..7e3fe4e6629 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -139,7 +139,7 @@ int HCC_PROFILE_VERBOSE=0x1F;
 char * HCC_PROFILE_FILE=nullptr;
 
 int HCC_KERNARG_MANAGER=1;
-int HCC_KERNARG_MANAGER_COARSE_GRAINED=0;
+int HCC_KERNARG_MANAGER_COARSE_GRAINED=1;
 int HCC_KERNARG_MANAGER_COARSE_GRAINED_FORCED_SYS_ACQ_FENCE=0;
 int HCC_KERNARG_MANAGER_GROW_THRESHOLD=20;
 int HCC_KERNARG_MANAGER_GROW_MIN_CHUNK_SIZE=(1024 * 1024);

From 3da04118b58f7ed82bcc1602033b7057b9b41f51 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Fri, 27 Mar 2020 15:56:23 -0400
Subject: [PATCH 14/14] Bump up kernarg size to take padding and implicit
 argument into consideration

---
 lib/hsa/mcwamp_hsa.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 7e3fe4e6629..eaa293079be 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -3152,7 +3152,9 @@ class HSADevice final : public KalmarDevice
             pools.emplace_back(std::make_tuple( 512, std::vector<void*>(0), std::vector<void*>(0)));
             pools.emplace_back(std::make_tuple(1024, std::vector<void*>(0), std::vector<void*>(0)));
             pools.emplace_back(std::make_tuple(2048, std::vector<void*>(0), std::vector<void*>(0)));
-            pools.emplace_back(std::make_tuple(4096, std::vector<void*>(0), std::vector<void*>(0)));
+
+            // 4k + padding + implicit arguments
+            pools.emplace_back(std::make_tuple(4096 + 2 * 64, std::vector<void*>(0), std::vector<void*>(0)));
 
             if (HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE > std::get<_buffer_size>(pools.back())) {
                 pools.emplace_back(std::make_tuple(HCC_KERNARG_MANAGER_EXTRA_BUFFER_SIZE,