From 7c29487ce7da4bec7f8654e03d79d4a955d3e1e2 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Thu, 7 Jan 2021 03:00:05 -0500 Subject: [PATCH 1/9] SWDEV-2 - Change OpenCL version number from 3240 to 3241 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index d801c3ddd..b81507443 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3240 +#define AMD_PLATFORM_BUILD_NUMBER 3241 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From f201c070c315bb3aa2ba532973618e1ce7528f6e Mon Sep 17 00:00:00 2001 From: Brian Sumner Date: Sat, 23 Apr 2022 09:24:44 -0700 Subject: [PATCH 2/9] SWDEV-307184 - Make sure runtime passes CO version into comgr without cherrypicking it Change-Id: Ia9fd607abd560082d4cdb7eaa42a22c67bfd5f8e --- compiler/lib/utils/OPTIONS.def | 8 ++++++++ device/devprogram.cpp | 29 ++++++++++++++++++----------- device/devprogram.hpp | 3 ++- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/compiler/lib/utils/OPTIONS.def b/compiler/lib/utils/OPTIONS.def index 005465c58..77569fe68 100644 --- a/compiler/lib/utils/OPTIONS.def +++ b/compiler/lib/utils/OPTIONS.def @@ -1267,6 +1267,14 @@ OPTION(OT_BOOL, \ false, 0, 0, NULL, \ "Enable the xnack feature for Finalizer/SC") +// -code-object-version= : code object version +OPTION(OT_UINT32, \ + OA_RUNTIME|OVA_OPTIONAL|OA_SEPARATOR_EQUAL, \ + "code-object-version", NULL, \ + LCCodeObjectVersion, \ + 4, 4, 5, NULL, \ + "Specify code object ABI version. Allowed values are 4, and 5. Defaults to 4. (COMGR only)") + /* Do not remove the following line. Any option should be added above this line. diff --git a/device/devprogram.cpp b/device/devprogram.cpp index f5f3249bd..48d138edd 100644 --- a/device/devprogram.cpp +++ b/device/devprogram.cpp @@ -346,7 +346,7 @@ amd_comgr_status_t Program::createAction(const amd_comgr_language_t oclver, bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs, const std::vector& options, const bool requiredDump, amd::option::Options* amdOptions, amd_comgr_data_set_t* output, - char* binaryData[], size_t* binarySize) { + char* binaryData[], size_t* binarySize, const bool link_dev_libs) { amd_comgr_language_t langver; setLanguage(amdOptions->oVariables->CLStd, &langver); @@ -364,19 +364,22 @@ bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs, amd_comgr_status_t status = createAction(langver, options, &action, &hasAction); - if (status == AMD_COMGR_STATUS_SUCCESS) { - status = amd::Comgr::create_data_set(&dataSetDevLibs); - } + if (link_dev_libs) { + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd::Comgr::create_data_set(&dataSetDevLibs); + } - if (status == AMD_COMGR_STATUS_SUCCESS) { - hasDataSetDevLibs = true; - status = amd::Comgr::do_action(AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES, action, inputs, - dataSetDevLibs); - extractBuildLog(dataSetDevLibs); + if (status == AMD_COMGR_STATUS_SUCCESS) { + hasDataSetDevLibs = true; + status = amd::Comgr::do_action(AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES, action, inputs, + dataSetDevLibs); + extractBuildLog(dataSetDevLibs); + } } if (status == AMD_COMGR_STATUS_SUCCESS) { - status = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, dataSetDevLibs, *output); + status = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, + (link_dev_libs) ? dataSetDevLibs : inputs, *output); extractBuildLog(*output); } @@ -689,6 +692,7 @@ bool Program::compileImplLC(const std::string& sourceCode, if (device().settings().lcWavefrontSize64_) { driverOptions.push_back("-mwavefrontsize64"); } + driverOptions.push_back("-mcode-object-version=" + std::to_string(options->oVariables->LCCodeObjectVersion)); // Iterate through each source code and dump it into tmp std::fstream f; @@ -967,8 +971,9 @@ bool Program::linkImplLC(const std::vector& inputPrograms, char* binaryData = nullptr; size_t binarySize = 0; std::vector linkOptions; + constexpr bool kLinkDevLibs = false; bool ret = linkLLVMBitcode(inputs, linkOptions, false, options, &output, &binaryData, - &binarySize); + &binarySize, kLinkDevLibs); amd::Comgr::destroy_data_set(output); amd::Comgr::destroy_data_set(inputs); @@ -1202,6 +1207,7 @@ bool Program::linkImplLC(amd::option::Options* options) { if (device().settings().lcWavefrontSize64_) { linkOptions.push_back("wavefrontsize64"); } + linkOptions.push_back("code_object_v" + std::to_string(options->oVariables->LCCodeObjectVersion)); amd_comgr_status_t status = addCodeObjData(llvmBinary_.data(), llvmBinary_.size(), AMD_COMGR_DATA_KIND_BC, @@ -1277,6 +1283,7 @@ bool Program::linkImplLC(amd::option::Options* options) { if (device().settings().lcWavefrontSize64_) { codegenOptions.push_back("-mwavefrontsize64"); } + codegenOptions.push_back("-mcode-object-version=" + std::to_string(options->oVariables->LCCodeObjectVersion)); // NOTE: The params is also used to identy cached code object. This parameter // should not contain any dyanamically generated filename. diff --git a/device/devprogram.hpp b/device/devprogram.hpp index f9c1af19b..099d6a37c 100644 --- a/device/devprogram.hpp +++ b/device/devprogram.hpp @@ -456,7 +456,8 @@ class Program : public amd::HeapObject { bool linkLLVMBitcode(const amd_comgr_data_set_t inputs, const std::vector& options, const bool requiredDump, amd::option::Options* amdOptions, amd_comgr_data_set_t* output, - char* binaryData[] = nullptr, size_t* binarySize = nullptr); + char* binaryData[] = nullptr, size_t* binarySize = nullptr, + const bool link_dev_libs = true); //! Create the bitcode of the compiled input dataset bool compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs, From 041c00465b7adcee78085dc42253d42d1bb1f250 Mon Sep 17 00:00:00 2001 From: Konstantin Zhuravlyov Date: Tue, 29 Nov 2022 13:18:15 -0500 Subject: [PATCH 3/9] SWDEV-325538 - Enable code object v5 by default Change-Id: Ifbb782e1177ff51b2de84a1ca0286b401833cb65 --- compiler/lib/utils/OPTIONS.def | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler/lib/utils/OPTIONS.def b/compiler/lib/utils/OPTIONS.def index 9b71f52e8..a8a906365 100644 --- a/compiler/lib/utils/OPTIONS.def +++ b/compiler/lib/utils/OPTIONS.def @@ -1272,8 +1272,8 @@ OPTION(OT_UINT32, \ OA_RUNTIME|OVA_OPTIONAL|OA_SEPARATOR_EQUAL, \ "code-object-version", NULL, \ LCCodeObjectVersion, \ - 4, 4, 5, NULL, \ - "Specify code object ABI version. Allowed values are 4, and 5. Defaults to 4. (COMGR only)") + 5, 4, 5, NULL, \ + "Specify code object ABI version. Allowed values are 4, and 5. Defaults to 5. (COMGR only)") /* Do not remove the following line. Any option should be From a983fc1b9161c83bb6879e926226c35e361d8d80 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Mon, 23 Jan 2023 17:40:30 -0500 Subject: [PATCH 4/9] SWDEV-372790 - Copy AQL packet from runtime setup Scheduler in device queue requires relaunching itself. Make sure scheduler uses exactly the same AQL packet as the host launch. Change-Id: I4eb03c4c91bf2408a6d4607731f081a2e2c2c8ae --- device/rocm/rocblit.cpp | 22 ++-------------------- device/rocm/rocvirtual.cpp | 13 ++++++++++++- device/rocm/rocvirtual.hpp | 3 ++- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp index f6b447ca0..913b26b62 100644 --- a/device/rocm/rocblit.cpp +++ b/device/rocm/rocblit.cpp @@ -2682,31 +2682,13 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, Memory* schedulerMem = dev().getRocMemory(schedulerParam); sp->kernarg_address = reinterpret_cast(schedulerMem->getDeviceMemory()); - - sp->hidden_global_offset_x = 0; - sp->hidden_global_offset_y = 0; - sp->hidden_global_offset_z = 0; sp->thread_counter = 0; sp->child_queue = reinterpret_cast(schedulerQueue); sp->complete_signal = schedulerSignal; hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne); - sp->scheduler_aql.header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - sp->scheduler_aql.setup = 1; - sp->scheduler_aql.workgroup_size_x = 1; - sp->scheduler_aql.workgroup_size_y = 1; - sp->scheduler_aql.workgroup_size_z = 1; - sp->scheduler_aql.grid_size_x = threads; - sp->scheduler_aql.grid_size_y = 1; - sp->scheduler_aql.grid_size_z = 1; - sp->scheduler_aql.kernel_object = gpuKernel.KernelCodeHandle(); - sp->scheduler_aql.kernarg_address = (void*)sp->kernarg_address; - sp->scheduler_aql.private_segment_size = 0; - sp->scheduler_aql.group_segment_size = 0; + sp->vqueue_header = vqVM; sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam); @@ -2721,7 +2703,7 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, address parameters = captureArguments(kernels_[Scheduler]); if (!gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], - parameters, nullptr)) { + parameters, nullptr, 0, nullptr, &sp->scheduler_aql)) { return false; } releaseArguments(parameters); diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp index 56b6022f6..dbf7d35ba 100644 --- a/device/rocm/rocvirtual.cpp +++ b/device/rocm/rocvirtual.cpp @@ -2781,7 +2781,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, void* eventHandle, - uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) { + uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd, + hsa_kernel_dispatch_packet_t* aql_packet) { device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); @@ -3108,6 +3109,16 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_; } + // Copy scheduler's AQL packet for possible relaunch from the scheduler itself + if (aql_packet != nullptr) { + *aql_packet = dispatchPacket; + aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + } + // Dispatch the packet if (!dispatchAqlPacket( &dispatchPacket, aqlHeaderWithOrder, diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp index 11a3670ec..18cc34ec3 100644 --- a/device/rocm/rocvirtual.hpp +++ b/device/rocm/rocvirtual.hpp @@ -312,7 +312,8 @@ class VirtualGPU : public device::VirtualDevice { const_address parameters, //!< Parameters for the kernel void* event_handle, //!< Handle to OCL event for debugging uint32_t sharedMemBytes = 0, //!< Shared memory size - amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command + amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command + hsa_kernel_dispatch_packet_t* aql_packet = nullptr //!< Scheduler launch ); void submitNativeFn(amd::NativeFnCommand& cmd); void submitMarker(amd::Marker& cmd); From 6d8fc0b8e2b91ec9f38f0c434b0b6163b5af9a15 Mon Sep 17 00:00:00 2001 From: Jiabao Xie Date: Wed, 25 Jan 2023 15:54:46 -0500 Subject: [PATCH 5/9] SWDEV-379991, SWDEV-366886 - Revert "SWDEV-366886 - force svm alloc for rocm" This reverts commit 275f4ddd209ecb39baedc7e127184c728479b510. Reason for revert: performance drop in Quicksilver app Change-Id: I2bdf42ad0a235a74e2bb4d38f86471e9affbd7a6 (cherry picked from commit cdd151dcd4b62d0d13a1273c669f005bd89be7fe) --- device/rocm/rocdevice.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index 961227bb7..f5d0f9659 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -2331,7 +2331,6 @@ bool Device::IpcDetach (void* dev_ptr) const { // ================================================================================================ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { - constexpr bool kForceAllocation = true; amd::Memory* mem = nullptr; if (nullptr == svmPtr) { @@ -2343,7 +2342,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_ return nullptr; } - if (!mem->create(nullptr, false, false, kForceAllocation)) { + if (!mem->create(nullptr)) { LogError("failed to create a svm hidden buffer!"); mem->release(); return nullptr; From 1cf8f19822da14b6a439004c74d15dc4ad04a25b Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Thu, 1 Dec 2022 13:32:35 +0000 Subject: [PATCH 6/9] SWDEV-380024 - Fix performance drop in TF-RCCL models Change-Id: Idc845bb0dab858b94b9d2720cae8308cac2e7328 --- device/device.hpp | 11 ++++++++--- device/rocm/rocdevice.cpp | 16 ++++++++++++++-- device/rocm/rocdevice.hpp | 2 ++ device/rocm/rocvirtual.hpp | 6 +++--- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/device/device.hpp b/device/device.hpp index 64d674659..ee9ac0eec 100644 --- a/device/device.hpp +++ b/device/device.hpp @@ -1796,9 +1796,14 @@ class Device : public RuntimeObject { // Returns the status of HW event, associated with amd::Event virtual bool IsHwEventReady( - const amd::Event& event, //!< AMD event for HW status validation - bool wait = false //!< If true then forces the event completion - ) const { + const amd::Event& event, //!< AMD event for HW status validation + bool wait = false) const { //!< If true then forces the event completion + return false; + }; + + // Returns the status of HW event, associated with amd::Event + virtual bool IsHwEventReadyForcedWait( + const amd::Event& event) const { //!< AMD event for HW status validation return false; }; diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index f5d0f9659..34395a245 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -2729,10 +2729,22 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI return result; } +// ================================================================================================ +bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const { + void* hw_event = + (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent(); + if (hw_event == nullptr) { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event"); + return false; + } + static constexpr bool Timeout = true; + return WaitForSignal(reinterpret_cast(hw_event)->signal_, false, true); +} + // ================================================================================================ bool Device::IsHwEventReady(const amd::Event& event, bool wait) const { - void* hw_event = (event.NotifyEvent() != nullptr) ? - event.NotifyEvent()->HwEvent() : event.HwEvent(); + void* hw_event = + (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent(); if (hw_event == nullptr) { ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event"); return false; diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp index b3da3783d..9619abe6a 100644 --- a/device/rocm/rocdevice.hpp +++ b/device/rocm/rocdevice.hpp @@ -258,6 +258,7 @@ class NullDevice : public amd::Device { cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; } virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; } + virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const { return false; } virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {}; virtual void ReleaseGlobalSignal(void* signal) const {} @@ -443,6 +444,7 @@ class Device : public NullDevice { cl_set_device_clock_mode_output_amd* pSetClockModeOutput); virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const; + virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const; virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const; virtual void ReleaseGlobalSignal(void* signal) const; diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp index 18cc34ec3..af597ef2e 100644 --- a/device/rocm/rocvirtual.hpp +++ b/device/rocm/rocvirtual.hpp @@ -46,10 +46,10 @@ constexpr static uint64_t kUnlimitedWait = std::numeric_limits::max(); // Active wait time out incase same sdma engine is used again, // then just wait instead of adding dependency wait signal. -constexpr static uint64_t kSDMAEngineTimeout = 10; +constexpr static uint64_t kForcedTimeout = 10; template -inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sdma_wait = false) { +inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) { if (hsa_signal_load_relaxed(signal) > 0) { uint64_t timeout = kTimeout100us; if (active_wait) { @@ -57,7 +57,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sd } if (active_wait_timeout) { // If diff engine, wait to 10 ms. Otherwise no wait - timeout = (sdma_wait ? kSDMAEngineTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K; + timeout = (forced_wait ? kForcedTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K; if (timeout == 0) { return false; } From dd5f3d27cce3924c1c3303b8f5e223be93ab252f Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Wed, 8 Mar 2023 04:34:50 +0000 Subject: [PATCH 7/9] SWDEV-386749 - Update stack size limit Change-Id: Id0cf66820e76e1bbd7f6c17ceb110782cdb5f978 --- device/device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/device/device.cpp b/device/device.cpp index 2709febd8..8c86b86e1 100644 --- a/device/device.cpp +++ b/device/device.cpp @@ -751,7 +751,7 @@ bool Device::disableP2P(amd::Device* ptrDev) { } bool Device::UpdateStackSize(uint64_t stackSize) { - if (stackSize > 16 * Ki) { + if (stackSize > ((128 * Ki) - 16)) { return false; } stack_size_ = stackSize; From e047204465e75a62904f95152732c30026534111 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 10 Mar 2023 09:03:34 +0000 Subject: [PATCH 8/9] SWDEV-380035 - Check for agent and ptr match for hsa LOCKED ptr Also do not create Arena Memobj for pinned memory Change-Id: Ibecfe90c62cfa252e3da45408041f3d1cb3acbbb --- device/rocm/rocdevice.cpp | 32 ++++++++++++++++++++------------ device/rocm/rocdevice.hpp | 2 +- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index 34395a245..69d5ff058 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -3221,7 +3221,9 @@ device::Signal* Device::createSignal() const { amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) { // Only create arena_mem_object if CPU memory is accessible from HMM // or if runtime received an interop from another ROCr's client - if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size)) { + hsa_amd_pointer_info_t ptr_info = {}; + ptr_info.size = sizeof(hsa_amd_pointer_info_t); + if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size, &ptr_info)) { return nullptr; } @@ -3238,8 +3240,9 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size } // Calculate the offset of the pointer. - const void* dev_ptr = reinterpret_cast(arena_mem_obj_->getDeviceMemory( - *arena_mem_obj_->getContext().devices()[0])->virtualAddress()); + const void* dev_ptr = reinterpret_cast( + arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0]) + ->virtualAddress()); offset = reinterpret_cast(ptr) - reinterpret_cast(dev_ptr); return arena_mem_obj_; @@ -3253,20 +3256,25 @@ void Device::ReleaseGlobalSignal(void* signal) const { } // ================================================================================================ -bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const { - hsa_amd_pointer_info_t ptr_info = {}; - ptr_info.size = sizeof(hsa_amd_pointer_info_t); +bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) { // Query ptr type to see if it's a HMM allocation - hsa_status_t status = hsa_amd_pointer_info( - const_cast(dev_ptr), &ptr_info, nullptr, nullptr, nullptr); + hsa_status_t status = + hsa_amd_pointer_info(const_cast(dev_ptr), ptr_info, nullptr, nullptr, nullptr); // The call should never fail in ROCR, but just check for an error and continue if (status != HSA_STATUS_SUCCESS) { LogError("hsa_amd_pointer_info() failed"); } - // Check if it's a legacy non-HMM allocation in ROCr - if (ptr_info.type != HSA_EXT_POINTER_TYPE_UNKNOWN) { - if ((size != 0) && ((reinterpret_cast(dev_ptr) - - reinterpret_cast(ptr_info.agentBaseAddress)) > size)) { + + // Return false for pinned memory. A true return may result in a race because + // ROCclr may attempt to do a pin/copy/unpin underneath in a multithreaded environment + if (ptr_info->type == HSA_EXT_POINTER_TYPE_LOCKED) { + return false; + } + + if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) { + if ((size != 0) && + ((reinterpret_cast(dev_ptr) - + reinterpret_cast(ptr_info->agentBaseAddress)) > size)) { return false; } return true; diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp index 9619abe6a..43898c8e0 100644 --- a/device/rocm/rocdevice.hpp +++ b/device/rocm/rocdevice.hpp @@ -551,7 +551,7 @@ class Device : public NullDevice { const bool isFineGrainSupported() const; //! Returns True if memory pointer is known to ROCr (excludes HMM allocations) - bool IsValidAllocation(const void* dev_ptr, size_t size) const; + bool IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info); //! Allocates hidden heap for device memory allocations void HiddenHeapAlloc(); From d7491f0e6824cfc0836c6c24f432afd4da7888e4 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Wed, 26 Apr 2023 15:47:26 -0700 Subject: [PATCH 9/9] SWDEV-394243 - Invalidate Barrier Value AQL header Change-Id: Id8e04ffe44da58641361468957d397af128443bb --- device/rocm/rocvirtual.cpp | 2 +- device/rocm/rocvirtual.hpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp index dbf7d35ba..451fdcd59 100644 --- a/device/rocm/rocvirtual.cpp +++ b/device/rocm/rocvirtual.cpp @@ -1041,7 +1041,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD hsa_signal_t signal, hsa_signal_value_t value, hsa_signal_value_t mask, hsa_signal_condition32_t cond, bool skipTs, hsa_signal_t completionSignal) { - hsa_amd_barrier_value_packet_t barrier_value_packet_ = {0}; uint16_t rest = HSA_AMD_PACKET_TYPE_BARRIER_VALUE; const uint32_t queueSize = gpu_queue_->size; const uint32_t queueMask = queueSize - 1; @@ -1274,6 +1273,7 @@ bool VirtualGPU::create() { // Initialize barrier and barrier value packets memset(&barrier_packet_, 0, sizeof(barrier_packet_)); barrier_packet_.header = kInvalidAql; + barrier_value_packet_.header.header = kInvalidAql; // Create a object of PrintfDbg printfdbg_ = new PrintfDbg(roc_device_); diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp index af597ef2e..d113a2899 100644 --- a/device/rocm/rocvirtual.hpp +++ b/device/rocm/rocvirtual.hpp @@ -503,6 +503,7 @@ class VirtualGPU : public device::VirtualDevice { hsa_agent_t gpu_device_; //!< Physical device hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu hsa_barrier_and_packet_t barrier_packet_; + hsa_amd_barrier_value_packet_t barrier_value_packet_; uint32_t dispatch_id_; //!< This variable must be updated atomically. Device& roc_device_; //!< roc device object