From 8b08c00d4a6059c77d5f1d900c97d5ac6ca8b442 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 20:05:01 +0000 Subject: [PATCH 1/2] Initial plan From 2dcf6c10b0c571dad5ab5a389b58791853d16bbe Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 20:17:54 +0000 Subject: [PATCH 2/2] Fix assertion failure with sub-8B kernel arguments In KernelArgHelper::computeKernargData(), the explicit_args_length was computed as roundArgsLength(last_explicit_arg_offset + size), rounding up to the next 8-byte boundary. When a kernel has pointer args followed by 32-bit int args (e.g. hgemm_kernel(half*, half*, half*, int, int, int)), the last explicit arg in the instrumented kernel (dh_comms*) is placed at a 4-byte-aligned offset. The rounded explicit_args_length overshot the actual hidden-args boundary, so kernarg_length - explicit_args_length became smaller than clone_hidden_args_length, triggering: assert(desc.clone_hidden_args_length <= desc.kernarg_length - desc.explicit_args_length) Fix: use the offset of the FIRST hidden argument from the kernel metadata as explicit_args_length. This is the exact explicit/hidden boundary with no rounding required. Fall back to roundArgsLength only for kernels that have no hidden arguments (e.g. some Triton kernels), where rounding is harmless. Also: - Add sub8b_args_test.cpp regression test kernel (ptr + int args) - Wire it into CMakeLists.txt and run_basic_tests.sh Co-authored-by: rwvo <21990117+rwvo@users.noreply.github.com> --- src/utils.cc | 23 ++++++++++- tests/run_basic_tests.sh | 12 ++++++ tests/test_kernels/CMakeLists.txt | 13 ++++++ tests/test_kernels/sub8b_args_test.cpp | 57 ++++++++++++++++++++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 tests/test_kernels/sub8b_args_test.cpp diff --git a/src/utils.cc b/src/utils.cc index edab803..e776bcf 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -852,6 +852,7 @@ void KernelArgHelper::computeKernargData(amd_comgr_metadata_node_t exec_map) if (kind == AMD_COMGR_METADATA_KIND_LIST) { size_t arg_count; + bool has_hidden_args = false; CHECK_COMGR(amd_comgr_get_metadata_list_size(args, &arg_count)); for (size_t j = 0; j < arg_count; j++) { @@ -871,7 +872,18 @@ void KernelArgHelper::computeKernargData(amd_comgr_metadata_node_t exec_map) //std::cout << "Name, Offset, Size\n"; //std::cout << parm_name << "," << arg_offset << "," << arg_size << std::endl; if (parm_name.rfind("hidden_",0) == 0) + { + // Use the offset of the first hidden argument as the exact + // explicit/hidden boundary. This avoids incorrect rounding when + // sub-8B explicit arguments (e.g. int) cause the last explicit + // argument to end at a non-8-byte-aligned offset. + if (!has_hidden_args) + { + has_hidden_args = true; + desc.explicit_args_length = arg_offset; + } desc.hidden_args_length = arg_offset + arg_size; + } else { desc.explicit_args_count++; @@ -879,8 +891,17 @@ void KernelArgHelper::computeKernargData(amd_comgr_metadata_node_t exec_map) } } } + // Only apply rounding when there are no hidden arguments (e.g. some Triton + // kernels). When hidden arguments are present their offset already encodes the + // exact boundary, so rounding is both unnecessary and harmful. + if (!has_hidden_args) + desc.explicit_args_length = std::min(roundArgsLength(desc.explicit_args_length), desc.kernarg_length); } - desc.explicit_args_length = std::min(roundArgsLength(desc.explicit_args_length), desc.kernarg_length); + // Recompute hidden_args_length as the number of bytes from the explicit/hidden + // boundary to the end of the kernarg segment. The loop set hidden_args_length to + // the absolute end offset of the last hidden arg, which is a different quantity. + // The recalculated value is used as a "has hidden args" flag (non-zero ↔ true) + // and as the basis for clone_hidden_args_length in getArgDescriptor(). desc.hidden_args_length = desc.kernarg_length - desc.explicit_args_length; kernels_[strName] = desc; } diff --git a/tests/run_basic_tests.sh b/tests/run_basic_tests.sh index 40ebb96..7ec9825 100755 --- a/tests/run_basic_tests.sh +++ b/tests/run_basic_tests.sh @@ -24,9 +24,11 @@ echo "========================================================================== # Use project's instrumented test kernels HEATMAP_TEST="${BUILD_DIR}/tests/test_kernels/simple_heatmap_test" MEMORY_ANALYSIS_TEST="${BUILD_DIR}/tests/test_kernels/simple_memory_analysis_test" +SUB8B_ARGS_TEST="${BUILD_DIR}/tests/test_kernels/sub8b_args_test" check_kernel "$HEATMAP_TEST" check_kernel "$MEMORY_ANALYSIS_TEST" +check_kernel "$SUB8B_ARGS_TEST" # Test 1: Memory heatmap handler run_test "heatmap_basic" \ @@ -46,5 +48,15 @@ run_test "heatmap_page_accesses" \ "Heatmap" \ "accesses" +# Test 4: Regression test for sub-8B kernel arguments. +# A kernel with pointer args followed by 32-bit int args has its explicit +# argument list end at a non-8-byte-aligned offset. The old roundArgsLength() +# logic caused an assertion failure in fixupKernArgs. Verify omniprobe runs +# to completion without asserting. +run_test "sub8b_args_regression" \ + "$SUB8B_ARGS_TEST" \ + "Heatmap" \ + "sub8b_args_test done" + # Export updated counters for parent script export TESTS_RUN TESTS_PASSED TESTS_FAILED diff --git a/tests/test_kernels/CMakeLists.txt b/tests/test_kernels/CMakeLists.txt index a1d3220..fd3b7d4 100644 --- a/tests/test_kernels/CMakeLists.txt +++ b/tests/test_kernels/CMakeLists.txt @@ -83,3 +83,16 @@ target_compile_options(block_filter_test PRIVATE target_link_options(block_filter_test PRIVATE -fgpu-rdc) add_dependencies(block_filter_test copy_bitcode_to_rocm) +# Sub-8B argument test kernel (regression test for issue: +# "Error with sub-8B kernel arguments"). A kernel whose explicit argument list +# ends at a non-8-byte-aligned offset (pointer args + int args) used to trigger +# an assertion in fixupKernArgs due to incorrect explicit_args_length rounding. +add_executable(sub8b_args_test sub8b_args_test.cpp) +set_source_files_properties(sub8b_args_test.cpp PROPERTIES LANGUAGE HIP) +target_compile_options(sub8b_args_test PRIVATE + ${TEST_KERNEL_COMPILE_FLAGS} + -fpass-plugin=${INST_PLUGIN} +) +target_link_options(sub8b_args_test PRIVATE -fgpu-rdc) +add_dependencies(sub8b_args_test copy_bitcode_to_rocm) + diff --git a/tests/test_kernels/sub8b_args_test.cpp b/tests/test_kernels/sub8b_args_test.cpp new file mode 100644 index 0000000..7d25414 --- /dev/null +++ b/tests/test_kernels/sub8b_args_test.cpp @@ -0,0 +1,57 @@ +// Test kernel for sub-8B (e.g. int) kernel arguments. +// This reproduces the bug reported in issue "Error with sub-8B kernel arguments": +// when a kernel has pointer arguments followed by 32-bit integer arguments, the +// kernarg layout places the last explicit argument at a non-8-byte-aligned offset. +// The old roundArgsLength() logic incorrectly rounded that offset up, overshooting +// the hidden-args boundary and triggering an assertion failure in fixupKernArgs(). + +#include +#include +#include "hip_test_utils.h" + +// Kernel with pointer args followed by 32-bit int args (sub-8B). +// This layout mirrors the hgemm_kernel from the bug report: +// ptr(0,8), ptr(8,8), ptr(16,8), int(24,4), int(28,4), int(32,4) +// The last explicit arg ends at byte 36, which is not 8-byte aligned. +__global__ void sub8b_args_kernel(const int* __restrict__ A, + const int* __restrict__ B, + int* __restrict__ C, + int M, int N, int K) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < M * N) + { + // Simple element-wise add using M, N, K to ensure they are live. + int row = idx / N; + int col = idx % N; + if (row < M && col < N && col < K) + C[row * N + col] = A[row * K + col] + B[row * N + col]; + } +} + +int main() +{ + std::cerr << "Starting sub8b_args_test" << std::endl; + + constexpr int M = 4; + constexpr int N = 4; + constexpr int K = 4; + + int *dA, *dB, *dC; + CHECK_HIP(hipMalloc(&dA, M * K * sizeof(int))); + CHECK_HIP(hipMalloc(&dB, M * N * sizeof(int))); + CHECK_HIP(hipMalloc(&dC, M * N * sizeof(int))); + + constexpr int blocksize = 64; + constexpr int no_blocks = 1; + + sub8b_args_kernel<<>>(dA, dB, dC, M, N, K); + CHECK_HIP(hipDeviceSynchronize()); + + CHECK_HIP(hipFree(dA)); + CHECK_HIP(hipFree(dB)); + CHECK_HIP(hipFree(dC)); + + std::cerr << "sub8b_args_test done" << std::endl; + return 0; +}