Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion src/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,7 @@ void KernelArgHelper::computeKernargData(amd_comgr_metadata_node_t exec_map)
if (kind == AMD_COMGR_METADATA_KIND_LIST)
{
size_t arg_count;
bool has_hidden_args = false;
CHECK_COMGR(amd_comgr_get_metadata_list_size(args, &arg_count));
for (size_t j = 0; j < arg_count; j++)
{
Expand All @@ -871,16 +872,36 @@ void KernelArgHelper::computeKernargData(amd_comgr_metadata_node_t exec_map)
//std::cout << "Name, Offset, Size\n";
//std::cout << parm_name << "," << arg_offset << "," << arg_size << std::endl;
if (parm_name.rfind("hidden_",0) == 0)
{
// Use the offset of the first hidden argument as the exact
// explicit/hidden boundary. This avoids incorrect rounding when
// sub-8B explicit arguments (e.g. int) cause the last explicit
// argument to end at a non-8-byte-aligned offset.
if (!has_hidden_args)
{
has_hidden_args = true;
desc.explicit_args_length = arg_offset;
}
desc.hidden_args_length = arg_offset + arg_size;
}
else
{
desc.explicit_args_count++;
desc.explicit_args_length = arg_offset + arg_size;
}
}
}
// Only apply rounding when there are no hidden arguments (e.g. some Triton
// kernels). When hidden arguments are present their offset already encodes the
// exact boundary, so rounding is both unnecessary and harmful.
if (!has_hidden_args)
desc.explicit_args_length = std::min(roundArgsLength(desc.explicit_args_length), desc.kernarg_length);
}
desc.explicit_args_length = std::min(roundArgsLength(desc.explicit_args_length), desc.kernarg_length);
// Recompute hidden_args_length as the number of bytes from the explicit/hidden
// boundary to the end of the kernarg segment. The loop set hidden_args_length to
// the absolute end offset of the last hidden arg, which is a different quantity.
// The recalculated value is used as a "has hidden args" flag (non-zero ↔ true)
// and as the basis for clone_hidden_args_length in getArgDescriptor().
desc.hidden_args_length = desc.kernarg_length - desc.explicit_args_length;
kernels_[strName] = desc;
}
Expand Down
12 changes: 12 additions & 0 deletions tests/run_basic_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ echo "==========================================================================
# Use project's instrumented test kernels
HEATMAP_TEST="${BUILD_DIR}/tests/test_kernels/simple_heatmap_test"
MEMORY_ANALYSIS_TEST="${BUILD_DIR}/tests/test_kernels/simple_memory_analysis_test"
SUB8B_ARGS_TEST="${BUILD_DIR}/tests/test_kernels/sub8b_args_test"

check_kernel "$HEATMAP_TEST"
check_kernel "$MEMORY_ANALYSIS_TEST"
check_kernel "$SUB8B_ARGS_TEST"

# Test 1: Memory heatmap handler
run_test "heatmap_basic" \
Expand All @@ -46,5 +48,15 @@ run_test "heatmap_page_accesses" \
"Heatmap" \
"accesses"

# Test 4: Regression test for sub-8B kernel arguments.
# A kernel with pointer args followed by 32-bit int args has its explicit
# argument list end at a non-8-byte-aligned offset. The old roundArgsLength()
# logic caused an assertion failure in fixupKernArgs. Verify omniprobe runs
# to completion without asserting.
run_test "sub8b_args_regression" \
"$SUB8B_ARGS_TEST" \
"Heatmap" \
"sub8b_args_test done"

# Export updated counters for parent script
export TESTS_RUN TESTS_PASSED TESTS_FAILED
13 changes: 13 additions & 0 deletions tests/test_kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,16 @@ target_compile_options(block_filter_test PRIVATE
target_link_options(block_filter_test PRIVATE -fgpu-rdc)
add_dependencies(block_filter_test copy_bitcode_to_rocm)

# Sub-8B argument test kernel (regression test for issue:
# "Error with sub-8B kernel arguments"). A kernel whose explicit argument list
# ends at a non-8-byte-aligned offset (pointer args + int args) used to trigger
# an assertion in fixupKernArgs due to incorrect explicit_args_length rounding.
add_executable(sub8b_args_test sub8b_args_test.cpp)
set_source_files_properties(sub8b_args_test.cpp PROPERTIES LANGUAGE HIP)
target_compile_options(sub8b_args_test PRIVATE
${TEST_KERNEL_COMPILE_FLAGS}
-fpass-plugin=${INST_PLUGIN}
)
target_link_options(sub8b_args_test PRIVATE -fgpu-rdc)
add_dependencies(sub8b_args_test copy_bitcode_to_rocm)

57 changes: 57 additions & 0 deletions tests/test_kernels/sub8b_args_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Test kernel for sub-8B (e.g. int) kernel arguments.
// This reproduces the bug reported in issue "Error with sub-8B kernel arguments":
// when a kernel has pointer arguments followed by 32-bit integer arguments, the
// kernarg layout places the last explicit argument at a non-8-byte-aligned offset.
// The old roundArgsLength() logic incorrectly rounded that offset up, overshooting
// the hidden-args boundary and triggering an assertion failure in fixupKernArgs().

#include <hip/hip_runtime.h>
#include <iostream>
#include "hip_test_utils.h"

// Kernel with pointer args followed by 32-bit int args (sub-8B).
// This layout mirrors the hgemm_kernel from the bug report:
// ptr(0,8), ptr(8,8), ptr(16,8), int(24,4), int(28,4), int(32,4)
// The last explicit arg ends at byte 36, which is not 8-byte aligned.
__global__ void sub8b_args_kernel(const int* __restrict__ A,
const int* __restrict__ B,
int* __restrict__ C,
int M, int N, int K)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < M * N)
{
// Simple element-wise add using M, N, K to ensure they are live.
int row = idx / N;
int col = idx % N;
if (row < M && col < N && col < K)
C[row * N + col] = A[row * K + col] + B[row * N + col];
}
}

int main()
{
std::cerr << "Starting sub8b_args_test" << std::endl;

constexpr int M = 4;
constexpr int N = 4;
constexpr int K = 4;

int *dA, *dB, *dC;
CHECK_HIP(hipMalloc(&dA, M * K * sizeof(int)));
CHECK_HIP(hipMalloc(&dB, M * N * sizeof(int)));
CHECK_HIP(hipMalloc(&dC, M * N * sizeof(int)));

constexpr int blocksize = 64;
constexpr int no_blocks = 1;

sub8b_args_kernel<<<no_blocks, blocksize>>>(dA, dB, dC, M, N, K);
CHECK_HIP(hipDeviceSynchronize());

CHECK_HIP(hipFree(dA));
CHECK_HIP(hipFree(dB));
CHECK_HIP(hipFree(dC));

std::cerr << "sub8b_args_test done" << std::endl;
return 0;
}