Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions docs/user-tutorial/benchmarks/micro-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,14 +273,14 @@ Measure the memory bandwidth of GPU using the STREAM benchmark. The benchmark te

| Metric Name | Unit | Description |
|------------------------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size. |
| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size. |
| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size. |
| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size. |
| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size. |
| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size. |
| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size. |
| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size. |
| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size. |
| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size. |
| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size. |
| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size. |
| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size. |
| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size. |
| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size. |
| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size. |

### `ib-loopback`

Expand Down
2 changes: 1 addition & 1 deletion examples/benchmarks/gpu_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'gpu-stream', platform=Platform.CUDA, parameters='--num_warm_up 1 --num_loops 10'
'gpu-stream', platform=Platform.CUDA, parameters='--num_warm_up 1 --num_loops 10 --data_type double'
)
# For ROCm environment, please specify the benchmark name and the platform as the following.
# context = BenchmarkRegistry.create_benchmark_context(
Expand Down
13 changes: 11 additions & 2 deletions superbench/benchmarks/micro_benchmarks/gpu_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@ def add_parser_arguments(self):
help='Number of data buffer copies performed.',
)

self._parser.add_argument(
'--data_type',
type=str,
default='double',
choices=['float', 'double'],
required=False,
help='Data type of the buffer elements.',
)

self._parser.add_argument(
'--check_data',
action='store_true',
Expand All @@ -68,8 +77,8 @@ def _preprocess(self):

self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)

args = '--size %d --num_warm_up %d --num_loops %d ' % (
self._args.size, self._args.num_warm_up, self._args.num_loops
args = '--size %d --num_warm_up %d --num_loops %d --data_type %s' % (
self._args.size, self._args.num_warm_up, self._args.num_loops, self._args.data_type
)

if self._args.check_data:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ find_package(CUDAToolkit QUIET)

# Source files
set(SOURCES
gpu_stream_test.cpp
gpu_stream_main.cpp
gpu_stream_utils.cpp
gpu_stream.cu
gpu_stream_kernels.cu
Expand Down
89 changes: 45 additions & 44 deletions superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -235,15 +235,15 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr
cudaError_t cuda_err = cudaSuccess;

if (args->check_data) {
// Generate data to copy
args->sub.data_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id));
// Generate data to copy - use local NUMA node for best CPU access
args->sub.data_buf = static_cast<T *>(numa_alloc_local(args->size * sizeof(T)));

for (int j = 0; j < args->size / sizeof(T); j++) {
args->sub.data_buf[j] = static_cast<T>(j % kUInt8Mod);
}

// Allocate check buffer
args->sub.check_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id));
// Allocate check buffer on local NUMA node
args->sub.check_buf = static_cast<T *>(numa_alloc_local(args->size * sizeof(T)));
}

// Allocate buffers
Expand Down Expand Up @@ -420,8 +420,10 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
int size_factor = 2;

// Validate data size
uint64_t num_elements_in_thread_block = kNumLoopUnroll * num_threads_per_block;
uint64_t num_bytes_in_thread_block = num_elements_in_thread_block * sizeof(T);
// Each thread processes 128 bits (16 bytes) for optimal memory bandwidth.
// For double: uses double2 (16 bytes). For float: would use float4 (16 bytes).
constexpr uint64_t kBytesPerThread = 16; // 128-bit aligned access
uint64_t num_bytes_in_thread_block = num_threads_per_block * kBytesPerThread;
if (args->size % num_bytes_in_thread_block) {
std::cerr << "RunCopy: Data size should be multiple of " << num_bytes_in_thread_block << std::endl;
return -1;
Expand All @@ -448,30 +450,30 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne

switch (kernel) {
case Kernel::kCopy:
CopyKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()));
CopyKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()));
args->sub.kernel_name = "COPY";
break;
case Kernel::kScale:
ScaleKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
ScaleKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
args->sub.kernel_name = "SCALE";
break;
case Kernel::kAdd:
AddKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()));
AddKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[1].get()));
size_factor = 3;
args->sub.kernel_name = "ADD";
break;
case Kernel::kTriad:
TriadKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
TriadKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
size_factor = 3;
args->sub.kernel_name = "TRIAD";
break;
Expand Down Expand Up @@ -583,10 +585,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string

// output formatted results to stdout
// Tags are of format:
// STREAM_<Kernelname>_datatype_gpu_<gpu_id>_buffer_<buffer_size>_block_<block_size>
// STREAM_<Kernelname>_datatype_buffer_<buffer_size>_block_<block_size>
for (int i = 0; i < args->sub.times_in_ms.size(); i++) {
std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_gpu_" + std::to_string(args->gpu_id) +
"_buffer_" + std::to_string(args->size);
std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_buffer_" + std::to_string(args->size);
for (int j = 0; j < args->sub.times_in_ms[i].size(); j++) {
// Calculate and display bandwidth
double bw = args->size * args->num_loops / args->sub.times_in_ms[i][j] / 1e6;
Expand All @@ -608,9 +609,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
/**
* @brief Runs the Stream benchmark.
*
* @details This function processes the input args, validates and composes the BenchArgs structure for the
availavble
* GPUs, and runs the benchmark.
* @details This function processes the input args, validates and composes the BenchArgs structure for
* the first visible GPU (CUDA device 0). When running under Superbench's default_local_mode,
* CUDA_VISIBLE_DEVICES is set per process, so device 0 maps to the assigned physical GPU.
*
* @return int The status code indicating success or failure of the benchmark execution.
* */
Expand All @@ -631,21 +632,29 @@ int GpuStream::Run() {
return ret;
}

// find all GPUs and compose the Benchmarking data structure
for (int j = 0; j < gpu_count; j++) {
auto args = std::make_unique<BenchArgs<double>>();
args->numa_id = 0;
args->gpu_id = j;
cudaGetDeviceProperties(&args->gpu_device_prop, j);
if (gpu_count < 1) {
std::cerr << "Run::No GPU available" << std::endl;
return -1;
}

// Run on CUDA device 0 (the visible GPU assigned by CUDA_VISIBLE_DEVICES).
if (opts_.data_type == "float") {
auto args = std::make_unique<BenchArgs<float>>();
args->gpu_id = 0;
cudaGetDeviceProperties(&args->gpu_device_prop, 0);
args->num_warm_up = opts_.num_warm_up;
args->num_loops = opts_.num_loops;
args->size = opts_.size;
args->check_data = opts_.check_data;
bench_args_.emplace_back(std::move(args));
} else {
auto args = std::make_unique<BenchArgs<double>>();
args->gpu_id = 0;
cudaGetDeviceProperties(&args->gpu_device_prop, 0);
args->num_warm_up = opts_.num_warm_up;
args->num_loops = opts_.num_loops;
args->size = opts_.size;
args->check_data = opts_.check_data;
args->numa_id = 0;
args->gpu_id = j;

// add data to vector
bench_args_.emplace_back(std::move(args));
}

Expand All @@ -668,14 +677,6 @@ int GpuStream::Run() {
// Print device info with both the memory clock and peak bandwidth
PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop, memory_clock_mhz, peak_bw);

// Set the NUMA node
ret = numa_run_on_node(curr_args->numa_id);
if (ret != 0) {
std::cerr << "Run::numa_run_on_node error: " << errno << std::endl;
has_error = true;
return;
}

// Run the stream benchmark for the configured data, passing the peak bandwidth
if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<float>>) {
ret = RunStream<float>(curr_args, "float", peak_bw);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class GpuStream {
int Run();

private:
using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<double>>>;
using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<float>>, std::unique_ptr<BenchArgs<double>>>;
std::vector<BenchArgsVariant> bench_args_;
Opts opts_;

Expand Down
Loading
Loading