From ae80498dcee6117c934d639457dd2abeaca5bbe1 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Mon, 8 Sep 2025 17:57:04 +0800
Subject: [PATCH 01/13] gpu-burn: collect per-snapshot per-GPU flops/temp and
 add summary metrics

---
 .../micro_benchmarks/gpu_burn_test.py         | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
index fba4ad2b3..66284704e 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
@@ -4,6 +4,7 @@
 """Module of the GPU-Burn Test."""
 
 import os
+import re
 
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform
@@ -81,6 +82,7 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
             raw_output (str): raw output string of the micro-benchmark.
 
         Return:
+
             True if the raw output string is valid and result can be extracted.
         """
         content = raw_output.splitlines()
@@ -88,7 +90,9 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
         abort = False
         failure_msg = 'unknown failure'
         index = -1
+
         try:
+            # detect fatal failure lines
             for idx, line in enumerate(content):
                 if 'No clients are alive!' in line or "Couldn't init a GPU" \
                         in line or 'Failure during compute' in line or 'Low mem for result' in line:
@@ -124,6 +128,57 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
                 self._result.add_raw_data('GPU Burn Failure: ', failure_msg, self._args.log_raw_data)
                 self._result.add_result('abort', 1)
                 return False
+
+            # Parse and emit metrics for every perf snapshot
+            # Find all performance snapshot lines containing Gflop/s
+            perf_lines = [line for line in raw_output.splitlines() if 'Gflop/s' in line]
+            per_gpu_flops, per_gpu_temps = {}, {}
+            for snap_idx, perf_line in enumerate(perf_lines):
+                # extract per-GPU Gflops values like '(581623 Gflop/s)'
+                gflops = re.findall(r'\(([0-9]+(?:\.[0-9]+)?)\s*Gflop/s\)', perf_line)
+                gflops = [float(x) for x in gflops]
+                # extract temps: 'temps: 48 C - 49 C - 49 C - 49 C'
+                temps = []
+                m = re.search(r'temps:\s*(.+)$', perf_line)
+                if m:
+                    temps = []
+                    for t in m.group(1).split(' - '):
+                        match = re.search(r'(\d+)', t)
+                        if match:
+                            temps.append(int(match.group(1)))
+
+                # Save snapshot raw line
+                self._result.add_raw_data(f'GPU-Burn_perf_snapshot_{snap_idx}', perf_line, self._args.log_raw_data)
+
+                # Emit per-GPU metrics for this snapshot
+                num_gpus = max(len(gflops), len(temps), len(gpu_res))
+                for i in range(num_gpus):
+                    if i not in per_gpu_flops:
+                        per_gpu_flops[i] = []
+                    if i not in per_gpu_temps:
+                        per_gpu_temps[i] = []
+                    if i < len(gflops) and gflops[i] > 0:
+                        self._result.add_result(f'gpu_{snap_idx}_gflops:{i}', gflops[i])
+                        per_gpu_flops[i].append(gflops[i])
+                    else:
+                        self._result.add_result(f'gpu_{snap_idx}_gflops:{i}', 0.0)
+                    if i < len(temps):
+                        self._result.add_result(f'gpu_{snap_idx}_temp:{i}', temps[i])
+                        per_gpu_temps[i].append(temps[i])
+                    else:
+                        self._result.add_result(f'gpu_{snap_idx}_temp:{i}', -1)
+            for i in per_gpu_flops:
+                if len(per_gpu_flops[i]) > 0:
+                    avg_flops = sum(per_gpu_flops[i]) / len(per_gpu_flops[i])
+                    self._result.add_result(f'gpu_avg_gflops:{i}', avg_flops)
+                    if avg_flops != 0:
+                        self._result.add_result(f'gpu_var_gflops:{i}', (max(per_gpu_flops[i]) - min(per_gpu_flops[i]))/avg_flops)
+                    else:
+                        self._result.add_result(f'gpu_var_gflops:{i}', 0.0)
+            for i in per_gpu_temps:
+                if len(per_gpu_temps[i]) > 0:
+                    self._result.add_result(f'gpu_max_temp:{i}', max(per_gpu_temps[i]))
+
         except BaseException as e:
             logger.error(
                 'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(

From fe8e8d6117b613e6ffd2370b41375fa2ba189112 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Mon, 8 Sep 2025 19:15:37 +0800
Subject: [PATCH 02/13] add doc

---
 .../benchmarks/micro-benchmarks.md            | 169 +++++++++---------
 1 file changed, 87 insertions(+), 82 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index aa3aa965b..539c05f0a 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -166,11 +166,17 @@ Supports the use of double unit types and the use of tensor cores.
 
 #### Metrics
 
-| Name                    | Unit     | Description                                                                        |
-|-------------------------|----------|------------------------------------------------------------------------------------|
-| gpu-burn/time           | time (s) | The runtime for gpu-burn test.                                                     |
-| gpu-burn/gpu_[0-9]_pass | yes/no   | The result of the gpu-burn test for each GPU (1: yes, 0: no).                      |
-| gpu-burn/abort          | yes/no   | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
+| Name                              | Unit            | Description                                                                                                                        |
+|-----------------------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------|
+| gpu-burn/time                     | time (s)        | The runtime for gpu-burn test.                                                                                                     |
+| gpu-burn/gpu_[0-9]_pass           | yes/no          | The result of the gpu-burn test for each GPU (1: yes, 0: no).                                                                      |
+| gpu-burn/abort                    | yes/no          | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no).                                                 |
+| gpu_<snap_idx>_gflops:<gpu_index> | FLOPS (GFLOPS)  | Per-snapshot measured GFLOPS for `gpu_index` at snapshot `snap_idx` (snapshot index increments for each performance summary line). |
+| gpu_<snap_idx>_temp:<gpu_index>   | temperature (C) | Per-snapshot temperature for `gpu_index` at snapshot `snap_idx`.                                                                   |
+| gpu_avg_gflops:<gpu_index>        | FLOPS (GFLOPS)  | Average GFLOPS across all snapshots for `gpu_index`.                                                                               |
+| gpu_var_gflops:<gpu_index>        |                 | Flops variability metric for `gpu_index` across snapshots using (max-min)/avg                                                      |
+| gpu_max_temp:<gpu_index>          | temperature (C) | Maximum observed temperature for `gpu_index` across all snapshots.                                                                 |
+
 
 ### `cpu-hpl`
 
@@ -271,16 +277,16 @@ Measure the memory bandwidth of GPU using the STREAM benchmark. The benchmark te
 
 #### Metrics
 
-| Metric Name                                                | Unit             | Description                                                                                                                             |
-|------------------------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
-| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw  | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size.                         |
-| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size.                         |
-| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw   | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size.                         |
-| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size.                         |
-| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio  | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size.                         |
-| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size.                         |
-| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio   | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size.                         |
-| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size.                         |
+| Metric Name                                                             | Unit             | Description                                                                                                        |
+|-------------------------------------------------------------------------|------------------|--------------------------------------------------------------------------------------------------------------------|
+| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw     | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size.             |
+| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw    | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size.            |
+| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw      | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size.              |
+| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw    | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size.            |
+| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio  | Efficiency (%)   | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size.  |
+| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%)   | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size. |
+| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio   | Efficiency (%)   | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size.   |
+| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%)   | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size. |
 
 ### `ib-loopback`
 
@@ -413,72 +419,72 @@ performed by [nvbandwidth](https://github.com/NVIDIA/nvbandwidth)
 
 #### Metrics
 
-| Metrics                                                 | Unit                   | Description                                                                                                                                                                |
-|---------------------------------------------------------|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| host_to_device_memcpy_ce_cpu[0-9]_gpu[0-9]_bw      | GB/s                | Host to device CE memcpy using cuMemcpyAsync                      |
-| host_to_device_memcpy_ce_sum_bw                    | GB/s                | Sum of the output matrix                                           |
-| device_to_host_memcpy_ce_cpu[0-9]_gpu[0-9]_bw      | GB/s                | Device to host CE memcpy using cuMemcpyAsync                      |
-| device_to_host_memcpy_ce_sum_bw                    | GB/s                | Sum of the output matrix                                           |
-| host_to_device_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw | GB/s      | A host to device copy is measured while a device to host copy is run simultaneously. Only the host to device copy bandwidth is reported. |
-| host_to_device_bidirectional_memcpy_ce_sum_bw      | GB/s                | Sum of the output matrix                                           |
-| device_to_host_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw | GB/s      | A device to host copy is measured while a host to device copy is run simultaneously. Only the device to host copy bandwidth is reported. |
-| device_to_host_bidirectional_memcpy_ce_sum_bw      | GB/s                | Sum of the output matrix                                           |
-| device_to_device_memcpy_read_ce_gpu[0-9]_gpu[0-9]_bw | GB/s               | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. Read tests launch a copy from the peer device to the target using the target's context. |
-| device_to_device_memcpy_read_ce_sum_bw             | GB/s                | Sum of the output matrix                                           |
-| device_to_device_memcpy_write_ce_gpu[0-9]_gpu[0-9]_bw | GB/s              | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. Write tests launch a copy from the target device to the peer using the target's context. |
-| device_to_device_memcpy_write_ce_sum_bw            | GB/s                | Sum of the output matrix                                           |
-| device_to_device_bidirectional_memcpy_read_ce_gpu[0-9]_gpu[0-9]_bw | GB/s | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. A copy in the opposite direction of the measured copy is run simultaneously but not measured. Read tests launch a copy from the peer device to the target using the target's context. |
-| device_to_device_bidirectional_memcpy_read_ce_sum_bw | GB/s               | Sum of the output matrix                                           |
-| device_to_device_bidirectional_memcpy_write_ce_gpu[0-9]_gpu[0-9]_bw | GB/s | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. A copy in the opposite direction of the measured copy is run simultaneously but not measured. Write tests launch a copy from the target device to the peer using the target's context. |
-| device_to_device_bidirectional_memcpy_write_ce_sum_bw | GB/s               | Sum of the output matrix                                           |
-| all_to_host_memcpy_ce_cpu[0-9]_gpu[0-9]_bw         | GB/s                | Measures bandwidth of cuMemcpyAsync between a single device and the host while simultaneously running copies from all other devices to the host. |
-| all_to_host_memcpy_ce_sum_bw                       | GB/s                | Sum of the output matrix                                           |
-| all_to_host_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw | GB/s              | A device to host copy is measured while a host to device copy is run simultaneously. Only the device to host copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic. |
-| all_to_host_bidirectional_memcpy_ce_sum_bw         | GB/s                | Sum of the output matrix                                           |
-| host_to_all_memcpy_ce_cpu[0-9]_gpu[0-9]_bw         | GB/s                | Measures bandwidth of cuMemcpyAsync between the host to a single device while simultaneously running copies from the host to all other devices. |
-| host_to_all_memcpy_ce_sum_bw                       | GB/s                | Sum of the output matrix                                           |
-| host_to_all_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw | GB/s              | A host to device copy is measured while a device to host copy is run simultaneously. Only the host to device copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic. |
-| host_to_all_bidirectional_memcpy_ce_sum_bw         | GB/s                | Sum of the output matrix                                           |
-| all_to_one_write_ce_gpu[0-9]_gpu[0-9]_bw           | GB/s                | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context. |
-| all_to_one_write_ce_sum_bw                         | GB/s                | Sum of the output matrix                                           |
-| all_to_one_read_ce_gpu[0-9]_gpu[0-9]_bw            | GB/s                | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context. |
-| all_to_one_read_ce_sum_bw                          | GB/s                | Sum of the output matrix                                           |
-| one_to_all_write_ce_gpu[0-9]_gpu[0-9]_bw           | GB/s                | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context. |
-| one_to_all_write_ce_sum_bw                         | GB/s                | Sum of the output matrix                                           |
-| one_to_all_read_ce_gpu[0-9]_gpu[0-9]_bw            | GB/s                | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context. |
-| one_to_all_read_ce_sum_bw                          | GB/s                | Sum of the output matrix                                           |
-| host_to_device_memcpy_sm_cpu[0-9]_gpu[0-9]_bw      | GB/s                | Host to device SM memcpy using a copy kernel                      |
-| host_to_device_memcpy_sm_sum_bw                    | GB/s                | Sum of the output matrix                                           |
-| device_to_host_memcpy_sm_cpu[0-9]_gpu[0-9]_bw      | GB/s                | Device to host SM memcpy using a copy kernel                      |
-| device_to_host_memcpy_sm_sum_bw                    | GB/s                | Sum of the output matrix                                           |
-| device_to_device_memcpy_read_sm_gpu[0-9]_gpu[0-9]_bw | GB/s               | Measures bandwidth of a copy kernel between each pair of accessible peers. Read tests launch a copy from the peer device to the target using the target's context. |
-| device_to_device_memcpy_read_sm_sum_bw             | GB/s                | Sum of the output matrix                                           |
-| device_to_device_memcpy_write_sm_gpu[0-9]_gpu[0-9]_bw | GB/s              | Measures bandwidth of a copy kernel between each pair of accessible peers. Write tests launch a copy from the target device to the peer using the target's context. |
-| device_to_device_memcpy_write_sm_sum_bw            | GB/s                | Sum of the output matrix                                           |
-| device_to_device_bidirectional_memcpy_read_sm_gpu[0-9]_gpu[0-9]_bw | GB/s | Measures bandwidth of a copy kernel between each pair of accessible peers. Copies are run in both directions between each pair, and the sum is reported. Read tests launch a copy from the peer device to the target using the target's context. |
-| device_to_device_bidirectional_memcpy_read_sm_sum_bw | GB/s               | Sum of the output matrix                                           |
-| device_to_device_bidirectional_memcpy_write_sm_gpu[0-9]_gpu[0-9]_bw | GB/s | Measures bandwidth of a copy kernel between each pair of accessible peers. Copies are run in both directions between each pair, and the sum is reported. Write tests launch a copy from the target device to the peer using the target's context. |
-| device_to_device_bidirectional_memcpy_write_sm_sum_bw | GB/s               | Sum of the output matrix                                           |
-| all_to_host_memcpy_sm_cpu[0-9]_gpu[0-9]_bw         | GB/s                | Measures bandwidth of a copy kernel between a single device and the host while simultaneously running copies from all other devices to the host. |
-| all_to_host_memcpy_sm_sum_bw                       | GB/s                | Sum of the output matrix                                           |
-| all_to_host_bidirectional_memcpy_sm_cpu[0-9]_gpu[0-9]_bw | GB/s              | A device to host bandwidth of a copy kernel is measured while a host to device copy is run simultaneously. Only the device to host copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic using copy kernels. |
-| all_to_host_bidirectional_memcpy_sm_sum_bw         | GB/s                | Sum of the output matrix                                           |
-| host_to_all_memcpy_sm_cpu[0-9]_gpu[0-9]_bw         | GB/s                | Measures bandwidth of a copy kernel between the host to a single device while simultaneously running copies from the host to all other devices. |
-| host_to_all_memcpy_sm_sum_bw                       | GB/s                | Sum of the output matrix                                           |
-| host_to_all_bidirectional_memcpy_sm_cpu[0-9]_gpu[0-9]_bw | GB/s              | A host to device bandwidth of a copy kernel is measured while a device to host copy is run simultaneously. Only the host to device copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic using copy kernels. |
-| host_to_all_bidirectional_memcpy_sm_sum_bw         | GB/s                | Sum of the output matrix                                           |
-| all_to_one_write_sm_gpu[0-9]_gpu[0-9]_bw           | GB/s                | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context. |
-| all_to_one_write_sm_sum_bw                         | GB/s                | Sum of the output matrix                                           |
-| all_to_one_read_sm_gpu[0-9]_gpu[0-9]_bw            | GB/s                | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context. |
-| all_to_one_read_sm_sum_bw                          | GB/s                | Sum of the output matrix                                           |
-| one_to_all_write_sm_gpu[0-9]_gpu[0-9]_bw           | GB/s                | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context. |
-| one_to_all_write_sm_sum_bw                         | GB/s                | Sum of the output matrix                                           |
-| one_to_all_read_sm_gpu[0-9]_gpu[0-9]_bw            | GB/s                | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context. |
-| one_to_all_read_sm_sum_bw                          | GB/s                | Sum of the output matrix                                           |
-| host_device_latency_sm_cpu[0-9]_gpu[0-9]_lat       | µs                  | Host - device SM copy latency using a ptr chase kernel            |
-| host_device_latency_sm_sum_lat                     | µs                  | Sum of the output matrix                                           |
-| device_to_device_latency_sm_gpu[0-9]_gpu[0-9]_lat  | µs                  | Measures latency of a pointer dereference operation between each pair of accessible peers. Memory is allocated on a GPU and is accessed by the peer GPU to determine latency. |
-| device_to_device_latency_sm_sum_lat                | µs                  | Sum of the output matrix                                           |
+| Metrics                                                             | Unit | Description                                                                                                                                                                                                                                                                      |
+|---------------------------------------------------------------------|------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| host_to_device_memcpy_ce_cpu[0-9]_gpu[0-9]_bw                       | GB/s | Host to device CE memcpy using cuMemcpyAsync                                                                                                                                                                                                                                     |
+| host_to_device_memcpy_ce_sum_bw                                     | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_host_memcpy_ce_cpu[0-9]_gpu[0-9]_bw                       | GB/s | Device to host CE memcpy using cuMemcpyAsync                                                                                                                                                                                                                                     |
+| device_to_host_memcpy_ce_sum_bw                                     | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| host_to_device_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw         | GB/s | A host to device copy is measured while a device to host copy is run simultaneously. Only the host to device copy bandwidth is reported.                                                                                                                                         |
+| host_to_device_bidirectional_memcpy_ce_sum_bw                       | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_host_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw         | GB/s | A device to host copy is measured while a host to device copy is run simultaneously. Only the device to host copy bandwidth is reported.                                                                                                                                         |
+| device_to_host_bidirectional_memcpy_ce_sum_bw                       | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_device_memcpy_read_ce_gpu[0-9]_gpu[0-9]_bw                | GB/s | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. Read tests launch a copy from the peer device to the target using the target's context.                                                                                                               |
+| device_to_device_memcpy_read_ce_sum_bw                              | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_device_memcpy_write_ce_gpu[0-9]_gpu[0-9]_bw               | GB/s | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. Write tests launch a copy from the target device to the peer using the target's context.                                                                                                              |
+| device_to_device_memcpy_write_ce_sum_bw                             | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_device_bidirectional_memcpy_read_ce_gpu[0-9]_gpu[0-9]_bw  | GB/s | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. A copy in the opposite direction of the measured copy is run simultaneously but not measured. Read tests launch a copy from the peer device to the target using the target's context.                 |
+| device_to_device_bidirectional_memcpy_read_ce_sum_bw                | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_device_bidirectional_memcpy_write_ce_gpu[0-9]_gpu[0-9]_bw | GB/s | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. A copy in the opposite direction of the measured copy is run simultaneously but not measured. Write tests launch a copy from the target device to the peer using the target's context.                |
+| device_to_device_bidirectional_memcpy_write_ce_sum_bw               | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| all_to_host_memcpy_ce_cpu[0-9]_gpu[0-9]_bw                          | GB/s | Measures bandwidth of cuMemcpyAsync between a single device and the host while simultaneously running copies from all other devices to the host.                                                                                                                                 |
+| all_to_host_memcpy_ce_sum_bw                                        | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| all_to_host_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw            | GB/s | A device to host copy is measured while a host to device copy is run simultaneously. Only the device to host copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic.                                          |
+| all_to_host_bidirectional_memcpy_ce_sum_bw                          | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| host_to_all_memcpy_ce_cpu[0-9]_gpu[0-9]_bw                          | GB/s | Measures bandwidth of cuMemcpyAsync between the host to a single device while simultaneously running copies from the host to all other devices.                                                                                                                                  |
+| host_to_all_memcpy_ce_sum_bw                                        | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| host_to_all_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw            | GB/s | A host to device copy is measured while a device to host copy is run simultaneously. Only the host to device copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic.                                          |
+| host_to_all_bidirectional_memcpy_ce_sum_bw                          | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| all_to_one_write_ce_gpu[0-9]_gpu[0-9]_bw                            | GB/s | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context.             |
+| all_to_one_write_ce_sum_bw                                          | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| all_to_one_read_ce_gpu[0-9]_gpu[0-9]_bw                             | GB/s | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context.             |
+| all_to_one_read_ce_sum_bw                                           | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| one_to_all_write_ce_gpu[0-9]_gpu[0-9]_bw                            | GB/s | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context.            |
+| one_to_all_write_ce_sum_bw                                          | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| one_to_all_read_ce_gpu[0-9]_gpu[0-9]_bw                             | GB/s | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context.              |
+| one_to_all_read_ce_sum_bw                                           | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| host_to_device_memcpy_sm_cpu[0-9]_gpu[0-9]_bw                       | GB/s | Host to device SM memcpy using a copy kernel                                                                                                                                                                                                                                     |
+| host_to_device_memcpy_sm_sum_bw                                     | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_host_memcpy_sm_cpu[0-9]_gpu[0-9]_bw                       | GB/s | Device to host SM memcpy using a copy kernel                                                                                                                                                                                                                                     |
+| device_to_host_memcpy_sm_sum_bw                                     | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_device_memcpy_read_sm_gpu[0-9]_gpu[0-9]_bw                | GB/s | Measures bandwidth of a copy kernel between each pair of accessible peers. Read tests launch a copy from the peer device to the target using the target's context.                                                                                                               |
+| device_to_device_memcpy_read_sm_sum_bw                              | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_device_memcpy_write_sm_gpu[0-9]_gpu[0-9]_bw               | GB/s | Measures bandwidth of a copy kernel between each pair of accessible peers. Write tests launch a copy from the target device to the peer using the target's context.                                                                                                              |
+| device_to_device_memcpy_write_sm_sum_bw                             | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_device_bidirectional_memcpy_read_sm_gpu[0-9]_gpu[0-9]_bw  | GB/s | Measures bandwidth of a copy kernel between each pair of accessible peers. Copies are run in both directions between each pair, and the sum is reported. Read tests launch a copy from the peer device to the target using the target's context.                                 |
+| device_to_device_bidirectional_memcpy_read_sm_sum_bw                | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_device_bidirectional_memcpy_write_sm_gpu[0-9]_gpu[0-9]_bw | GB/s | Measures bandwidth of a copy kernel between each pair of accessible peers. Copies are run in both directions between each pair, and the sum is reported. Write tests launch a copy from the target device to the peer using the target's context.                                |
+| device_to_device_bidirectional_memcpy_write_sm_sum_bw               | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| all_to_host_memcpy_sm_cpu[0-9]_gpu[0-9]_bw                          | GB/s | Measures bandwidth of a copy kernel between a single device and the host while simultaneously running copies from all other devices to the host.                                                                                                                                 |
+| all_to_host_memcpy_sm_sum_bw                                        | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| all_to_host_bidirectional_memcpy_sm_cpu[0-9]_gpu[0-9]_bw            | GB/s | A device to host bandwidth of a copy kernel is measured while a host to device copy is run simultaneously. Only the device to host copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic using copy kernels. |
+| all_to_host_bidirectional_memcpy_sm_sum_bw                          | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| host_to_all_memcpy_sm_cpu[0-9]_gpu[0-9]_bw                          | GB/s | Measures bandwidth of a copy kernel between the host to a single device while simultaneously running copies from the host to all other devices.                                                                                                                                  |
+| host_to_all_memcpy_sm_sum_bw                                        | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| host_to_all_bidirectional_memcpy_sm_cpu[0-9]_gpu[0-9]_bw            | GB/s | A host to device bandwidth of a copy kernel is measured while a device to host copy is run simultaneously. Only the host to device copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic using copy kernels. |
+| host_to_all_bidirectional_memcpy_sm_sum_bw                          | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| all_to_one_write_sm_gpu[0-9]_gpu[0-9]_bw                            | GB/s | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context.             |
+| all_to_one_write_sm_sum_bw                                          | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| all_to_one_read_sm_gpu[0-9]_gpu[0-9]_bw                             | GB/s | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context.             |
+| all_to_one_read_sm_sum_bw                                           | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| one_to_all_write_sm_gpu[0-9]_gpu[0-9]_bw                            | GB/s | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context.            |
+| one_to_all_write_sm_sum_bw                                          | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| one_to_all_read_sm_gpu[0-9]_gpu[0-9]_bw                             | GB/s | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context.              |
+| one_to_all_read_sm_sum_bw                                           | GB/s | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| host_device_latency_sm_cpu[0-9]_gpu[0-9]_lat                        | µs   | Host - device SM copy latency using a ptr chase kernel                                                                                                                                                                                                                           |
+| host_device_latency_sm_sum_lat                                      | µs   | Sum of the output matrix                                                                                                                                                                                                                                                         |
+| device_to_device_latency_sm_gpu[0-9]_gpu[0-9]_lat                   | µs   | Measures latency of a pointer dereference operation between each pair of accessible peers. Memory is allocated on a GPU and is accessed by the peer GPU to determine latency.                                                                                                    |
+| device_to_device_latency_sm_sum_lat                                 | µs   | Sum of the output matrix                                                                                                                                                                                                                                                         |
 
 
 ## Computation-communication Benchmarks
@@ -546,4 +552,3 @@ Measure the disk performance through [FIO](https://github.com/axboe/fio/tree/031
 | disk-benchmark/${disk_name}_rand_read_write_write_iops        | IOPS         | Disk random read write write IOPS.                       |
 | disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_95.0 | time (ns)    | Disk random read write write latency in 95.0 percentile. |
 | disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_99.0 | time (ns)    | Disk random read write write latency in 99.0 percentile. |
-| disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_99.9 | time (ns)    | Disk random read write write latency in 99.9 percentile. |

From 8d8adb39282ffceb0aa64c1d49f0d8cee62b6347 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Wed, 10 Sep 2025 01:01:50 +0800
Subject: [PATCH 03/13] fix lint issue and update test

---
 superbench/benchmarks/micro_benchmarks/gpu_burn_test.py | 4 +++-
 tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
index 66284704e..1e09531d2 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
@@ -172,7 +172,9 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
                     avg_flops = sum(per_gpu_flops[i]) / len(per_gpu_flops[i])
                     self._result.add_result(f'gpu_avg_gflops:{i}', avg_flops)
                     if avg_flops != 0:
-                        self._result.add_result(f'gpu_var_gflops:{i}', (max(per_gpu_flops[i]) - min(per_gpu_flops[i]))/avg_flops)
+                        self._result.add_result(
+                            f'gpu_var_gflops:{i}', (max(per_gpu_flops[i]) - min(per_gpu_flops[i])) / avg_flops
+                        )
                     else:
                         self._result.add_result(f'gpu_var_gflops:{i}', 0.0)
             for i in per_gpu_temps:
diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
index 3ec352c4d..29ba5c7d7 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
@@ -57,4 +57,7 @@ def test_gpu_burn(self, results):
         assert (benchmark.result['time'][0] == time)
         for device in range(8):
             assert (benchmark.result['gpu_' + str(device) + '_pass'][0] == 1)
+            assert ('gpu_max_temp:' + str(device) in benchmark.result)
+            assert ('gpu_avg_gflops:' + str(device) in benchmark.result)
+            assert ('gpu_var_gflops:' + str(device) in benchmark.result)
         assert (benchmark.result['abort'][0] == 0)

From 271308127940a8fcbaabb883b0a742b52e208002 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Wed, 10 Sep 2025 01:15:15 +0800
Subject: [PATCH 04/13] fix lint issue

---
 superbench/benchmarks/micro_benchmarks/gpu_burn_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
index 1e09531d2..396f60f56 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
@@ -82,7 +82,6 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
             raw_output (str): raw output string of the micro-benchmark.
 
         Return:
-
             True if the raw output string is valid and result can be extracted.
         """
         content = raw_output.splitlines()

From cef8747b7737036927da8b0094d0146d61c2ed9e Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Wed, 10 Sep 2025 11:17:10 +0800
Subject: [PATCH 05/13] restore

---
 docs/user-tutorial/benchmarks/micro-benchmarks.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index 539c05f0a..f88d163e5 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -552,3 +552,4 @@ Measure the disk performance through [FIO](https://github.com/axboe/fio/tree/031
 | disk-benchmark/${disk_name}_rand_read_write_write_iops        | IOPS         | Disk random read write write IOPS.                       |
 | disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_95.0 | time (ns)    | Disk random read write write latency in 95.0 percentile. |
 | disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_99.0 | time (ns)    | Disk random read write write latency in 99.0 percentile. |
+| disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_99.9 | time (ns)    | Disk random read write write latency in 99.9 percentile. |

From bd6d7be40453ba4a99696cf40a888804927c6da8 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Thu, 11 Sep 2025 16:12:56 +0800
Subject: [PATCH 06/13] update

---
 docs/user-tutorial/benchmarks/micro-benchmarks.md     |  2 +-
 .../benchmarks/micro_benchmarks/gpu_burn_test.py      | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index f88d163e5..1adf84a45 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -174,7 +174,7 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu_<snap_idx>_gflops:<gpu_index> | FLOPS (GFLOPS)  | Per-snapshot measured GFLOPS for `gpu_index` at snapshot `snap_idx` (snapshot index increments for each performance summary line). |
 | gpu_<snap_idx>_temp:<gpu_index>   | temperature (C) | Per-snapshot temperature for `gpu_index` at snapshot `snap_idx`.                                                                   |
 | gpu_avg_gflops:<gpu_index>        | FLOPS (GFLOPS)  | Average GFLOPS across all snapshots for `gpu_index`.                                                                               |
-| gpu_var_gflops:<gpu_index>        |                 | Flops variability metric for `gpu_index` across snapshots using (max-min)/avg                                                      |
+| gpu_var_gflops:<gpu_index>        |                 | Flops variance metric for `gpu_index` across snapshots using (max-min)/avg - 1                                                     |
 | gpu_max_temp:<gpu_index>          | temperature (C) | Maximum observed temperature for `gpu_index` across all snapshots.                                                                 |
 
 
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
index 396f60f56..2147ef5cf 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
@@ -46,6 +46,12 @@ def add_parser_arguments(self):
             default=10,
             help='Length of time to run GPU-Burn for(in seconds)',
         )
+        self._parser.add_argument(
+            '--warmup_iters',
+            type=int,
+            default=512,
+            help='Number of warmup iterations before performance measurement',
+        )
 
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
@@ -158,7 +164,8 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
                         per_gpu_temps[i] = []
                     if i < len(gflops) and gflops[i] > 0:
                         self._result.add_result(f'gpu_{snap_idx}_gflops:{i}', gflops[i])
-                        per_gpu_flops[i].append(gflops[i])
+                        if snap_idx > self._args.warmup_iters:
+                            per_gpu_flops[i].append(gflops[i])
                     else:
                         self._result.add_result(f'gpu_{snap_idx}_gflops:{i}', 0.0)
                     if i < len(temps):
@@ -172,7 +179,7 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
                     self._result.add_result(f'gpu_avg_gflops:{i}', avg_flops)
                     if avg_flops != 0:
                         self._result.add_result(
-                            f'gpu_var_gflops:{i}', (max(per_gpu_flops[i]) - min(per_gpu_flops[i])) / avg_flops
+                            f'gpu_var_gflops:{i}', (max(per_gpu_flops[i]) - min(per_gpu_flops[i])) / avg_flops - 1
                         )
                     else:
                         self._result.add_result(f'gpu_var_gflops:{i}', 0.0)

From dfab68bcfec4fe76816f8d06b42ac2fe57223784 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Thu, 11 Sep 2025 16:17:25 +0800
Subject: [PATCH 07/13] fix test

---
 superbench/benchmarks/micro_benchmarks/gpu_burn_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
index 2147ef5cf..a30e54197 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
@@ -49,7 +49,7 @@ def add_parser_arguments(self):
         self._parser.add_argument(
             '--warmup_iters',
             type=int,
-            default=512,
+            default=0,
             help='Number of warmup iterations before performance measurement',
         )
 

From 665299f9deda5e50809258d5cc5d504d0acb0642 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Thu, 11 Sep 2025 19:50:42 +0800
Subject: [PATCH 08/13] fix bug

---
 docs/user-tutorial/benchmarks/micro-benchmarks.md       | 2 +-
 superbench/benchmarks/micro_benchmarks/gpu_burn_test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index 1adf84a45..a4f2982ea 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -174,7 +174,7 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu_<snap_idx>_gflops:<gpu_index> | FLOPS (GFLOPS)  | Per-snapshot measured GFLOPS for `gpu_index` at snapshot `snap_idx` (snapshot index increments for each performance summary line). |
 | gpu_<snap_idx>_temp:<gpu_index>   | temperature (C) | Per-snapshot temperature for `gpu_index` at snapshot `snap_idx`.                                                                   |
 | gpu_avg_gflops:<gpu_index>        | FLOPS (GFLOPS)  | Average GFLOPS across all snapshots for `gpu_index`.                                                                               |
-| gpu_var_gflops:<gpu_index>        |                 | Flops variance metric for `gpu_index` across snapshots using (max-min)/avg - 1                                                     |
+| gpu_var_gflops:<gpu_index>        |                 | Flops variance metric for `gpu_index` across snapshots using (max-min)/avg                                                         |
 | gpu_max_temp:<gpu_index>          | temperature (C) | Maximum observed temperature for `gpu_index` across all snapshots.                                                                 |
 
 
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
index a30e54197..6e8b739c7 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
@@ -179,7 +179,7 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
                     self._result.add_result(f'gpu_avg_gflops:{i}', avg_flops)
                     if avg_flops != 0:
                         self._result.add_result(
-                            f'gpu_var_gflops:{i}', (max(per_gpu_flops[i]) - min(per_gpu_flops[i])) / avg_flops - 1
+                            f'gpu_var_gflops:{i}', (max(per_gpu_flops[i]) - min(per_gpu_flops[i])) / avg_flops
                         )
                     else:
                         self._result.add_result(f'gpu_var_gflops:{i}', 0.0)

From d97bc222f6f17d1d9508992642cd5c6522be01b7 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 18 Sep 2025 17:53:44 +0800
Subject: [PATCH 09/13] bugfix

---
 superbench/benchmarks/micro_benchmarks/gpu_burn_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
index 6e8b739c7..a69615e0f 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
@@ -138,6 +138,7 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
             # Find all performance snapshot lines containing Gflop/s
             perf_lines = [line for line in raw_output.splitlines() if 'Gflop/s' in line]
             per_gpu_flops, per_gpu_temps = {}, {}
+            num_gpus = 0
             for snap_idx, perf_line in enumerate(perf_lines):
                 # extract per-GPU Gflops values like '(581623 Gflop/s)'
                 gflops = re.findall(r'\(([0-9]+(?:\.[0-9]+)?)\s*Gflop/s\)', perf_line)
@@ -156,7 +157,7 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
                 self._result.add_raw_data(f'GPU-Burn_perf_snapshot_{snap_idx}', perf_line, self._args.log_raw_data)
 
                 # Emit per-GPU metrics for this snapshot
-                num_gpus = max(len(gflops), len(temps), len(gpu_res))
+                num_gpus = max(len(gflops), len(temps), num_gpus)
                 for i in range(num_gpus):
                     if i not in per_gpu_flops:
                         per_gpu_flops[i] = []

From 9821e85fdc2ad3a2d191fa7780807bcc727001b4 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Thu, 18 Sep 2025 19:52:06 +0800
Subject: [PATCH 10/13] update test

---
 tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
index 29ba5c7d7..a567bf44b 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
@@ -29,7 +29,7 @@ def test_gpu_burn(self, results):
 
         time = 10
 
-        parameters = '--doubles --tensor_core --time ' + str(time)
+        parameters = '--doubles --tensor_core --warmup_iters 128 --time ' + str(time)
         benchmark = benchmark_class(benchmark_name, parameters=parameters)
 
         # Check basic information
@@ -58,6 +58,9 @@ def test_gpu_burn(self, results):
         for device in range(8):
             assert (benchmark.result['gpu_' + str(device) + '_pass'][0] == 1)
             assert ('gpu_max_temp:' + str(device) in benchmark.result)
+            assert (benchmark.result['gpu_max_temp:' + str(device)][0] >= 50 )
             assert ('gpu_avg_gflops:' + str(device) in benchmark.result)
+            assert (benchmark.result['gpu_avg_gflops:' + str(device)][0] >= 16000)
             assert ('gpu_var_gflops:' + str(device) in benchmark.result)
+            assert (benchmark.result['gpu_var_gflops:' + str(device)][0] <= 0.01)
         assert (benchmark.result['abort'][0] == 0)

From a29000348cbe0038d461c61de75877f9696c6047 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Thu, 18 Sep 2025 19:57:49 +0800
Subject: [PATCH 11/13] update test

---
 tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
index a567bf44b..80c8d12cc 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
@@ -63,4 +63,6 @@ def test_gpu_burn(self, results):
             assert (benchmark.result['gpu_avg_gflops:' + str(device)][0] >= 16000)
             assert ('gpu_var_gflops:' + str(device) in benchmark.result)
             assert (benchmark.result['gpu_var_gflops:' + str(device)][0] <= 0.01)
+            assert ('gpu_195_gflops:' + str(device) in benchmark.result)
+            assert ('gpu_195_temp:' + str(device) in benchmark.result)
         assert (benchmark.result['abort'][0] == 0)

From 055409b349fd88fe1ce6d9b0c8f7b7dffb4a0cf1 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Sun, 28 Sep 2025 17:41:10 +0800
Subject: [PATCH 12/13] fix lint issue

---
 tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
index 80c8d12cc..82454ee53 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
@@ -58,7 +58,7 @@ def test_gpu_burn(self, results):
         for device in range(8):
             assert (benchmark.result['gpu_' + str(device) + '_pass'][0] == 1)
             assert ('gpu_max_temp:' + str(device) in benchmark.result)
-            assert (benchmark.result['gpu_max_temp:' + str(device)][0] >= 50 )
+            assert (benchmark.result['gpu_max_temp:' + str(device)][0] >= 50)
             assert ('gpu_avg_gflops:' + str(device) in benchmark.result)
             assert (benchmark.result['gpu_avg_gflops:' + str(device)][0] >= 16000)
             assert ('gpu_var_gflops:' + str(device) in benchmark.result)

From 80eecae3912b71dfb948c17c1c3778430732ef53 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Tue, 2 Dec 2025 11:50:42 +0800
Subject: [PATCH 13/13] update according to comments

---
 docs/user-tutorial/benchmarks/micro-benchmarks.md      | 10 +++++-----
 .../benchmarks/micro_benchmarks/gpu_burn_test.py       |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index a4f2982ea..6ed8585d5 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -171,11 +171,11 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu-burn/time                     | time (s)        | The runtime for gpu-burn test.                                                                                                     |
 | gpu-burn/gpu_[0-9]_pass           | yes/no          | The result of the gpu-burn test for each GPU (1: yes, 0: no).                                                                      |
 | gpu-burn/abort                    | yes/no          | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no).                                                 |
-| gpu_<snap_idx>_gflops:<gpu_index> | FLOPS (GFLOPS)  | Per-snapshot measured GFLOPS for `gpu_index` at snapshot `snap_idx` (snapshot index increments for each performance summary line). |
-| gpu_<snap_idx>_temp:<gpu_index>   | temperature (C) | Per-snapshot temperature for `gpu_index` at snapshot `snap_idx`.                                                                   |
-| gpu_avg_gflops:<gpu_index>        | FLOPS (GFLOPS)  | Average GFLOPS across all snapshots for `gpu_index`.                                                                               |
-| gpu_var_gflops:<gpu_index>        |                 | Flops variance metric for `gpu_index` across snapshots using (max-min)/avg                                                         |
-| gpu_max_temp:<gpu_index>          | temperature (C) | Maximum observed temperature for `gpu_index` across all snapshots.                                                                 |
+| gpu_<snap_idx>_gflops | FLOPS (GFLOPS)  | Per-snapshot measured GFLOPS for each gpu at snapshot `snap_idx` (snapshot index increments for each performance summary line). |
+| gpu_<snap_idx>_temp   | temperature (C) | Per-snapshot temperature for each gpu at snapshot `snap_idx`.                                                                   |
+| gpu_avg_gflops        | FLOPS (GFLOPS)  | Average GFLOPS across all snapshots for each gpu.                                                                               |
+| gpu_var_gflops        |                 | Flops variance metric for each gpu across snapshots using (max-min)/avg                                                         |
+| gpu_max_temp          | temperature (C) | Maximum observed temperature for each gpu across all snapshots.                                                                 |
 
 
 ### `cpu-hpl`
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
index a69615e0f..7e7ca6378 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
@@ -145,10 +145,10 @@ def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
                 gflops = [float(x) for x in gflops]
                 # extract temps: 'temps: 48 C - 49 C - 49 C - 49 C'
                 temps = []
-                m = re.search(r'temps:\s*(.+)$', perf_line)
-                if m:
+                temp_match = re.search(r'temps:\s*(.+)$', perf_line)
+                if temp_match:
                     temps = []
-                    for t in m.group(1).split(' - '):
+                    for t in temp_match.group(1).split(' - '):
                         match = re.search(r'(\d+)', t)
                         if match:
                             temps.append(int(match.group(1)))