From 32e6df70862ad1c832d09848bf07b1776d53827c Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 8 Mar 2026 02:56:21 +0000 Subject: [PATCH 1/2] feat: use amd-smi native watch mode with built-in timestamps Replace the while-true loop with amd-smi metric -w (native watch mode) which already includes timestamps in CSV output, removing the need for manual timestamp prepending. Pipe through awk to deduplicate headers. Co-authored-by: functionstackx --- benchmarks/benchmark_lib.sh | 24 +++++++++++++++-------- benchmarks/single_node/dsr1_fp8_mi325x.sh | 6 ++++++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 7b2d05dcf..6ed60e4f2 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -9,7 +9,8 @@ GPU_MONITOR_PID="" GPU_METRICS_CSV="/workspace/gpu_metrics.csv" -# Start background nvidia-smi monitoring that logs GPU metrics every second to CSV. +# Start background GPU monitoring that logs metrics every second to CSV. +# Auto-detects NVIDIA (nvidia-smi) or AMD (amd-smi) GPUs. # Usage: start_gpu_monitor [--output /path/to/output.csv] [--interval 1] start_gpu_monitor() { local output="$GPU_METRICS_CSV" @@ -25,15 +26,22 @@ start_gpu_monitor() { GPU_METRICS_CSV="$output" - if ! command -v nvidia-smi &>/dev/null; then - echo "[GPU Monitor] nvidia-smi not found, skipping GPU monitoring" + if command -v nvidia-smi &>/dev/null; then + nvidia-smi --query-gpu=timestamp,index,power.draw,temperature.gpu,clocks.current.sm,clocks.current.memory,utilization.gpu,utilization.memory \ + --format=csv -l "$interval" > "$output" 2>/dev/null & + GPU_MONITOR_PID=$! + echo "[GPU Monitor] Started NVIDIA (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" + elif command -v amd-smi &>/dev/null; then + # Use amd-smi native watch mode (-w) which includes timestamps automatically. + # Pipe through awk to deduplicate CSV headers (only keep the first one). + amd-smi metric -p -c -t -u -w "$interval" --csv 2>/dev/null \ + | awk 'NR==1 || !/^timestamp,/' > "$output" & + GPU_MONITOR_PID=$! + echo "[GPU Monitor] Started AMD (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" + else + echo "[GPU Monitor] No GPU monitoring tool found (nvidia-smi or amd-smi), skipping" return 0 fi - - nvidia-smi --query-gpu=timestamp,index,power.draw,temperature.gpu,clocks.current.sm,clocks.current.memory,utilization.gpu,utilization.memory \ - --format=csv -l "$interval" > "$output" 2>/dev/null & - GPU_MONITOR_PID=$! - echo "[GPU Monitor] Started (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" } # Stop the background GPU monitor and report file size. diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh index c2e1ddf6c..6870fe060 100644 --- a/benchmarks/single_node/dsr1_fp8_mi325x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh @@ -26,6 +26,9 @@ hf download $MODEL export SGLANG_USE_AITER=1 export SGLANG_AITER_MLA_PERSIST=1 +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + set -x python3 -m sglang.launch_server \ --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ @@ -62,4 +65,7 @@ if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi + +# Stop GPU monitoring +stop_gpu_monitor set +x From b056f54a1a483fb6b89cce687473a2339ac61e72 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 8 Mar 2026 03:12:29 +0000 Subject: [PATCH 2/2] fix: handle amd-smi preamble line in CSV output amd-smi -w prints a 'CTRL+C to stop' preamble before CSV data. Fix awk filter to: skip preamble, keep first CSV header, skip repeated headers. Co-authored-by: functionstackx --- benchmarks/benchmark_lib.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 6ed60e4f2..f69d3c418 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -33,9 +33,9 @@ start_gpu_monitor() { echo "[GPU Monitor] Started NVIDIA (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" elif command -v amd-smi &>/dev/null; then # Use amd-smi native watch mode (-w) which includes timestamps automatically. - # Pipe through awk to deduplicate CSV headers (only keep the first one). + # Pipe through awk to: skip preamble lines, keep first CSV header, skip repeated headers. amd-smi metric -p -c -t -u -w "$interval" --csv 2>/dev/null \ - | awk 'NR==1 || !/^timestamp,/' > "$output" & + | awk '/^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" & GPU_MONITOR_PID=$! echo "[GPU Monitor] Started AMD (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" else