diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 7b2d05dcf..f69d3c418 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -9,7 +9,8 @@ GPU_MONITOR_PID="" GPU_METRICS_CSV="/workspace/gpu_metrics.csv" -# Start background nvidia-smi monitoring that logs GPU metrics every second to CSV. +# Start background GPU monitoring that logs metrics every second to CSV. +# Auto-detects NVIDIA (nvidia-smi) or AMD (amd-smi) GPUs. # Usage: start_gpu_monitor [--output /path/to/output.csv] [--interval 1] start_gpu_monitor() { local output="$GPU_METRICS_CSV" @@ -25,15 +26,22 @@ start_gpu_monitor() { GPU_METRICS_CSV="$output" - if ! command -v nvidia-smi &>/dev/null; then - echo "[GPU Monitor] nvidia-smi not found, skipping GPU monitoring" + if command -v nvidia-smi &>/dev/null; then + nvidia-smi --query-gpu=timestamp,index,power.draw,temperature.gpu,clocks.current.sm,clocks.current.memory,utilization.gpu,utilization.memory \ + --format=csv -l "$interval" > "$output" 2>/dev/null & + GPU_MONITOR_PID=$! + echo "[GPU Monitor] Started NVIDIA (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" + elif command -v amd-smi &>/dev/null; then + # Use amd-smi native watch mode (-w) which includes timestamps automatically. + # Pipe through awk to: skip preamble lines, keep first CSV header, skip repeated headers. + amd-smi metric -p -c -t -u -w "$interval" --csv 2>/dev/null \ + | awk '/^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" & + GPU_MONITOR_PID=$! + echo "[GPU Monitor] Started AMD (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" + else + echo "[GPU Monitor] No GPU monitoring tool found (nvidia-smi or amd-smi), skipping" return 0 fi - - nvidia-smi --query-gpu=timestamp,index,power.draw,temperature.gpu,clocks.current.sm,clocks.current.memory,utilization.gpu,utilization.memory \ - --format=csv -l "$interval" > "$output" 2>/dev/null & - GPU_MONITOR_PID=$! - echo "[GPU Monitor] Started (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" } # Stop the background GPU monitor and report file size. diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh index c2e1ddf6c..6870fe060 100644 --- a/benchmarks/single_node/dsr1_fp8_mi325x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh @@ -26,6 +26,9 @@ hf download $MODEL export SGLANG_USE_AITER=1 export SGLANG_AITER_MLA_PERSIST=1 +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + set -x python3 -m sglang.launch_server \ --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ @@ -62,4 +65,7 @@ if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi + +# Stop GPU monitoring +stop_gpu_monitor set +x