Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
GPU_MONITOR_PID=""
GPU_METRICS_CSV="/workspace/gpu_metrics.csv"

# Start background nvidia-smi monitoring that logs GPU metrics every second to CSV.
# Start background GPU monitoring that logs metrics every second to CSV.
# Auto-detects NVIDIA (nvidia-smi) or AMD (amd-smi) GPUs.
# Usage: start_gpu_monitor [--output /path/to/output.csv] [--interval 1]
start_gpu_monitor() {
local output="$GPU_METRICS_CSV"
Expand All @@ -25,15 +26,22 @@ start_gpu_monitor() {

GPU_METRICS_CSV="$output"

if ! command -v nvidia-smi &>/dev/null; then
echo "[GPU Monitor] nvidia-smi not found, skipping GPU monitoring"
if command -v nvidia-smi &>/dev/null; then
nvidia-smi --query-gpu=timestamp,index,power.draw,temperature.gpu,clocks.current.sm,clocks.current.memory,utilization.gpu,utilization.memory \
--format=csv -l "$interval" > "$output" 2>/dev/null &
GPU_MONITOR_PID=$!
echo "[GPU Monitor] Started NVIDIA (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)"
elif command -v amd-smi &>/dev/null; then
# Use amd-smi native watch mode (-w) which includes timestamps automatically.
# Pipe through awk to: skip preamble lines, keep first CSV header, skip repeated headers.
amd-smi metric -p -c -t -u -w "$interval" --csv 2>/dev/null \
| awk '/^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" &
GPU_MONITOR_PID=$!
echo "[GPU Monitor] Started AMD (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)"
else
echo "[GPU Monitor] No GPU monitoring tool found (nvidia-smi or amd-smi), skipping"
return 0
fi

nvidia-smi --query-gpu=timestamp,index,power.draw,temperature.gpu,clocks.current.sm,clocks.current.memory,utilization.gpu,utilization.memory \
--format=csv -l "$interval" > "$output" 2>/dev/null &
GPU_MONITOR_PID=$!
echo "[GPU Monitor] Started (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)"
}

# Stop the background GPU monitor and report file size.
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/single_node/dsr1_fp8_mi325x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ hf download $MODEL
export SGLANG_USE_AITER=1
export SGLANG_AITER_MLA_PERSIST=1

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
python3 -m sglang.launch_server \
--model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \
Expand Down Expand Up @@ -62,4 +65,7 @@ if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x