Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 13 additions & 19 deletions benchmark/kernel/rccl/benchmark_allreduce.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
###############################################################################
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE for license information.
###############################################################################
Expand Down Expand Up @@ -29,19 +29,18 @@


def test_allreduce(mbs, seq, hidden, dtype, rank, local_rank, world_size, dry_run=False):
if local_rank == 0:
if rank == 0:
size = mbs * seq * hidden * torch.tensor([], dtype=dtype).element_size()
print("AllReduce with input size(Byte): ", size)
print(
f"Rccl-test command: \n$ mpirun -np {world_size} -N $NNODES ./build/all_reduce_perf -b {size} -e {size} -g 1"
)
if dry_run:
print(
f"Rccl-test command: \n$ mpirun -np {world_size} -N $NNODES ./build/all_reduce_perf -b {size} -e {size} -g 1"
)
if dry_run:
return 0, 0
shape = (mbs, seq, hidden)
device = torch.device(f"cuda:{local_rank}")
tensor = torch.ones(shape, dtype=dtype, device=device)
if local_rank == 0:
print("AllReduce with input size(Byte): ", tensor.nelement() * tensor.element_size())
# Warm-up
for _ in range(5):
dist.all_reduce(tensor)
Expand All @@ -66,7 +65,7 @@ def test_allreduce(mbs, seq, hidden, dtype, rank, local_rank, world_size, dry_ru
def test_allgather(mbs, seq, hidden, dtype, rank, local_rank, world_size, dry_run=False):
local_seq = seq // world_size

if local_rank == 0:
if rank == 0:
element_size = torch.tensor([], dtype=dtype).element_size()
nelement = mbs * local_seq * hidden
print(
Expand All @@ -84,13 +83,7 @@ def test_allgather(mbs, seq, hidden, dtype, rank, local_rank, world_size, dry_ru

# Gather buffer
output = [torch.randn_like(tensor) for _ in range(world_size)]
if local_rank == 0:
print(
"AllGather with input size(Byte): ",
tensor.nelement() * tensor.element_size(),
" Output size ",
world_size * tensor.nelement() * tensor.element_size(),
)

for _ in range(5):
dist.all_gather(output, tensor)
dist.barrier()
Expand All @@ -114,7 +107,7 @@ def test_reducescatter(mbs, seq, hidden, dtype, rank, local_rank, world_size, dr
chunk_seq = seq // world_size
chunk_shape = (mbs, chunk_seq, hidden)

if local_rank == 0:
if rank == 0:
print(
"ReduceScatter with each output chunk size(Byte): ",
mbs * chunk_seq * hidden * torch.tensor([], dtype=dtype).element_size(),
Expand All @@ -126,8 +119,7 @@ def test_reducescatter(mbs, seq, hidden, dtype, rank, local_rank, world_size, dr
device = torch.device(f"cuda:{local_rank}")
tensor = torch.ones(full_shape, dtype=dtype, device=device)
output = torch.empty(chunk_shape, dtype=dtype, device=device)
if local_rank == 0:
print("ReduceScatter with each output chunk size(Byte): ", output.nelement() * output.element_size())

for _ in range(5):
dist.reduce_scatter(output, list(tensor.chunk(world_size, dim=1)))
dist.barrier()
Expand All @@ -151,7 +143,8 @@ def benchmark(test_func, output_csv_path, rank, local_rank, world_size, dry_run=

for model_name, (seq, hidden) in MODEL_PARAMS_TABLE.items():
for mbs in MBS_LIST:
print(f"\nModel Name {model_name}, mbs {mbs}")
if rank == 0:
print(f"\nModel Name {model_name}, mbs {mbs}")
for dtype in [torch.float16]:
avg_time, bandwidth = test_func(
mbs, seq, hidden, dtype, rank, local_rank, world_size, dry_run
Expand All @@ -167,6 +160,7 @@ def benchmark(test_func, output_csv_path, rank, local_rank, world_size, dry_run=
"Time(s)": avg_time,
"Bandwidth(GB/s)": bandwidth,
}
# print(result)
benchmark_results.append(result)

if rank == 0 and not dry_run:
Expand Down
223 changes: 223 additions & 0 deletions benchmark/kernel/rccl/run_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
#!/bin/bash
###############################################################################
# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE for license information.
###############################################################################
#
# Slurm launcher for RCCL benchmarks inside a Docker container.
#
# Usage:
# DOCKER_IMAGE=<image> sbatch run_slurm.sh
# DOCKER_IMAGE=<image> NNODES=2 PARTITION=my-gpu sbatch run_slurm.sh
# DOCKER_IMAGE=rocm/primus:v26.1 NNODES=2 sbatch -N2 -w smci355-ccs-aus-n04-[25,29] -p Compute-DCPT ./run_slurm.sh
#
# Environment variables (all optional except DOCKER_IMAGE):
# DOCKER_IMAGE Docker image to use (required)
# NNODES Number of nodes [default: 1]
# PARTITION Slurm partition [default: unset]
# GPUS_PER_NODE GPUs per node [default: 8]
# MASTER_PORT Port for torchrun rendezvous [default: 1234]
# EXTRA_DOCKER_ARGS Extra arguments passed to docker run
#
###############################################################################

#SBATCH --exclusive
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=8
#SBATCH --job-name=rccl-bench

set -euo pipefail

NNODES="${NNODES:-${SLURM_NNODES:-1}}"
GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
MASTER_PORT="${MASTER_PORT:-1234}"

SCRIPT_DIR="${SLURM_SUBMIT_DIR}"
OUTPUT_DIR="${SCRIPT_DIR}"
echo "SCRIPT_DIR: ${SCRIPT_DIR}"

if [[ -z "${DOCKER_IMAGE:-}" ]]; then
echo "[ERROR] DOCKER_IMAGE is not set. Export it before submitting the job."
exit 1
fi

# Keep your original behavior (may be aggressive on shared nodes)
docker stop $(docker ps -q) >/dev/null 2>&1 || true

# Build sbatch overrides from env vars
SBATCH_OVERRIDES=()
if [[ -n "${PARTITION:-}" ]]; then
SBATCH_OVERRIDES+=(-p "$PARTITION")
fi

echo "============================================"
echo " RCCL Benchmark - Slurm + Docker launcher"
echo "============================================"
echo " DOCKER_IMAGE : ${DOCKER_IMAGE}"
echo " NNODES : ${NNODES}"
echo " GPUS_PER_NODE : ${GPUS_PER_NODE}"
echo " MASTER_PORT : ${MASTER_PORT}"
echo " OUTPUT_DIR : ${OUTPUT_DIR}"
echo "============================================"

# Pre-pull image + stop containers on allocated nodes
srun -N "${NNODES}" \
--exclusive \
--export=ALL \
--ntasks-per-node=1 \
"${SBATCH_OVERRIDES[@]}" \
bash -c 'docker pull "${DOCKER_IMAGE}"; docker stop $(docker ps -q) >/dev/null 2>&1 || true'

# Run workload on allocated nodes
srun -N "${NNODES}" \
--exclusive \
--export=ALL \
--ntasks-per-node=1 \
"${SBATCH_OVERRIDES[@]}" \
bash -c '

set -euo pipefail

# ---- Resolve master address from Slurm node list ----
readarray -t NODE_ARRAY < <(scontrol show hostnames "$SLURM_JOB_NODELIST")
MASTER_ADDR="${NODE_ARRAY[0]}"
NODE_RANK="${SLURM_NODEID}"

# ---- Build short NODE_TAG like: n05-29_n05-33 ----
short_node() {
# input: smci355-ccs-aus-n05-29 -> output: n05-29
local h="$1"
local n_part id_part
n_part="$(echo "$h" | awk -F"-" "{print \$(NF-1)}")" # n05
id_part="$(echo "$h" | awk -F"-" "{print \$NF}")" # 29
echo "${n_part}-${id_part}"
}

NODE_TAG=""
for h in "${NODE_ARRAY[@]}"; do
NODE_TAG+="$(short_node "$h")_"
done
NODE_TAG="${NODE_TAG%_}" # trim trailing _

if [[ "$NODE_RANK" == "0" ]]; then
echo "========== Slurm cluster info =========="
echo "SLURM_NODELIST : ${NODE_ARRAY[*]}"
echo "SLURM_NNODES : ${SLURM_NNODES}"
echo "MASTER_ADDR : ${MASTER_ADDR}"
echo "NODE_RANK : ${NODE_RANK}"
echo "NODE_TAG : ${NODE_TAG}"
echo ""
fi

SCRIPT_DIR='"${SCRIPT_DIR}"'
OUTPUT_DIR='"${OUTPUT_DIR}"'
DOCKER_IMAGE='"${DOCKER_IMAGE}"'
DOCKER_LOGIN='"${DOCKER_LOGIN:-}"'
NNODES='"${NNODES}"'
GPUS_PER_NODE='"${GPUS_PER_NODE}"'
MASTER_PORT='"${MASTER_PORT}"'
EXTRA_DOCKER_ARGS='"${EXTRA_DOCKER_ARGS:-}"'

docker run --rm \
--network=host \
--ipc=host \
--device=/dev/kfd \
--device=/dev/dri \
--privileged --device=/dev/infiniband \
--cap-add=SYS_PTRACE \
--cap-add=CAP_SYS_ADMIN \
--security-opt seccomp=unconfined \
--group-add video \
-v "${SCRIPT_DIR}:${SCRIPT_DIR}" \
-v "${HOME}:${HOME}" \
-w "${SCRIPT_DIR}" \
-e MASTER_ADDR="${MASTER_ADDR}" \
-e MASTER_PORT="${MASTER_PORT}" \
-e NNODES="${NNODES}" \
-e NODE_RANK="${NODE_RANK}" \
-e GPUS_PER_NODE="${GPUS_PER_NODE}" \
-e NODE_TAG="${NODE_TAG}" \
${EXTRA_DOCKER_ARGS} \
"${DOCKER_IMAGE}" \
bash -cx "
set -euo pipefail

cd ${SCRIPT_DIR}
ifconfig || true
ibv_devices || true
rocm-smi || true

export TORCH_NCCL_HIGH_PRIORITY=1
export NCCL_IB_HCA=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
export NCCL_SOCKET_IFNAME=fenic
export NCCL_DEBUG=TRACE
export USING_AINIC=1

# AINIC lib paths for different docker image
export NCCL_NET_PLUGIN=/workspace/amd-anp/build/librccl-net.so

# Set InfiniBand GID index for NCCL communication
export NCCL_IB_GID_INDEX=1
export NCCL_MAX_P2P_CHANNELS=56
export NCCL_IB_TC=104
export NCCL_IB_FIFO_TC=192
export NET_OPTIONAL_RECV_COMPLETION=1
export NCCL_IB_USE_INLINE=1
export RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0
export NCCL_GDR_FLUSH_DISABLE=1
export NCCL_DMABUF_ENABLE=0
export NCCL_IGNORE_CPU_AFFINITY=1
export NCCL_IB_QPS_PER_CONNECTION=1
export NCCL_IB_RETRY_CNT=20
export NCCL_IB_TIMEOUT=300

PRIMUS_ROOT_PATH=\"${SCRIPT_DIR}/../../..\"
MEGATRON_PATH=\"\${PRIMUS_ROOT_PATH}/third_party/Megatron-LM\"
export PYTHONPATH=\"\${MEGATRON_PATH}:\${PYTHONPATH:-}\"

# Final CSV names you want (no smci prefix, include both nodes)
FINAL_ALLREDUCE=\"${OUTPUT_DIR}/allreduce_\${NODE_TAG}.csv\"
FINAL_ALLGATHER=\"${OUTPUT_DIR}/allgather_\${NODE_TAG}.csv\"
FINAL_REDUCESCATTER=\"${OUTPUT_DIR}/reducescatter_\${NODE_TAG}.csv\"

# To avoid concurrent overwrite, only rank0 writes the final CSV.
# Other ranks write to temporary per-rank files.
if [[ \"\${NODE_RANK}\" == \"0\" ]]; then
ALLREDUCE_CSV=\"\${FINAL_ALLREDUCE}\"
ALLGATHER_CSV=\"\${FINAL_ALLGATHER}\"
REDUCESCATTER_CSV=\"\${FINAL_REDUCESCATTER}\"
else
ALLREDUCE_CSV=\"/tmp/allreduce_\${NODE_TAG}_rank\${NODE_RANK}.csv\"
ALLGATHER_CSV=\"/tmp/allgather_\${NODE_TAG}_rank\${NODE_RANK}.csv\"
REDUCESCATTER_CSV=\"/tmp/reducescatter_\${NODE_TAG}_rank\${NODE_RANK}.csv\"
fi

echo \"[Node \${NODE_RANK}] CSV outputs:\"
echo \" allreduce -> \${ALLREDUCE_CSV}\"
echo \" allgather -> \${ALLGATHER_CSV}\"
echo \" reducescatter -> \${REDUCESCATTER_CSV}\"

echo \"[Node \${NODE_RANK}] Starting RCCL benchmarks...\"
torchrun --master_addr \"\${MASTER_ADDR}\" \
--master_port \"\${MASTER_PORT}\" \
--nnodes=\"\${NNODES}\" \
--node_rank=\"\${NODE_RANK}\" \
--nproc_per_node=\"\${GPUS_PER_NODE}\" \
./benchmark_allreduce.py \
--allreduce-report-csv-path \"\${ALLREDUCE_CSV}\" \
--allgather-report-csv-path \"\${ALLGATHER_CSV}\" \
--reducescatter-report-csv-path \"\${REDUCESCATTER_CSV}\"

if [[ \"\${NODE_RANK}\" == \"0\" ]]; then
echo \"[Rank0] Final CSV written:\"
echo \" \${FINAL_ALLREDUCE}\"
echo \" \${FINAL_ALLGATHER}\"
echo \" \${FINAL_REDUCESCATTER}\"
fi

echo \"[Node \${NODE_RANK}] RCCL benchmarks complete.\"
"
'

echo " Results written to: ${OUTPUT_DIR}/"