From 46c1a302506b24e9ac65368c4fc0deb2d31d74fd Mon Sep 17 00:00:00 2001 From: Lorri Rao Date: Wed, 25 Feb 2026 14:36:24 -0800 Subject: [PATCH 1/3] add slurm script for multi-node rccl test --- benchmark/kernel/rccl/benchmark_allreduce.py | 30 ++-- benchmark/kernel/rccl/run_slurm.sh | 173 +++++++++++++++++++ 2 files changed, 185 insertions(+), 18 deletions(-) create mode 100644 benchmark/kernel/rccl/run_slurm.sh diff --git a/benchmark/kernel/rccl/benchmark_allreduce.py b/benchmark/kernel/rccl/benchmark_allreduce.py index 718c89690..00087673b 100644 --- a/benchmark/kernel/rccl/benchmark_allreduce.py +++ b/benchmark/kernel/rccl/benchmark_allreduce.py @@ -29,19 +29,18 @@ def test_allreduce(mbs, seq, hidden, dtype, rank, local_rank, world_size, dry_run=False): - if local_rank == 0: + if rank == 0: size = mbs * seq * hidden * torch.tensor([], dtype=dtype).element_size() print("AllReduce with input size(Byte): ", size) - print( - f"Rccl-test command: \n$ mpirun -np {world_size} -N $NNODES ./build/all_reduce_perf -b {size} -e {size} -g 1" - ) + if dry_run: + print( + f"Rccl-test command: \n$ mpirun -np {world_size} -N $NNODES ./build/all_reduce_perf -b {size} -e {size} -g 1" + ) if dry_run: return 0, 0 shape = (mbs, seq, hidden) device = torch.device(f"cuda:{local_rank}") tensor = torch.ones(shape, dtype=dtype, device=device) - if local_rank == 0: - print("AllReduce with input size(Byte): ", tensor.nelement() * tensor.element_size()) # Warm-up for _ in range(5): dist.all_reduce(tensor) @@ -66,7 +65,7 @@ def test_allreduce(mbs, seq, hidden, dtype, rank, local_rank, world_size, dry_ru def test_allgather(mbs, seq, hidden, dtype, rank, local_rank, world_size, dry_run=False): local_seq = seq // world_size - if local_rank == 0: + if rank == 0: element_size = torch.tensor([], dtype=dtype).element_size() nelement = mbs * local_seq * hidden print( @@ -84,13 +83,7 @@ def test_allgather(mbs, seq, hidden, dtype, rank, local_rank, world_size, dry_ru # Gather buffer output = [torch.randn_like(tensor) for _ in range(world_size)] - if local_rank == 0: - print( - "AllGather with input size(Byte): ", - tensor.nelement() * tensor.element_size(), - " Output size ", - world_size * tensor.nelement() * tensor.element_size(), - ) + for _ in range(5): dist.all_gather(output, tensor) dist.barrier() @@ -114,7 +107,7 @@ def test_reducescatter(mbs, seq, hidden, dtype, rank, local_rank, world_size, dr chunk_seq = seq // world_size chunk_shape = (mbs, chunk_seq, hidden) - if local_rank == 0: + if rank == 0: print( "ReduceScatter with each output chunk size(Byte): ", mbs * chunk_seq * hidden * torch.tensor([], dtype=dtype).element_size(), @@ -126,8 +119,7 @@ def test_reducescatter(mbs, seq, hidden, dtype, rank, local_rank, world_size, dr device = torch.device(f"cuda:{local_rank}") tensor = torch.ones(full_shape, dtype=dtype, device=device) output = torch.empty(chunk_shape, dtype=dtype, device=device) - if local_rank == 0: - print("ReduceScatter with each output chunk size(Byte): ", output.nelement() * output.element_size()) + for _ in range(5): dist.reduce_scatter(output, list(tensor.chunk(world_size, dim=1))) dist.barrier() @@ -151,7 +143,8 @@ def benchmark(test_func, output_csv_path, rank, local_rank, world_size, dry_run= for model_name, (seq, hidden) in MODEL_PARAMS_TABLE.items(): for mbs in MBS_LIST: - print(f"\nModel Name {model_name}, mbs {mbs}") + if rank == 0: + print(f"\nModel Name {model_name}, mbs {mbs}") for dtype in [torch.float16]: avg_time, bandwidth = test_func( mbs, seq, hidden, dtype, rank, local_rank, world_size, dry_run @@ -167,6 +160,7 @@ def benchmark(test_func, output_csv_path, rank, local_rank, world_size, dry_run= "Time(s)": avg_time, "Bandwidth(GB/s)": bandwidth, } + print(result) benchmark_results.append(result) if rank == 0 and not dry_run: diff --git a/benchmark/kernel/rccl/run_slurm.sh b/benchmark/kernel/rccl/run_slurm.sh new file mode 100644 index 000000000..4c9fe30ff --- /dev/null +++ b/benchmark/kernel/rccl/run_slurm.sh @@ -0,0 +1,173 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +# +# Slurm launcher for RCCL benchmarks inside a Docker container. +# +# Usage: +# DOCKER_IMAGE= sbatch run_slurm.sh +# DOCKER_IMAGE= NNODES=2 PARTITION=my-gpu sbatch run_slurm.sh +# DOCKER_IMAGE=rocm/primus:v26.1 NNODES=2 sbatch -N2 -w smci355-ccs-aus-n04-[25,29] -p Compute-DCPT ./run_slurm.sh +# Environment variables (all optional except DOCKER_IMAGE): +# DOCKER_IMAGE Docker image to use (required) +# NNODES Number of nodes [default: 1] +# PARTITION Slurm partition [default: unset] +# GPUS_PER_NODE GPUs per node [default: 8] +# MASTER_PORT Port for torchrun rendezvous [default: 1234] +# EXTRA_DOCKER_ARGS Extra arguments passed to docker run +# +############################################################################### + + +#SBATCH --exclusive +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node=8 +#SBATCH --job-name=rccl-bench + +NNODES="${NNODES:-${SLURM_NNODES:-1}}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" +MASTER_PORT="${MASTER_PORT:-1234}" + +SCRIPT_DIR=$SLURM_SUBMIT_DIR +OUTPUT_DIR="${SCRIPT_DIR}" +echo "SCRIPT_DIR: ${SCRIPT_DIR}" + +if [[ -z "${DOCKER_IMAGE:-}" ]]; then + echo "[ERROR] DOCKER_IMAGE is not set. Export it before submitting the job." + exit 1 +fi +docker stop $(docker ps -q) + +# Build sbatch overrides from env vars +SBATCH_OVERRIDES=() +if [[ -n "${PARTITION:-}" ]]; then + SBATCH_OVERRIDES+=(-p "$PARTITION") +fi + +echo "============================================" +echo " RCCL Benchmark - Slurm + Docker launcher" +echo "============================================" +echo " DOCKER_IMAGE : ${DOCKER_IMAGE}" +echo " NNODES : ${NNODES}" +echo " GPUS_PER_NODE : ${GPUS_PER_NODE}" +echo " MASTER_PORT : ${MASTER_PORT}" +echo " OUTPUT_DIR : ${OUTPUT_DIR}" +echo "============================================" +srun -N "${NNODES}" \ + --exclusive \ + --export=ALL \ + --ntasks-per-node=1 \ + "${SBATCH_OVERRIDES[@]}" \ + bash -c 'docker pull "${DOCKER_IMAGE}";docker stop $(docker ps -q)' + +srun -N "${NNODES}" \ + --exclusive \ + --export=ALL \ + --ntasks-per-node=1 \ + "${SBATCH_OVERRIDES[@]}" \ + bash -c ' + +# ---- Resolve master address from Slurm node list ---- +readarray -t NODE_ARRAY < <(scontrol show hostnames "$SLURM_JOB_NODELIST") +MASTER_ADDR="${NODE_ARRAY[0]}" +NODE_RANK="${SLURM_NODEID}" + +if [[ "$NODE_RANK" == "0" ]]; then + echo "========== Slurm cluster info ==========" + echo "SLURM_NODELIST : ${NODE_ARRAY[*]}" + echo "SLURM_NNODES : ${SLURM_NNODES}" + echo "MASTER_ADDR : ${MASTER_ADDR}" + echo "NODE_RANK : ${NODE_RANK}" + echo "" +fi + +SCRIPT_DIR='"${SCRIPT_DIR}"' +OUTPUT_DIR='"${OUTPUT_DIR}"' +DOCKER_IMAGE='"${DOCKER_IMAGE}"' +DOCKER_LOGIN='"${DOCKER_LOGIN}"' +NNODES='"${NNODES}"' +GPUS_PER_NODE='"${GPUS_PER_NODE}"' +MASTER_PORT='"${MASTER_PORT}"' +EXTRA_DOCKER_ARGS='"${EXTRA_DOCKER_ARGS:-}"' +docker run --rm \ + --network=host \ + --ipc=host \ + --device=/dev/kfd \ + --device=/dev/dri \ + --privileged --device=/dev/infiniband \ + --cap-add=SYS_PTRACE \ + --cap-add=CAP_SYS_ADMIN \ + --security-opt seccomp=unconfined \ + --group-add video \ + -v "${SCRIPT_DIR}:${SCRIPT_DIR}" \ + -v "${HOME}:${HOME}" \ + -w "${SCRIPT_DIR}" \ + -e MASTER_ADDR="${MASTER_ADDR}" \ + -e MASTER_PORT="${MASTER_PORT}" \ + -e NNODES="${NNODES}" \ + -e NODE_RANK="${NODE_RANK}" \ + -e GPUS_PER_NODE="${GPUS_PER_NODE}" \ + ${EXTRA_DOCKER_ARGS} \ + "${DOCKER_IMAGE}" \ + bash -cx " + # set -euo pipefail + cd ${SCRIPT_DIR} + ifconfig + ibv_devices + rocm-smi + + export TORCH_NCCL_HIGH_PRIORITY=1 + export NCCL_IB_HCA=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 + export NCCL_SOCKET_IFNAME=fenic + export NCCL_DEBUG=TRACE + export USING_AINIC=1 + + # AINIC lib paths for different docker image + # /workspace/amd-anp/build/librccl-net.so + # /opt/amd-anp/build/librccl-anp.so + export NCCL_NET_PLUGIN=/workspace/amd-anp/build/librccl-net.so + + # Set InfiniBand GID index for NCCL communication + # if [ $USING_AINIC -eq 1 ]; then + # unset NCCL_IB_GID_INDEX + export NCCL_IB_GID_INDEX=1 + # export NCCL_IB_ROCE_VERSION_NUM=2 + export NCCL_MAX_P2P_CHANNELS=56 + export NCCL_IB_TC=104 + export NCCL_IB_FIFO_TC=192 + export NET_OPTIONAL_RECV_COMPLETION=1 + export NCCL_IB_USE_INLINE=1 + export RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0 + export NCCL_GDR_FLUSH_DISABLE=1 + export NCCL_DMABUF_ENABLE=0 + export NCCL_IGNORE_CPU_AFFINITY=1 + export NCCL_IB_QPS_PER_CONNECTION=1 + # copy from Joyce script + export NCCL_IB_RETRY_CNT=20 + export NCCL_IB_TIMEOUT=300 + # fi + + + PRIMUS_ROOT_PATH=\"${SCRIPT_DIR}/../../..\" + MEGATRON_PATH=\"\${PRIMUS_ROOT_PATH}/third_party/Megatron-LM\" + export PYTHONPATH=\"\${MEGATRON_PATH}:\${PYTHONPATH:-}\" + + echo \"[Node \${NODE_RANK}] Starting RCCL benchmarks...\" + torchrun --master_addr \"\${MASTER_ADDR}\" \ + --master_port \"\${MASTER_PORT}\" \ + --nnodes=\"\${NNODES}\" \ + --node_rank=\"\${NODE_RANK}\" \ + --nproc_per_node=\"\${GPUS_PER_NODE}\" \ + ./benchmark_allreduce.py \ + --allreduce-report-csv-path ${OUTPUT_DIR}/allreduce_benchmark.csv \ + --allgather-report-csv-path ${OUTPUT_DIR}/allgather_benchmark.csv \ + --reducescatter-report-csv-path ${OUTPUT_DIR}/reducescatter_benchmark.csv + + echo \"[Node \${NODE_RANK}] RCCL benchmarks complete.\" + " +' + +echo " Results written to: ${OUTPUT_DIR}/" From 9dfe4bed09b9301f3f69e917f495d1524c6b8637 Mon Sep 17 00:00:00 2001 From: Lorri Rao Date: Wed, 25 Feb 2026 15:06:57 -0800 Subject: [PATCH 2/3] less outputs --- benchmark/kernel/rccl/benchmark_allreduce.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/kernel/rccl/benchmark_allreduce.py b/benchmark/kernel/rccl/benchmark_allreduce.py index 00087673b..51ad20c08 100644 --- a/benchmark/kernel/rccl/benchmark_allreduce.py +++ b/benchmark/kernel/rccl/benchmark_allreduce.py @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2025-2026, Advanced Micro Devices, Inc. All rights reserved. # # See LICENSE for license information. ############################################################################### @@ -160,7 +160,7 @@ def benchmark(test_func, output_csv_path, rank, local_rank, world_size, dry_run= "Time(s)": avg_time, "Bandwidth(GB/s)": bandwidth, } - print(result) + # print(result) benchmark_results.append(result) if rank == 0 and not dry_run: From 985a8275f2560f28e832db5a8fc874b4ed9a1fd7 Mon Sep 17 00:00:00 2001 From: Lorri Rao Date: Thu, 26 Feb 2026 12:55:52 -0800 Subject: [PATCH 3/3] Add output filename customization Created by Joyce for their DCGPU cluster --- benchmark/kernel/rccl/run_slurm.sh | 116 +++++++++++++++++++++-------- 1 file changed, 83 insertions(+), 33 deletions(-) diff --git a/benchmark/kernel/rccl/run_slurm.sh b/benchmark/kernel/rccl/run_slurm.sh index 4c9fe30ff..0b29160f4 100644 --- a/benchmark/kernel/rccl/run_slurm.sh +++ b/benchmark/kernel/rccl/run_slurm.sh @@ -11,6 +11,7 @@ # DOCKER_IMAGE= sbatch run_slurm.sh # DOCKER_IMAGE= NNODES=2 PARTITION=my-gpu sbatch run_slurm.sh # DOCKER_IMAGE=rocm/primus:v26.1 NNODES=2 sbatch -N2 -w smci355-ccs-aus-n04-[25,29] -p Compute-DCPT ./run_slurm.sh +# # Environment variables (all optional except DOCKER_IMAGE): # DOCKER_IMAGE Docker image to use (required) # NNODES Number of nodes [default: 1] @@ -21,17 +22,18 @@ # ############################################################################### - #SBATCH --exclusive #SBATCH --ntasks-per-node=1 #SBATCH --gpus-per-node=8 #SBATCH --job-name=rccl-bench +set -euo pipefail + NNODES="${NNODES:-${SLURM_NNODES:-1}}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" MASTER_PORT="${MASTER_PORT:-1234}" -SCRIPT_DIR=$SLURM_SUBMIT_DIR +SCRIPT_DIR="${SLURM_SUBMIT_DIR}" OUTPUT_DIR="${SCRIPT_DIR}" echo "SCRIPT_DIR: ${SCRIPT_DIR}" @@ -39,7 +41,9 @@ if [[ -z "${DOCKER_IMAGE:-}" ]]; then echo "[ERROR] DOCKER_IMAGE is not set. Export it before submitting the job." exit 1 fi -docker stop $(docker ps -q) + +# Keep your original behavior (may be aggressive on shared nodes) +docker stop $(docker ps -q) >/dev/null 2>&1 || true # Build sbatch overrides from env vars SBATCH_OVERRIDES=() @@ -56,13 +60,16 @@ echo " GPUS_PER_NODE : ${GPUS_PER_NODE}" echo " MASTER_PORT : ${MASTER_PORT}" echo " OUTPUT_DIR : ${OUTPUT_DIR}" echo "============================================" + +# Pre-pull image + stop containers on allocated nodes srun -N "${NNODES}" \ --exclusive \ --export=ALL \ --ntasks-per-node=1 \ "${SBATCH_OVERRIDES[@]}" \ - bash -c 'docker pull "${DOCKER_IMAGE}";docker stop $(docker ps -q)' + bash -c 'docker pull "${DOCKER_IMAGE}"; docker stop $(docker ps -q) >/dev/null 2>&1 || true' +# Run workload on allocated nodes srun -N "${NNODES}" \ --exclusive \ --export=ALL \ @@ -70,28 +77,48 @@ srun -N "${NNODES}" \ "${SBATCH_OVERRIDES[@]}" \ bash -c ' +set -euo pipefail + # ---- Resolve master address from Slurm node list ---- readarray -t NODE_ARRAY < <(scontrol show hostnames "$SLURM_JOB_NODELIST") MASTER_ADDR="${NODE_ARRAY[0]}" NODE_RANK="${SLURM_NODEID}" +# ---- Build short NODE_TAG like: n05-29_n05-33 ---- +short_node() { + # input: smci355-ccs-aus-n05-29 -> output: n05-29 + local h="$1" + local n_part id_part + n_part="$(echo "$h" | awk -F"-" "{print \$(NF-1)}")" # n05 + id_part="$(echo "$h" | awk -F"-" "{print \$NF}")" # 29 + echo "${n_part}-${id_part}" +} + +NODE_TAG="" +for h in "${NODE_ARRAY[@]}"; do + NODE_TAG+="$(short_node "$h")_" +done +NODE_TAG="${NODE_TAG%_}" # trim trailing _ + if [[ "$NODE_RANK" == "0" ]]; then echo "========== Slurm cluster info ==========" echo "SLURM_NODELIST : ${NODE_ARRAY[*]}" echo "SLURM_NNODES : ${SLURM_NNODES}" echo "MASTER_ADDR : ${MASTER_ADDR}" echo "NODE_RANK : ${NODE_RANK}" + echo "NODE_TAG : ${NODE_TAG}" echo "" fi SCRIPT_DIR='"${SCRIPT_DIR}"' OUTPUT_DIR='"${OUTPUT_DIR}"' DOCKER_IMAGE='"${DOCKER_IMAGE}"' -DOCKER_LOGIN='"${DOCKER_LOGIN}"' +DOCKER_LOGIN='"${DOCKER_LOGIN:-}"' NNODES='"${NNODES}"' GPUS_PER_NODE='"${GPUS_PER_NODE}"' MASTER_PORT='"${MASTER_PORT}"' EXTRA_DOCKER_ARGS='"${EXTRA_DOCKER_ARGS:-}"' + docker run --rm \ --network=host \ --ipc=host \ @@ -110,14 +137,16 @@ docker run --rm \ -e NNODES="${NNODES}" \ -e NODE_RANK="${NODE_RANK}" \ -e GPUS_PER_NODE="${GPUS_PER_NODE}" \ + -e NODE_TAG="${NODE_TAG}" \ ${EXTRA_DOCKER_ARGS} \ "${DOCKER_IMAGE}" \ bash -cx " - # set -euo pipefail + set -euo pipefail + cd ${SCRIPT_DIR} - ifconfig - ibv_devices - rocm-smi + ifconfig || true + ibv_devices || true + rocm-smi || true export TORCH_NCCL_HIGH_PRIORITY=1 export NCCL_IB_HCA=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 @@ -126,35 +155,49 @@ docker run --rm \ export USING_AINIC=1 # AINIC lib paths for different docker image - # /workspace/amd-anp/build/librccl-net.so - # /opt/amd-anp/build/librccl-anp.so export NCCL_NET_PLUGIN=/workspace/amd-anp/build/librccl-net.so # Set InfiniBand GID index for NCCL communication - # if [ $USING_AINIC -eq 1 ]; then - # unset NCCL_IB_GID_INDEX - export NCCL_IB_GID_INDEX=1 - # export NCCL_IB_ROCE_VERSION_NUM=2 - export NCCL_MAX_P2P_CHANNELS=56 - export NCCL_IB_TC=104 - export NCCL_IB_FIFO_TC=192 - export NET_OPTIONAL_RECV_COMPLETION=1 - export NCCL_IB_USE_INLINE=1 - export RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0 - export NCCL_GDR_FLUSH_DISABLE=1 - export NCCL_DMABUF_ENABLE=0 - export NCCL_IGNORE_CPU_AFFINITY=1 - export NCCL_IB_QPS_PER_CONNECTION=1 - # copy from Joyce script - export NCCL_IB_RETRY_CNT=20 - export NCCL_IB_TIMEOUT=300 - # fi - + export NCCL_IB_GID_INDEX=1 + export NCCL_MAX_P2P_CHANNELS=56 + export NCCL_IB_TC=104 + export NCCL_IB_FIFO_TC=192 + export NET_OPTIONAL_RECV_COMPLETION=1 + export NCCL_IB_USE_INLINE=1 + export RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0 + export NCCL_GDR_FLUSH_DISABLE=1 + export NCCL_DMABUF_ENABLE=0 + export NCCL_IGNORE_CPU_AFFINITY=1 + export NCCL_IB_QPS_PER_CONNECTION=1 + export NCCL_IB_RETRY_CNT=20 + export NCCL_IB_TIMEOUT=300 PRIMUS_ROOT_PATH=\"${SCRIPT_DIR}/../../..\" MEGATRON_PATH=\"\${PRIMUS_ROOT_PATH}/third_party/Megatron-LM\" export PYTHONPATH=\"\${MEGATRON_PATH}:\${PYTHONPATH:-}\" + # Final CSV names you want (no smci prefix, include both nodes) + FINAL_ALLREDUCE=\"${OUTPUT_DIR}/allreduce_\${NODE_TAG}.csv\" + FINAL_ALLGATHER=\"${OUTPUT_DIR}/allgather_\${NODE_TAG}.csv\" + FINAL_REDUCESCATTER=\"${OUTPUT_DIR}/reducescatter_\${NODE_TAG}.csv\" + + # To avoid concurrent overwrite, only rank0 writes the final CSV. + # Other ranks write to temporary per-rank files. + if [[ \"\${NODE_RANK}\" == \"0\" ]]; then + ALLREDUCE_CSV=\"\${FINAL_ALLREDUCE}\" + ALLGATHER_CSV=\"\${FINAL_ALLGATHER}\" + REDUCESCATTER_CSV=\"\${FINAL_REDUCESCATTER}\" + else + ALLREDUCE_CSV=\"/tmp/allreduce_\${NODE_TAG}_rank\${NODE_RANK}.csv\" + ALLGATHER_CSV=\"/tmp/allgather_\${NODE_TAG}_rank\${NODE_RANK}.csv\" + REDUCESCATTER_CSV=\"/tmp/reducescatter_\${NODE_TAG}_rank\${NODE_RANK}.csv\" + fi + + echo \"[Node \${NODE_RANK}] CSV outputs:\" + echo \" allreduce -> \${ALLREDUCE_CSV}\" + echo \" allgather -> \${ALLGATHER_CSV}\" + echo \" reducescatter -> \${REDUCESCATTER_CSV}\" + echo \"[Node \${NODE_RANK}] Starting RCCL benchmarks...\" torchrun --master_addr \"\${MASTER_ADDR}\" \ --master_port \"\${MASTER_PORT}\" \ @@ -162,9 +205,16 @@ docker run --rm \ --node_rank=\"\${NODE_RANK}\" \ --nproc_per_node=\"\${GPUS_PER_NODE}\" \ ./benchmark_allreduce.py \ - --allreduce-report-csv-path ${OUTPUT_DIR}/allreduce_benchmark.csv \ - --allgather-report-csv-path ${OUTPUT_DIR}/allgather_benchmark.csv \ - --reducescatter-report-csv-path ${OUTPUT_DIR}/reducescatter_benchmark.csv + --allreduce-report-csv-path \"\${ALLREDUCE_CSV}\" \ + --allgather-report-csv-path \"\${ALLGATHER_CSV}\" \ + --reducescatter-report-csv-path \"\${REDUCESCATTER_CSV}\" + + if [[ \"\${NODE_RANK}\" == \"0\" ]]; then + echo \"[Rank0] Final CSV written:\" + echo \" \${FINAL_ALLREDUCE}\" + echo \" \${FINAL_ALLGATHER}\" + echo \" \${FINAL_REDUCESCATTER}\" + fi echo \"[Node \${NODE_RANK}] RCCL benchmarks complete.\" "