From 8cb3dc4e82cbb3ac799ed206b6c3287e5fa11f15 Mon Sep 17 00:00:00 2001 From: Pavel Belevich Date: Thu, 27 Nov 2025 00:38:11 +0000 Subject: [PATCH] Remove AWS_OFI_NCCL_VERSION --- micro-benchmarks/nccl-tests/README.md | 9 ++++----- micro-benchmarks/nccl-tests/nccl-tests.Dockerfile | 5 +++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/micro-benchmarks/nccl-tests/README.md b/micro-benchmarks/nccl-tests/README.md index 8d82c1469..1a5f254b5 100644 --- a/micro-benchmarks/nccl-tests/README.md +++ b/micro-benchmarks/nccl-tests/README.md @@ -38,7 +38,7 @@ The NCCL tests are packaged in a container. > |`CUDA_VERSION` | `12.8.1` | | > |`GDRCOPY_VERSION` | `v2.5.1` | [link](https://github.com/NVIDIA/gdrcopy) | > |`EFA_INSTALLER_VERSION`| `1.43.2` | [link](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-enable) | -> |`AWS_OFI_NCCL_VERSION` | `v1.16.3` | [link](https://github.com/aws/aws-ofi-nccl) | +> |`AWS_OFI_NCCL_VERSION` | *(deprecated)* | AWS OFI NCCL plugin is now bundled with EFA installer | > |`NCCL_VERSION` | `v2.27.7-1` | [link](https://github.com/NVIDIA/nccl) | > |`NCCL_TESTS_VERSION` | `v2.16.9` | [link](https://github.com/NVIDIA/nccl-tests) | @@ -47,10 +47,9 @@ You must pick each version of the library and set them as variables before proce ```bash GDRCOPY_VERSION=v2.5.1 EFA_INSTALLER_VERSION=1.43.2 -AWS_OFI_NCCL_VERSION=v1.16.3 NCCL_VERSION=v2.27.7-1 NCCL_TESTS_VERSION=v2.16.9 -TAG="efa${EFA_INSTALLER_VERSION}-ofi${AWS_OFI_NCCL_VERSION}-nccl${NCCL_VERSION}-tests${NCCL_TESTS_VERSION}" +TAG="efa${EFA_INSTALLER_VERSION}-nccl${NCCL_VERSION}-tests${NCCL_TESTS_VERSION}" CONTAINER_IMAGE_NAME_TAG="nccl-tests:${TAG}" ``` @@ -61,8 +60,8 @@ If you wish to build the containar image by yourself, follow this section. Alter 1. Build the container image with the command below: ```bash docker build -f nccl-tests.Dockerfile \ + --build-arg="GDRCOPY_VERSION=${GDRCOPY_VERSION}" \ --build-arg="EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION}" \ - --build-arg="AWS_OFI_NCCL_VERSION=${AWS_OFI_NCCL_VERSION}" \ --build-arg="NCCL_VERSION=${NCCL_VERSION}" \ --build-arg="NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}" \ -t ${CONTAINER_IMAGE_NAME_TAG} \ @@ -262,7 +261,7 @@ To change the type of collective to test, modify the line with `srun` in the fil kubectl logs -f $(kubectl get pods | grep launcher | cut -d ' ' -f 1) ``` - The following is an example exerpt from the logs of a NCCL all_reduce_perf test, executed on a cluster with two p5.48xlarge instances (using EFA_INSTALLER_VERSION=1.28.0, AWS_OFI_NCCL_VERSION=v1.7.3-aws, NCCL_TESTS_VERSION=master, ARG NCCL_VERSION=2.18.5): + The following is an example exerpt from the logs of a NCCL all_reduce_perf test, executed on a cluster with two p5.48xlarge instances (using EFA_INSTALLER_VERSION=1.28.0, NCCL_TESTS_VERSION=master, NCCL_VERSION=2.18.5): ```log [1,0]:# out-of-place in-place diff --git a/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile b/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile index 35e18f189..ae2328fd0 100644 --- a/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile +++ b/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile @@ -5,6 +5,7 @@ FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 ARG GDRCOPY_VERSION=v2.5.1 ARG EFA_INSTALLER_VERSION=1.47.0 +ARG AWS_OFI_NCCL_VERSION="" # Kept for backward compatibility - value is ignored as plugin is bundled with EFA ARG NCCL_VERSION=v2.29.3-1 ARG NCCL_TESTS_VERSION=v2.17.9 @@ -83,6 +84,10 @@ RUN cd $HOME \ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ && rm -rf $HOME/aws-efa-installer +RUN echo "Verifying AWS OFI NCCL plugin installation..." && \ + (ls -la /opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi*.so || \ + ls -la /opt/amazon/ofi-nccl/lib/aarch64-linux-gnu/libnccl-ofi*.so) + ################################################### ## Install NCCL RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \