Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 104 additions & 73 deletions 3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

FROM nvcr.io/nvidia/pytorch:25.03-py3
# ============================================================
# Base image: PyTorch 25.04 with CUDA 12.9.0 (required for NCCL 2.29.x)
# Supports Blackwell (sm_100), Hopper, Ampere architectures
# ============================================================
FROM nvcr.io/nvidia/pytorch:25.04-py3

ARG GDRCOPY_VERSION=v2.4.1
ARG EFA_INSTALLER_VERSION=1.37.0
ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws
ARG TRANSFORMERS_VERSION=4.44.2
ARG MEGATRON_LM_VERSION=core_r0.8.0

ARG OPEN_MPI_PATH=/opt/amazon/openmpi

######################
# Update and remove the IB libverbs
######################
ENV DEBIAN_FRONTEND=noninteractive

# ============================================================
# 1. System packages and SSH setup (needed for multi-node training)
# ============================================================
RUN apt-get update -y && apt-get upgrade -y
RUN apt-get remove -y --allow-change-held-packages \
ibverbs-utils \
Expand All @@ -26,8 +27,7 @@ RUN rm -rf /opt/hpcx/ompi \
&& rm -rf /usr/local/ucx \
&& ldconfig

RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \
apt-utils \
RUN apt-get install -y --no-install-recommends \
autoconf \
automake \
build-essential \
Expand All @@ -36,6 +36,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \
gcc \
gdb \
git \
gnupg \
kmod \
libtool \
openssh-client \
Expand All @@ -55,69 +56,99 @@ RUN rm -rf /root/.ssh/ \
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config

ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:$LD_LIBRARY_PATH
ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH

#################################################
## Install NVIDIA GDRCopy
##
## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
## that the cuda-compat-xx-x package is the latest.
RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
&& cd /tmp/gdrcopy \
&& make prefix=/opt/gdrcopy install

ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH
ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH
ENV CPATH /opt/gdrcopy/include:$CPATH
ENV PATH /opt/gdrcopy/bin:$PATH

#################################################
## Install EFA installer
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
# ============================================================
# 2. Install EFA Installer 1.47.0
# This bundles libfabric, rdma-core, and pre-built aws-ofi-nccl
# No source build of aws-ofi-nccl needed (unlike EFA < 1.40)
# ============================================================
ENV EFA_INSTALLER_VERSION=1.47.0
WORKDIR /tmp
RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer


###################################################
## Install AWS-OFI-NCCL plugin
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
#Switch from sh to bash to allow parameter expansion
SHELL ["/bin/bash", "-c"]
RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
&& tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
&& cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
&& make -j $(nproc) \
&& make install \
&& cd .. \
&& rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
&& rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz

SHELL ["/bin/sh", "-c"]

###################################################
RUN rm -rf /var/lib/apt/lists/*

RUN echo "hwloc_base_binding_policy = none" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf \
&& echo "rmaps_base_mapping_policy = slot" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf
&& cd / && rm -rf /tmp/aws-efa-installer

# ============================================================
# 3. Remove old aws-ofi-nccl and create NCCL plugin symlinks
# NCCL_NET_PLUGIN=aws-ofi looks for libnccl-net-aws-ofi.so
# EFA installer names it libnccl-net-ofi.so
# Without this symlink NCCL falls back to TCP sockets silently
# ============================================================
RUN rm -rf /opt/amazon/aws-ofi-nccl

RUN ln -sf /opt/amazon/ofi-nccl/lib/libnccl-net-ofi.so \
/opt/amazon/ofi-nccl/lib/libnccl-net-aws-ofi.so && \
ln -sf /opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \
/opt/amazon/ofi-nccl/lib/libnccl-tuner-aws-ofi.so

# ============================================================
# 4. Upgrade NCCL to 2.29.3 (matches B200 host version)
# Requires CUDA >= 12.9 (which pytorch:25.04-py3 provides)
# Must add NVIDIA CUDA apt repo first since base image may not have it
# ============================================================
ENV NCCL_VERSION=2.29.3-1
RUN apt-get update && \
apt-get install -y --no-install-recommends wget && \
wget -qO /tmp/cuda-keyring.deb \
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \
dpkg -i /tmp/cuda-keyring.deb && \
rm /tmp/cuda-keyring.deb && \
apt-get update && \
apt-get install -y --allow-downgrades --allow-change-held-packages \
libnccl2=${NCCL_VERSION}+cuda12.9 \
libnccl-dev=${NCCL_VERSION}+cuda12.9 && \
rm -rf /var/lib/apt/lists/*

# ============================================================
# 5. Install GDRCopy v2.5.1 (lib-only, no binaries needed)
# ============================================================
RUN cd /tmp && \
git clone --branch v2.5.1 --depth 1 https://github.com/NVIDIA/gdrcopy.git && \
cd gdrcopy && \
make -j$(nproc) lib lib_install && \
cd / && rm -rf /tmp/gdrcopy

# ============================================================
# 6. Fix library path references
# Use ld.so.conf.d for system-wide discovery (more robust
# than relying solely on LD_LIBRARY_PATH)
# ============================================================
RUN echo "/opt/amazon/ofi-nccl/lib" > /etc/ld.so.conf.d/aws-ofi-nccl.conf && \
echo "/opt/amazon/efa/lib" > /etc/ld.so.conf.d/efa.conf

RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/environment 2>/dev/null || true
RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/shinit_v2 2>/dev/null || true

# Rebuild ldconfig cache
RUN rm -f /etc/ld.so.cache && ldconfig

# ============================================================
# 7. Environment variables
# ============================================================
ENV LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}"
ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}"
ENV FI_PROVIDER=efa

# ============================================================
# 8. OpenMPI tuning for EFA (needed for multi-node training)
# ============================================================
RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
&& echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf

RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \
&& echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \
&& echo "${OPEN_MPI_PATH}/bin/mpirun.real \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \
&& chmod a+x ${OPEN_MPI_PATH}/bin/mpirun

# ============================================================
# 9. Python packages for DeepSpeed training
# ============================================================
RUN pip3 install --no-cache-dir \
awscli pynvml \
transformers==${TRANSFORMERS_VERSION} \
sentencepiece python-etcd \
deepspeed>=0.16,<1.0 accelerate>=1.0,<2.0

RUN pip3 install awscli pynvml

RUN mv $OPEN_MPI_PATH/bin/mpirun $OPEN_MPI_PATH/bin/mpirun.real \
&& echo '#!/bin/bash' > $OPEN_MPI_PATH/bin/mpirun \
&& echo '/opt/amazon/openmpi/bin/mpirun.real "$@"' >> $OPEN_MPI_PATH/bin/mpirun \
&& chmod a+x $OPEN_MPI_PATH/bin/mpirun

######################
# DeepSpeed dependencies
######################
RUN pip install transformers==${TRANSFORMERS_VERSION} sentencepiece python-etcd deepspeed accelerate
RUN rm -rf /var/lib/apt/lists/*

WORKDIR /workspace
16 changes: 10 additions & 6 deletions 3.test_cases/pytorch/deepspeed/1.build-image.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# SPDX-License-Identifier: MIT-0

#SBATCH -N 1 # number of nodes to use
#SBATCH --job-name=build-neox-image # name of your job
#SBATCH --job-name=build-deepspeed-image # name of your job
#SBATCH --output=logs/%x_%j.out # logfile for stdout
#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs

Expand All @@ -14,11 +14,15 @@ set -euxo pipefail
: "${APPS_PATH:=/fsx/apps}"
: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"

# Ensure output directory exists
mkdir -p "${APPS_PATH}"
mkdir -p logs

ENROOT_IMAGE=deepspeed
docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile .
docker build -t "${ENROOT_IMAGE}" -f 0.deepspeed.dockerfile .
# Remove old sqsh file if exists
if [ -f ${ENROOT_IMAGE}.sqsh ] ; then
rm ${ENROOT_IMAGE}.sqsh
if [ -f "${ENROOT_IMAGE}.sqsh" ] ; then
rm "${ENROOT_IMAGE}.sqsh"
fi
enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest
mv ${ENROOT_IMAGE}.sqsh ${IMAGE}
enroot import -o "${ENROOT_IMAGE}.sqsh" "dockerd://${ENROOT_IMAGE}:latest"
mv "${ENROOT_IMAGE}.sqsh" "${IMAGE}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing final newline

The diff shows \ No newline at end of file here. Per .editorconfig (insert_final_newline = true), this should end with a trailing newline.

52 changes: 46 additions & 6 deletions 3.test_cases/pytorch/deepspeed/Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,52 @@
ENROOT_IMAGE=deepspeed
ENROOT_IMAGE ?= deepspeed
APPS_PATH ?= /fsx/apps
SQUASH_FILE ?= $(APPS_PATH)/$(ENROOT_IMAGE).sqsh
PARTITION ?= dev
NODES ?= 8
LOGS_DIR ?= logs
RESULTS_DIR ?= sweep_results

all: build clean import
.PHONY: all build clean import build-remote train parse help

all: build import

help:
@echo "Container targets:"
@echo " build - Build Docker image locally"
@echo " import - Convert Docker image to Enroot squash file"
@echo " build-remote - Build image on a compute node via sbatch"
@echo " clean - Remove local squash file"
@echo ""
@echo "Training targets:"
@echo " train - Submit 103B GPT pretraining (best config: TP=8, PP=8, fusions)"
@echo ""
@echo "Results targets:"
@echo " parse - Parse training logs into benchmark JSON"

# ---- Container ----

build:
docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile .
docker build -t $(ENROOT_IMAGE) -f 0.deepspeed.dockerfile .

import:
mkdir -p $(APPS_PATH)
enroot import -o $(SQUASH_FILE) dockerd://$(ENROOT_IMAGE):latest

build-remote:
sbatch 1.build-image.sbatch

clean:
-rm ${ENROOT_IMAGE}.sqsh
-rm -f $(SQUASH_FILE)

import:
enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest
# ---- Training (best config: TP=8, PP=8, ZeRO=0, fusions enabled) ----

train:
sbatch --partition=$(PARTITION) --nodes=$(NODES) \
--export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \
gpt/slurm/pretrain_gpt_103b.sbatch

# ---- Results ----

parse:
python3 gpt/parse_results.py --jobs-csv $(RESULTS_DIR)/sweep_jobs.csv \
--logs-dir $(LOGS_DIR) --output-dir $(RESULTS_DIR)
Loading