diff --git a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile index 472edc476..1874ca3d5 100644 --- a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile +++ b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile @@ -1,19 +1,20 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 -FROM nvcr.io/nvidia/pytorch:25.03-py3 +# ============================================================ +# Base image: PyTorch 25.04 with CUDA 12.9.0 (required for NCCL 2.29.x) +# Supports Blackwell (sm_100), Hopper, Ampere architectures +# ============================================================ +FROM nvcr.io/nvidia/pytorch:25.04-py3 -ARG GDRCOPY_VERSION=v2.4.1 -ARG EFA_INSTALLER_VERSION=1.37.0 -ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws ARG TRANSFORMERS_VERSION=4.44.2 -ARG MEGATRON_LM_VERSION=core_r0.8.0 - ARG OPEN_MPI_PATH=/opt/amazon/openmpi -###################### -# Update and remove the IB libverbs -###################### +ENV DEBIAN_FRONTEND=noninteractive + +# ============================================================ +# 1. System packages and SSH setup (needed for multi-node training) +# ============================================================ RUN apt-get update -y && apt-get upgrade -y RUN apt-get remove -y --allow-change-held-packages \ ibverbs-utils \ @@ -26,8 +27,7 @@ RUN rm -rf /opt/hpcx/ompi \ && rm -rf /usr/local/ucx \ && ldconfig -RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \ - apt-utils \ +RUN apt-get install -y --no-install-recommends \ autoconf \ automake \ build-essential \ @@ -36,6 +36,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \ gcc \ gdb \ git \ + gnupg \ kmod \ libtool \ openssh-client \ @@ -55,69 +56,99 @@ RUN rm -rf /root/.ssh/ \ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config -ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:$LD_LIBRARY_PATH -ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH - -################################################# -## Install NVIDIA GDRCopy -## -## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure -## that the cuda-compat-xx-x package is the latest. -RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \ - && cd /tmp/gdrcopy \ - && make prefix=/opt/gdrcopy install - -ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH -ENV CPATH /opt/gdrcopy/include:$CPATH -ENV PATH /opt/gdrcopy/bin:$PATH - -################################################# -## Install EFA installer -RUN cd $HOME \ - && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ - && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ +# ============================================================ +# 2. Install EFA Installer 1.47.0 +# This bundles libfabric, rdma-core, and pre-built aws-ofi-nccl +# No source build of aws-ofi-nccl needed (unlike EFA < 1.40) +# ============================================================ +ENV EFA_INSTALLER_VERSION=1.47.0 +WORKDIR /tmp +RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xz \ && cd aws-efa-installer \ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ - && rm -rf $HOME/aws-efa-installer - - -################################################### -## Install AWS-OFI-NCCL plugin -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev -#Switch from sh to bash to allow parameter expansion -SHELL ["/bin/bash", "-c"] -RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ - && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ - && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ - && ./configure --prefix=/opt/aws-ofi-nccl/install \ - --with-mpi=/opt/amazon/openmpi \ - --with-libfabric=/opt/amazon/efa \ - --with-cuda=/usr/local/cuda \ - --enable-platform-aws \ - && make -j $(nproc) \ - && make install \ - && cd .. \ - && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ - && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz - -SHELL ["/bin/sh", "-c"] - -################################################### -RUN rm -rf /var/lib/apt/lists/* - -RUN echo "hwloc_base_binding_policy = none" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf \ - && echo "rmaps_base_mapping_policy = slot" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf + && cd / && rm -rf /tmp/aws-efa-installer + +# ============================================================ +# 3. Remove old aws-ofi-nccl and create NCCL plugin symlinks +# NCCL_NET_PLUGIN=aws-ofi looks for libnccl-net-aws-ofi.so +# EFA installer names it libnccl-net-ofi.so +# Without this symlink NCCL falls back to TCP sockets silently +# ============================================================ +RUN rm -rf /opt/amazon/aws-ofi-nccl + +RUN ln -sf /opt/amazon/ofi-nccl/lib/libnccl-net-ofi.so \ + /opt/amazon/ofi-nccl/lib/libnccl-net-aws-ofi.so && \ + ln -sf /opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \ + /opt/amazon/ofi-nccl/lib/libnccl-tuner-aws-ofi.so + +# ============================================================ +# 4. Upgrade NCCL to 2.29.3 (matches B200 host version) +# Requires CUDA >= 12.9 (which pytorch:25.04-py3 provides) +# Must add NVIDIA CUDA apt repo first since base image may not have it +# ============================================================ +ENV NCCL_VERSION=2.29.3-1 +RUN apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget -qO /tmp/cuda-keyring.deb \ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i /tmp/cuda-keyring.deb && \ + rm /tmp/cuda-keyring.deb && \ + apt-get update && \ + apt-get install -y --allow-downgrades --allow-change-held-packages \ + libnccl2=${NCCL_VERSION}+cuda12.9 \ + libnccl-dev=${NCCL_VERSION}+cuda12.9 && \ + rm -rf /var/lib/apt/lists/* + +# ============================================================ +# 5. Install GDRCopy v2.5.1 (lib-only, no binaries needed) +# ============================================================ +RUN cd /tmp && \ + git clone --branch v2.5.1 --depth 1 https://github.com/NVIDIA/gdrcopy.git && \ + cd gdrcopy && \ + make -j$(nproc) lib lib_install && \ + cd / && rm -rf /tmp/gdrcopy + +# ============================================================ +# 6. Fix library path references +# Use ld.so.conf.d for system-wide discovery (more robust +# than relying solely on LD_LIBRARY_PATH) +# ============================================================ +RUN echo "/opt/amazon/ofi-nccl/lib" > /etc/ld.so.conf.d/aws-ofi-nccl.conf && \ + echo "/opt/amazon/efa/lib" > /etc/ld.so.conf.d/efa.conf + +RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/environment 2>/dev/null || true +RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/shinit_v2 2>/dev/null || true + +# Rebuild ldconfig cache +RUN rm -f /etc/ld.so.cache && ldconfig + +# ============================================================ +# 7. Environment variables +# ============================================================ +ENV LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}" +ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}" +ENV FI_PROVIDER=efa + +# ============================================================ +# 8. OpenMPI tuning for EFA (needed for multi-node training) +# ============================================================ +RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf + +RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \ + && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "${OPEN_MPI_PATH}/bin/mpirun.real \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \ + && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun + +# ============================================================ +# 9. Python packages for DeepSpeed training +# ============================================================ +RUN pip3 install --no-cache-dir \ + awscli pynvml \ + transformers==${TRANSFORMERS_VERSION} \ + sentencepiece python-etcd \ + deepspeed>=0.16,<1.0 accelerate>=1.0,<2.0 -RUN pip3 install awscli pynvml - -RUN mv $OPEN_MPI_PATH/bin/mpirun $OPEN_MPI_PATH/bin/mpirun.real \ - && echo '#!/bin/bash' > $OPEN_MPI_PATH/bin/mpirun \ - && echo '/opt/amazon/openmpi/bin/mpirun.real "$@"' >> $OPEN_MPI_PATH/bin/mpirun \ - && chmod a+x $OPEN_MPI_PATH/bin/mpirun - -###################### -# DeepSpeed dependencies -###################### -RUN pip install transformers==${TRANSFORMERS_VERSION} sentencepiece python-etcd deepspeed accelerate +RUN rm -rf /var/lib/apt/lists/* +WORKDIR /workspace diff --git a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch index ebd7b0b04..f999b926b 100644 --- a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch +++ b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MIT-0 #SBATCH -N 1 # number of nodes to use -#SBATCH --job-name=build-neox-image # name of your job +#SBATCH --job-name=build-deepspeed-image # name of your job #SBATCH --output=logs/%x_%j.out # logfile for stdout #SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs @@ -14,11 +14,15 @@ set -euxo pipefail : "${APPS_PATH:=/fsx/apps}" : "${IMAGE:=$APPS_PATH/deepspeed.sqsh}" +# Ensure output directory exists +mkdir -p "${APPS_PATH}" +mkdir -p logs + ENROOT_IMAGE=deepspeed -docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile . +docker build -t "${ENROOT_IMAGE}" -f 0.deepspeed.dockerfile . # Remove old sqsh file if exists -if [ -f ${ENROOT_IMAGE}.sqsh ] ; then - rm ${ENROOT_IMAGE}.sqsh +if [ -f "${ENROOT_IMAGE}.sqsh" ] ; then + rm "${ENROOT_IMAGE}.sqsh" fi -enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest -mv ${ENROOT_IMAGE}.sqsh ${IMAGE} \ No newline at end of file +enroot import -o "${ENROOT_IMAGE}.sqsh" "dockerd://${ENROOT_IMAGE}:latest" +mv "${ENROOT_IMAGE}.sqsh" "${IMAGE}" \ No newline at end of file diff --git a/3.test_cases/pytorch/deepspeed/Makefile b/3.test_cases/pytorch/deepspeed/Makefile index e4615c60f..88b3d2262 100644 --- a/3.test_cases/pytorch/deepspeed/Makefile +++ b/3.test_cases/pytorch/deepspeed/Makefile @@ -1,12 +1,52 @@ -ENROOT_IMAGE=deepspeed +ENROOT_IMAGE ?= deepspeed +APPS_PATH ?= /fsx/apps +SQUASH_FILE ?= $(APPS_PATH)/$(ENROOT_IMAGE).sqsh +PARTITION ?= dev +NODES ?= 8 +LOGS_DIR ?= logs +RESULTS_DIR ?= sweep_results -all: build clean import +.PHONY: all build clean import build-remote train parse help + +all: build import + +help: + @echo "Container targets:" + @echo " build - Build Docker image locally" + @echo " import - Convert Docker image to Enroot squash file" + @echo " build-remote - Build image on a compute node via sbatch" + @echo " clean - Remove local squash file" + @echo "" + @echo "Training targets:" + @echo " train - Submit 103B GPT pretraining (best config: TP=8, PP=8, fusions)" + @echo "" + @echo "Results targets:" + @echo " parse - Parse training logs into benchmark JSON" + +# ---- Container ---- build: - docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile . + docker build -t $(ENROOT_IMAGE) -f 0.deepspeed.dockerfile . + +import: + mkdir -p $(APPS_PATH) + enroot import -o $(SQUASH_FILE) dockerd://$(ENROOT_IMAGE):latest + +build-remote: + sbatch 1.build-image.sbatch clean: - -rm ${ENROOT_IMAGE}.sqsh + -rm -f $(SQUASH_FILE) -import: - enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest +# ---- Training (best config: TP=8, PP=8, ZeRO=0, fusions enabled) ---- + +train: + sbatch --partition=$(PARTITION) --nodes=$(NODES) \ + --export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \ + gpt/slurm/pretrain_gpt_103b.sbatch + +# ---- Results ---- + +parse: + python3 gpt/parse_results.py --jobs-csv $(RESULTS_DIR)/sweep_jobs.csv \ + --logs-dir $(LOGS_DIR) --output-dir $(RESULTS_DIR) diff --git a/3.test_cases/pytorch/deepspeed/README.md b/3.test_cases/pytorch/deepspeed/README.md index fd2ef7524..82e34d46a 100644 --- a/3.test_cases/pytorch/deepspeed/README.md +++ b/3.test_cases/pytorch/deepspeed/README.md @@ -1,87 +1,179 @@ -# DeepSpeed Test Cases +# DeepSpeed on AWS -[DeepSpeed](https://github.com/microsoft/DeepSpeed) enables world's most powerful language models like MT-530B and BLOOM. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. `deepspeed` illustrates several example test cases for DeepSpeed training on AWS. +[DeepSpeed](https://github.com/microsoft/DeepSpeed) is a deep learning optimization library that enables efficient distributed training at scale. This directory contains test cases for running DeepSpeed workloads on AWS GPU clusters, covering large-scale pretraining and parameter-efficient fine-tuning. -## 1. Preparation +## Use Cases -This guide assumes that you have the following: +| Use Case | Description | Location | +|----------|-------------|----------| +| GPT-103B Pretraining | Large-scale GPT pretraining benchmark using Megatron-DeepSpeed with 3D parallelism (TP/PP/DP) and ZeRO optimization | [`gpt/`](gpt/) | +| QLoRA Fine-tuning | Qwen3-8B fine-tuning with QLoRA (4-bit) + DeepSpeed ZeRO-2/3, supports EKS and Slurm | [`qlora/`](qlora/) | +| Llama2 Fine-tuning | Llama2 fine-tuning from HuggingFace weights using Megatron-DeepSpeed | [`examples_megatron_deepspeed/finetune_hf_llama/`](examples_megatron_deepspeed/finetune_hf_llama/) | -* A functional Slurm cluster on AWS. -* Docker, [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) installed. -* An FSx for Lustre filesystem mounted on `/fsx`. +## Prerequisites -We recommend that you set up a Slurm cluster using the templates in the architectures [directory](../../1.architectures). You need to set the following environment variables to run these test cases: +- A functional Slurm cluster on AWS. We recommend [SageMaker HyperPod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html) or the templates in the [architectures directory](../../1.architectures). +- [Docker](https://docs.docker.com/engine/install/), [Pyxis](https://github.com/NVIDIA/pyxis), and [Enroot](https://github.com/NVIDIA/enroot) installed on compute nodes. +- An [FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html) filesystem mounted on `/fsx`. +- NVIDIA GPU instances with [EFA networking](https://aws.amazon.com/hpc/efa/) (B200, H100, A100, etc.). -```bash -export APPS_PATH=/fsx/apps -export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh -export FSX_PATH=/fsx -export MODEL_PATH=$FSX_PATH/deepspeed -export TEST_CASE_PATH=${HOME}/18.deepspeed # where you copy the test case or set to your test case path -cd $TEST_CASE_PATH # Note that we assume that you are here during the following command executions -``` +## 1. GPT-103B Pretraining Benchmark + +A ~103B-parameter GPT model (80 layers, hidden=12288, heads=96, FFN=49152) trained with [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) using 3D parallelism (tensor, pipeline, data) and DeepSpeed ZeRO optimization. Designed for benchmarking multi-node GPU clusters. +### Container setup +The container image (`0.deepspeed.dockerfile`) is built on `nvcr.io/nvidia/pytorch:25.04-py3` and includes: -## 2. Build the container +- **EFA 1.47.0** with the bundled aws-ofi-nccl plugin and NCCL tuner +- **NCCL 2.29.3** (upgraded to match B200 host driver) +- **GDRCopy v2.5.1** for GPU-direct RDMA +- **DeepSpeed**, **Transformers 4.44.2**, and multi-node SSH configuration -Before running training jobs, you need to use a build docker container image. [Enroot](https://github.com/NVIDIA/enroot) will be used to turn the image into unprivileged sandbox for Slurm but build step may exceed the storage available on the head node so we reccomend building it on a compute node following instructions below (option 2) +Build the container on a compute node (recommended, avoids head node storage limits): -### Option 1: build image on a head node +```bash +sbatch 1.build-image.sbatch +``` + +Or build locally and convert to a squash file: + +```bash +make build # docker build +make import # enroot import to /fsx/apps/deepspeed.sqsh +``` -Below are the steps you need to follow: +### Data preparation +The benchmark uses preprocessed data in Megatron format with the GPT-2 tokenizer. -1. Build the Docker image with the command below in this directory. +1. Download the GPT-2 tokenizer: ```bash - docker build -t deepspeed -f 0.deepspeed.dockerfile . + mkdir -p /fsx/deepspeed/data && cd /fsx/deepspeed/data + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt ``` - -2. Once the Docker image is built, you can check if it is present with `docker images`. You should see an output similar to this one: +2. Prepare training data (any text corpus works; for benchmarking, synthetic data is sufficient): ```bash - REPOSITORY TAG IMAGE ID CREATED SIZE - deepspeed latest b6c49033c424 9 minutes ago 23.3GB - ... + python3 -c " + import json + with open('synthetic_corpus.json', 'w') as f: + for i in range(50000): + json.dump({'text': 'The quick brown fox ' * 100}, f) + f.write('\n') + " ``` -3. Convert the Docker image to a squash file with the command below. +3. Clone Megatron-DeepSpeed and preprocess: ```bash - enroot import -o ${ENROOT_IMAGE} dockerd://deepspeed:latest + git clone https://github.com/microsoft/Megatron-DeepSpeed /fsx/deepspeed/Megatron-DeepSpeed + + python3 /fsx/deepspeed/Megatron-DeepSpeed/tools/preprocess_data.py \ + --input synthetic_corpus.json \ + --output-prefix BookCorpusDataset_text_document \ + --vocab-file gpt2-vocab.json \ + --merge-file gpt2-merges.txt \ + --tokenizer-type GPT2BPETokenizer \ + --workers 16 --append-eod ``` - The file will be stored in the `/apps` directory (by default). The output should look as below. +### Running + +Submit the best-performing configuration (TP=8, PP=8, ZeRO-0, fusions enabled): + +```bash +make train +# or equivalently: +sbatch --partition=dev --nodes=8 \ + --export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \ + gpt/slurm/pretrain_gpt_103b.sbatch +``` + +Override parallelism settings for custom configurations: + +```bash +sbatch --nodes=8 \ + --export=ALL,TP=8,PP=4,ZERO_STAGE=1,MICRO_BATCH_SIZE=2,CONFIG_NAME=my_config \ + gpt/slurm/pretrain_gpt_103b.sbatch +``` + +#### Environment variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `TP` | 8 | Tensor parallel size | +| `PP` | 2 | Pipeline parallel size (best throughput with PP=8, see `make train`) | +| `ZERO_STAGE` | 1 | DeepSpeed ZeRO stage (0, 1, 2, or 3) | +| `MICRO_BATCH_SIZE` | 1 | Per-GPU micro batch size | +| `GLOBAL_BATCH_SIZE` | 64 | Global batch size | +| `SEQ_LENGTH` | 2048 | Sequence length | +| `ENABLE_FUSIONS` | 0 | Set to 1 to enable kernel fusion ops | +| `USE_ACTIVATION_CHECKPOINTING` | 0 | Set to 1 for activation checkpointing | +| `USE_OVERLAP_COMM` | 0 | Set to 1 to overlap communication with compute | +| `TRAIN_ITERS` | 50 | Number of training iterations | +| `CONFIG_NAME` | baseline | Label for this configuration | + +### Best practices + +The following recommendations are based on extensive parameter sweeps across parallelism strategies, ZeRO stages, NCCL flags, and memory optimizations: + +**Parallelism strategy:** + +- **Maximize pipeline parallelism** (PP) alongside tensor parallelism (TP) for best throughput. For an 8-node cluster with 8 GPUs per node, TP=8 with PP=8 is optimal. +- **Enable kernel fusion ops** (`ENABLE_FUSIONS=1`) for a significant throughput improvement over the non-fused baseline. This enables masked-softmax, bias-gelu, bias-dropout, and gradient-accumulation fusions. +- **ZeRO-0 outperforms ZeRO-1** when the data-parallel group size is small (e.g., DP=1 with TP=8/PP=8). ZeRO-1's allreduce overhead is not amortized. + +**ZeRO-2 and ZeRO-3:** + +- ZeRO-2 and ZeRO-3 are **incompatible with pipeline parallelism** in Megatron-DeepSpeed. The sbatch script automatically sets `PP=1` and adds `--no-pipeline-parallel` when `ZERO_STAGE >= 2`. +- ZeRO-3's parameter partitioning **enables lower TP values** that ZeRO-2 cannot fit in memory (e.g., TP=4 works with ZeRO-3 but OOMs with ZeRO-2). +- **Increasing micro-batch size** (e.g., `MICRO_BATCH_SIZE=2`) substantially improves throughput for ZeRO-2 and ZeRO-3 configurations. +- `overlap_comm` provides only marginal improvement (~2%) with ZeRO-3. - ```bash - [INFO] Fetching image +**NCCL and networking:** - 36a8c752c28a2db543d2a632a3fc1fcbd5789a6f3d45b9d3a24632420dedcfa8 +- NCCL environment flag variations (buffer sizes, chunk sizes, min channels) have **negligible impact** on throughput (~1% range). The defaults in the sbatch script are well-tuned. +- **Do not set `NCCL_ALGO=Tree`** on EFA-based clusters -- it causes hangs. Let the NCCL tuner plugin (`libnccl-ofi-tuner.so`) choose the algorithm automatically. +- **Do not set `NCCL_PROTO` or `FI_EFA_FORK_SAFE`** -- these are not needed and can cause issues. + +**Memory:** + +- `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` is set by default in the sbatch script. Note the **capital T** is required in pytorch:25.04 containers; lowercase `true` causes a `RuntimeError`. +- Sequence length 4096 exceeds available HBM even with TP=8/PP=2 on B200 (178GB per GPU). Use seq=2048 for this model size. + +### Parsing results + +After training completes, parse the Slurm logs into benchmark JSON using `gpt/parse_results.py`: + +```bash +# Single log file +python3 gpt/parse_results.py --log-file logs/deepspeed-pretrain-103b_123.out --config-name my_config + +# Multiple jobs tracked in a CSV +python3 gpt/parse_results.py --jobs-csv sweep_results/sweep_jobs.csv --output-dir sweep_results +``` - [INFO] Extracting image content... - [INFO] Creating squashfs filesystem... +### Known issues - Parallel mksquashfs: Using 32 processors - Creating 4.0 filesystem on /apps/deepspeed.sqsh, block size 131072. - [========================================================================================================================================================================================================================-] 291068/291068 100% +- **torchrun shebang**: The container's `torchrun` may have a shebang pointing to the wrong Python version. The sbatch script uses `python3 -m torch.distributed.run` as a workaround. +- **`expandable_segments` case sensitivity**: Must use `expandable_segments:True` (capital T) in pytorch:25.04-py3. Lowercase causes a `RuntimeError`. +- **NCCL Tree algorithm**: Incompatible with EFA topology -- causes hangs. Do not set `NCCL_ALGO=Tree`. +- **Sequence parallelism**: Incompatible with pipeline parallelism (PP>1) in this Megatron-DeepSpeed version. - Exportable Squashfs 4.0 filesystem, gzip compressed, data block size 131072 - uncompressed data, uncompressed metadata, uncompressed fragments, uncompressed xattrs - duplicates are not removed - ... - ``` +## 2. QLoRA Fine-tuning (Qwen3-8B) -Once done proceed to the next stage. +Fine-tune [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) using QLoRA (4-bit quantization + LoRA adapters) with DeepSpeed ZeRO-2 or ZeRO-3. Supports deployment on SageMaker HyperPod with both EKS and Slurm orchestrators, including MIG GPU partitioning and automatic checkpoint resume. -### Option 2: Build image on a compute node +The QLoRA use case has its own container (`qlora/Dockerfile`) optimized for the same infrastructure best practices (EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1). -In this option, you will use a compute node to build the image. Submit the job as: +See [`qlora/README.md`](qlora/README.md) for full instructions. - ```bash - sbatch 1.build-image.sbatch - ``` +## 3. Llama2 Fine-tuning (Megatron-DeepSpeed) +Fine-tune Llama2 from HuggingFace weights using Megatron-DeepSpeed. Includes weight conversion from HuggingFace to Megatron format and fine-tuning on the Stanford Alpaca dataset. Uses the shared container image (`0.deepspeed.dockerfile`). -Once the image is prepared, you can proceed to `examples_*` directory for various deepspeed test cases. \ No newline at end of file +See [`examples_megatron_deepspeed/finetune_hf_llama/README.md`](examples_megatron_deepspeed/finetune_hf_llama/README.md) for full instructions. diff --git a/3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json b/3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json new file mode 100644 index 000000000..6197eaf78 --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json @@ -0,0 +1,20 @@ +{ + "train_batch_size": 64, + "train_micro_batch_size_per_gpu": 1, + "steps_per_print": 10, + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 500000000, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 500000000, + "contiguous_gradients": true + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "bf16": { + "enabled": true + }, + "wall_clock_breakdown": false +} diff --git a/3.test_cases/pytorch/deepspeed/gpt/parse_results.py b/3.test_cases/pytorch/deepspeed/gpt/parse_results.py new file mode 100755 index 000000000..8b2815396 --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/gpt/parse_results.py @@ -0,0 +1,423 @@ +#!/usr/bin/env python3 +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +""" +parse_results.py - Parse Megatron-DeepSpeed training logs into benchmark JSON. + +Reads Slurm log files, extracts per-step metrics, and produces JSON files +matching the existing benchmark-results schema at: + s3:///benchmark-results// + +Usage: + python parse_results.py [--logs-dir logs] [--output-dir sweep_results] + python parse_results.py --log-file logs/sweep_01_baseline_123.out --config-name 01_baseline +""" + +import argparse +import csv +import json +import os +import re +import statistics +import sys +from datetime import datetime, timezone + + +# ============================================================ +# Megatron-DeepSpeed log line patterns +# ============================================================ +# Example: " iteration 10/ 50 | consumed samples: ..." +# Example: "elapsed time per iteration (ms): 4725.7 | ..." +# Example: "lm loss: 1.3389E+01 | ..." +# Example: "learning rate: 3.000E-05 | ..." +# Example: "global batch size: 128 | ..." +# Example: "loss scale: 1.0 | ..." +# Example: "grad norm: 74.776 | ..." +# Example: "TFLOPs: 125.4 | ..." + +ITER_PATTERN = re.compile(r"iteration\s+(\d+)/\s*(\d+)") +ELAPSED_PATTERN = re.compile(r"elapsed time per iteration \(ms\):\s*([\d.]+)") +LOSS_PATTERN = re.compile(r"lm loss:\s*([\d.eE+\-]+)") +LR_PATTERN = re.compile(r"learning rate:\s*([\d.eE+\-]+)") +GBS_PATTERN = re.compile(r"global batch size:\s*(\d+)") +LOSS_SCALE_PATTERN = re.compile(r"loss scale:\s*([\d.eE+\-]+)") +GRAD_NORM_PATTERN = re.compile(r"grad norm:\s*([\d.eE+\-]+)") +TFLOPS_PATTERN = re.compile(r"TFLOPs:\s*([\d.]+)") + + +def parse_log_file(log_path): + """Parse a single Megatron-DeepSpeed log file and extract per-step metrics.""" + steps = [] + current_step = {} + + with open(log_path, "r") as f: + for line in f: + # Check for iteration marker + m = ITER_PATTERN.search(line) + if m: + if current_step: + steps.append(current_step) + current_step = { + "step": int(m.group(1)), + "total_steps": int(m.group(2)), + } + + if not current_step: + continue + + # Extract metrics from the same log block + m = ELAPSED_PATTERN.search(line) + if m: + elapsed_ms = float(m.group(1)) + current_step["elapsed_ms"] = elapsed_ms + current_step["step_time_s"] = round(elapsed_ms / 1000.0, 2) + + m = LOSS_PATTERN.search(line) + if m: + current_step["lm_loss"] = float(m.group(1)) + + m = LR_PATTERN.search(line) + if m: + current_step["learning_rate"] = float(m.group(1)) + + m = GBS_PATTERN.search(line) + if m: + current_step["global_batch_size"] = int(m.group(1)) + + m = LOSS_SCALE_PATTERN.search(line) + if m: + current_step["loss_scale"] = float(m.group(1)) + + m = GRAD_NORM_PATTERN.search(line) + if m: + current_step["grad_norm"] = float(m.group(1)) + + m = TFLOPS_PATTERN.search(line) + if m: + current_step["tflops_per_gpu"] = float(m.group(1)) + + # Don't forget the last step + if current_step: + steps.append(current_step) + + return steps + + +def compute_tflops_from_step_time( + step_time_s, + global_batch_size, + seq_length=2048, + hidden_size=12288, + num_layers=80, + num_heads=96, + total_gpus=64, +): + """ + Estimate TFLOPS/GPU for a GPT model using the standard formula: + FLOPs per iteration = 8 * seq * hidden^2 * layers * (1 + seq/(6*hidden) + vocab/(12*hidden*layers)) + Simplified: ~= 8 * B * s * h^2 * L * (1 + s/(6h)) + where B = global_batch_size + """ + vocab_size = 50257 # GPT-2 vocab + s = seq_length + h = hidden_size + L = num_layers + B = global_batch_size + + # Standard approximation for GPT FLOP count + flops_per_iter = ( + 8 * B * s * h * h * L * (1 + s / (6 * h) + vocab_size / (12 * h * L)) + ) + tflops_per_gpu = flops_per_iter / (step_time_s * total_gpus * 1e12) + return round(tflops_per_gpu, 1) + + +def build_result_json( + steps, + config_name, + job_id, + nodes=8, + gpus_per_node=8, + tp=8, + pp=2, + zero_stage=1, + mbs=1, + gbs=64, + seq_length=2048, + precision="bf16", + cluster="unknown", + instance_type="unknown", +): + """Build the benchmark JSON matching the existing schema.""" + total_gpus = nodes * gpus_per_node + warmup_steps = 5 + total_steps = len(steps) + + # Ensure TFLOPS values exist (compute if not in logs) + for step in steps: + if "tflops_per_gpu" not in step and "step_time_s" in step: + step["tflops_per_gpu"] = compute_tflops_from_step_time( + step["step_time_s"], + step.get("global_batch_size", gbs), + seq_length=seq_length, + total_gpus=total_gpus, + ) + + # Steady-state metrics (skip warmup) + steady_steps = [s for s in steps if s.get("step", 0) > warmup_steps] + + if not steady_steps: + print(f"Warning: No steady-state steps found for {config_name}") + steady_steps = steps + + steady_tflops = [s["tflops_per_gpu"] for s in steady_steps if "tflops_per_gpu" in s] + steady_times = [s["step_time_s"] for s in steady_steps if "step_time_s" in s] + + summary = { + "total_steps": total_steps, + "warmup_steps": warmup_steps, + "steady_state_steps": len(steady_steps), + } + + if steady_tflops: + summary.update( + { + "steady_state_avg_tflops_per_gpu": round( + statistics.mean(steady_tflops), 2 + ), + "steady_state_median_tflops_per_gpu": round( + statistics.median(steady_tflops), 1 + ), + "steady_state_min_tflops_per_gpu": round(min(steady_tflops), 1), + "steady_state_max_tflops_per_gpu": round(max(steady_tflops), 1), + "steady_state_stdev_tflops_per_gpu": round( + statistics.stdev(steady_tflops), 2 + ) + if len(steady_tflops) > 1 + else 0.0, + "peak_tflops_per_gpu": round(max(steady_tflops), 1), + } + ) + + if steady_times: + summary.update( + { + "steady_state_avg_step_time_s": round(statistics.mean(steady_times), 4), + "steady_state_median_step_time_s": round( + statistics.median(steady_times), 2 + ), + "steady_state_min_step_time_s": round(min(steady_times), 2), + "steady_state_max_step_time_s": round(max(steady_times), 2), + } + ) + + if steps: + summary["final_loss"] = steps[-1].get("lm_loss", None) + summary["initial_loss"] = steps[0].get("lm_loss", None) + + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + result = { + "metadata": { + "timestamp": timestamp, + "job_id": str(job_id), + "cluster": cluster, + "instance_type": instance_type, + "nodes": nodes, + "gpus_per_node": gpus_per_node, + "total_gpus": total_gpus, + "model": "deepspeed-gpt-103b", + "precision": precision, + "framework": "megatron-deepspeed", + "sweep_config": { + "config_name": config_name, + "tp": tp, + "pp": pp, + "zero_stage": zero_stage, + "micro_batch_size": mbs, + "global_batch_size": gbs, + "seq_length": seq_length, + }, + }, + "summary": summary, + "steps": steps, + } + + return result + + +def parse_sweep_jobs( + jobs_csv, logs_dir, output_dir, cluster="unknown", instance_type="unknown" +): + """Parse all jobs from the sweep tracking CSV.""" + os.makedirs(output_dir, exist_ok=True) + results = [] + + with open(jobs_csv, "r") as f: + reader = csv.DictReader(f) + for row in reader: + job_id = row["job_id"] + config_name = row["config_name"] + + # Find the log file for this job + log_pattern = f"sweep_{config_name}_{job_id}.out" + log_path = os.path.join(logs_dir, log_pattern) + + if not os.path.exists(log_path): + # Try alternate pattern + log_candidates = [ + f + for f in os.listdir(logs_dir) + if job_id in f and f.endswith(".out") + ] + if log_candidates: + log_path = os.path.join(logs_dir, log_candidates[0]) + else: + print( + f"Warning: No log file found for job {job_id} ({config_name})" + ) + continue + + print(f"Parsing {config_name} (job {job_id}): {log_path}") + steps = parse_log_file(log_path) + + if not steps: + print(f" Warning: No steps found in log file") + continue + + result = build_result_json( + steps=steps, + config_name=config_name, + job_id=job_id, + tp=int(row.get("tp", 8)), + pp=int(row.get("pp", 2)), + zero_stage=int(row.get("zero", 1)), + mbs=int(row.get("mbs", 1)), + gbs=int(row.get("gbs", 64)), + seq_length=int(row.get("seq_length", 2048)), + cluster=cluster, + instance_type=instance_type, + ) + + # Write individual JSON file + now = datetime.now(timezone.utc) + filename = ( + f"training_bench_deepspeed-gpt-103b_bf16_" + f"{now.strftime('%Y-%m-%d_%H%M')}_job{job_id}.json" + ) + filepath = os.path.join(output_dir, filename) + with open(filepath, "w") as jf: + json.dump(result, jf, indent=2) + print(f" Wrote: {filepath}") + + results.append(result) + + # Write combined summary + summary_path = os.path.join(output_dir, "sweep_summary.json") + with open(summary_path, "w") as f: + json.dump(results, f, indent=2) + print(f"\nWrote combined summary: {summary_path}") + + return results + + +def parse_single_log( + log_file, config_name, output_dir, cluster="unknown", instance_type="unknown" +): + """Parse a single log file.""" + os.makedirs(output_dir, exist_ok=True) + + # Extract job ID from filename + job_id_match = re.search(r"_(\d+)\.out", log_file) + job_id = job_id_match.group(1) if job_id_match else "unknown" + + print(f"Parsing {config_name} (job {job_id}): {log_file}") + steps = parse_log_file(log_file) + + if not steps: + print("Error: No steps found in log file") + sys.exit(1) + + result = build_result_json( + steps=steps, + config_name=config_name, + job_id=job_id, + cluster=cluster, + instance_type=instance_type, + ) + + now = datetime.now(timezone.utc) + filename = ( + f"training_bench_deepspeed-gpt-103b_bf16_" + f"{now.strftime('%Y-%m-%d_%H%M')}_job{job_id}.json" + ) + filepath = os.path.join(output_dir, filename) + with open(filepath, "w") as f: + json.dump(result, f, indent=2) + print(f"Wrote: {filepath}") + + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Parse Megatron-DeepSpeed logs into benchmark JSON" + ) + parser.add_argument( + "--logs-dir", default="logs", help="Directory containing Slurm log files" + ) + parser.add_argument( + "--output-dir", default="sweep_results", help="Directory to write JSON results" + ) + parser.add_argument( + "--jobs-csv", + default="sweep_results/sweep_jobs.csv", + help="CSV file tracking sweep job IDs", + ) + parser.add_argument( + "--log-file", default=None, help="Parse a single log file instead of sweep CSV" + ) + parser.add_argument( + "--config-name", + default="single_run", + help="Config name for single log file parsing", + ) + parser.add_argument( + "--cluster", + default=os.environ.get("CLUSTER_NAME", "unknown"), + help="Cluster name for metadata (default: $CLUSTER_NAME or 'unknown')", + ) + parser.add_argument( + "--instance-type", + default=os.environ.get("INSTANCE_TYPE", "unknown"), + help="Instance type for metadata (default: $INSTANCE_TYPE or 'unknown')", + ) + + args = parser.parse_args() + + if args.log_file: + parse_single_log( + args.log_file, + args.config_name, + args.output_dir, + cluster=args.cluster, + instance_type=args.instance_type, + ) + else: + if not os.path.exists(args.jobs_csv): + print(f"Error: Jobs CSV not found: {args.jobs_csv}") + print( + "Run sweep_runner.sh first, or use --log-file for single file parsing" + ) + sys.exit(1) + parse_sweep_jobs( + args.jobs_csv, + args.logs_dir, + args.output_dir, + cluster=args.cluster, + instance_type=args.instance_type, + ) + + +if __name__ == "__main__": + main() diff --git a/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch b/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch new file mode 100755 index 000000000..a68585528 --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch @@ -0,0 +1,282 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --exclusive +#SBATCH --job-name=deepspeed-pretrain-103b +#SBATCH --output=logs/%x_%j.out +#SBATCH --error=logs/%x_%j.err + +set -euxo pipefail + +# ============================================================ +# Environment defaults +# ============================================================ +: "${APPS_PATH:=/fsx/apps}" +: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}" +: "${FSX_PATH:=/fsx}" +: "${DATA_DIR:=$FSX_PATH/deepspeed/data}" +: "${MEGATRON_DS_PATH:=$FSX_PATH/deepspeed/Megatron-DeepSpeed}" + +# ============================================================ +# Parallelism config (overridable via env vars from sweep_runner.sh) +# ============================================================ +: "${TP:=8}" +: "${PP:=2}" +: "${ZERO_STAGE:=1}" +: "${MICRO_BATCH_SIZE:=1}" +: "${GLOBAL_BATCH_SIZE:=64}" +: "${TRAIN_ITERS:=50}" + +# ============================================================ +# ~103B GPT model architecture +# Layers=80, Hidden=12288, Heads=96, FFN=49152 +# Estimated parameters: ~103B +# ============================================================ +: "${NUM_LAYERS:=80}" +: "${HIDDEN_SIZE:=12288}" +: "${NUM_HEADS:=96}" +: "${FFN_HIDDEN_SIZE:=49152}" +: "${SEQ_LENGTH:=2048}" + +# ============================================================ +# Optional features (set to 1 to enable) +# ============================================================ +: "${USE_ACTIVATION_CHECKPOINTING:=0}" +: "${USE_SEQUENCE_PARALLEL:=0}" +: "${USE_OVERLAP_COMM:=0}" +: "${ENABLE_FUSIONS:=0}" +: "${CONFIG_NAME:=baseline}" + +# ============================================================ +# PyTorch memory allocator optimisation +# ============================================================ +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + +# ============================================================ +# Cluster topology +# ============================================================ +export NODES=( $( scontrol show hostnames "$SLURM_JOB_NODELIST" ) ) +export NODES_ARRAY=($NODES) +export HEAD_NODE=${NODES_ARRAY[0]} +export MASTER_ADDR=$(hostname --ip-address) +export MASTER_PORT=$((RANDOM + 10000)) +export NNODES=$SLURM_JOB_NUM_NODES +export NUM_GPUS_PER_NODE=8 + +# ============================================================ +# Network settings for EFA + NCCL +# ============================================================ +export FI_LOG_LEVEL=1 +export FI_PROVIDER=efa +export FI_EFA_USE_HUGE_PAGE=0 +export NCCL_SOCKET_IFNAME=^docker,lo,veth +export NCCL_P2P_NET_CHUNKSIZE=2097152 +export NCCL_BUFFERSIZE=8388608 +export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so +export NCCL_ASYNC_ERROR_HANDLING=1 +export OMPI_MCA_plm=^slurm + +# ============================================================ +# Generate DeepSpeed config dynamically +# ============================================================ +mkdir -p configs logs + +PRESCALE_GRAD="false" +if [ "${ZERO_STAGE}" -eq 0 ]; then + PRESCALE_GRAD="true" +fi + +OVERLAP_COMM_BOOL="false" +if [ "${USE_OVERLAP_COMM}" -eq 1 ]; then + OVERLAP_COMM_BOOL="true" +fi + +# Build ZeRO optimisation block depending on stage +if [ "${ZERO_STAGE}" -eq 3 ]; then + ZERO_BLOCK=$(cat < configs/ds_config_run.json +{ + "train_batch_size": ${GLOBAL_BATCH_SIZE}, + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 10, +${ZERO_BLOCK}, + "gradient_clipping": 1.0, + "prescale_gradients": ${PRESCALE_GRAD}, + "bf16": { + "enabled": true + }, + "wall_clock_breakdown": false +} +EOF + +# ============================================================ +# Hostfile for DeepSpeed +# ============================================================ +export HOSTFILE="/fsx/hostfile_${SLURM_JOB_ID}" +function makehostfile() { +perl -e '$slots=split /,/, $ENV{"SLURM_STEP_GPUS"}; +$slots=8 if $slots==0; +@nodes = split /\n/, qx[scontrol show hostnames $ENV{"SLURM_JOB_NODELIST"}]; +print map { "$b$_ slots=$slots\n" } @nodes' +} +makehostfile > "${HOSTFILE}" + +# ============================================================ +# Container + distributed launch args +# ============================================================ +declare -a SRUN_ARGS=( + --container-image "${IMAGE}" + --container-mounts /fsx,/opt/slurm/bin +) + +declare -a DIST_ARGS=( + --nnodes ${NNODES} + --nproc-per-node ${NUM_GPUS_PER_NODE} + --master_addr ${MASTER_ADDR} + --master_port ${MASTER_PORT} + --rdzv_id $RANDOM + --rdzv_backend c10d + --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} +) + +# ============================================================ +# Model + training args +# ============================================================ +declare -a MODEL_ARGS=( + --num-layers ${NUM_LAYERS} + --hidden-size ${HIDDEN_SIZE} + --num-attention-heads ${NUM_HEADS} + --ffn-hidden-size ${FFN_HIDDEN_SIZE} + --seq-length ${SEQ_LENGTH} + --max-position-embeddings ${SEQ_LENGTH} + --micro-batch-size ${MICRO_BATCH_SIZE} + --global-batch-size ${GLOBAL_BATCH_SIZE} + --train-iters ${TRAIN_ITERS} + --lr 1.0e-4 + --min-lr 1.0e-6 + --lr-decay-style cosine + --lr-warmup-iters 5 + --lr-decay-iters 50 + --weight-decay 0.1 + --clip-grad 1.0 + --adam-beta1 0.9 + --adam-beta2 0.95 + --init-method-std 0.006 + --log-interval 1 + --eval-iters 0 + --eval-interval 1000 + --bf16 + --data-path ${DATA_DIR}/BookCorpusDataset_text_document + --vocab-file ${DATA_DIR}/gpt2-vocab.json + --merge-file ${DATA_DIR}/gpt2-merges.txt + --split 100,0,0 + --data-impl mmap + --num-workers 0 +) + +# By default disable fusions (matches sweep v1 behaviour). +# Set ENABLE_FUSIONS=1 to enable them. +if [ "${ENABLE_FUSIONS}" -eq 0 ]; then + MODEL_ARGS+=( + --no-masked-softmax-fusion + --no-bias-gelu-fusion + --no-bias-dropout-fusion + --no-gradient-accumulation-fusion + ) +fi + +declare -a DS_ARGS=( + --tensor-model-parallel-size ${TP} + --zero-stage ${ZERO_STAGE} + --deepspeed_config ${PWD}/configs/ds_config_run.json + --deepspeed + --distributed-backend nccl +) + +# Megatron-DeepSpeed wraps the model in PipelineModule by default. +# DeepSpeed's PipelineEngine asserts that ZeRO stage < 2. +# When using ZeRO-2 or ZeRO-3 we must disable pipeline parallel entirely +# via --no-pipeline-parallel (which sets ds_pipeline_enabled=False). +if [ "${ZERO_STAGE}" -ge 2 ]; then + DS_ARGS+=( + --pipeline-model-parallel-size 1 + --no-pipeline-parallel + ) +else + DS_ARGS+=(--pipeline-model-parallel-size ${PP}) +fi + +# ============================================================ +# Optional features +# ============================================================ +if [ "${USE_ACTIVATION_CHECKPOINTING}" -eq 1 ]; then + DS_ARGS+=( + --checkpoint-activations + --deepspeed-activation-checkpointing + ) +fi + +if [ "${USE_SEQUENCE_PARALLEL}" -eq 1 ]; then + DS_ARGS+=(--sequence-parallel) +fi + +# ============================================================ +# Launch training +# Note: Using python3 -m torch.distributed.run instead of torchrun +# because the container's Python version may differ from the host +# ============================================================ +echo "=== DeepSpeed 103B GPT Pretraining ===" +echo "Config: ${CONFIG_NAME}" +echo "Nodes: ${NNODES}, GPUs/node: ${NUM_GPUS_PER_NODE}, Total GPUs: $((NNODES * NUM_GPUS_PER_NODE))" +echo "TP=${TP}, PP=${PP}, ZeRO=${ZERO_STAGE}" +echo "MBS=${MICRO_BATCH_SIZE}, GBS=${GLOBAL_BATCH_SIZE}" +echo "Model: layers=${NUM_LAYERS}, hidden=${HIDDEN_SIZE}, heads=${NUM_HEADS}, ffn=${FFN_HIDDEN_SIZE}" +echo "Seq length: ${SEQ_LENGTH}, Fusions: ${ENABLE_FUSIONS}" +echo "Activation ckpt: ${USE_ACTIVATION_CHECKPOINTING}, Seq parallel: ${USE_SEQUENCE_PARALLEL}" +echo "PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-}" +echo "=======================================" + +# Convert arrays to strings for bash -c invocation +DIST_ARGS_STR="${DIST_ARGS[*]}" +MODEL_ARGS_STR="${MODEL_ARGS[*]}" +DS_ARGS_STR="${DS_ARGS[*]}" + +# Note: Variables inside the bash -c string are expanded on the host side before +# being passed to the container. This is intentional — the container does not have +# access to these env vars at shell expansion time. +srun -l "${SRUN_ARGS[@]}" bash -c "export PYTHONPATH=${MEGATRON_DS_PATH} && cd ${MEGATRON_DS_PATH} && python3 -m torch.distributed.run ${DIST_ARGS_STR} pretrain_gpt.py ${MODEL_ARGS_STR} ${DS_ARGS_STR}" + +# Cleanup hostfile +rm -f "${HOSTFILE}" diff --git a/3.test_cases/pytorch/deepspeed/qlora/Dockerfile b/3.test_cases/pytorch/deepspeed/qlora/Dockerfile index 27f9bcd8c..32f02d35e 100644 --- a/3.test_cases/pytorch/deepspeed/qlora/Dockerfile +++ b/3.test_cases/pytorch/deepspeed/qlora/Dockerfile @@ -1,61 +1,154 @@ # Dockerfile for QLoRA Fine-tuning of Qwen3-8B # ============================================= -# Base Image: NVIDIA CUDA 12.8 with cuDNN 9 -# Python: 3.10 +# Base Image: PyTorch 25.04 with CUDA 12.9.0 (supports Blackwell, Hopper, Ampere) +# Python: 3.12 (bundled with pytorch:25.04-py3) # Key Libraries: PyTorch, Transformers, PEFT, BitsAndBytes, DeepSpeed +# Networking: EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1 # # Build: # docker build -t qwen3-qlora-training:latest . # # If you encounter CUBLAS errors at runtime (typically caused by CUDA -# library conflicts on the host), switch the torch index URL below to -# cu126 as a fallback — see docs/TROUBLESHOOTING.md. +# library conflicts on the host), see docs/TROUBLESHOOTING.md. -# Stage 1: Base image with CUDA -FROM nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 AS base +# ============================================================ +# Base image: PyTorch 25.04 with CUDA 12.9.0 +# ============================================================ +FROM nvcr.io/nvidia/pytorch:25.04-py3 -# Prevent interactive prompts during build ENV DEBIAN_FRONTEND=noninteractive -# Install system dependencies -RUN apt-get update && apt-get install -y \ - python3.10 \ - python3.10-dev \ - python3-pip \ - python3.10-venv \ +# ============================================================ +# 1. System packages and SSH setup (needed for multi-node training) +# ============================================================ +RUN apt-get update -y && apt-get upgrade -y +RUN apt-get remove -y --allow-change-held-packages \ + ibverbs-utils \ + libibverbs-dev \ + libibverbs1 \ + && rm -rf /opt/hpcx/ompi \ + && rm -rf /usr/local/ucx \ + && ldconfig + +RUN apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + build-essential \ + cmake \ + curl \ + gcc \ + gdb \ git \ git-lfs \ + gnupg \ + kmod \ + libtool \ + openssh-client \ + openssh-server \ wget \ - curl \ && rm -rf /var/lib/apt/lists/* -# Set Python 3.10 as default -RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \ - ln -sf /usr/bin/pip3 /usr/bin/pip +# SSH configuration for multi-node +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config -# Upgrade pip -RUN pip install --upgrade pip setuptools wheel +# ============================================================ +# 2. Install EFA Installer 1.47.0 +# ============================================================ +ENV EFA_INSTALLER_VERSION=1.47.0 +WORKDIR /tmp +RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && cd / && rm -rf /tmp/aws-efa-installer -# Set working directory -WORKDIR /app +# ============================================================ +# 3. NCCL plugin symlinks +# EFA installer names the plugin libnccl-net-ofi.so but NCCL +# looks for libnccl-net-aws-ofi.so. Without this symlink NCCL +# falls back to TCP sockets silently. +# ============================================================ +RUN rm -rf /opt/amazon/aws-ofi-nccl + +RUN ln -sf /opt/amazon/ofi-nccl/lib/libnccl-net-ofi.so \ + /opt/amazon/ofi-nccl/lib/libnccl-net-aws-ofi.so && \ + ln -sf /opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \ + /opt/amazon/ofi-nccl/lib/libnccl-tuner-aws-ofi.so + +# ============================================================ +# 4. Upgrade NCCL to 2.29.3 (requires CUDA >= 12.9) +# ============================================================ +ENV NCCL_VERSION=2.29.3-1 +RUN apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget -qO /tmp/cuda-keyring.deb \ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i /tmp/cuda-keyring.deb && \ + rm /tmp/cuda-keyring.deb && \ + apt-get update && \ + apt-get install -y --allow-downgrades --allow-change-held-packages \ + libnccl2=${NCCL_VERSION}+cuda12.9 \ + libnccl-dev=${NCCL_VERSION}+cuda12.9 && \ + rm -rf /var/lib/apt/lists/* + +# ============================================================ +# 5. Install GDRCopy v2.5.1 (lib-only) +# ============================================================ +RUN cd /tmp && \ + git clone --branch v2.5.1 --depth 1 https://github.com/NVIDIA/gdrcopy.git && \ + cd gdrcopy && \ + make -j$(nproc) lib lib_install && \ + cd / && rm -rf /tmp/gdrcopy + +# ============================================================ +# 6. Library path configuration +# ============================================================ +RUN echo "/opt/amazon/ofi-nccl/lib" > /etc/ld.so.conf.d/aws-ofi-nccl.conf && \ + echo "/opt/amazon/efa/lib" > /etc/ld.so.conf.d/efa.conf -# Stage 2: Install Python dependencies -FROM base AS dependencies +RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/environment 2>/dev/null || true +RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/shinit_v2 2>/dev/null || true + +RUN rm -f /etc/ld.so.cache && ldconfig + +ENV LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}" +ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}" + +# ============================================================ +# 7. OpenMPI tuning for EFA +# ============================================================ +ARG OPEN_MPI_PATH=/opt/amazon/openmpi +RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf + +RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \ + && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "${OPEN_MPI_PATH}/bin/mpirun.real \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \ + && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun + +# ============================================================ +# 8. Python dependencies for QLoRA training +# ============================================================ +WORKDIR /app # Copy requirements first for better caching COPY requirements.txt . -# Install PyTorch with CUDA 12.8 support +# Install PyTorch with CUDA 12.9 support # Note: torch 2.10+ has a breaking LR scheduler change (strict zip) that is # incompatible with some DeepSpeed/transformers versions. Pin to <2.10 until # upstream libraries catch up. -RUN pip install --no-cache-dir 'torch>=2.7.0,<2.10.0' --index-url https://download.pytorch.org/whl/cu128 +RUN pip install --no-cache-dir 'torch>=2.7.0,<2.10.0' --index-url https://download.pytorch.org/whl/cu129 # Install other dependencies RUN pip install --no-cache-dir -r requirements.txt -# Stage 3: Final image with application code -FROM dependencies AS final +# ============================================================ +# 9. Application code +# ============================================================ # Copy source code and entrypoint COPY entrypoint.sh /app/entrypoint.sh @@ -66,14 +159,23 @@ COPY configs/ /app/configs/ # Create directories for outputs and cache RUN mkdir -p /workspace/outputs /workspace/hf_cache -# Set environment variables +# ============================================================ +# 10. Environment variables +# ============================================================ ENV PYTHONPATH=/app ENV HF_HOME=/workspace/hf_cache ENV PYTHONUNBUFFERED=1 -# Do NOT set CUDA_VISIBLE_DEVICES here — let torchrun / K8s manage GPU visibility -# DeepSpeed / NCCL settings for multi-GPU communication +# Do NOT set CUDA_VISIBLE_DEVICES here -- let torchrun / K8s manage GPU visibility + +# NCCL / EFA settings for multi-GPU and multi-node communication ENV NCCL_DEBUG=INFO -ENV NCCL_SOCKET_IFNAME=^lo +ENV NCCL_SOCKET_IFNAME=^docker,lo,veth +ENV FI_PROVIDER=efa +ENV NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so + +# PyTorch memory allocator -- expandable segments reduces fragmentation +# Note: capital T is required in pytorch:25.04 containers +ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # Entrypoint reads PET_* env vars set by the Kubeflow Training Operator # and launches torchrun with the correct number of processes per node. diff --git a/3.test_cases/pytorch/deepspeed/qlora/requirements.txt b/3.test_cases/pytorch/deepspeed/qlora/requirements.txt index 893e443a7..9bdf0eb7a 100644 --- a/3.test_cases/pytorch/deepspeed/qlora/requirements.txt +++ b/3.test_cases/pytorch/deepspeed/qlora/requirements.txt @@ -1,6 +1,6 @@ # Core ML Libraries # NOTE: torch is installed separately with the correct CUDA index URL. -# - Docker (see Dockerfile): torch>=2.7.0,<2.10.0 with cu128 +# - Docker (see Dockerfile): torch>=2.7.0,<2.10.0 with cu129 (pytorch:25.04-py3 base) # - Slurm venv (see slurm/README): torch==2.6.0 with cu126 # See docs/TROUBLESHOOTING.md if you encounter CUBLAS errors (typically caused # by environment-level CUDA library conflicts, not a library bug).