diff --git a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile
index 472edc476..1874ca3d5 100644
--- a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile
+++ b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile
@@ -1,19 +1,20 @@
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: MIT-0
 
-FROM nvcr.io/nvidia/pytorch:25.03-py3
+# ============================================================
+# Base image: PyTorch 25.04 with CUDA 12.9.0 (required for NCCL 2.29.x)
+# Supports Blackwell (sm_100), Hopper, Ampere architectures
+# ============================================================
+FROM nvcr.io/nvidia/pytorch:25.04-py3
 
-ARG GDRCOPY_VERSION=v2.4.1
-ARG EFA_INSTALLER_VERSION=1.37.0
-ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws
 ARG TRANSFORMERS_VERSION=4.44.2
-ARG MEGATRON_LM_VERSION=core_r0.8.0
-
 ARG OPEN_MPI_PATH=/opt/amazon/openmpi
 
-######################
-# Update and remove the IB libverbs
-######################
+ENV DEBIAN_FRONTEND=noninteractive
+
+# ============================================================
+# 1. System packages and SSH setup (needed for multi-node training)
+# ============================================================
 RUN apt-get update -y && apt-get upgrade -y
 RUN apt-get remove -y --allow-change-held-packages \
     ibverbs-utils \
@@ -26,8 +27,7 @@ RUN rm -rf /opt/hpcx/ompi \
     && rm -rf /usr/local/ucx \
     && ldconfig
 
-RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \
-    apt-utils \
+RUN apt-get install -y --no-install-recommends \
     autoconf \
     automake \
     build-essential \
@@ -36,6 +36,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \
     gcc \
     gdb \
     git \
+    gnupg \
     kmod \
     libtool \
     openssh-client \
@@ -55,69 +56,99 @@ RUN rm -rf /root/.ssh/ \
  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 
-ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:$LD_LIBRARY_PATH
-ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
-
-#################################################
-## Install NVIDIA GDRCopy
-##
-## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
-## that the cuda-compat-xx-x package is the latest.
-RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
-    && cd /tmp/gdrcopy \
-    && make prefix=/opt/gdrcopy install
-
-ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH
-ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH
-ENV CPATH /opt/gdrcopy/include:$CPATH
-ENV PATH /opt/gdrcopy/bin:$PATH
-
-#################################################
-## Install EFA installer
-RUN cd $HOME \
-    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
-    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+# ============================================================
+# 2. Install EFA Installer 1.47.0
+#    This bundles libfabric, rdma-core, and pre-built aws-ofi-nccl
+#    No source build of aws-ofi-nccl needed (unlike EFA < 1.40)
+# ============================================================
+ENV EFA_INSTALLER_VERSION=1.47.0
+WORKDIR /tmp
+RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xz \
     && cd aws-efa-installer \
     && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
-    && rm -rf $HOME/aws-efa-installer
-
-
-###################################################
-## Install AWS-OFI-NCCL plugin
-RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
-#Switch from sh to bash to allow parameter expansion
-SHELL ["/bin/bash", "-c"]
-RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
-    && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
-    && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
-    && ./configure --prefix=/opt/aws-ofi-nccl/install \
-        --with-mpi=/opt/amazon/openmpi \
-        --with-libfabric=/opt/amazon/efa \
-        --with-cuda=/usr/local/cuda \
-        --enable-platform-aws \
-    && make -j $(nproc) \
-    && make install \
-    && cd .. \
-    && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
-    && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
-
-SHELL ["/bin/sh", "-c"]
-
-###################################################
-RUN rm -rf /var/lib/apt/lists/*
-
-RUN echo "hwloc_base_binding_policy = none" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf \
- && echo "rmaps_base_mapping_policy = slot" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf
+    && cd / && rm -rf /tmp/aws-efa-installer
+
+# ============================================================
+# 3. Remove old aws-ofi-nccl and create NCCL plugin symlinks
+#    NCCL_NET_PLUGIN=aws-ofi looks for libnccl-net-aws-ofi.so
+#    EFA installer names it libnccl-net-ofi.so
+#    Without this symlink NCCL falls back to TCP sockets silently
+# ============================================================
+RUN rm -rf /opt/amazon/aws-ofi-nccl
+
+RUN ln -sf /opt/amazon/ofi-nccl/lib/libnccl-net-ofi.so \
+           /opt/amazon/ofi-nccl/lib/libnccl-net-aws-ofi.so && \
+    ln -sf /opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \
+           /opt/amazon/ofi-nccl/lib/libnccl-tuner-aws-ofi.so
+
+# ============================================================
+# 4. Upgrade NCCL to 2.29.3 (matches B200 host version)
+#    Requires CUDA >= 12.9 (which pytorch:25.04-py3 provides)
+#    Must add NVIDIA CUDA apt repo first since base image may not have it
+# ============================================================
+ENV NCCL_VERSION=2.29.3-1
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends wget && \
+    wget -qO /tmp/cuda-keyring.deb \
+      https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i /tmp/cuda-keyring.deb && \
+    rm /tmp/cuda-keyring.deb && \
+    apt-get update && \
+    apt-get install -y --allow-downgrades --allow-change-held-packages \
+      libnccl2=${NCCL_VERSION}+cuda12.9 \
+      libnccl-dev=${NCCL_VERSION}+cuda12.9 && \
+    rm -rf /var/lib/apt/lists/*
+
+# ============================================================
+# 5. Install GDRCopy v2.5.1 (lib-only, no binaries needed)
+# ============================================================
+RUN cd /tmp && \
+    git clone --branch v2.5.1 --depth 1 https://github.com/NVIDIA/gdrcopy.git && \
+    cd gdrcopy && \
+    make -j$(nproc) lib lib_install && \
+    cd / && rm -rf /tmp/gdrcopy
+
+# ============================================================
+# 6. Fix library path references
+#    Use ld.so.conf.d for system-wide discovery (more robust
+#    than relying solely on LD_LIBRARY_PATH)
+# ============================================================
+RUN echo "/opt/amazon/ofi-nccl/lib" > /etc/ld.so.conf.d/aws-ofi-nccl.conf && \
+    echo "/opt/amazon/efa/lib" > /etc/ld.so.conf.d/efa.conf
+
+RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/environment 2>/dev/null || true
+RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/shinit_v2 2>/dev/null || true
+
+# Rebuild ldconfig cache
+RUN rm -f /etc/ld.so.cache && ldconfig
+
+# ============================================================
+# 7. Environment variables
+# ============================================================
+ENV LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}"
+ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}"
+ENV FI_PROVIDER=efa
+
+# ============================================================
+# 8. OpenMPI tuning for EFA (needed for multi-node training)
+# ============================================================
+RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
+
+RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \
+ && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \
+ && echo "${OPEN_MPI_PATH}/bin/mpirun.real \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \
+ && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun
+
+# ============================================================
+# 9. Python packages for DeepSpeed training
+# ============================================================
+RUN pip3 install --no-cache-dir \
+    awscli pynvml \
+    transformers==${TRANSFORMERS_VERSION} \
+    sentencepiece python-etcd \
+    deepspeed>=0.16,<1.0 accelerate>=1.0,<2.0
 
-RUN pip3 install awscli pynvml
-
-RUN mv $OPEN_MPI_PATH/bin/mpirun $OPEN_MPI_PATH/bin/mpirun.real \
- && echo '#!/bin/bash' > $OPEN_MPI_PATH/bin/mpirun \
- && echo '/opt/amazon/openmpi/bin/mpirun.real "$@"' >> $OPEN_MPI_PATH/bin/mpirun \
- && chmod a+x $OPEN_MPI_PATH/bin/mpirun
-
-######################
-# DeepSpeed dependencies
-######################
-RUN pip install transformers==${TRANSFORMERS_VERSION} sentencepiece python-etcd deepspeed accelerate
+RUN rm -rf /var/lib/apt/lists/*
 
+WORKDIR /workspace
diff --git a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
index ebd7b0b04..f999b926b 100644
--- a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
+++ b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: MIT-0
 
 #SBATCH -N 1 # number of nodes to use
-#SBATCH --job-name=build-neox-image # name of your job
+#SBATCH --job-name=build-deepspeed-image # name of your job
 #SBATCH --output=logs/%x_%j.out # logfile for stdout
 #SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs
 
@@ -14,11 +14,15 @@ set -euxo pipefail
 : "${APPS_PATH:=/fsx/apps}"
 : "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"
 
+# Ensure output directory exists
+mkdir -p "${APPS_PATH}"
+mkdir -p logs
+
 ENROOT_IMAGE=deepspeed
-docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile .
+docker build -t "${ENROOT_IMAGE}" -f 0.deepspeed.dockerfile .
 # Remove old sqsh file if exists
-if [ -f ${ENROOT_IMAGE}.sqsh ] ; then
-    rm ${ENROOT_IMAGE}.sqsh
+if [ -f "${ENROOT_IMAGE}.sqsh" ] ; then
+    rm "${ENROOT_IMAGE}.sqsh"
 fi
-enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest
-mv ${ENROOT_IMAGE}.sqsh ${IMAGE}
\ No newline at end of file
+enroot import -o "${ENROOT_IMAGE}.sqsh" "dockerd://${ENROOT_IMAGE}:latest"
+mv "${ENROOT_IMAGE}.sqsh" "${IMAGE}"
\ No newline at end of file
diff --git a/3.test_cases/pytorch/deepspeed/Makefile b/3.test_cases/pytorch/deepspeed/Makefile
index e4615c60f..88b3d2262 100644
--- a/3.test_cases/pytorch/deepspeed/Makefile
+++ b/3.test_cases/pytorch/deepspeed/Makefile
@@ -1,12 +1,52 @@
-ENROOT_IMAGE=deepspeed
+ENROOT_IMAGE   ?= deepspeed
+APPS_PATH      ?= /fsx/apps
+SQUASH_FILE    ?= $(APPS_PATH)/$(ENROOT_IMAGE).sqsh
+PARTITION      ?= dev
+NODES          ?= 8
+LOGS_DIR       ?= logs
+RESULTS_DIR    ?= sweep_results
 
-all: build clean import
+.PHONY: all build clean import build-remote train parse help
+
+all: build import
+
+help:
+	@echo "Container targets:"
+	@echo "  build          - Build Docker image locally"
+	@echo "  import         - Convert Docker image to Enroot squash file"
+	@echo "  build-remote   - Build image on a compute node via sbatch"
+	@echo "  clean          - Remove local squash file"
+	@echo ""
+	@echo "Training targets:"
+	@echo "  train          - Submit 103B GPT pretraining (best config: TP=8, PP=8, fusions)"
+	@echo ""
+	@echo "Results targets:"
+	@echo "  parse          - Parse training logs into benchmark JSON"
+
+# ---- Container ----
 
 build:
-	docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile .
+	docker build -t $(ENROOT_IMAGE) -f 0.deepspeed.dockerfile .
+
+import:
+	mkdir -p $(APPS_PATH)
+	enroot import -o $(SQUASH_FILE) dockerd://$(ENROOT_IMAGE):latest
+
+build-remote:
+	sbatch 1.build-image.sbatch
 
 clean:
-	-rm ${ENROOT_IMAGE}.sqsh
+	-rm -f $(SQUASH_FILE)
 
-import:
-	enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest
+# ---- Training (best config: TP=8, PP=8, ZeRO=0, fusions enabled) ----
+
+train:
+	sbatch --partition=$(PARTITION) --nodes=$(NODES) \
+		--export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \
+		gpt/slurm/pretrain_gpt_103b.sbatch
+
+# ---- Results ----
+
+parse:
+	python3 gpt/parse_results.py --jobs-csv $(RESULTS_DIR)/sweep_jobs.csv \
+		--logs-dir $(LOGS_DIR) --output-dir $(RESULTS_DIR)
diff --git a/3.test_cases/pytorch/deepspeed/README.md b/3.test_cases/pytorch/deepspeed/README.md
index fd2ef7524..82e34d46a 100644
--- a/3.test_cases/pytorch/deepspeed/README.md
+++ b/3.test_cases/pytorch/deepspeed/README.md
@@ -1,87 +1,179 @@
-# DeepSpeed Test Cases <!-- omit in toc -->
+# DeepSpeed on AWS <!-- omit in toc -->
 
-[DeepSpeed](https://github.com/microsoft/DeepSpeed) enables world's most powerful language models like MT-530B and BLOOM. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. `deepspeed` illustrates several example test cases for DeepSpeed training on AWS. 
+[DeepSpeed](https://github.com/microsoft/DeepSpeed) is a deep learning optimization library that enables efficient distributed training at scale. This directory contains test cases for running DeepSpeed workloads on AWS GPU clusters, covering large-scale pretraining and parameter-efficient fine-tuning.
 
-## 1. Preparation
+## Use Cases
 
-This guide assumes that you have the following:
+| Use Case | Description | Location |
+|----------|-------------|----------|
+| GPT-103B Pretraining | Large-scale GPT pretraining benchmark using Megatron-DeepSpeed with 3D parallelism (TP/PP/DP) and ZeRO optimization | [`gpt/`](gpt/) |
+| QLoRA Fine-tuning | Qwen3-8B fine-tuning with QLoRA (4-bit) + DeepSpeed ZeRO-2/3, supports EKS and Slurm | [`qlora/`](qlora/) |
+| Llama2 Fine-tuning | Llama2 fine-tuning from HuggingFace weights using Megatron-DeepSpeed | [`examples_megatron_deepspeed/finetune_hf_llama/`](examples_megatron_deepspeed/finetune_hf_llama/) |
 
-* A functional Slurm cluster on AWS.
-* Docker, [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) installed.
-* An FSx for Lustre filesystem mounted on `/fsx`.
+## Prerequisites
 
-We recommend that you set up a Slurm cluster using the templates in the architectures [directory](../../1.architectures). You need to set the following environment variables to run these test cases:
+- A functional Slurm cluster on AWS. We recommend [SageMaker HyperPod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html) or the templates in the [architectures directory](../../1.architectures).
+- [Docker](https://docs.docker.com/engine/install/), [Pyxis](https://github.com/NVIDIA/pyxis), and [Enroot](https://github.com/NVIDIA/enroot) installed on compute nodes.
+- An [FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html) filesystem mounted on `/fsx`.
+- NVIDIA GPU instances with [EFA networking](https://aws.amazon.com/hpc/efa/) (B200, H100, A100, etc.).
 
-```bash
-export APPS_PATH=/fsx/apps
-export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh
-export FSX_PATH=/fsx
-export MODEL_PATH=$FSX_PATH/deepspeed
-export TEST_CASE_PATH=${HOME}/18.deepspeed  # where you copy the test case or set to your test case path
-cd $TEST_CASE_PATH                          # Note that we assume that you are here during the following command executions
-```
+## 1. GPT-103B Pretraining Benchmark
+
+A ~103B-parameter GPT model (80 layers, hidden=12288, heads=96, FFN=49152) trained with [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) using 3D parallelism (tensor, pipeline, data) and DeepSpeed ZeRO optimization. Designed for benchmarking multi-node GPU clusters.
 
+### Container setup
 
+The container image (`0.deepspeed.dockerfile`) is built on `nvcr.io/nvidia/pytorch:25.04-py3` and includes:
 
-## 2. Build the container
+- **EFA 1.47.0** with the bundled aws-ofi-nccl plugin and NCCL tuner
+- **NCCL 2.29.3** (upgraded to match B200 host driver)
+- **GDRCopy v2.5.1** for GPU-direct RDMA
+- **DeepSpeed**, **Transformers 4.44.2**, and multi-node SSH configuration
 
-Before running training jobs, you need to use a build docker container image. [Enroot](https://github.com/NVIDIA/enroot) will be used to turn the image into unprivileged sandbox for Slurm but build step may exceed the storage available on the head node so we reccomend building it on a compute node following instructions below (option 2)
+Build the container on a compute node (recommended, avoids head node storage limits):
 
-### Option 1: build image on a head node
+```bash
+sbatch 1.build-image.sbatch
+```
+
+Or build locally and convert to a squash file:
+
+```bash
+make build    # docker build
+make import   # enroot import to /fsx/apps/deepspeed.sqsh
+```
 
-Below are the steps you need to follow:
+### Data preparation
 
+The benchmark uses preprocessed data in Megatron format with the GPT-2 tokenizer.
 
-1. Build the Docker image with the command below in this directory.
+1. Download the GPT-2 tokenizer:
 
    ```bash
-    docker build -t deepspeed -f 0.deepspeed.dockerfile .
+   mkdir -p /fsx/deepspeed/data && cd /fsx/deepspeed/data
+   wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+   wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
    ```
 
-
-2. Once the Docker image is built, you can check if it is present with `docker images`. You should see an output similar to this one:
+2. Prepare training data (any text corpus works; for benchmarking, synthetic data is sufficient):
 
    ```bash
-    REPOSITORY   TAG       IMAGE ID       CREATED          SIZE
-    deepspeed     latest    b6c49033c424   9 minutes ago    23.3GB
-   ...
+   python3 -c "
+   import json
+   with open('synthetic_corpus.json', 'w') as f:
+       for i in range(50000):
+           json.dump({'text': 'The quick brown fox ' * 100}, f)
+           f.write('\n')
+   "
    ```
 
-3. Convert the Docker image to a squash file with the command below.
+3. Clone Megatron-DeepSpeed and preprocess:
 
    ```bash
-   enroot import -o ${ENROOT_IMAGE} dockerd://deepspeed:latest
+   git clone https://github.com/microsoft/Megatron-DeepSpeed /fsx/deepspeed/Megatron-DeepSpeed
+
+   python3 /fsx/deepspeed/Megatron-DeepSpeed/tools/preprocess_data.py \
+       --input synthetic_corpus.json \
+       --output-prefix BookCorpusDataset_text_document \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --tokenizer-type GPT2BPETokenizer \
+       --workers 16 --append-eod
    ```
 
-   The file will be stored in the `/apps` directory (by default). The output should look as below.
+### Running
+
+Submit the best-performing configuration (TP=8, PP=8, ZeRO-0, fusions enabled):
+
+```bash
+make train
+# or equivalently:
+sbatch --partition=dev --nodes=8 \
+    --export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \
+    gpt/slurm/pretrain_gpt_103b.sbatch
+```
+
+Override parallelism settings for custom configurations:
+
+```bash
+sbatch --nodes=8 \
+    --export=ALL,TP=8,PP=4,ZERO_STAGE=1,MICRO_BATCH_SIZE=2,CONFIG_NAME=my_config \
+    gpt/slurm/pretrain_gpt_103b.sbatch
+```
+
+#### Environment variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `TP` | 8 | Tensor parallel size |
+| `PP` | 2 | Pipeline parallel size (best throughput with PP=8, see `make train`) |
+| `ZERO_STAGE` | 1 | DeepSpeed ZeRO stage (0, 1, 2, or 3) |
+| `MICRO_BATCH_SIZE` | 1 | Per-GPU micro batch size |
+| `GLOBAL_BATCH_SIZE` | 64 | Global batch size |
+| `SEQ_LENGTH` | 2048 | Sequence length |
+| `ENABLE_FUSIONS` | 0 | Set to 1 to enable kernel fusion ops |
+| `USE_ACTIVATION_CHECKPOINTING` | 0 | Set to 1 for activation checkpointing |
+| `USE_OVERLAP_COMM` | 0 | Set to 1 to overlap communication with compute |
+| `TRAIN_ITERS` | 50 | Number of training iterations |
+| `CONFIG_NAME` | baseline | Label for this configuration |
+
+### Best practices
+
+The following recommendations are based on extensive parameter sweeps across parallelism strategies, ZeRO stages, NCCL flags, and memory optimizations:
+
+**Parallelism strategy:**
+
+- **Maximize pipeline parallelism** (PP) alongside tensor parallelism (TP) for best throughput. For an 8-node cluster with 8 GPUs per node, TP=8 with PP=8 is optimal.
+- **Enable kernel fusion ops** (`ENABLE_FUSIONS=1`) for a significant throughput improvement over the non-fused baseline. This enables masked-softmax, bias-gelu, bias-dropout, and gradient-accumulation fusions.
+- **ZeRO-0 outperforms ZeRO-1** when the data-parallel group size is small (e.g., DP=1 with TP=8/PP=8). ZeRO-1's allreduce overhead is not amortized.
+
+**ZeRO-2 and ZeRO-3:**
+
+- ZeRO-2 and ZeRO-3 are **incompatible with pipeline parallelism** in Megatron-DeepSpeed. The sbatch script automatically sets `PP=1` and adds `--no-pipeline-parallel` when `ZERO_STAGE >= 2`.
+- ZeRO-3's parameter partitioning **enables lower TP values** that ZeRO-2 cannot fit in memory (e.g., TP=4 works with ZeRO-3 but OOMs with ZeRO-2).
+- **Increasing micro-batch size** (e.g., `MICRO_BATCH_SIZE=2`) substantially improves throughput for ZeRO-2 and ZeRO-3 configurations.
+- `overlap_comm` provides only marginal improvement (~2%) with ZeRO-3.
 
-    ```bash
-    [INFO] Fetching image
+**NCCL and networking:**
 
-    36a8c752c28a2db543d2a632a3fc1fcbd5789a6f3d45b9d3a24632420dedcfa8
+- NCCL environment flag variations (buffer sizes, chunk sizes, min channels) have **negligible impact** on throughput (~1% range). The defaults in the sbatch script are well-tuned.
+- **Do not set `NCCL_ALGO=Tree`** on EFA-based clusters -- it causes hangs. Let the NCCL tuner plugin (`libnccl-ofi-tuner.so`) choose the algorithm automatically.
+- **Do not set `NCCL_PROTO` or `FI_EFA_FORK_SAFE`** -- these are not needed and can cause issues.
+
+**Memory:**
+
+- `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` is set by default in the sbatch script. Note the **capital T** is required in pytorch:25.04 containers; lowercase `true` causes a `RuntimeError`.
+- Sequence length 4096 exceeds available HBM even with TP=8/PP=2 on B200 (178GB per GPU). Use seq=2048 for this model size.
+
+### Parsing results
+
+After training completes, parse the Slurm logs into benchmark JSON using `gpt/parse_results.py`:
+
+```bash
+# Single log file
+python3 gpt/parse_results.py --log-file logs/deepspeed-pretrain-103b_123.out --config-name my_config
+
+# Multiple jobs tracked in a CSV
+python3 gpt/parse_results.py --jobs-csv sweep_results/sweep_jobs.csv --output-dir sweep_results
+```
 
-    [INFO] Extracting image content...
-    [INFO] Creating squashfs filesystem...
+### Known issues
 
-    Parallel mksquashfs: Using 32 processors
-    Creating 4.0 filesystem on /apps/deepspeed.sqsh, block size 131072.
-    [========================================================================================================================================================================================================================-] 291068/291068 100%
+- **torchrun shebang**: The container's `torchrun` may have a shebang pointing to the wrong Python version. The sbatch script uses `python3 -m torch.distributed.run` as a workaround.
+- **`expandable_segments` case sensitivity**: Must use `expandable_segments:True` (capital T) in pytorch:25.04-py3. Lowercase causes a `RuntimeError`.
+- **NCCL Tree algorithm**: Incompatible with EFA topology -- causes hangs. Do not set `NCCL_ALGO=Tree`.
+- **Sequence parallelism**: Incompatible with pipeline parallelism (PP>1) in this Megatron-DeepSpeed version.
 
-    Exportable Squashfs 4.0 filesystem, gzip compressed, data block size 131072
-            uncompressed data, uncompressed metadata, uncompressed fragments, uncompressed xattrs
-            duplicates are not removed
-    ...
-    ```
+## 2. QLoRA Fine-tuning (Qwen3-8B)
 
-Once done proceed to the next stage.
+Fine-tune [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) using QLoRA (4-bit quantization + LoRA adapters) with DeepSpeed ZeRO-2 or ZeRO-3. Supports deployment on SageMaker HyperPod with both EKS and Slurm orchestrators, including MIG GPU partitioning and automatic checkpoint resume.
 
-### Option 2: Build image on a compute node
+The QLoRA use case has its own container (`qlora/Dockerfile`) optimized for the same infrastructure best practices (EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1).
 
-In this option, you will use a compute node to build the image. Submit the job as:
+See [`qlora/README.md`](qlora/README.md) for full instructions.
 
-    ```bash
-    sbatch 1.build-image.sbatch
-    ```
+## 3. Llama2 Fine-tuning (Megatron-DeepSpeed)
 
+Fine-tune Llama2 from HuggingFace weights using Megatron-DeepSpeed. Includes weight conversion from HuggingFace to Megatron format and fine-tuning on the Stanford Alpaca dataset. Uses the shared container image (`0.deepspeed.dockerfile`).
 
-Once the image is prepared, you can proceed to `examples_*` directory for various deepspeed test cases.
\ No newline at end of file
+See [`examples_megatron_deepspeed/finetune_hf_llama/README.md`](examples_megatron_deepspeed/finetune_hf_llama/README.md) for full instructions.
diff --git a/3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json b/3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json
new file mode 100644
index 000000000..6197eaf78
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json
@@ -0,0 +1,20 @@
+{
+    "train_batch_size": 64,
+    "train_micro_batch_size_per_gpu": 1,
+    "steps_per_print": 10,
+    "zero_optimization": {
+        "stage": 1,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 500000000,
+        "overlap_comm": false,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 500000000,
+        "contiguous_gradients": true
+    },
+    "gradient_clipping": 1.0,
+    "prescale_gradients": false,
+    "bf16": {
+        "enabled": true
+    },
+    "wall_clock_breakdown": false
+}
diff --git a/3.test_cases/pytorch/deepspeed/gpt/parse_results.py b/3.test_cases/pytorch/deepspeed/gpt/parse_results.py
new file mode 100755
index 000000000..8b2815396
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/gpt/parse_results.py
@@ -0,0 +1,423 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+"""
+parse_results.py - Parse Megatron-DeepSpeed training logs into benchmark JSON.
+
+Reads Slurm log files, extracts per-step metrics, and produces JSON files
+matching the existing benchmark-results schema at:
+  s3://<YOUR_BUCKET>/benchmark-results/<instance_type>/
+
+Usage:
+    python parse_results.py [--logs-dir logs] [--output-dir sweep_results]
+    python parse_results.py --log-file logs/sweep_01_baseline_123.out --config-name 01_baseline
+"""
+
+import argparse
+import csv
+import json
+import os
+import re
+import statistics
+import sys
+from datetime import datetime, timezone
+
+
+# ============================================================
+# Megatron-DeepSpeed log line patterns
+# ============================================================
+# Example: " iteration       10/      50 | consumed samples: ..."
+# Example: "elapsed time per iteration (ms): 4725.7 | ..."
+# Example: "lm loss: 1.3389E+01 | ..."
+# Example: "learning rate: 3.000E-05 | ..."
+# Example: "global batch size:   128 | ..."
+# Example: "loss scale: 1.0 | ..."
+# Example: "grad norm: 74.776 | ..."
+# Example: "TFLOPs: 125.4 | ..."
+
+ITER_PATTERN = re.compile(r"iteration\s+(\d+)/\s*(\d+)")
+ELAPSED_PATTERN = re.compile(r"elapsed time per iteration \(ms\):\s*([\d.]+)")
+LOSS_PATTERN = re.compile(r"lm loss:\s*([\d.eE+\-]+)")
+LR_PATTERN = re.compile(r"learning rate:\s*([\d.eE+\-]+)")
+GBS_PATTERN = re.compile(r"global batch size:\s*(\d+)")
+LOSS_SCALE_PATTERN = re.compile(r"loss scale:\s*([\d.eE+\-]+)")
+GRAD_NORM_PATTERN = re.compile(r"grad norm:\s*([\d.eE+\-]+)")
+TFLOPS_PATTERN = re.compile(r"TFLOPs:\s*([\d.]+)")
+
+
+def parse_log_file(log_path):
+    """Parse a single Megatron-DeepSpeed log file and extract per-step metrics."""
+    steps = []
+    current_step = {}
+
+    with open(log_path, "r") as f:
+        for line in f:
+            # Check for iteration marker
+            m = ITER_PATTERN.search(line)
+            if m:
+                if current_step:
+                    steps.append(current_step)
+                current_step = {
+                    "step": int(m.group(1)),
+                    "total_steps": int(m.group(2)),
+                }
+
+            if not current_step:
+                continue
+
+            # Extract metrics from the same log block
+            m = ELAPSED_PATTERN.search(line)
+            if m:
+                elapsed_ms = float(m.group(1))
+                current_step["elapsed_ms"] = elapsed_ms
+                current_step["step_time_s"] = round(elapsed_ms / 1000.0, 2)
+
+            m = LOSS_PATTERN.search(line)
+            if m:
+                current_step["lm_loss"] = float(m.group(1))
+
+            m = LR_PATTERN.search(line)
+            if m:
+                current_step["learning_rate"] = float(m.group(1))
+
+            m = GBS_PATTERN.search(line)
+            if m:
+                current_step["global_batch_size"] = int(m.group(1))
+
+            m = LOSS_SCALE_PATTERN.search(line)
+            if m:
+                current_step["loss_scale"] = float(m.group(1))
+
+            m = GRAD_NORM_PATTERN.search(line)
+            if m:
+                current_step["grad_norm"] = float(m.group(1))
+
+            m = TFLOPS_PATTERN.search(line)
+            if m:
+                current_step["tflops_per_gpu"] = float(m.group(1))
+
+    # Don't forget the last step
+    if current_step:
+        steps.append(current_step)
+
+    return steps
+
+
+def compute_tflops_from_step_time(
+    step_time_s,
+    global_batch_size,
+    seq_length=2048,
+    hidden_size=12288,
+    num_layers=80,
+    num_heads=96,
+    total_gpus=64,
+):
+    """
+    Estimate TFLOPS/GPU for a GPT model using the standard formula:
+    FLOPs per iteration = 8 * seq * hidden^2 * layers * (1 + seq/(6*hidden) + vocab/(12*hidden*layers))
+    Simplified: ~= 8 * B * s * h^2 * L * (1 + s/(6h))
+    where B = global_batch_size
+    """
+    vocab_size = 50257  # GPT-2 vocab
+    s = seq_length
+    h = hidden_size
+    L = num_layers
+    B = global_batch_size
+
+    # Standard approximation for GPT FLOP count
+    flops_per_iter = (
+        8 * B * s * h * h * L * (1 + s / (6 * h) + vocab_size / (12 * h * L))
+    )
+    tflops_per_gpu = flops_per_iter / (step_time_s * total_gpus * 1e12)
+    return round(tflops_per_gpu, 1)
+
+
+def build_result_json(
+    steps,
+    config_name,
+    job_id,
+    nodes=8,
+    gpus_per_node=8,
+    tp=8,
+    pp=2,
+    zero_stage=1,
+    mbs=1,
+    gbs=64,
+    seq_length=2048,
+    precision="bf16",
+    cluster="unknown",
+    instance_type="unknown",
+):
+    """Build the benchmark JSON matching the existing schema."""
+    total_gpus = nodes * gpus_per_node
+    warmup_steps = 5
+    total_steps = len(steps)
+
+    # Ensure TFLOPS values exist (compute if not in logs)
+    for step in steps:
+        if "tflops_per_gpu" not in step and "step_time_s" in step:
+            step["tflops_per_gpu"] = compute_tflops_from_step_time(
+                step["step_time_s"],
+                step.get("global_batch_size", gbs),
+                seq_length=seq_length,
+                total_gpus=total_gpus,
+            )
+
+    # Steady-state metrics (skip warmup)
+    steady_steps = [s for s in steps if s.get("step", 0) > warmup_steps]
+
+    if not steady_steps:
+        print(f"Warning: No steady-state steps found for {config_name}")
+        steady_steps = steps
+
+    steady_tflops = [s["tflops_per_gpu"] for s in steady_steps if "tflops_per_gpu" in s]
+    steady_times = [s["step_time_s"] for s in steady_steps if "step_time_s" in s]
+
+    summary = {
+        "total_steps": total_steps,
+        "warmup_steps": warmup_steps,
+        "steady_state_steps": len(steady_steps),
+    }
+
+    if steady_tflops:
+        summary.update(
+            {
+                "steady_state_avg_tflops_per_gpu": round(
+                    statistics.mean(steady_tflops), 2
+                ),
+                "steady_state_median_tflops_per_gpu": round(
+                    statistics.median(steady_tflops), 1
+                ),
+                "steady_state_min_tflops_per_gpu": round(min(steady_tflops), 1),
+                "steady_state_max_tflops_per_gpu": round(max(steady_tflops), 1),
+                "steady_state_stdev_tflops_per_gpu": round(
+                    statistics.stdev(steady_tflops), 2
+                )
+                if len(steady_tflops) > 1
+                else 0.0,
+                "peak_tflops_per_gpu": round(max(steady_tflops), 1),
+            }
+        )
+
+    if steady_times:
+        summary.update(
+            {
+                "steady_state_avg_step_time_s": round(statistics.mean(steady_times), 4),
+                "steady_state_median_step_time_s": round(
+                    statistics.median(steady_times), 2
+                ),
+                "steady_state_min_step_time_s": round(min(steady_times), 2),
+                "steady_state_max_step_time_s": round(max(steady_times), 2),
+            }
+        )
+
+    if steps:
+        summary["final_loss"] = steps[-1].get("lm_loss", None)
+        summary["initial_loss"] = steps[0].get("lm_loss", None)
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+    result = {
+        "metadata": {
+            "timestamp": timestamp,
+            "job_id": str(job_id),
+            "cluster": cluster,
+            "instance_type": instance_type,
+            "nodes": nodes,
+            "gpus_per_node": gpus_per_node,
+            "total_gpus": total_gpus,
+            "model": "deepspeed-gpt-103b",
+            "precision": precision,
+            "framework": "megatron-deepspeed",
+            "sweep_config": {
+                "config_name": config_name,
+                "tp": tp,
+                "pp": pp,
+                "zero_stage": zero_stage,
+                "micro_batch_size": mbs,
+                "global_batch_size": gbs,
+                "seq_length": seq_length,
+            },
+        },
+        "summary": summary,
+        "steps": steps,
+    }
+
+    return result
+
+
+def parse_sweep_jobs(
+    jobs_csv, logs_dir, output_dir, cluster="unknown", instance_type="unknown"
+):
+    """Parse all jobs from the sweep tracking CSV."""
+    os.makedirs(output_dir, exist_ok=True)
+    results = []
+
+    with open(jobs_csv, "r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            job_id = row["job_id"]
+            config_name = row["config_name"]
+
+            # Find the log file for this job
+            log_pattern = f"sweep_{config_name}_{job_id}.out"
+            log_path = os.path.join(logs_dir, log_pattern)
+
+            if not os.path.exists(log_path):
+                # Try alternate pattern
+                log_candidates = [
+                    f
+                    for f in os.listdir(logs_dir)
+                    if job_id in f and f.endswith(".out")
+                ]
+                if log_candidates:
+                    log_path = os.path.join(logs_dir, log_candidates[0])
+                else:
+                    print(
+                        f"Warning: No log file found for job {job_id} ({config_name})"
+                    )
+                    continue
+
+            print(f"Parsing {config_name} (job {job_id}): {log_path}")
+            steps = parse_log_file(log_path)
+
+            if not steps:
+                print(f"  Warning: No steps found in log file")
+                continue
+
+            result = build_result_json(
+                steps=steps,
+                config_name=config_name,
+                job_id=job_id,
+                tp=int(row.get("tp", 8)),
+                pp=int(row.get("pp", 2)),
+                zero_stage=int(row.get("zero", 1)),
+                mbs=int(row.get("mbs", 1)),
+                gbs=int(row.get("gbs", 64)),
+                seq_length=int(row.get("seq_length", 2048)),
+                cluster=cluster,
+                instance_type=instance_type,
+            )
+
+            # Write individual JSON file
+            now = datetime.now(timezone.utc)
+            filename = (
+                f"training_bench_deepspeed-gpt-103b_bf16_"
+                f"{now.strftime('%Y-%m-%d_%H%M')}_job{job_id}.json"
+            )
+            filepath = os.path.join(output_dir, filename)
+            with open(filepath, "w") as jf:
+                json.dump(result, jf, indent=2)
+            print(f"  Wrote: {filepath}")
+
+            results.append(result)
+
+    # Write combined summary
+    summary_path = os.path.join(output_dir, "sweep_summary.json")
+    with open(summary_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nWrote combined summary: {summary_path}")
+
+    return results
+
+
+def parse_single_log(
+    log_file, config_name, output_dir, cluster="unknown", instance_type="unknown"
+):
+    """Parse a single log file."""
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Extract job ID from filename
+    job_id_match = re.search(r"_(\d+)\.out", log_file)
+    job_id = job_id_match.group(1) if job_id_match else "unknown"
+
+    print(f"Parsing {config_name} (job {job_id}): {log_file}")
+    steps = parse_log_file(log_file)
+
+    if not steps:
+        print("Error: No steps found in log file")
+        sys.exit(1)
+
+    result = build_result_json(
+        steps=steps,
+        config_name=config_name,
+        job_id=job_id,
+        cluster=cluster,
+        instance_type=instance_type,
+    )
+
+    now = datetime.now(timezone.utc)
+    filename = (
+        f"training_bench_deepspeed-gpt-103b_bf16_"
+        f"{now.strftime('%Y-%m-%d_%H%M')}_job{job_id}.json"
+    )
+    filepath = os.path.join(output_dir, filename)
+    with open(filepath, "w") as f:
+        json.dump(result, f, indent=2)
+    print(f"Wrote: {filepath}")
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Parse Megatron-DeepSpeed logs into benchmark JSON"
+    )
+    parser.add_argument(
+        "--logs-dir", default="logs", help="Directory containing Slurm log files"
+    )
+    parser.add_argument(
+        "--output-dir", default="sweep_results", help="Directory to write JSON results"
+    )
+    parser.add_argument(
+        "--jobs-csv",
+        default="sweep_results/sweep_jobs.csv",
+        help="CSV file tracking sweep job IDs",
+    )
+    parser.add_argument(
+        "--log-file", default=None, help="Parse a single log file instead of sweep CSV"
+    )
+    parser.add_argument(
+        "--config-name",
+        default="single_run",
+        help="Config name for single log file parsing",
+    )
+    parser.add_argument(
+        "--cluster",
+        default=os.environ.get("CLUSTER_NAME", "unknown"),
+        help="Cluster name for metadata (default: $CLUSTER_NAME or 'unknown')",
+    )
+    parser.add_argument(
+        "--instance-type",
+        default=os.environ.get("INSTANCE_TYPE", "unknown"),
+        help="Instance type for metadata (default: $INSTANCE_TYPE or 'unknown')",
+    )
+
+    args = parser.parse_args()
+
+    if args.log_file:
+        parse_single_log(
+            args.log_file,
+            args.config_name,
+            args.output_dir,
+            cluster=args.cluster,
+            instance_type=args.instance_type,
+        )
+    else:
+        if not os.path.exists(args.jobs_csv):
+            print(f"Error: Jobs CSV not found: {args.jobs_csv}")
+            print(
+                "Run sweep_runner.sh first, or use --log-file for single file parsing"
+            )
+            sys.exit(1)
+        parse_sweep_jobs(
+            args.jobs_csv,
+            args.logs_dir,
+            args.output_dir,
+            cluster=args.cluster,
+            instance_type=args.instance_type,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch b/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch
new file mode 100755
index 000000000..a68585528
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch
@@ -0,0 +1,282 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --exclusive
+#SBATCH --job-name=deepspeed-pretrain-103b
+#SBATCH --output=logs/%x_%j.out
+#SBATCH --error=logs/%x_%j.err
+
+set -euxo pipefail
+
+# ============================================================
+# Environment defaults
+# ============================================================
+: "${APPS_PATH:=/fsx/apps}"
+: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"
+: "${FSX_PATH:=/fsx}"
+: "${DATA_DIR:=$FSX_PATH/deepspeed/data}"
+: "${MEGATRON_DS_PATH:=$FSX_PATH/deepspeed/Megatron-DeepSpeed}"
+
+# ============================================================
+# Parallelism config (overridable via env vars from sweep_runner.sh)
+# ============================================================
+: "${TP:=8}"
+: "${PP:=2}"
+: "${ZERO_STAGE:=1}"
+: "${MICRO_BATCH_SIZE:=1}"
+: "${GLOBAL_BATCH_SIZE:=64}"
+: "${TRAIN_ITERS:=50}"
+
+# ============================================================
+# ~103B GPT model architecture
+# Layers=80, Hidden=12288, Heads=96, FFN=49152
+# Estimated parameters: ~103B
+# ============================================================
+: "${NUM_LAYERS:=80}"
+: "${HIDDEN_SIZE:=12288}"
+: "${NUM_HEADS:=96}"
+: "${FFN_HIDDEN_SIZE:=49152}"
+: "${SEQ_LENGTH:=2048}"
+
+# ============================================================
+# Optional features (set to 1 to enable)
+# ============================================================
+: "${USE_ACTIVATION_CHECKPOINTING:=0}"
+: "${USE_SEQUENCE_PARALLEL:=0}"
+: "${USE_OVERLAP_COMM:=0}"
+: "${ENABLE_FUSIONS:=0}"
+: "${CONFIG_NAME:=baseline}"
+
+# ============================================================
+# PyTorch memory allocator optimisation
+# ============================================================
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+# ============================================================
+# Cluster topology
+# ============================================================
+export NODES=( $( scontrol show hostnames "$SLURM_JOB_NODELIST" ) )
+export NODES_ARRAY=($NODES)
+export HEAD_NODE=${NODES_ARRAY[0]}
+export MASTER_ADDR=$(hostname --ip-address)
+export MASTER_PORT=$((RANDOM + 10000))
+export NNODES=$SLURM_JOB_NUM_NODES
+export NUM_GPUS_PER_NODE=8
+
+# ============================================================
+# Network settings for EFA + NCCL
+# ============================================================
+export FI_LOG_LEVEL=1
+export FI_PROVIDER=efa
+export FI_EFA_USE_HUGE_PAGE=0
+export NCCL_SOCKET_IFNAME=^docker,lo,veth
+export NCCL_P2P_NET_CHUNKSIZE=2097152
+export NCCL_BUFFERSIZE=8388608
+export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMPI_MCA_plm=^slurm
+
+# ============================================================
+# Generate DeepSpeed config dynamically
+# ============================================================
+mkdir -p configs logs
+
+PRESCALE_GRAD="false"
+if [ "${ZERO_STAGE}" -eq 0 ]; then
+    PRESCALE_GRAD="true"
+fi
+
+OVERLAP_COMM_BOOL="false"
+if [ "${USE_OVERLAP_COMM}" -eq 1 ]; then
+    OVERLAP_COMM_BOOL="true"
+fi
+
+# Build ZeRO optimisation block depending on stage
+if [ "${ZERO_STAGE}" -eq 3 ]; then
+    ZERO_BLOCK=$(cat <<ZEOF
+    "zero_optimization": {
+        "stage": 3,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 500000000,
+        "overlap_comm": ${OVERLAP_COMM_BOOL},
+        "reduce_scatter": true,
+        "reduce_bucket_size": 500000000,
+        "contiguous_gradients": true,
+        "stage3_prefetch_bucket_size": 500000000,
+        "stage3_param_persistence_threshold": 1000000,
+        "stage3_max_live_parameters": 1000000000,
+        "stage3_max_reuse_distance": 1000000000
+    }
+ZEOF
+    )
+else
+    ZERO_BLOCK=$(cat <<ZEOF
+    "zero_optimization": {
+        "stage": ${ZERO_STAGE},
+        "allgather_partitions": true,
+        "allgather_bucket_size": 500000000,
+        "overlap_comm": ${OVERLAP_COMM_BOOL},
+        "reduce_scatter": true,
+        "reduce_bucket_size": 500000000,
+        "contiguous_gradients": true
+    }
+ZEOF
+    )
+fi
+
+cat <<EOF > configs/ds_config_run.json
+{
+    "train_batch_size": ${GLOBAL_BATCH_SIZE},
+    "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+    "steps_per_print": 10,
+${ZERO_BLOCK},
+    "gradient_clipping": 1.0,
+    "prescale_gradients": ${PRESCALE_GRAD},
+    "bf16": {
+        "enabled": true
+    },
+    "wall_clock_breakdown": false
+}
+EOF
+
+# ============================================================
+# Hostfile for DeepSpeed
+# ============================================================
+export HOSTFILE="/fsx/hostfile_${SLURM_JOB_ID}"
+function makehostfile() {
+perl -e '$slots=split /,/, $ENV{"SLURM_STEP_GPUS"};
+$slots=8 if $slots==0;
+@nodes = split /\n/, qx[scontrol show hostnames $ENV{"SLURM_JOB_NODELIST"}];
+print map { "$b$_ slots=$slots\n" } @nodes'
+}
+makehostfile > "${HOSTFILE}"
+
+# ============================================================
+# Container + distributed launch args
+# ============================================================
+declare -a SRUN_ARGS=(
+    --container-image "${IMAGE}"
+    --container-mounts /fsx,/opt/slurm/bin
+)
+
+declare -a DIST_ARGS=(
+    --nnodes ${NNODES}
+    --nproc-per-node ${NUM_GPUS_PER_NODE}
+    --master_addr ${MASTER_ADDR}
+    --master_port ${MASTER_PORT}
+    --rdzv_id $RANDOM
+    --rdzv_backend c10d
+    --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT}
+)
+
+# ============================================================
+# Model + training args
+# ============================================================
+declare -a MODEL_ARGS=(
+    --num-layers ${NUM_LAYERS}
+    --hidden-size ${HIDDEN_SIZE}
+    --num-attention-heads ${NUM_HEADS}
+    --ffn-hidden-size ${FFN_HIDDEN_SIZE}
+    --seq-length ${SEQ_LENGTH}
+    --max-position-embeddings ${SEQ_LENGTH}
+    --micro-batch-size ${MICRO_BATCH_SIZE}
+    --global-batch-size ${GLOBAL_BATCH_SIZE}
+    --train-iters ${TRAIN_ITERS}
+    --lr 1.0e-4
+    --min-lr 1.0e-6
+    --lr-decay-style cosine
+    --lr-warmup-iters 5
+    --lr-decay-iters 50
+    --weight-decay 0.1
+    --clip-grad 1.0
+    --adam-beta1 0.9
+    --adam-beta2 0.95
+    --init-method-std 0.006
+    --log-interval 1
+    --eval-iters 0
+    --eval-interval 1000
+    --bf16
+    --data-path ${DATA_DIR}/BookCorpusDataset_text_document
+    --vocab-file ${DATA_DIR}/gpt2-vocab.json
+    --merge-file ${DATA_DIR}/gpt2-merges.txt
+    --split 100,0,0
+    --data-impl mmap
+    --num-workers 0
+)
+
+# By default disable fusions (matches sweep v1 behaviour).
+# Set ENABLE_FUSIONS=1 to enable them.
+if [ "${ENABLE_FUSIONS}" -eq 0 ]; then
+    MODEL_ARGS+=(
+        --no-masked-softmax-fusion
+        --no-bias-gelu-fusion
+        --no-bias-dropout-fusion
+        --no-gradient-accumulation-fusion
+    )
+fi
+
+declare -a DS_ARGS=(
+    --tensor-model-parallel-size ${TP}
+    --zero-stage ${ZERO_STAGE}
+    --deepspeed_config ${PWD}/configs/ds_config_run.json
+    --deepspeed
+    --distributed-backend nccl
+)
+
+# Megatron-DeepSpeed wraps the model in PipelineModule by default.
+# DeepSpeed's PipelineEngine asserts that ZeRO stage < 2.
+# When using ZeRO-2 or ZeRO-3 we must disable pipeline parallel entirely
+# via --no-pipeline-parallel (which sets ds_pipeline_enabled=False).
+if [ "${ZERO_STAGE}" -ge 2 ]; then
+    DS_ARGS+=(
+        --pipeline-model-parallel-size 1
+        --no-pipeline-parallel
+    )
+else
+    DS_ARGS+=(--pipeline-model-parallel-size ${PP})
+fi
+
+# ============================================================
+# Optional features
+# ============================================================
+if [ "${USE_ACTIVATION_CHECKPOINTING}" -eq 1 ]; then
+    DS_ARGS+=(
+        --checkpoint-activations
+        --deepspeed-activation-checkpointing
+    )
+fi
+
+if [ "${USE_SEQUENCE_PARALLEL}" -eq 1 ]; then
+    DS_ARGS+=(--sequence-parallel)
+fi
+
+# ============================================================
+# Launch training
+# Note: Using python3 -m torch.distributed.run instead of torchrun
+# because the container's Python version may differ from the host
+# ============================================================
+echo "=== DeepSpeed 103B GPT Pretraining ==="
+echo "Config: ${CONFIG_NAME}"
+echo "Nodes: ${NNODES}, GPUs/node: ${NUM_GPUS_PER_NODE}, Total GPUs: $((NNODES * NUM_GPUS_PER_NODE))"
+echo "TP=${TP}, PP=${PP}, ZeRO=${ZERO_STAGE}"
+echo "MBS=${MICRO_BATCH_SIZE}, GBS=${GLOBAL_BATCH_SIZE}"
+echo "Model: layers=${NUM_LAYERS}, hidden=${HIDDEN_SIZE}, heads=${NUM_HEADS}, ffn=${FFN_HIDDEN_SIZE}"
+echo "Seq length: ${SEQ_LENGTH}, Fusions: ${ENABLE_FUSIONS}"
+echo "Activation ckpt: ${USE_ACTIVATION_CHECKPOINTING}, Seq parallel: ${USE_SEQUENCE_PARALLEL}"
+echo "PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-}"
+echo "======================================="
+
+# Convert arrays to strings for bash -c invocation
+DIST_ARGS_STR="${DIST_ARGS[*]}"
+MODEL_ARGS_STR="${MODEL_ARGS[*]}"
+DS_ARGS_STR="${DS_ARGS[*]}"
+
+# Note: Variables inside the bash -c string are expanded on the host side before
+# being passed to the container. This is intentional — the container does not have
+# access to these env vars at shell expansion time.
+srun -l "${SRUN_ARGS[@]}" bash -c "export PYTHONPATH=${MEGATRON_DS_PATH} && cd ${MEGATRON_DS_PATH} && python3 -m torch.distributed.run ${DIST_ARGS_STR} pretrain_gpt.py ${MODEL_ARGS_STR} ${DS_ARGS_STR}"
+
+# Cleanup hostfile
+rm -f "${HOSTFILE}"
diff --git a/3.test_cases/pytorch/deepspeed/qlora/Dockerfile b/3.test_cases/pytorch/deepspeed/qlora/Dockerfile
index 27f9bcd8c..32f02d35e 100644
--- a/3.test_cases/pytorch/deepspeed/qlora/Dockerfile
+++ b/3.test_cases/pytorch/deepspeed/qlora/Dockerfile
@@ -1,61 +1,154 @@
 # Dockerfile for QLoRA Fine-tuning of Qwen3-8B
 # =============================================
-# Base Image: NVIDIA CUDA 12.8 with cuDNN 9
-# Python: 3.10
+# Base Image: PyTorch 25.04 with CUDA 12.9.0 (supports Blackwell, Hopper, Ampere)
+# Python: 3.12 (bundled with pytorch:25.04-py3)
 # Key Libraries: PyTorch, Transformers, PEFT, BitsAndBytes, DeepSpeed
+# Networking: EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1
 #
 # Build:
 #   docker build -t qwen3-qlora-training:latest .
 #
 # If you encounter CUBLAS errors at runtime (typically caused by CUDA
-# library conflicts on the host), switch the torch index URL below to
-# cu126 as a fallback — see docs/TROUBLESHOOTING.md.
+# library conflicts on the host), see docs/TROUBLESHOOTING.md.
 
-# Stage 1: Base image with CUDA
-FROM nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 AS base
+# ============================================================
+# Base image: PyTorch 25.04 with CUDA 12.9.0
+# ============================================================
+FROM nvcr.io/nvidia/pytorch:25.04-py3
 
-# Prevent interactive prompts during build
 ENV DEBIAN_FRONTEND=noninteractive
 
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    python3.10 \
-    python3.10-dev \
-    python3-pip \
-    python3.10-venv \
+# ============================================================
+# 1. System packages and SSH setup (needed for multi-node training)
+# ============================================================
+RUN apt-get update -y && apt-get upgrade -y
+RUN apt-get remove -y --allow-change-held-packages \
+    ibverbs-utils \
+    libibverbs-dev \
+    libibverbs1 \
+    && rm -rf /opt/hpcx/ompi \
+    && rm -rf /usr/local/ucx \
+    && ldconfig
+
+RUN apt-get install -y --no-install-recommends \
+    autoconf \
+    automake \
+    build-essential \
+    cmake \
+    curl \
+    gcc \
+    gdb \
     git \
     git-lfs \
+    gnupg \
+    kmod \
+    libtool \
+    openssh-client \
+    openssh-server \
     wget \
-    curl \
     && rm -rf /var/lib/apt/lists/*
 
-# Set Python 3.10 as default
-RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \
-    ln -sf /usr/bin/pip3 /usr/bin/pip
+# SSH configuration for multi-node
+RUN rm -rf /root/.ssh/ \
+ && mkdir -p /root/.ssh/ \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 
-# Upgrade pip
-RUN pip install --upgrade pip setuptools wheel
+# ============================================================
+# 2. Install EFA Installer 1.47.0
+# ============================================================
+ENV EFA_INSTALLER_VERSION=1.47.0
+WORKDIR /tmp
+RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && cd / && rm -rf /tmp/aws-efa-installer
 
-# Set working directory
-WORKDIR /app
+# ============================================================
+# 3. NCCL plugin symlinks
+#    EFA installer names the plugin libnccl-net-ofi.so but NCCL
+#    looks for libnccl-net-aws-ofi.so. Without this symlink NCCL
+#    falls back to TCP sockets silently.
+# ============================================================
+RUN rm -rf /opt/amazon/aws-ofi-nccl
+
+RUN ln -sf /opt/amazon/ofi-nccl/lib/libnccl-net-ofi.so \
+           /opt/amazon/ofi-nccl/lib/libnccl-net-aws-ofi.so && \
+    ln -sf /opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \
+           /opt/amazon/ofi-nccl/lib/libnccl-tuner-aws-ofi.so
+
+# ============================================================
+# 4. Upgrade NCCL to 2.29.3 (requires CUDA >= 12.9)
+# ============================================================
+ENV NCCL_VERSION=2.29.3-1
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends wget && \
+    wget -qO /tmp/cuda-keyring.deb \
+      https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i /tmp/cuda-keyring.deb && \
+    rm /tmp/cuda-keyring.deb && \
+    apt-get update && \
+    apt-get install -y --allow-downgrades --allow-change-held-packages \
+      libnccl2=${NCCL_VERSION}+cuda12.9 \
+      libnccl-dev=${NCCL_VERSION}+cuda12.9 && \
+    rm -rf /var/lib/apt/lists/*
+
+# ============================================================
+# 5. Install GDRCopy v2.5.1 (lib-only)
+# ============================================================
+RUN cd /tmp && \
+    git clone --branch v2.5.1 --depth 1 https://github.com/NVIDIA/gdrcopy.git && \
+    cd gdrcopy && \
+    make -j$(nproc) lib lib_install && \
+    cd / && rm -rf /tmp/gdrcopy
+
+# ============================================================
+# 6. Library path configuration
+# ============================================================
+RUN echo "/opt/amazon/ofi-nccl/lib" > /etc/ld.so.conf.d/aws-ofi-nccl.conf && \
+    echo "/opt/amazon/efa/lib" > /etc/ld.so.conf.d/efa.conf
 
-# Stage 2: Install Python dependencies
-FROM base AS dependencies
+RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/environment 2>/dev/null || true
+RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/shinit_v2 2>/dev/null || true
+
+RUN rm -f /etc/ld.so.cache && ldconfig
+
+ENV LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}"
+ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}"
+
+# ============================================================
+# 7. OpenMPI tuning for EFA
+# ============================================================
+ARG OPEN_MPI_PATH=/opt/amazon/openmpi
+RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
+
+RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \
+ && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \
+ && echo "${OPEN_MPI_PATH}/bin/mpirun.real \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \
+ && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun
+
+# ============================================================
+# 8. Python dependencies for QLoRA training
+# ============================================================
+WORKDIR /app
 
 # Copy requirements first for better caching
 COPY requirements.txt .
 
-# Install PyTorch with CUDA 12.8 support
+# Install PyTorch with CUDA 12.9 support
 # Note: torch 2.10+ has a breaking LR scheduler change (strict zip) that is
 # incompatible with some DeepSpeed/transformers versions. Pin to <2.10 until
 # upstream libraries catch up.
-RUN pip install --no-cache-dir 'torch>=2.7.0,<2.10.0' --index-url https://download.pytorch.org/whl/cu128
+RUN pip install --no-cache-dir 'torch>=2.7.0,<2.10.0' --index-url https://download.pytorch.org/whl/cu129
 
 # Install other dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 
-# Stage 3: Final image with application code
-FROM dependencies AS final
+# ============================================================
+# 9. Application code
+# ============================================================
 
 # Copy source code and entrypoint
 COPY entrypoint.sh /app/entrypoint.sh
@@ -66,14 +159,23 @@ COPY configs/ /app/configs/
 # Create directories for outputs and cache
 RUN mkdir -p /workspace/outputs /workspace/hf_cache
 
-# Set environment variables
+# ============================================================
+# 10. Environment variables
+# ============================================================
 ENV PYTHONPATH=/app
 ENV HF_HOME=/workspace/hf_cache
 ENV PYTHONUNBUFFERED=1
-# Do NOT set CUDA_VISIBLE_DEVICES here — let torchrun / K8s manage GPU visibility
-# DeepSpeed / NCCL settings for multi-GPU communication
+# Do NOT set CUDA_VISIBLE_DEVICES here -- let torchrun / K8s manage GPU visibility
+
+# NCCL / EFA settings for multi-GPU and multi-node communication
 ENV NCCL_DEBUG=INFO
-ENV NCCL_SOCKET_IFNAME=^lo
+ENV NCCL_SOCKET_IFNAME=^docker,lo,veth
+ENV FI_PROVIDER=efa
+ENV NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so
+
+# PyTorch memory allocator -- expandable segments reduces fragmentation
+# Note: capital T is required in pytorch:25.04 containers
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 
 # Entrypoint reads PET_* env vars set by the Kubeflow Training Operator
 # and launches torchrun with the correct number of processes per node.
diff --git a/3.test_cases/pytorch/deepspeed/qlora/requirements.txt b/3.test_cases/pytorch/deepspeed/qlora/requirements.txt
index 893e443a7..9bdf0eb7a 100644
--- a/3.test_cases/pytorch/deepspeed/qlora/requirements.txt
+++ b/3.test_cases/pytorch/deepspeed/qlora/requirements.txt
@@ -1,6 +1,6 @@
 # Core ML Libraries
 # NOTE: torch is installed separately with the correct CUDA index URL.
-# - Docker (see Dockerfile):       torch>=2.7.0,<2.10.0 with cu128
+# - Docker (see Dockerfile):       torch>=2.7.0,<2.10.0 with cu129 (pytorch:25.04-py3 base)
 # - Slurm venv (see slurm/README): torch==2.6.0 with cu126
 # See docs/TROUBLESHOOTING.md if you encounter CUBLAS errors (typically caused
 # by environment-level CUDA library conflicts, not a library bug).