From c87bc69bf4efd9153f504e078932acc3114daa42 Mon Sep 17 00:00:00 2001
From: Paulo Aragao <aragao.paulo@gmail.com>
Date: Thu, 5 Mar 2026 15:04:02 +0000
Subject: [PATCH 1/5] Add DeepSpeed 103B GPT pretraining benchmark for B200
 cluster

- Update Dockerfile: pytorch:25.04-py3 base, EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1
- Add 103B GPT pretraining sbatch script with parameterized parallelism, ZeRO stages, fusion ops, and correct NCCL/EFA flags
- Add sweep runners (v1: 20 configs, v2: 10 configs) covering TP/PP/ZeRO/fusion/memory variations
- Add results parser and S3 upload script with CloudWatch metric publishing
- Best result: 476.6 TFLOPS/GPU (TP=8, PP=8, ZeRO=0, fusions enabled) on 8x B200 nodes
---
 .../pytorch/deepspeed/0.deepspeed.dockerfile  | 177 ++++----
 .../pytorch/deepspeed/1.build-image.sbatch    |   6 +-
 .../configs/ds_config_103b_template.json      |  20 +
 .../pytorch/deepspeed/parse_results.py        | 391 ++++++++++++++++++
 .../deepspeed/pretrain_gpt_103b.sbatch        | 280 +++++++++++++
 .../pytorch/deepspeed/sweep_runner.sh         | 144 +++++++
 .../pytorch/deepspeed/sweep_runner_v2.sh      | 154 +++++++
 .../pytorch/deepspeed/upload_results.sh       | 260 ++++++++++++
 8 files changed, 1358 insertions(+), 74 deletions(-)
 create mode 100644 3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json
 create mode 100755 3.test_cases/pytorch/deepspeed/parse_results.py
 create mode 100755 3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch
 create mode 100755 3.test_cases/pytorch/deepspeed/sweep_runner.sh
 create mode 100644 3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
 create mode 100755 3.test_cases/pytorch/deepspeed/upload_results.sh

diff --git a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile
index 472edc476..ec7f99995 100644
--- a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile
+++ b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile
@@ -1,19 +1,20 @@
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: MIT-0
 
-FROM nvcr.io/nvidia/pytorch:25.03-py3
+# ============================================================
+# Base image: PyTorch 25.04 with CUDA 12.9.0 (required for NCCL 2.29.x)
+# Supports Blackwell (sm_100), Hopper, Ampere architectures
+# ============================================================
+FROM nvcr.io/nvidia/pytorch:25.04-py3
 
-ARG GDRCOPY_VERSION=v2.4.1
-ARG EFA_INSTALLER_VERSION=1.37.0
-ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws
 ARG TRANSFORMERS_VERSION=4.44.2
-ARG MEGATRON_LM_VERSION=core_r0.8.0
-
 ARG OPEN_MPI_PATH=/opt/amazon/openmpi
 
-######################
-# Update and remove the IB libverbs
-######################
+ENV DEBIAN_FRONTEND=noninteractive
+
+# ============================================================
+# 1. System packages and SSH setup (needed for multi-node training)
+# ============================================================
 RUN apt-get update -y && apt-get upgrade -y
 RUN apt-get remove -y --allow-change-held-packages \
     ibverbs-utils \
@@ -26,8 +27,7 @@ RUN rm -rf /opt/hpcx/ompi \
     && rm -rf /usr/local/ucx \
     && ldconfig
 
-RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \
-    apt-utils \
+RUN apt-get install -y --no-install-recommends \
     autoconf \
     automake \
     build-essential \
@@ -36,6 +36,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \
     gcc \
     gdb \
     git \
+    gnupg \
     kmod \
     libtool \
     openssh-client \
@@ -55,69 +56,99 @@ RUN rm -rf /root/.ssh/ \
  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 
-ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:$LD_LIBRARY_PATH
-ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
-
-#################################################
-## Install NVIDIA GDRCopy
-##
-## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
-## that the cuda-compat-xx-x package is the latest.
-RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
-    && cd /tmp/gdrcopy \
-    && make prefix=/opt/gdrcopy install
-
-ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH
-ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH
-ENV CPATH /opt/gdrcopy/include:$CPATH
-ENV PATH /opt/gdrcopy/bin:$PATH
-
-#################################################
-## Install EFA installer
-RUN cd $HOME \
-    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
-    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+# ============================================================
+# 2. Install EFA Installer 1.47.0
+#    This bundles libfabric, rdma-core, and pre-built aws-ofi-nccl
+#    No source build of aws-ofi-nccl needed (unlike EFA < 1.40)
+# ============================================================
+ENV EFA_INSTALLER_VERSION=1.47.0
+WORKDIR /tmp
+RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xz \
     && cd aws-efa-installer \
     && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
-    && rm -rf $HOME/aws-efa-installer
-
-
-###################################################
-## Install AWS-OFI-NCCL plugin
-RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
-#Switch from sh to bash to allow parameter expansion
-SHELL ["/bin/bash", "-c"]
-RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
-    && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
-    && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
-    && ./configure --prefix=/opt/aws-ofi-nccl/install \
-        --with-mpi=/opt/amazon/openmpi \
-        --with-libfabric=/opt/amazon/efa \
-        --with-cuda=/usr/local/cuda \
-        --enable-platform-aws \
-    && make -j $(nproc) \
-    && make install \
-    && cd .. \
-    && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
-    && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
-
-SHELL ["/bin/sh", "-c"]
-
-###################################################
-RUN rm -rf /var/lib/apt/lists/*
-
-RUN echo "hwloc_base_binding_policy = none" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf \
- && echo "rmaps_base_mapping_policy = slot" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf
+    && cd / && rm -rf /tmp/aws-efa-installer
+
+# ============================================================
+# 3. Remove old aws-ofi-nccl and create NCCL plugin symlinks
+#    NCCL_NET_PLUGIN=aws-ofi looks for libnccl-net-aws-ofi.so
+#    EFA installer names it libnccl-net-ofi.so
+#    Without this symlink NCCL falls back to TCP sockets silently
+# ============================================================
+RUN rm -rf /opt/amazon/aws-ofi-nccl
+
+RUN ln -sf /opt/amazon/ofi-nccl/lib/libnccl-net-ofi.so \
+           /opt/amazon/ofi-nccl/lib/libnccl-net-aws-ofi.so && \
+    ln -sf /opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \
+           /opt/amazon/ofi-nccl/lib/libnccl-tuner-aws-ofi.so
+
+# ============================================================
+# 4. Upgrade NCCL to 2.29.3 (matches B200 host version)
+#    Requires CUDA >= 12.9 (which pytorch:25.04-py3 provides)
+#    Must add NVIDIA CUDA apt repo first since base image may not have it
+# ============================================================
+ENV NCCL_VERSION=2.29.3-1
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends wget && \
+    wget -qO /tmp/cuda-keyring.deb \
+      https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i /tmp/cuda-keyring.deb && \
+    rm /tmp/cuda-keyring.deb && \
+    apt-get update && \
+    apt-get install -y --allow-downgrades --allow-change-held-packages \
+      libnccl2=${NCCL_VERSION}+cuda12.9 \
+      libnccl-dev=${NCCL_VERSION}+cuda12.9 && \
+    rm -rf /var/lib/apt/lists/*
+
+# ============================================================
+# 5. Install GDRCopy v2.5.1 (lib-only, no binaries needed)
+# ============================================================
+RUN cd /tmp && \
+    git clone --branch v2.5.1 --depth 1 https://github.com/NVIDIA/gdrcopy.git && \
+    cd gdrcopy && \
+    make -j$(nproc) lib lib_install && \
+    cd / && rm -rf /tmp/gdrcopy
+
+# ============================================================
+# 6. Fix library path references
+#    Use ld.so.conf.d for system-wide discovery (more robust
+#    than relying solely on LD_LIBRARY_PATH)
+# ============================================================
+RUN echo "/opt/amazon/ofi-nccl/lib" > /etc/ld.so.conf.d/aws-ofi-nccl.conf && \
+    echo "/opt/amazon/efa/lib" > /etc/ld.so.conf.d/efa.conf
+
+RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/environment 2>/dev/null || true
+RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/shinit_v2 2>/dev/null || true
+
+# Rebuild ldconfig cache
+RUN rm -f /etc/ld.so.cache && ldconfig
+
+# ============================================================
+# 7. Environment variables
+# ============================================================
+ENV LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}"
+ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}"
+ENV FI_PROVIDER=efa
+
+# ============================================================
+# 8. OpenMPI tuning for EFA (needed for multi-node training)
+# ============================================================
+RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
+
+RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \
+ && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \
+ && echo "${OPEN_MPI_PATH}/bin/mpirun.real \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \
+ && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun
+
+# ============================================================
+# 9. Python packages for DeepSpeed training
+# ============================================================
+RUN pip3 install --no-cache-dir \
+    awscli pynvml \
+    transformers==${TRANSFORMERS_VERSION} \
+    sentencepiece python-etcd \
+    deepspeed accelerate
 
-RUN pip3 install awscli pynvml
-
-RUN mv $OPEN_MPI_PATH/bin/mpirun $OPEN_MPI_PATH/bin/mpirun.real \
- && echo '#!/bin/bash' > $OPEN_MPI_PATH/bin/mpirun \
- && echo '/opt/amazon/openmpi/bin/mpirun.real "$@"' >> $OPEN_MPI_PATH/bin/mpirun \
- && chmod a+x $OPEN_MPI_PATH/bin/mpirun
-
-######################
-# DeepSpeed dependencies
-######################
-RUN pip install transformers==${TRANSFORMERS_VERSION} sentencepiece python-etcd deepspeed accelerate
+RUN rm -rf /var/lib/apt/lists/*
 
+WORKDIR /workspace
diff --git a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
index ebd7b0b04..f91222361 100644
--- a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
+++ b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: MIT-0
 
 #SBATCH -N 1 # number of nodes to use
-#SBATCH --job-name=build-neox-image # name of your job
+#SBATCH --job-name=build-deepspeed-image # name of your job
 #SBATCH --output=logs/%x_%j.out # logfile for stdout
 #SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs
 
@@ -14,6 +14,10 @@ set -euxo pipefail
 : "${APPS_PATH:=/fsx/apps}"
 : "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"
 
+# Ensure output directory exists
+mkdir -p ${APPS_PATH}
+mkdir -p logs
+
 ENROOT_IMAGE=deepspeed
 docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile .
 # Remove old sqsh file if exists
diff --git a/3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json b/3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json
new file mode 100644
index 000000000..6197eaf78
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json
@@ -0,0 +1,20 @@
+{
+    "train_batch_size": 64,
+    "train_micro_batch_size_per_gpu": 1,
+    "steps_per_print": 10,
+    "zero_optimization": {
+        "stage": 1,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 500000000,
+        "overlap_comm": false,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 500000000,
+        "contiguous_gradients": true
+    },
+    "gradient_clipping": 1.0,
+    "prescale_gradients": false,
+    "bf16": {
+        "enabled": true
+    },
+    "wall_clock_breakdown": false
+}
diff --git a/3.test_cases/pytorch/deepspeed/parse_results.py b/3.test_cases/pytorch/deepspeed/parse_results.py
new file mode 100755
index 000000000..8eacea70c
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/parse_results.py
@@ -0,0 +1,391 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+"""
+parse_results.py - Parse Megatron-DeepSpeed training logs into benchmark JSON.
+
+Reads Slurm log files, extracts per-step metrics, and produces JSON files
+matching the existing benchmark-results schema at:
+  s3://paragao-new-nemo-squash-container/benchmark-results/b200/
+
+Usage:
+    python parse_results.py [--logs-dir logs] [--output-dir sweep_results]
+    python parse_results.py --log-file logs/sweep_01_baseline_123.out --config-name 01_baseline
+"""
+
+import argparse
+import csv
+import json
+import os
+import re
+import statistics
+import sys
+from datetime import datetime, timezone
+
+
+# ============================================================
+# Megatron-DeepSpeed log line patterns
+# ============================================================
+# Example: " iteration       10/      50 | consumed samples: ..."
+# Example: "elapsed time per iteration (ms): 4725.7 | ..."
+# Example: "lm loss: 1.3389E+01 | ..."
+# Example: "learning rate: 3.000E-05 | ..."
+# Example: "global batch size:   128 | ..."
+# Example: "loss scale: 1.0 | ..."
+# Example: "grad norm: 74.776 | ..."
+# Example: "TFLOPs: 125.4 | ..."
+
+ITER_PATTERN = re.compile(r"iteration\s+(\d+)/\s*(\d+)")
+ELAPSED_PATTERN = re.compile(r"elapsed time per iteration \(ms\):\s*([\d.]+)")
+LOSS_PATTERN = re.compile(r"lm loss:\s*([\d.eE+\-]+)")
+LR_PATTERN = re.compile(r"learning rate:\s*([\d.eE+\-]+)")
+GBS_PATTERN = re.compile(r"global batch size:\s*(\d+)")
+LOSS_SCALE_PATTERN = re.compile(r"loss scale:\s*([\d.eE+\-]+)")
+GRAD_NORM_PATTERN = re.compile(r"grad norm:\s*([\d.eE+\-]+)")
+TFLOPS_PATTERN = re.compile(r"TFLOPs:\s*([\d.]+)")
+
+
+def parse_log_file(log_path):
+    """Parse a single Megatron-DeepSpeed log file and extract per-step metrics."""
+    steps = []
+    current_step = {}
+
+    with open(log_path, "r") as f:
+        for line in f:
+            # Check for iteration marker
+            m = ITER_PATTERN.search(line)
+            if m:
+                if current_step:
+                    steps.append(current_step)
+                current_step = {
+                    "step": int(m.group(1)),
+                    "total_steps": int(m.group(2)),
+                }
+
+            if not current_step:
+                continue
+
+            # Extract metrics from the same log block
+            m = ELAPSED_PATTERN.search(line)
+            if m:
+                elapsed_ms = float(m.group(1))
+                current_step["elapsed_ms"] = elapsed_ms
+                current_step["step_time_s"] = round(elapsed_ms / 1000.0, 2)
+
+            m = LOSS_PATTERN.search(line)
+            if m:
+                current_step["lm_loss"] = float(m.group(1))
+
+            m = LR_PATTERN.search(line)
+            if m:
+                current_step["learning_rate"] = float(m.group(1))
+
+            m = GBS_PATTERN.search(line)
+            if m:
+                current_step["global_batch_size"] = int(m.group(1))
+
+            m = LOSS_SCALE_PATTERN.search(line)
+            if m:
+                current_step["loss_scale"] = float(m.group(1))
+
+            m = GRAD_NORM_PATTERN.search(line)
+            if m:
+                current_step["grad_norm"] = float(m.group(1))
+
+            m = TFLOPS_PATTERN.search(line)
+            if m:
+                current_step["tflops_per_gpu"] = float(m.group(1))
+
+    # Don't forget the last step
+    if current_step:
+        steps.append(current_step)
+
+    return steps
+
+
+def compute_tflops_from_step_time(
+    step_time_s,
+    global_batch_size,
+    seq_length=2048,
+    hidden_size=12288,
+    num_layers=80,
+    num_heads=96,
+    total_gpus=64,
+):
+    """
+    Estimate TFLOPS/GPU for a GPT model using the standard formula:
+    FLOPs per iteration = 8 * seq * hidden^2 * layers * (1 + seq/(6*hidden) + vocab/(12*hidden*layers))
+    Simplified: ~= 8 * B * s * h^2 * L * (1 + s/(6h))
+    where B = global_batch_size
+    """
+    vocab_size = 50257  # GPT-2 vocab
+    s = seq_length
+    h = hidden_size
+    L = num_layers
+    B = global_batch_size
+
+    # Standard approximation for GPT FLOP count
+    flops_per_iter = (
+        8 * B * s * h * h * L * (1 + s / (6 * h) + vocab_size / (12 * h * L))
+    )
+    tflops_per_gpu = flops_per_iter / (step_time_s * total_gpus * 1e12)
+    return round(tflops_per_gpu, 1)
+
+
+def build_result_json(
+    steps,
+    config_name,
+    job_id,
+    nodes=8,
+    gpus_per_node=8,
+    tp=8,
+    pp=2,
+    zero_stage=1,
+    mbs=1,
+    gbs=64,
+    seq_length=2048,
+    precision="bf16",
+):
+    """Build the benchmark JSON matching the existing schema."""
+    total_gpus = nodes * gpus_per_node
+    warmup_steps = 5
+    total_steps = len(steps)
+
+    # Ensure TFLOPS values exist (compute if not in logs)
+    for step in steps:
+        if "tflops_per_gpu" not in step and "step_time_s" in step:
+            step["tflops_per_gpu"] = compute_tflops_from_step_time(
+                step["step_time_s"],
+                step.get("global_batch_size", gbs),
+                seq_length=seq_length,
+                total_gpus=total_gpus,
+            )
+
+    # Steady-state metrics (skip warmup)
+    steady_steps = [s for s in steps if s.get("step", 0) > warmup_steps]
+
+    if not steady_steps:
+        print(f"Warning: No steady-state steps found for {config_name}")
+        steady_steps = steps
+
+    steady_tflops = [s["tflops_per_gpu"] for s in steady_steps if "tflops_per_gpu" in s]
+    steady_times = [s["step_time_s"] for s in steady_steps if "step_time_s" in s]
+
+    summary = {
+        "total_steps": total_steps,
+        "warmup_steps": warmup_steps,
+        "steady_state_steps": len(steady_steps),
+    }
+
+    if steady_tflops:
+        summary.update(
+            {
+                "steady_state_avg_tflops_per_gpu": round(
+                    statistics.mean(steady_tflops), 2
+                ),
+                "steady_state_median_tflops_per_gpu": round(
+                    statistics.median(steady_tflops), 1
+                ),
+                "steady_state_min_tflops_per_gpu": round(min(steady_tflops), 1),
+                "steady_state_max_tflops_per_gpu": round(max(steady_tflops), 1),
+                "steady_state_stdev_tflops_per_gpu": round(
+                    statistics.stdev(steady_tflops), 2
+                )
+                if len(steady_tflops) > 1
+                else 0.0,
+                "peak_tflops_per_gpu": round(max(steady_tflops), 1),
+            }
+        )
+
+    if steady_times:
+        summary.update(
+            {
+                "steady_state_avg_step_time_s": round(statistics.mean(steady_times), 4),
+                "steady_state_median_step_time_s": round(
+                    statistics.median(steady_times), 2
+                ),
+                "steady_state_min_step_time_s": round(min(steady_times), 2),
+                "steady_state_max_step_time_s": round(max(steady_times), 2),
+            }
+        )
+
+    if steps:
+        summary["final_loss"] = steps[-1].get("lm_loss", None)
+        summary["initial_loss"] = steps[0].get("lm_loss", None)
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+    result = {
+        "metadata": {
+            "timestamp": timestamp,
+            "job_id": str(job_id),
+            "cluster": "b200-hyperpod",
+            "instance_type": "ml.p6-b200.48xlarge",
+            "nodes": nodes,
+            "gpus_per_node": gpus_per_node,
+            "total_gpus": total_gpus,
+            "model": "deepspeed-gpt-103b",
+            "precision": precision,
+            "framework": "megatron-deepspeed",
+            "sweep_config": {
+                "config_name": config_name,
+                "tp": tp,
+                "pp": pp,
+                "zero_stage": zero_stage,
+                "micro_batch_size": mbs,
+                "global_batch_size": gbs,
+                "seq_length": seq_length,
+            },
+        },
+        "summary": summary,
+        "steps": steps,
+    }
+
+    return result
+
+
+def parse_sweep_jobs(jobs_csv, logs_dir, output_dir):
+    """Parse all jobs from the sweep tracking CSV."""
+    os.makedirs(output_dir, exist_ok=True)
+    results = []
+
+    with open(jobs_csv, "r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            job_id = row["job_id"]
+            config_name = row["config_name"]
+
+            # Find the log file for this job
+            log_pattern = f"sweep_{config_name}_{job_id}.out"
+            log_path = os.path.join(logs_dir, log_pattern)
+
+            if not os.path.exists(log_path):
+                # Try alternate pattern
+                log_candidates = [
+                    f
+                    for f in os.listdir(logs_dir)
+                    if job_id in f and f.endswith(".out")
+                ]
+                if log_candidates:
+                    log_path = os.path.join(logs_dir, log_candidates[0])
+                else:
+                    print(
+                        f"Warning: No log file found for job {job_id} ({config_name})"
+                    )
+                    continue
+
+            print(f"Parsing {config_name} (job {job_id}): {log_path}")
+            steps = parse_log_file(log_path)
+
+            if not steps:
+                print(f"  Warning: No steps found in log file")
+                continue
+
+            result = build_result_json(
+                steps=steps,
+                config_name=config_name,
+                job_id=job_id,
+                tp=int(row.get("tp", 8)),
+                pp=int(row.get("pp", 2)),
+                zero_stage=int(row.get("zero", 1)),
+                mbs=int(row.get("mbs", 1)),
+                gbs=int(row.get("gbs", 64)),
+                seq_length=int(row.get("seq_length", 2048)),
+            )
+
+            # Write individual JSON file
+            now = datetime.now(timezone.utc)
+            filename = (
+                f"training_bench_deepspeed-gpt-103b_bf16_"
+                f"{now.strftime('%Y-%m-%d_%H%M')}_job{job_id}.json"
+            )
+            filepath = os.path.join(output_dir, filename)
+            with open(filepath, "w") as jf:
+                json.dump(result, jf, indent=2)
+            print(f"  Wrote: {filepath}")
+
+            results.append(result)
+
+    # Write combined summary
+    summary_path = os.path.join(output_dir, "sweep_summary.json")
+    with open(summary_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nWrote combined summary: {summary_path}")
+
+    return results
+
+
+def parse_single_log(log_file, config_name, output_dir):
+    """Parse a single log file."""
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Extract job ID from filename
+    job_id_match = re.search(r"_(\d+)\.out", log_file)
+    job_id = job_id_match.group(1) if job_id_match else "unknown"
+
+    print(f"Parsing {config_name} (job {job_id}): {log_file}")
+    steps = parse_log_file(log_file)
+
+    if not steps:
+        print("Error: No steps found in log file")
+        sys.exit(1)
+
+    result = build_result_json(
+        steps=steps,
+        config_name=config_name,
+        job_id=job_id,
+    )
+
+    now = datetime.now(timezone.utc)
+    filename = (
+        f"training_bench_deepspeed-gpt-103b_bf16_"
+        f"{now.strftime('%Y-%m-%d_%H%M')}_job{job_id}.json"
+    )
+    filepath = os.path.join(output_dir, filename)
+    with open(filepath, "w") as f:
+        json.dump(result, f, indent=2)
+    print(f"Wrote: {filepath}")
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Parse Megatron-DeepSpeed logs into benchmark JSON"
+    )
+    parser.add_argument(
+        "--logs-dir", default="logs", help="Directory containing Slurm log files"
+    )
+    parser.add_argument(
+        "--output-dir", default="sweep_results", help="Directory to write JSON results"
+    )
+    parser.add_argument(
+        "--jobs-csv",
+        default="sweep_results/sweep_jobs.csv",
+        help="CSV file tracking sweep job IDs",
+    )
+    parser.add_argument(
+        "--log-file", default=None, help="Parse a single log file instead of sweep CSV"
+    )
+    parser.add_argument(
+        "--config-name",
+        default="single_run",
+        help="Config name for single log file parsing",
+    )
+
+    args = parser.parse_args()
+
+    if args.log_file:
+        parse_single_log(args.log_file, args.config_name, args.output_dir)
+    else:
+        if not os.path.exists(args.jobs_csv):
+            print(f"Error: Jobs CSV not found: {args.jobs_csv}")
+            print(
+                "Run sweep_runner.sh first, or use --log-file for single file parsing"
+            )
+            sys.exit(1)
+        parse_sweep_jobs(args.jobs_csv, args.logs_dir, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch b/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch
new file mode 100755
index 000000000..43957885f
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch
@@ -0,0 +1,280 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --exclusive
+#SBATCH --job-name=deepspeed-pretrain-103b
+#SBATCH --output=logs/%x_%j.out
+#SBATCH --error=logs/%x_%j.err
+#SBATCH --partition=b200
+
+set -euxo pipefail
+
+# ============================================================
+# Environment defaults
+# ============================================================
+: "${APPS_PATH:=/fsx/apps}"
+: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"
+: "${FSX_PATH:=/fsx}"
+: "${DATA_DIR:=$FSX_PATH/deepspeed/data}"
+: "${MEGATRON_DS_PATH:=$FSX_PATH/deepspeed/Megatron-DeepSpeed}"
+
+# ============================================================
+# Parallelism config (overridable via env vars from sweep_runner.sh)
+# ============================================================
+: "${TP:=8}"
+: "${PP:=2}"
+: "${ZERO_STAGE:=1}"
+: "${MICRO_BATCH_SIZE:=1}"
+: "${GLOBAL_BATCH_SIZE:=64}"
+: "${TRAIN_ITERS:=50}"
+
+# ============================================================
+# ~103B GPT model architecture
+# Layers=80, Hidden=12288, Heads=96, FFN=49152
+# Estimated parameters: ~103B
+# ============================================================
+: "${NUM_LAYERS:=80}"
+: "${HIDDEN_SIZE:=12288}"
+: "${NUM_HEADS:=96}"
+: "${FFN_HIDDEN_SIZE:=49152}"
+: "${SEQ_LENGTH:=2048}"
+
+# ============================================================
+# Optional features (set to 1 to enable)
+# ============================================================
+: "${USE_ACTIVATION_CHECKPOINTING:=0}"
+: "${USE_SEQUENCE_PARALLEL:=0}"
+: "${USE_OVERLAP_COMM:=0}"
+: "${ENABLE_FUSIONS:=0}"
+: "${CONFIG_NAME:=baseline}"
+
+# ============================================================
+# PyTorch memory allocator optimisation
+# ============================================================
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+# ============================================================
+# Cluster topology
+# ============================================================
+export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+export NODES_ARRAY=($NODES)
+export HEAD_NODE=${NODES_ARRAY[0]}
+export MASTER_ADDR=$(hostname --ip-address)
+export MASTER_PORT=$((RANDOM + 10000))
+export NNODES=$SLURM_JOB_NUM_NODES
+export NUM_GPUS_PER_NODE=8
+
+# ============================================================
+# Network settings for EFA + NCCL
+# ============================================================
+export FI_LOG_LEVEL=1
+export FI_PROVIDER=efa
+export FI_EFA_USE_HUGE_PAGE=0
+export NCCL_SOCKET_IFNAME=^docker,lo,veth
+export NCCL_P2P_NET_CHUNKSIZE=2048576
+export NCCL_BUFFERSIZE=8388608
+export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMPI_MCA_plm=^slurm
+
+# ============================================================
+# Generate DeepSpeed config dynamically
+# ============================================================
+mkdir -p configs logs
+
+PRESCALE_GRAD="false"
+if [ "${ZERO_STAGE}" -eq 0 ]; then
+    PRESCALE_GRAD="true"
+fi
+
+OVERLAP_COMM_BOOL="false"
+if [ "${USE_OVERLAP_COMM}" -eq 1 ]; then
+    OVERLAP_COMM_BOOL="true"
+fi
+
+# Build ZeRO optimisation block depending on stage
+if [ "${ZERO_STAGE}" -eq 3 ]; then
+    ZERO_BLOCK=$(cat <<ZEOF
+    "zero_optimization": {
+        "stage": 3,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 500000000,
+        "overlap_comm": ${OVERLAP_COMM_BOOL},
+        "reduce_scatter": true,
+        "reduce_bucket_size": 500000000,
+        "contiguous_gradients": true,
+        "stage3_prefetch_bucket_size": 500000000,
+        "stage3_param_persistence_threshold": 1000000,
+        "stage3_max_live_parameters": 1000000000,
+        "stage3_max_reuse_distance": 1000000000
+    }
+ZEOF
+    )
+else
+    ZERO_BLOCK=$(cat <<ZEOF
+    "zero_optimization": {
+        "stage": ${ZERO_STAGE},
+        "allgather_partitions": true,
+        "allgather_bucket_size": 500000000,
+        "overlap_comm": ${OVERLAP_COMM_BOOL},
+        "reduce_scatter": true,
+        "reduce_bucket_size": 500000000,
+        "contiguous_gradients": true
+    }
+ZEOF
+    )
+fi
+
+cat <<EOF > configs/ds_config_run.json
+{
+    "train_batch_size": ${GLOBAL_BATCH_SIZE},
+    "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+    "steps_per_print": 10,
+${ZERO_BLOCK},
+    "gradient_clipping": 1.0,
+    "prescale_gradients": ${PRESCALE_GRAD},
+    "bf16": {
+        "enabled": true
+    },
+    "wall_clock_breakdown": false
+}
+EOF
+
+# ============================================================
+# Hostfile for DeepSpeed
+# ============================================================
+export HOSTFILE=/fsx/hostfile_${SLURM_JOB_ID}
+function makehostfile() {
+perl -e '$slots=split /,/, $ENV{"SLURM_STEP_GPUS"};
+$slots=8 if $slots==0;
+@nodes = split /\n/, qx[scontrol show hostnames $ENV{"SLURM_JOB_NODELIST"}];
+print map { "$b$_ slots=$slots\n" } @nodes'
+}
+makehostfile > ${HOSTFILE}
+
+# ============================================================
+# Container + distributed launch args
+# ============================================================
+declare -a SRUN_ARGS=(
+    --container-image ${IMAGE}
+    --container-mounts /fsx,/opt/slurm/bin
+)
+
+declare -a DIST_ARGS=(
+    --nnodes ${NNODES}
+    --nproc-per-node ${NUM_GPUS_PER_NODE}
+    --master_addr ${MASTER_ADDR}
+    --master_port ${MASTER_PORT}
+    --rdzv_id $RANDOM
+    --rdzv_backend c10d
+    --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT}
+)
+
+# ============================================================
+# Model + training args
+# ============================================================
+declare -a MODEL_ARGS=(
+    --num-layers ${NUM_LAYERS}
+    --hidden-size ${HIDDEN_SIZE}
+    --num-attention-heads ${NUM_HEADS}
+    --ffn-hidden-size ${FFN_HIDDEN_SIZE}
+    --seq-length ${SEQ_LENGTH}
+    --max-position-embeddings ${SEQ_LENGTH}
+    --micro-batch-size ${MICRO_BATCH_SIZE}
+    --global-batch-size ${GLOBAL_BATCH_SIZE}
+    --train-iters ${TRAIN_ITERS}
+    --lr 1.0e-4
+    --min-lr 1.0e-6
+    --lr-decay-style cosine
+    --lr-warmup-iters 5
+    --lr-decay-iters 50
+    --weight-decay 0.1
+    --clip-grad 1.0
+    --adam-beta1 0.9
+    --adam-beta2 0.95
+    --init-method-std 0.006
+    --log-interval 1
+    --eval-iters 0
+    --eval-interval 1000
+    --bf16
+    --data-path ${DATA_DIR}/BookCorpusDataset_text_document
+    --vocab-file ${DATA_DIR}/gpt2-vocab.json
+    --merge-file ${DATA_DIR}/gpt2-merges.txt
+    --split 100,0,0
+    --data-impl mmap
+    --num-workers 0
+)
+
+# By default disable fusions (matches sweep v1 behaviour).
+# Set ENABLE_FUSIONS=1 to enable them.
+if [ "${ENABLE_FUSIONS}" -eq 0 ]; then
+    MODEL_ARGS+=(
+        --no-masked-softmax-fusion
+        --no-bias-gelu-fusion
+        --no-bias-dropout-fusion
+        --no-gradient-accumulation-fusion
+    )
+fi
+
+declare -a DS_ARGS=(
+    --tensor-model-parallel-size ${TP}
+    --zero-stage ${ZERO_STAGE}
+    --deepspeed_config ${PWD}/configs/ds_config_run.json
+    --deepspeed
+    --distributed-backend nccl
+)
+
+# Megatron-DeepSpeed wraps the model in PipelineModule by default.
+# DeepSpeed's PipelineEngine asserts that ZeRO stage < 2.
+# When using ZeRO-2 or ZeRO-3 we must disable pipeline parallel entirely
+# via --no-pipeline-parallel (which sets ds_pipeline_enabled=False).
+if [ "${ZERO_STAGE}" -ge 2 ]; then
+    DS_ARGS+=(
+        --pipeline-model-parallel-size 1
+        --no-pipeline-parallel
+    )
+else
+    DS_ARGS+=(--pipeline-model-parallel-size ${PP})
+fi
+
+# ============================================================
+# Optional features
+# ============================================================
+if [ "${USE_ACTIVATION_CHECKPOINTING}" -eq 1 ]; then
+    DS_ARGS+=(
+        --checkpoint-activations
+        --deepspeed-activation-checkpointing
+    )
+fi
+
+if [ "${USE_SEQUENCE_PARALLEL}" -eq 1 ]; then
+    DS_ARGS+=(--sequence-parallel)
+fi
+
+# ============================================================
+# Launch training
+# Note: Using python3 -m torch.distributed.run instead of torchrun
+# because the container's Python version may differ from the host
+# ============================================================
+echo "=== DeepSpeed 103B GPT Pretraining ==="
+echo "Config: ${CONFIG_NAME}"
+echo "Nodes: ${NNODES}, GPUs/node: ${NUM_GPUS_PER_NODE}, Total GPUs: $((NNODES * NUM_GPUS_PER_NODE))"
+echo "TP=${TP}, PP=${PP}, ZeRO=${ZERO_STAGE}"
+echo "MBS=${MICRO_BATCH_SIZE}, GBS=${GLOBAL_BATCH_SIZE}"
+echo "Model: layers=${NUM_LAYERS}, hidden=${HIDDEN_SIZE}, heads=${NUM_HEADS}, ffn=${FFN_HIDDEN_SIZE}"
+echo "Seq length: ${SEQ_LENGTH}, Fusions: ${ENABLE_FUSIONS}"
+echo "Activation ckpt: ${USE_ACTIVATION_CHECKPOINTING}, Seq parallel: ${USE_SEQUENCE_PARALLEL}"
+echo "PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-}"
+echo "======================================="
+
+# Convert arrays to strings for bash -c invocation
+DIST_ARGS_STR="${DIST_ARGS[*]}"
+MODEL_ARGS_STR="${MODEL_ARGS[*]}"
+DS_ARGS_STR="${DS_ARGS[*]}"
+
+srun -l "${SRUN_ARGS[@]}" bash -c "export PYTHONPATH=${MEGATRON_DS_PATH} && cd ${MEGATRON_DS_PATH} && python3 -m torch.distributed.run ${DIST_ARGS_STR} pretrain_gpt.py ${MODEL_ARGS_STR} ${DS_ARGS_STR}"
+
+# Cleanup hostfile
+rm -f ${HOSTFILE}
diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner.sh b/3.test_cases/pytorch/deepspeed/sweep_runner.sh
new file mode 100755
index 000000000..9f7d5a944
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/sweep_runner.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# sweep_runner.sh - Automated parameter sweep for DeepSpeed 103B pretraining
+# Runs all parallelism and environment flag configurations, collects results.
+#
+# Usage: bash sweep_runner.sh [--dry-run]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch"
+RESULTS_DIR="${SCRIPT_DIR}/sweep_results"
+NODES=8
+PARTITION="b200"
+
+DRY_RUN=0
+if [ "${1:-}" = "--dry-run" ]; then
+    DRY_RUN=1
+    echo "[DRY RUN] Will print commands without submitting"
+fi
+
+mkdir -p "${RESULTS_DIR}" logs
+
+# ============================================================
+# Helper: submit a sweep configuration
+# ============================================================
+submit_config() {
+    local config_name="$1"
+    local tp="$2"
+    local pp="$3"
+    local zero="$4"
+    local mbs="$5"
+    local gbs="$6"
+    local act_ckpt="${7:-0}"
+    local seq_par="${8:-0}"
+    local overlap="${9:-0}"
+    shift 9 || true
+    local extra_env="${*:-}"
+
+    echo "============================================"
+    echo "Submitting: ${config_name}"
+    echo "  TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}"
+    echo "  ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}"
+    [ -n "${extra_env}" ] && echo "  Extra env: ${extra_env}"
+    echo "============================================"
+
+    local env_exports=""
+    env_exports+="TP=${tp},"
+    env_exports+="PP=${pp},"
+    env_exports+="ZERO_STAGE=${zero},"
+    env_exports+="MICRO_BATCH_SIZE=${mbs},"
+    env_exports+="GLOBAL_BATCH_SIZE=${gbs},"
+    env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt},"
+    env_exports+="USE_SEQUENCE_PARALLEL=${seq_par},"
+    env_exports+="USE_OVERLAP_COMM=${overlap},"
+    env_exports+="CONFIG_NAME=${config_name}"
+
+    local sbatch_cmd="sbatch"
+    sbatch_cmd+=" --partition=${PARTITION}"
+    sbatch_cmd+=" --nodes=${NODES}"
+    sbatch_cmd+=" --export=ALL,${env_exports}"
+    sbatch_cmd+=" --job-name=sweep_${config_name}"
+
+    # Add extra env vars for NCCL tuning
+    if [ -n "${extra_env}" ]; then
+        sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}"
+    fi
+
+    sbatch_cmd+=" ${SBATCH_SCRIPT}"
+
+    if [ "${DRY_RUN}" -eq 1 ]; then
+        echo "[DRY RUN] ${sbatch_cmd}"
+        echo ""
+        return
+    fi
+
+    local job_output
+    job_output=$(eval "${sbatch_cmd}")
+    local job_id
+    job_id=$(echo "${job_output}" | awk '{print $NF}')
+    echo "Submitted job ${job_id} for config ${config_name}"
+    echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap}" >> "${RESULTS_DIR}/sweep_jobs.csv"
+}
+
+# ============================================================
+# Initialize tracking file
+# ============================================================
+echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap" > "${RESULTS_DIR}/sweep_jobs.csv"
+
+# ============================================================
+# PARALLELISM SWEEP (Configs 1-11)
+# ============================================================
+echo ""
+echo "========== PARALLELISM SWEEP =========="
+echo ""
+
+#            config_name         TP PP ZeRO MBS GBS ACT SEQ OVR
+submit_config "01_baseline"       8  2  0    1   64  0   0   0
+submit_config "02_more_pp"        8  4  0    1   64  0   0   0
+submit_config "03_zero1"          8  2  1    1   64  0   0   0
+submit_config "04_larger_mbs"     8  2  1    2  128  0   0   0
+submit_config "05_pp4_zero1"      8  4  1    1  128  0   0   0
+submit_config "06_zero2"          8  2  2    1   64  0   0   0
+submit_config "07_full_pp"        8  8  0    1   64  0   0   0
+submit_config "08_tp4_pp4"        4  4  1    1   64  0   0   0
+submit_config "09_act_ckpt"       8  2  1    1   64  1   0   0
+submit_config "10_seq_parallel"   8  2  1    1   64  0   1   0
+submit_config "11_overlap_comm"   8  2  1    1   64  0   0   1
+
+# ============================================================
+# Wait for parallelism sweep to determine best config
+# If not waiting, env sweep uses config 03 (TP8/PP2/ZeRO1) as default
+# ============================================================
+echo ""
+echo "========== ENVIRONMENT FLAGS SWEEP =========="
+echo "(Using TP=8 PP=2 ZeRO=1 as base for env flag sweep)"
+echo ""
+
+# Base parallelism for env sweep
+BASE_TP=8
+BASE_PP=2
+BASE_ZERO=1
+BASE_MBS=1
+BASE_GBS=64
+
+#            config_name              TP       PP       ZeRO     MBS      GBS      ACT SEQ OVR extra_env
+submit_config "12_nccl_ring"          ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Ring"
+submit_config "13_nccl_tree"          ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Tree"
+submit_config "14_nccl_no_tuner"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_TUNER_PLUGIN="
+submit_config "15_nccl_chunk_4mb"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_P2P_NET_CHUNKSIZE=4194304"
+submit_config "16_cuda_max_conn_1"    ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "CUDA_DEVICE_MAX_CONNECTIONS=1"
+submit_config "17_nccl_buf_16mb"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=16777216"
+submit_config "18_nccl_buf_32mb"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=33554432"
+submit_config "19_nccl_min_ch_16"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=16"
+submit_config "20_nccl_min_ch_32"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=32"
+
+echo ""
+echo "========== SWEEP SUBMITTED =========="
+echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs.csv"
+echo ""
+echo "To monitor: watch 'squeue -u \$USER'"
+echo "When all jobs finish, run: python parse_results.py"
diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh b/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
new file mode 100644
index 000000000..977b0149c
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# sweep_runner_v2.sh - Sweep v2: ZeRO-2 (no PP), ZeRO-3, memory push, fusion ops
+#
+# All configs use PYTORCH_CUDA_ALLOC_CONF=expandable_segments:true (set in sbatch).
+# Optimal NCCL flags are the defaults already in the sbatch script.
+#
+# Usage: bash sweep_runner_v2.sh [--dry-run]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch"
+RESULTS_DIR="${SCRIPT_DIR}/sweep_results"
+NODES=8
+PARTITION="b200"
+
+DRY_RUN=0
+if [ "${1:-}" = "--dry-run" ]; then
+    DRY_RUN=1
+    echo "[DRY RUN] Will print commands without submitting"
+fi
+
+mkdir -p "${RESULTS_DIR}" logs
+
+# ============================================================
+# Helper: submit a sweep configuration
+# Extends v1 helper with seq_length and enable_fusions params.
+# ============================================================
+submit_config() {
+    local config_name="$1"
+    local tp="$2"
+    local pp="$3"
+    local zero="$4"
+    local mbs="$5"
+    local gbs="$6"
+    local act_ckpt="${7:-0}"
+    local seq_par="${8:-0}"
+    local overlap="${9:-0}"
+    local seq_length="${10:-2048}"
+    local enable_fusions="${11:-0}"
+    shift 11 || true
+    local extra_env="${*:-}"
+
+    echo "============================================"
+    echo "Submitting: ${config_name}"
+    echo "  TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}"
+    echo "  SeqLen=${seq_length} Fusions=${enable_fusions}"
+    echo "  ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}"
+    [ -n "${extra_env}" ] && echo "  Extra env: ${extra_env}"
+    echo "============================================"
+
+    local env_exports=""
+    env_exports+="TP=${tp},"
+    env_exports+="PP=${pp},"
+    env_exports+="ZERO_STAGE=${zero},"
+    env_exports+="MICRO_BATCH_SIZE=${mbs},"
+    env_exports+="GLOBAL_BATCH_SIZE=${gbs},"
+    env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt},"
+    env_exports+="USE_SEQUENCE_PARALLEL=${seq_par},"
+    env_exports+="USE_OVERLAP_COMM=${overlap},"
+    env_exports+="SEQ_LENGTH=${seq_length},"
+    env_exports+="ENABLE_FUSIONS=${enable_fusions},"
+    env_exports+="CONFIG_NAME=${config_name}"
+
+    local sbatch_cmd="sbatch"
+    sbatch_cmd+=" --partition=${PARTITION}"
+    sbatch_cmd+=" --nodes=${NODES}"
+
+    # Add extra env vars (NCCL overrides etc.)
+    if [ -n "${extra_env}" ]; then
+        sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}"
+    else
+        sbatch_cmd+=" --export=ALL,${env_exports}"
+    fi
+
+    sbatch_cmd+=" ${SBATCH_SCRIPT}"
+
+    if [ "${DRY_RUN}" -eq 1 ]; then
+        echo "[DRY RUN] ${sbatch_cmd}"
+        echo ""
+        return
+    fi
+
+    local job_output
+    job_output=$(eval "${sbatch_cmd}")
+    local job_id
+    job_id=$(echo "${job_output}" | awk '{print $NF}')
+    echo "Submitted job ${job_id} for config ${config_name}"
+    echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap},${seq_length},${enable_fusions}" >> "${RESULTS_DIR}/sweep_jobs_v2.csv"
+}
+
+# ============================================================
+# Initialize tracking file
+# ============================================================
+echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap,seq_length,enable_fusions" > "${RESULTS_DIR}/sweep_jobs_v2.csv"
+
+# ============================================================
+# ZeRO-2 WITHOUT PIPELINE PARALLELISM (PP=1)
+# ============================================================
+echo ""
+echo "========== ZeRO-2 SWEEP (PP=1) =========="
+echo ""
+
+#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
+submit_config "21_zero2_tp8_pp1"      8  1  2    1   64  0   0   0   2048    0
+submit_config "22_zero2_tp8_pp1_mbs2" 8  1  2    2   64  0   0   0   2048    0
+submit_config "23_zero2_tp4_pp1"      4  1  2    1   64  0   0   0   2048    0
+
+# ============================================================
+# ZeRO-3 (PP=1)
+# ============================================================
+echo ""
+echo "========== ZeRO-3 SWEEP (PP=1) =========="
+echo ""
+
+#            config_name                  TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
+submit_config "24_zero3_tp8_pp1"          8  1  3    1   64  0   0   0   2048    0
+submit_config "25_zero3_tp8_pp1_mbs2"     8  1  3    2   64  0   0   0   2048    0
+submit_config "26_zero3_tp4_pp1"          4  1  3    1   64  0   0   0   2048    0
+submit_config "27_zero3_tp8_pp1_overlap"  8  1  3    1   64  0   0   1   2048    0
+
+# ============================================================
+# MEMORY PUSH / SEQ LENGTH / FUSIONS
+# ============================================================
+echo ""
+echo "========== MEMORY PUSH SWEEP =========="
+echo ""
+
+#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
+submit_config "28_mem_seq4k_tp8_pp2"  8  2  0    1   64  0   0   0   4096    0
+submit_config "29_mem_fused_tp8_pp8"  8  8  0    1   64  0   0   0   2048    1
+
+# ============================================================
+# EXPANDABLE SEGMENTS IMPACT ON BEST CONFIG
+# Re-test best config (TP8/PP8/ZeRO0) — now with expandable_segments
+# enabled automatically via the updated sbatch.
+# ============================================================
+echo ""
+echo "========== EXPANDABLE SEGMENTS IMPACT =========="
+echo ""
+
+#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
+submit_config "30_best_expand_seg"    8  8  0    1   64  0   0   0   2048    0
+
+echo ""
+echo "========== SWEEP V2 SUBMITTED =========="
+echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs_v2.csv"
+echo ""
+echo "Total configs: 10"
+echo "To monitor: watch 'squeue -u \$USER'"
+echo "When all jobs finish, run: python parse_results.py --jobs-csv sweep_results/sweep_jobs_v2.csv"
diff --git a/3.test_cases/pytorch/deepspeed/upload_results.sh b/3.test_cases/pytorch/deepspeed/upload_results.sh
new file mode 100755
index 000000000..c88c79077
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/upload_results.sh
@@ -0,0 +1,260 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# upload_results.sh - Upload benchmark results to S3 and CloudWatch
+#
+# Usage:
+#   bash upload_results.sh [--results-dir sweep_results] [--region us-east-1]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+RESULTS_DIR="${1:---results-dir}"
+CW_REGION="us-east-1"
+S3_REGION="us-west-2"
+S3_BUCKET="paragao-new-nemo-squash-container"
+CW_NAMESPACE="DeepSpeed/B200Benchmarks"
+CW_DASHBOARD_NAME="DeepSpeed-B200-Benchmarks"
+
+# Parse args
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --results-dir) RESULTS_DIR="$2"; shift 2 ;;
+        --region) CW_REGION="$2"; shift 2 ;;
+        *) shift ;;
+    esac
+done
+
+: "${RESULTS_DIR:=${SCRIPT_DIR}/sweep_results}"
+
+if [ ! -d "${RESULTS_DIR}" ]; then
+    echo "Error: Results directory not found: ${RESULTS_DIR}"
+    exit 1
+fi
+
+# ============================================================
+# 1. Upload JSON files to S3
+# ============================================================
+echo "=== Uploading results to S3 ==="
+
+# Determine S3 path: benchmark-results/b200/2026/March/04/
+YEAR=$(date -u +%Y)
+MONTH=$(date -u +%B)
+DAY=$(date -u +%d)
+S3_PREFIX="benchmark-results/b200/${YEAR}/${MONTH}/${DAY}"
+
+for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
+    if [ ! -f "${json_file}" ]; then
+        echo "No result JSON files found in ${RESULTS_DIR}"
+        break
+    fi
+    filename=$(basename "${json_file}")
+    echo "  Uploading ${filename} -> s3://${S3_BUCKET}/${S3_PREFIX}/${filename}"
+    aws s3 cp "${json_file}" "s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" \
+        --region "${S3_REGION}" \
+        --content-type "application/json"
+done
+
+echo "S3 upload complete: s3://${S3_BUCKET}/${S3_PREFIX}/"
+echo ""
+
+# ============================================================
+# 2. Publish metrics to CloudWatch
+# ============================================================
+echo "=== Publishing metrics to CloudWatch ==="
+
+for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
+    if [ ! -f "${json_file}" ]; then
+        break
+    fi
+
+    filename=$(basename "${json_file}")
+
+    # Extract metadata and summary using python
+    read -r config_name tp pp zero_stage precision avg_tflops avg_step_time timestamp < <(
+        python3 -c "
+import json, sys
+with open('${json_file}') as f:
+    d = json.load(f)
+m = d['metadata']
+s = d['summary']
+sc = m.get('sweep_config', {})
+print(
+    sc.get('config_name', 'unknown'),
+    sc.get('tp', 8),
+    sc.get('pp', 2),
+    sc.get('zero_stage', 1),
+    m.get('precision', 'bf16'),
+    s.get('steady_state_avg_tflops_per_gpu', 0),
+    s.get('steady_state_avg_step_time_s', 0),
+    m.get('timestamp', '$(date -u +%Y-%m-%dT%H:%M:%SZ)')
+)
+"
+    )
+
+    echo "  Publishing: ${config_name} (TFLOPS=${avg_tflops}, StepTime=${avg_step_time}s)"
+
+    # Publish with full dimensions
+    aws cloudwatch put-metric-data \
+        --namespace "${CW_NAMESPACE}" \
+        --region "${CW_REGION}" \
+        --metric-data "[
+            {
+                \"MetricName\": \"TFLOPSPerGPU\",
+                \"Value\": ${avg_tflops},
+                \"Unit\": \"Count\",
+                \"Timestamp\": \"${timestamp}\",
+                \"Dimensions\": [
+                    {\"Name\": \"model_size\", \"Value\": \"103b\"},
+                    {\"Name\": \"tp\", \"Value\": \"${tp}\"},
+                    {\"Name\": \"pp\", \"Value\": \"${pp}\"},
+                    {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"},
+                    {\"Name\": \"precision\", \"Value\": \"${precision}\"},
+                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
+                ]
+            },
+            {
+                \"MetricName\": \"StepTimeSeconds\",
+                \"Value\": ${avg_step_time},
+                \"Unit\": \"Seconds\",
+                \"Timestamp\": \"${timestamp}\",
+                \"Dimensions\": [
+                    {\"Name\": \"model_size\", \"Value\": \"103b\"},
+                    {\"Name\": \"tp\", \"Value\": \"${tp}\"},
+                    {\"Name\": \"pp\", \"Value\": \"${pp}\"},
+                    {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"},
+                    {\"Name\": \"precision\", \"Value\": \"${precision}\"},
+                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
+                ]
+            }
+        ]"
+
+    # Also publish with just config_name dimension for easy dashboard queries
+    aws cloudwatch put-metric-data \
+        --namespace "${CW_NAMESPACE}" \
+        --region "${CW_REGION}" \
+        --metric-data "[
+            {
+                \"MetricName\": \"TFLOPSPerGPU\",
+                \"Value\": ${avg_tflops},
+                \"Unit\": \"Count\",
+                \"Timestamp\": \"${timestamp}\",
+                \"Dimensions\": [
+                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
+                ]
+            },
+            {
+                \"MetricName\": \"StepTimeSeconds\",
+                \"Value\": ${avg_step_time},
+                \"Unit\": \"Seconds\",
+                \"Timestamp\": \"${timestamp}\",
+                \"Dimensions\": [
+                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
+                ]
+            }
+        ]"
+
+done
+
+echo "CloudWatch metrics published to namespace: ${CW_NAMESPACE}"
+echo ""
+
+# ============================================================
+# 3. Create/Update CloudWatch Dashboard
+# ============================================================
+echo "=== Creating CloudWatch Dashboard ==="
+
+# Build metric entries dynamically from results
+TFLOPS_METRICS=""
+STEPTIME_METRICS=""
+TABLE_METRICS=""
+
+for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
+    if [ ! -f "${json_file}" ]; then
+        break
+    fi
+
+    config_name=$(python3 -c "
+import json
+with open('${json_file}') as f:
+    d = json.load(f)
+print(d['metadata'].get('sweep_config', {}).get('config_name', 'unknown'))
+")
+
+    TFLOPS_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}],"
+    STEPTIME_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}],"
+    TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} TFLOPS\"}],"
+    TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} StepTime\"}],"
+done
+
+# Remove trailing commas
+TFLOPS_METRICS="${TFLOPS_METRICS%,}"
+STEPTIME_METRICS="${STEPTIME_METRICS%,}"
+TABLE_METRICS="${TABLE_METRICS%,}"
+
+DASHBOARD_BODY=$(cat <<DASH
+{
+  "widgets": [
+    {
+      "type": "text",
+      "x": 0, "y": 0, "width": 24, "height": 1,
+      "properties": {
+        "markdown": "# DeepSpeed B200 Benchmark Results - GPT 103B\\nCluster: b200-hyperpod | 8 nodes x 8 B200 GPUs | Namespace: \`${CW_NAMESPACE}\`"
+      }
+    },
+    {
+      "type": "metric",
+      "x": 0, "y": 1, "width": 24, "height": 8,
+      "properties": {
+        "title": "TFLOPS/GPU Across Sweep Configurations",
+        "view": "bar",
+        "region": "${CW_REGION}",
+        "stat": "Average",
+        "period": 86400,
+        "yAxis": {"left": {"label": "TFLOPS/GPU", "showUnits": false}},
+        "metrics": [${TFLOPS_METRICS}]
+      }
+    },
+    {
+      "type": "metric",
+      "x": 0, "y": 9, "width": 24, "height": 8,
+      "properties": {
+        "title": "Step Time Comparison (seconds)",
+        "view": "bar",
+        "region": "${CW_REGION}",
+        "stat": "Average",
+        "period": 86400,
+        "yAxis": {"left": {"label": "Step Time (s)", "showUnits": false}},
+        "metrics": [${STEPTIME_METRICS}]
+      }
+    },
+    {
+      "type": "metric",
+      "x": 0, "y": 17, "width": 24, "height": 8,
+      "properties": {
+        "title": "Summary Metrics Table",
+        "view": "table",
+        "region": "${CW_REGION}",
+        "stat": "Average",
+        "period": 86400,
+        "metrics": [${TABLE_METRICS}]
+      }
+    }
+  ]
+}
+DASH
+)
+
+aws cloudwatch put-dashboard \
+    --dashboard-name "${CW_DASHBOARD_NAME}" \
+    --region "${CW_REGION}" \
+    --dashboard-body "${DASHBOARD_BODY}"
+
+echo "Dashboard created: ${CW_DASHBOARD_NAME}"
+echo "URL: https://${CW_REGION}.console.aws.amazon.com/cloudwatch/home?region=${CW_REGION}#dashboards:name=${CW_DASHBOARD_NAME}"
+echo ""
+echo "=== Upload Complete ==="
+echo "S3: s3://${S3_BUCKET}/${S3_PREFIX}/"
+echo "CloudWatch: ${CW_NAMESPACE} in ${CW_REGION}"
+echo "Dashboard: ${CW_DASHBOARD_NAME}"

From 0f972867880e61a5a8c04a44b13bf8c058daf313 Mon Sep 17 00:00:00 2001
From: Paulo Aragao <aragao.paulo@gmail.com>
Date: Fri, 6 Mar 2026 11:31:01 +0000
Subject: [PATCH 2/5] Update README, Makefile, and QLoRA Dockerfile for PR
 readiness

- Rewrite README as use-case-focused guide: GPT-103B pretraining,
  QLoRA fine-tuning, and Llama2 fine-tuning with best practices
  and proper configuration docs (no benchmark numbers)
- Simplify Makefile: best-config train target, remove sweep/upload targets
- Standardize QLoRA Dockerfile: pytorch:25.04-py3 base, EFA 1.47,
  NCCL 2.29.3, GDRCopy 2.5.1, OFI-NCCL symlinks, proper NCCL/EFA env vars
- Remove sweep runners and upload script from tracked files (internal tooling)
---
 3.test_cases/pytorch/deepspeed/Makefile       |  52 +++-
 3.test_cases/pytorch/deepspeed/README.md      | 192 +++++++++----
 .../pytorch/deepspeed/qlora/Dockerfile        | 164 ++++++++---
 .../pytorch/deepspeed/qlora/requirements.txt  |   2 +-
 .../pytorch/deepspeed/sweep_runner.sh         | 144 ----------
 .../pytorch/deepspeed/sweep_runner_v2.sh      | 154 -----------
 .../pytorch/deepspeed/upload_results.sh       | 260 ------------------
 7 files changed, 322 insertions(+), 646 deletions(-)
 delete mode 100755 3.test_cases/pytorch/deepspeed/sweep_runner.sh
 delete mode 100644 3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
 delete mode 100755 3.test_cases/pytorch/deepspeed/upload_results.sh

diff --git a/3.test_cases/pytorch/deepspeed/Makefile b/3.test_cases/pytorch/deepspeed/Makefile
index e4615c60f..d2b46ee82 100644
--- a/3.test_cases/pytorch/deepspeed/Makefile
+++ b/3.test_cases/pytorch/deepspeed/Makefile
@@ -1,12 +1,52 @@
-ENROOT_IMAGE=deepspeed
+ENROOT_IMAGE   ?= deepspeed
+APPS_PATH      ?= /fsx/apps
+SQUASH_FILE    ?= $(APPS_PATH)/$(ENROOT_IMAGE).sqsh
+PARTITION      ?= b200
+NODES          ?= 8
+LOGS_DIR       ?= logs
+RESULTS_DIR    ?= sweep_results
 
-all: build clean import
+.PHONY: all build clean import build-remote train parse help
+
+all: build import
+
+help:
+	@echo "Container targets:"
+	@echo "  build          - Build Docker image locally"
+	@echo "  import         - Convert Docker image to Enroot squash file"
+	@echo "  build-remote   - Build image on a compute node via sbatch"
+	@echo "  clean          - Remove local squash file"
+	@echo ""
+	@echo "Training targets:"
+	@echo "  train          - Submit 103B GPT pretraining (best config: TP=8, PP=8, fusions)"
+	@echo ""
+	@echo "Results targets:"
+	@echo "  parse          - Parse training logs into benchmark JSON"
+
+# ---- Container ----
 
 build:
-	docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile .
+	docker build -t $(ENROOT_IMAGE) -f 0.deepspeed.dockerfile .
+
+import:
+	mkdir -p $(APPS_PATH)
+	enroot import -o $(SQUASH_FILE) dockerd://$(ENROOT_IMAGE):latest
+
+build-remote:
+	sbatch 1.build-image.sbatch
 
 clean:
-	-rm ${ENROOT_IMAGE}.sqsh
+	-rm -f $(SQUASH_FILE)
 
-import:
-	enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest
+# ---- Training (best config: TP=8, PP=8, ZeRO=0, fusions enabled) ----
+
+train:
+	sbatch --partition=$(PARTITION) --nodes=$(NODES) \
+		--export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \
+		pretrain_gpt_103b.sbatch
+
+# ---- Results ----
+
+parse:
+	python3 parse_results.py --jobs-csv $(RESULTS_DIR)/sweep_jobs.csv \
+		--logs-dir $(LOGS_DIR) --output-dir $(RESULTS_DIR)
diff --git a/3.test_cases/pytorch/deepspeed/README.md b/3.test_cases/pytorch/deepspeed/README.md
index fd2ef7524..e873d1af8 100644
--- a/3.test_cases/pytorch/deepspeed/README.md
+++ b/3.test_cases/pytorch/deepspeed/README.md
@@ -1,87 +1,179 @@
-# DeepSpeed Test Cases <!-- omit in toc -->
+# DeepSpeed on AWS <!-- omit in toc -->
 
-[DeepSpeed](https://github.com/microsoft/DeepSpeed) enables world's most powerful language models like MT-530B and BLOOM. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. `deepspeed` illustrates several example test cases for DeepSpeed training on AWS. 
+[DeepSpeed](https://github.com/microsoft/DeepSpeed) is a deep learning optimization library that enables efficient distributed training at scale. This directory contains test cases for running DeepSpeed workloads on AWS GPU clusters, covering large-scale pretraining and parameter-efficient fine-tuning.
 
-## 1. Preparation
+## Use Cases
 
-This guide assumes that you have the following:
+| Use Case | Description | Location |
+|----------|-------------|----------|
+| GPT-103B Pretraining | Large-scale GPT pretraining benchmark using Megatron-DeepSpeed with 3D parallelism (TP/PP/DP) and ZeRO optimization | [`pretrain_gpt_103b.sbatch`](pretrain_gpt_103b.sbatch) |
+| QLoRA Fine-tuning | Qwen3-8B fine-tuning with QLoRA (4-bit) + DeepSpeed ZeRO-2/3, supports EKS and Slurm | [`qlora/`](qlora/) |
+| Llama2 Fine-tuning | Llama2 fine-tuning from HuggingFace weights using Megatron-DeepSpeed | [`examples_megatron_deepspeed/finetune_hf_llama/`](examples_megatron_deepspeed/finetune_hf_llama/) |
 
-* A functional Slurm cluster on AWS.
-* Docker, [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) installed.
-* An FSx for Lustre filesystem mounted on `/fsx`.
+## Prerequisites
 
-We recommend that you set up a Slurm cluster using the templates in the architectures [directory](../../1.architectures). You need to set the following environment variables to run these test cases:
+- A functional Slurm cluster on AWS. We recommend [SageMaker HyperPod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html) or the templates in the [architectures directory](../../1.architectures).
+- [Docker](https://docs.docker.com/engine/install/), [Pyxis](https://github.com/NVIDIA/pyxis), and [Enroot](https://github.com/NVIDIA/enroot) installed on compute nodes.
+- An [FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html) filesystem mounted on `/fsx`.
+- NVIDIA GPU instances with [EFA networking](https://aws.amazon.com/hpc/efa/) (B200, H100, A100, etc.).
 
-```bash
-export APPS_PATH=/fsx/apps
-export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh
-export FSX_PATH=/fsx
-export MODEL_PATH=$FSX_PATH/deepspeed
-export TEST_CASE_PATH=${HOME}/18.deepspeed  # where you copy the test case or set to your test case path
-cd $TEST_CASE_PATH                          # Note that we assume that you are here during the following command executions
-```
+## 1. GPT-103B Pretraining Benchmark
+
+A ~103B-parameter GPT model (80 layers, hidden=12288, heads=96, FFN=49152) trained with [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) using 3D parallelism (tensor, pipeline, data) and DeepSpeed ZeRO optimization. Designed for benchmarking multi-node GPU clusters.
 
+### Container setup
 
+The container image (`0.deepspeed.dockerfile`) is built on `nvcr.io/nvidia/pytorch:25.04-py3` and includes:
 
-## 2. Build the container
+- **EFA 1.47.0** with the bundled aws-ofi-nccl plugin and NCCL tuner
+- **NCCL 2.29.3** (upgraded to match B200 host driver)
+- **GDRCopy v2.5.1** for GPU-direct RDMA
+- **DeepSpeed**, **Transformers 4.44.2**, and multi-node SSH configuration
 
-Before running training jobs, you need to use a build docker container image. [Enroot](https://github.com/NVIDIA/enroot) will be used to turn the image into unprivileged sandbox for Slurm but build step may exceed the storage available on the head node so we reccomend building it on a compute node following instructions below (option 2)
+Build the container on a compute node (recommended, avoids head node storage limits):
 
-### Option 1: build image on a head node
+```bash
+sbatch 1.build-image.sbatch
+```
+
+Or build locally and convert to a squash file:
+
+```bash
+make build    # docker build
+make import   # enroot import to /fsx/apps/deepspeed.sqsh
+```
 
-Below are the steps you need to follow:
+### Data preparation
 
+The benchmark uses preprocessed data in Megatron format with the GPT-2 tokenizer.
 
-1. Build the Docker image with the command below in this directory.
+1. Download the GPT-2 tokenizer:
 
    ```bash
-    docker build -t deepspeed -f 0.deepspeed.dockerfile .
+   mkdir -p /fsx/deepspeed/data && cd /fsx/deepspeed/data
+   wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+   wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
    ```
 
-
-2. Once the Docker image is built, you can check if it is present with `docker images`. You should see an output similar to this one:
+2. Prepare training data (any text corpus works; for benchmarking, synthetic data is sufficient):
 
    ```bash
-    REPOSITORY   TAG       IMAGE ID       CREATED          SIZE
-    deepspeed     latest    b6c49033c424   9 minutes ago    23.3GB
-   ...
+   python3 -c "
+   import json
+   with open('synthetic_corpus.json', 'w') as f:
+       for i in range(50000):
+           json.dump({'text': 'The quick brown fox ' * 100}, f)
+           f.write('\n')
+   "
    ```
 
-3. Convert the Docker image to a squash file with the command below.
+3. Clone Megatron-DeepSpeed and preprocess:
 
    ```bash
-   enroot import -o ${ENROOT_IMAGE} dockerd://deepspeed:latest
+   git clone https://github.com/microsoft/Megatron-DeepSpeed /fsx/deepspeed/Megatron-DeepSpeed
+
+   python3 /fsx/deepspeed/Megatron-DeepSpeed/tools/preprocess_data.py \
+       --input synthetic_corpus.json \
+       --output-prefix BookCorpusDataset_text_document \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --tokenizer-type GPT2BPETokenizer \
+       --workers 16 --append-eod
    ```
 
-   The file will be stored in the `/apps` directory (by default). The output should look as below.
+### Running
+
+Submit the best-performing configuration (TP=8, PP=8, ZeRO-0, fusions enabled):
+
+```bash
+make train
+# or equivalently:
+sbatch --partition=b200 --nodes=8 \
+    --export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \
+    pretrain_gpt_103b.sbatch
+```
+
+Override parallelism settings for custom configurations:
+
+```bash
+sbatch --nodes=8 \
+    --export=ALL,TP=8,PP=4,ZERO_STAGE=1,MICRO_BATCH_SIZE=2,CONFIG_NAME=my_config \
+    pretrain_gpt_103b.sbatch
+```
+
+#### Environment variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `TP` | 8 | Tensor parallel size |
+| `PP` | 2 | Pipeline parallel size |
+| `ZERO_STAGE` | 1 | DeepSpeed ZeRO stage (0, 1, 2, or 3) |
+| `MICRO_BATCH_SIZE` | 1 | Per-GPU micro batch size |
+| `GLOBAL_BATCH_SIZE` | 64 | Global batch size |
+| `SEQ_LENGTH` | 2048 | Sequence length |
+| `ENABLE_FUSIONS` | 0 | Set to 1 to enable kernel fusion ops |
+| `USE_ACTIVATION_CHECKPOINTING` | 0 | Set to 1 for activation checkpointing |
+| `USE_OVERLAP_COMM` | 0 | Set to 1 to overlap communication with compute |
+| `TRAIN_ITERS` | 50 | Number of training iterations |
+| `CONFIG_NAME` | baseline | Label for this configuration |
+
+### Best practices
+
+The following recommendations are based on extensive parameter sweeps across parallelism strategies, ZeRO stages, NCCL flags, and memory optimizations:
+
+**Parallelism strategy:**
+
+- **Maximize pipeline parallelism** (PP) alongside tensor parallelism (TP) for best throughput. For an 8-node cluster with 8 GPUs per node, TP=8 with PP=8 is optimal.
+- **Enable kernel fusion ops** (`ENABLE_FUSIONS=1`) for a significant throughput improvement over the non-fused baseline. This enables masked-softmax, bias-gelu, bias-dropout, and gradient-accumulation fusions.
+- **ZeRO-0 outperforms ZeRO-1** when the data-parallel group size is small (e.g., DP=1 with TP=8/PP=8). ZeRO-1's allreduce overhead is not amortized.
+
+**ZeRO-2 and ZeRO-3:**
+
+- ZeRO-2 and ZeRO-3 are **incompatible with pipeline parallelism** in Megatron-DeepSpeed. The sbatch script automatically sets `PP=1` and adds `--no-pipeline-parallel` when `ZERO_STAGE >= 2`.
+- ZeRO-3's parameter partitioning **enables lower TP values** that ZeRO-2 cannot fit in memory (e.g., TP=4 works with ZeRO-3 but OOMs with ZeRO-2).
+- **Increasing micro-batch size** (e.g., `MICRO_BATCH_SIZE=2`) substantially improves throughput for ZeRO-2 and ZeRO-3 configurations.
+- `overlap_comm` provides only marginal improvement (~2%) with ZeRO-3.
 
-    ```bash
-    [INFO] Fetching image
+**NCCL and networking:**
 
-    36a8c752c28a2db543d2a632a3fc1fcbd5789a6f3d45b9d3a24632420dedcfa8
+- NCCL environment flag variations (buffer sizes, chunk sizes, min channels) have **negligible impact** on throughput (~1% range). The defaults in the sbatch script are well-tuned.
+- **Do not set `NCCL_ALGO=Tree`** on EFA-based clusters -- it causes hangs. Let the NCCL tuner plugin (`libnccl-ofi-tuner.so`) choose the algorithm automatically.
+- **Do not set `NCCL_PROTO` or `FI_EFA_FORK_SAFE`** -- these are not needed and can cause issues.
+
+**Memory:**
+
+- `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` is set by default in the sbatch script. Note the **capital T** is required in pytorch:25.04 containers; lowercase `true` causes a `RuntimeError`.
+- Sequence length 4096 exceeds available HBM even with TP=8/PP=2 on B200 (178GB per GPU). Use seq=2048 for this model size.
+
+### Parsing results
+
+After training completes, parse the Slurm logs into benchmark JSON using `parse_results.py`:
+
+```bash
+# Single log file
+python3 parse_results.py --log-file logs/deepspeed-pretrain-103b_123.out --config-name my_config
+
+# Multiple jobs tracked in a CSV
+python3 parse_results.py --jobs-csv sweep_results/sweep_jobs.csv --output-dir sweep_results
+```
 
-    [INFO] Extracting image content...
-    [INFO] Creating squashfs filesystem...
+### Known issues
 
-    Parallel mksquashfs: Using 32 processors
-    Creating 4.0 filesystem on /apps/deepspeed.sqsh, block size 131072.
-    [========================================================================================================================================================================================================================-] 291068/291068 100%
+- **torchrun shebang**: The container's `torchrun` may have a shebang pointing to the wrong Python version. The sbatch script uses `python3 -m torch.distributed.run` as a workaround.
+- **`expandable_segments` case sensitivity**: Must use `expandable_segments:True` (capital T) in pytorch:25.04-py3. Lowercase causes a `RuntimeError`.
+- **NCCL Tree algorithm**: Incompatible with EFA topology -- causes hangs. Do not set `NCCL_ALGO=Tree`.
+- **Sequence parallelism**: Incompatible with pipeline parallelism (PP>1) in this Megatron-DeepSpeed version.
 
-    Exportable Squashfs 4.0 filesystem, gzip compressed, data block size 131072
-            uncompressed data, uncompressed metadata, uncompressed fragments, uncompressed xattrs
-            duplicates are not removed
-    ...
-    ```
+## 2. QLoRA Fine-tuning (Qwen3-8B)
 
-Once done proceed to the next stage.
+Fine-tune [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) using QLoRA (4-bit quantization + LoRA adapters) with DeepSpeed ZeRO-2 or ZeRO-3. Supports deployment on SageMaker HyperPod with both EKS and Slurm orchestrators, including MIG GPU partitioning and automatic checkpoint resume.
 
-### Option 2: Build image on a compute node
+The QLoRA use case has its own container (`qlora/Dockerfile`) optimized for the same infrastructure best practices (EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1).
 
-In this option, you will use a compute node to build the image. Submit the job as:
+See [`qlora/README.md`](qlora/README.md) for full instructions.
 
-    ```bash
-    sbatch 1.build-image.sbatch
-    ```
+## 3. Llama2 Fine-tuning (Megatron-DeepSpeed)
 
+Fine-tune Llama2 from HuggingFace weights using Megatron-DeepSpeed. Includes weight conversion from HuggingFace to Megatron format and fine-tuning on the Stanford Alpaca dataset. Uses the shared container image (`0.deepspeed.dockerfile`).
 
-Once the image is prepared, you can proceed to `examples_*` directory for various deepspeed test cases.
\ No newline at end of file
+See [`examples_megatron_deepspeed/finetune_hf_llama/README.md`](examples_megatron_deepspeed/finetune_hf_llama/README.md) for full instructions.
\ No newline at end of file
diff --git a/3.test_cases/pytorch/deepspeed/qlora/Dockerfile b/3.test_cases/pytorch/deepspeed/qlora/Dockerfile
index 27f9bcd8c..32f02d35e 100644
--- a/3.test_cases/pytorch/deepspeed/qlora/Dockerfile
+++ b/3.test_cases/pytorch/deepspeed/qlora/Dockerfile
@@ -1,61 +1,154 @@
 # Dockerfile for QLoRA Fine-tuning of Qwen3-8B
 # =============================================
-# Base Image: NVIDIA CUDA 12.8 with cuDNN 9
-# Python: 3.10
+# Base Image: PyTorch 25.04 with CUDA 12.9.0 (supports Blackwell, Hopper, Ampere)
+# Python: 3.12 (bundled with pytorch:25.04-py3)
 # Key Libraries: PyTorch, Transformers, PEFT, BitsAndBytes, DeepSpeed
+# Networking: EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1
 #
 # Build:
 #   docker build -t qwen3-qlora-training:latest .
 #
 # If you encounter CUBLAS errors at runtime (typically caused by CUDA
-# library conflicts on the host), switch the torch index URL below to
-# cu126 as a fallback — see docs/TROUBLESHOOTING.md.
+# library conflicts on the host), see docs/TROUBLESHOOTING.md.
 
-# Stage 1: Base image with CUDA
-FROM nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 AS base
+# ============================================================
+# Base image: PyTorch 25.04 with CUDA 12.9.0
+# ============================================================
+FROM nvcr.io/nvidia/pytorch:25.04-py3
 
-# Prevent interactive prompts during build
 ENV DEBIAN_FRONTEND=noninteractive
 
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    python3.10 \
-    python3.10-dev \
-    python3-pip \
-    python3.10-venv \
+# ============================================================
+# 1. System packages and SSH setup (needed for multi-node training)
+# ============================================================
+RUN apt-get update -y && apt-get upgrade -y
+RUN apt-get remove -y --allow-change-held-packages \
+    ibverbs-utils \
+    libibverbs-dev \
+    libibverbs1 \
+    && rm -rf /opt/hpcx/ompi \
+    && rm -rf /usr/local/ucx \
+    && ldconfig
+
+RUN apt-get install -y --no-install-recommends \
+    autoconf \
+    automake \
+    build-essential \
+    cmake \
+    curl \
+    gcc \
+    gdb \
     git \
     git-lfs \
+    gnupg \
+    kmod \
+    libtool \
+    openssh-client \
+    openssh-server \
     wget \
-    curl \
     && rm -rf /var/lib/apt/lists/*
 
-# Set Python 3.10 as default
-RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \
-    ln -sf /usr/bin/pip3 /usr/bin/pip
+# SSH configuration for multi-node
+RUN rm -rf /root/.ssh/ \
+ && mkdir -p /root/.ssh/ \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 
-# Upgrade pip
-RUN pip install --upgrade pip setuptools wheel
+# ============================================================
+# 2. Install EFA Installer 1.47.0
+# ============================================================
+ENV EFA_INSTALLER_VERSION=1.47.0
+WORKDIR /tmp
+RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && cd / && rm -rf /tmp/aws-efa-installer
 
-# Set working directory
-WORKDIR /app
+# ============================================================
+# 3. NCCL plugin symlinks
+#    EFA installer names the plugin libnccl-net-ofi.so but NCCL
+#    looks for libnccl-net-aws-ofi.so. Without this symlink NCCL
+#    falls back to TCP sockets silently.
+# ============================================================
+RUN rm -rf /opt/amazon/aws-ofi-nccl
+
+RUN ln -sf /opt/amazon/ofi-nccl/lib/libnccl-net-ofi.so \
+           /opt/amazon/ofi-nccl/lib/libnccl-net-aws-ofi.so && \
+    ln -sf /opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \
+           /opt/amazon/ofi-nccl/lib/libnccl-tuner-aws-ofi.so
+
+# ============================================================
+# 4. Upgrade NCCL to 2.29.3 (requires CUDA >= 12.9)
+# ============================================================
+ENV NCCL_VERSION=2.29.3-1
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends wget && \
+    wget -qO /tmp/cuda-keyring.deb \
+      https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i /tmp/cuda-keyring.deb && \
+    rm /tmp/cuda-keyring.deb && \
+    apt-get update && \
+    apt-get install -y --allow-downgrades --allow-change-held-packages \
+      libnccl2=${NCCL_VERSION}+cuda12.9 \
+      libnccl-dev=${NCCL_VERSION}+cuda12.9 && \
+    rm -rf /var/lib/apt/lists/*
+
+# ============================================================
+# 5. Install GDRCopy v2.5.1 (lib-only)
+# ============================================================
+RUN cd /tmp && \
+    git clone --branch v2.5.1 --depth 1 https://github.com/NVIDIA/gdrcopy.git && \
+    cd gdrcopy && \
+    make -j$(nproc) lib lib_install && \
+    cd / && rm -rf /tmp/gdrcopy
+
+# ============================================================
+# 6. Library path configuration
+# ============================================================
+RUN echo "/opt/amazon/ofi-nccl/lib" > /etc/ld.so.conf.d/aws-ofi-nccl.conf && \
+    echo "/opt/amazon/efa/lib" > /etc/ld.so.conf.d/efa.conf
 
-# Stage 2: Install Python dependencies
-FROM base AS dependencies
+RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/environment 2>/dev/null || true
+RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/shinit_v2 2>/dev/null || true
+
+RUN rm -f /etc/ld.so.cache && ldconfig
+
+ENV LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}"
+ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}"
+
+# ============================================================
+# 7. OpenMPI tuning for EFA
+# ============================================================
+ARG OPEN_MPI_PATH=/opt/amazon/openmpi
+RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
+
+RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \
+ && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \
+ && echo "${OPEN_MPI_PATH}/bin/mpirun.real \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \
+ && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun
+
+# ============================================================
+# 8. Python dependencies for QLoRA training
+# ============================================================
+WORKDIR /app
 
 # Copy requirements first for better caching
 COPY requirements.txt .
 
-# Install PyTorch with CUDA 12.8 support
+# Install PyTorch with CUDA 12.9 support
 # Note: torch 2.10+ has a breaking LR scheduler change (strict zip) that is
 # incompatible with some DeepSpeed/transformers versions. Pin to <2.10 until
 # upstream libraries catch up.
-RUN pip install --no-cache-dir 'torch>=2.7.0,<2.10.0' --index-url https://download.pytorch.org/whl/cu128
+RUN pip install --no-cache-dir 'torch>=2.7.0,<2.10.0' --index-url https://download.pytorch.org/whl/cu129
 
 # Install other dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 
-# Stage 3: Final image with application code
-FROM dependencies AS final
+# ============================================================
+# 9. Application code
+# ============================================================
 
 # Copy source code and entrypoint
 COPY entrypoint.sh /app/entrypoint.sh
@@ -66,14 +159,23 @@ COPY configs/ /app/configs/
 # Create directories for outputs and cache
 RUN mkdir -p /workspace/outputs /workspace/hf_cache
 
-# Set environment variables
+# ============================================================
+# 10. Environment variables
+# ============================================================
 ENV PYTHONPATH=/app
 ENV HF_HOME=/workspace/hf_cache
 ENV PYTHONUNBUFFERED=1
-# Do NOT set CUDA_VISIBLE_DEVICES here — let torchrun / K8s manage GPU visibility
-# DeepSpeed / NCCL settings for multi-GPU communication
+# Do NOT set CUDA_VISIBLE_DEVICES here -- let torchrun / K8s manage GPU visibility
+
+# NCCL / EFA settings for multi-GPU and multi-node communication
 ENV NCCL_DEBUG=INFO
-ENV NCCL_SOCKET_IFNAME=^lo
+ENV NCCL_SOCKET_IFNAME=^docker,lo,veth
+ENV FI_PROVIDER=efa
+ENV NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so
+
+# PyTorch memory allocator -- expandable segments reduces fragmentation
+# Note: capital T is required in pytorch:25.04 containers
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 
 # Entrypoint reads PET_* env vars set by the Kubeflow Training Operator
 # and launches torchrun with the correct number of processes per node.
diff --git a/3.test_cases/pytorch/deepspeed/qlora/requirements.txt b/3.test_cases/pytorch/deepspeed/qlora/requirements.txt
index 893e443a7..9bdf0eb7a 100644
--- a/3.test_cases/pytorch/deepspeed/qlora/requirements.txt
+++ b/3.test_cases/pytorch/deepspeed/qlora/requirements.txt
@@ -1,6 +1,6 @@
 # Core ML Libraries
 # NOTE: torch is installed separately with the correct CUDA index URL.
-# - Docker (see Dockerfile):       torch>=2.7.0,<2.10.0 with cu128
+# - Docker (see Dockerfile):       torch>=2.7.0,<2.10.0 with cu129 (pytorch:25.04-py3 base)
 # - Slurm venv (see slurm/README): torch==2.6.0 with cu126
 # See docs/TROUBLESHOOTING.md if you encounter CUBLAS errors (typically caused
 # by environment-level CUDA library conflicts, not a library bug).
diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner.sh b/3.test_cases/pytorch/deepspeed/sweep_runner.sh
deleted file mode 100755
index 9f7d5a944..000000000
--- a/3.test_cases/pytorch/deepspeed/sweep_runner.sh
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/bin/bash
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# SPDX-License-Identifier: MIT-0
-#
-# sweep_runner.sh - Automated parameter sweep for DeepSpeed 103B pretraining
-# Runs all parallelism and environment flag configurations, collects results.
-#
-# Usage: bash sweep_runner.sh [--dry-run]
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch"
-RESULTS_DIR="${SCRIPT_DIR}/sweep_results"
-NODES=8
-PARTITION="b200"
-
-DRY_RUN=0
-if [ "${1:-}" = "--dry-run" ]; then
-    DRY_RUN=1
-    echo "[DRY RUN] Will print commands without submitting"
-fi
-
-mkdir -p "${RESULTS_DIR}" logs
-
-# ============================================================
-# Helper: submit a sweep configuration
-# ============================================================
-submit_config() {
-    local config_name="$1"
-    local tp="$2"
-    local pp="$3"
-    local zero="$4"
-    local mbs="$5"
-    local gbs="$6"
-    local act_ckpt="${7:-0}"
-    local seq_par="${8:-0}"
-    local overlap="${9:-0}"
-    shift 9 || true
-    local extra_env="${*:-}"
-
-    echo "============================================"
-    echo "Submitting: ${config_name}"
-    echo "  TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}"
-    echo "  ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}"
-    [ -n "${extra_env}" ] && echo "  Extra env: ${extra_env}"
-    echo "============================================"
-
-    local env_exports=""
-    env_exports+="TP=${tp},"
-    env_exports+="PP=${pp},"
-    env_exports+="ZERO_STAGE=${zero},"
-    env_exports+="MICRO_BATCH_SIZE=${mbs},"
-    env_exports+="GLOBAL_BATCH_SIZE=${gbs},"
-    env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt},"
-    env_exports+="USE_SEQUENCE_PARALLEL=${seq_par},"
-    env_exports+="USE_OVERLAP_COMM=${overlap},"
-    env_exports+="CONFIG_NAME=${config_name}"
-
-    local sbatch_cmd="sbatch"
-    sbatch_cmd+=" --partition=${PARTITION}"
-    sbatch_cmd+=" --nodes=${NODES}"
-    sbatch_cmd+=" --export=ALL,${env_exports}"
-    sbatch_cmd+=" --job-name=sweep_${config_name}"
-
-    # Add extra env vars for NCCL tuning
-    if [ -n "${extra_env}" ]; then
-        sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}"
-    fi
-
-    sbatch_cmd+=" ${SBATCH_SCRIPT}"
-
-    if [ "${DRY_RUN}" -eq 1 ]; then
-        echo "[DRY RUN] ${sbatch_cmd}"
-        echo ""
-        return
-    fi
-
-    local job_output
-    job_output=$(eval "${sbatch_cmd}")
-    local job_id
-    job_id=$(echo "${job_output}" | awk '{print $NF}')
-    echo "Submitted job ${job_id} for config ${config_name}"
-    echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap}" >> "${RESULTS_DIR}/sweep_jobs.csv"
-}
-
-# ============================================================
-# Initialize tracking file
-# ============================================================
-echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap" > "${RESULTS_DIR}/sweep_jobs.csv"
-
-# ============================================================
-# PARALLELISM SWEEP (Configs 1-11)
-# ============================================================
-echo ""
-echo "========== PARALLELISM SWEEP =========="
-echo ""
-
-#            config_name         TP PP ZeRO MBS GBS ACT SEQ OVR
-submit_config "01_baseline"       8  2  0    1   64  0   0   0
-submit_config "02_more_pp"        8  4  0    1   64  0   0   0
-submit_config "03_zero1"          8  2  1    1   64  0   0   0
-submit_config "04_larger_mbs"     8  2  1    2  128  0   0   0
-submit_config "05_pp4_zero1"      8  4  1    1  128  0   0   0
-submit_config "06_zero2"          8  2  2    1   64  0   0   0
-submit_config "07_full_pp"        8  8  0    1   64  0   0   0
-submit_config "08_tp4_pp4"        4  4  1    1   64  0   0   0
-submit_config "09_act_ckpt"       8  2  1    1   64  1   0   0
-submit_config "10_seq_parallel"   8  2  1    1   64  0   1   0
-submit_config "11_overlap_comm"   8  2  1    1   64  0   0   1
-
-# ============================================================
-# Wait for parallelism sweep to determine best config
-# If not waiting, env sweep uses config 03 (TP8/PP2/ZeRO1) as default
-# ============================================================
-echo ""
-echo "========== ENVIRONMENT FLAGS SWEEP =========="
-echo "(Using TP=8 PP=2 ZeRO=1 as base for env flag sweep)"
-echo ""
-
-# Base parallelism for env sweep
-BASE_TP=8
-BASE_PP=2
-BASE_ZERO=1
-BASE_MBS=1
-BASE_GBS=64
-
-#            config_name              TP       PP       ZeRO     MBS      GBS      ACT SEQ OVR extra_env
-submit_config "12_nccl_ring"          ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Ring"
-submit_config "13_nccl_tree"          ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Tree"
-submit_config "14_nccl_no_tuner"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_TUNER_PLUGIN="
-submit_config "15_nccl_chunk_4mb"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_P2P_NET_CHUNKSIZE=4194304"
-submit_config "16_cuda_max_conn_1"    ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "CUDA_DEVICE_MAX_CONNECTIONS=1"
-submit_config "17_nccl_buf_16mb"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=16777216"
-submit_config "18_nccl_buf_32mb"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=33554432"
-submit_config "19_nccl_min_ch_16"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=16"
-submit_config "20_nccl_min_ch_32"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=32"
-
-echo ""
-echo "========== SWEEP SUBMITTED =========="
-echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs.csv"
-echo ""
-echo "To monitor: watch 'squeue -u \$USER'"
-echo "When all jobs finish, run: python parse_results.py"
diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh b/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
deleted file mode 100644
index 977b0149c..000000000
--- a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/bin/bash
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# SPDX-License-Identifier: MIT-0
-#
-# sweep_runner_v2.sh - Sweep v2: ZeRO-2 (no PP), ZeRO-3, memory push, fusion ops
-#
-# All configs use PYTORCH_CUDA_ALLOC_CONF=expandable_segments:true (set in sbatch).
-# Optimal NCCL flags are the defaults already in the sbatch script.
-#
-# Usage: bash sweep_runner_v2.sh [--dry-run]
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch"
-RESULTS_DIR="${SCRIPT_DIR}/sweep_results"
-NODES=8
-PARTITION="b200"
-
-DRY_RUN=0
-if [ "${1:-}" = "--dry-run" ]; then
-    DRY_RUN=1
-    echo "[DRY RUN] Will print commands without submitting"
-fi
-
-mkdir -p "${RESULTS_DIR}" logs
-
-# ============================================================
-# Helper: submit a sweep configuration
-# Extends v1 helper with seq_length and enable_fusions params.
-# ============================================================
-submit_config() {
-    local config_name="$1"
-    local tp="$2"
-    local pp="$3"
-    local zero="$4"
-    local mbs="$5"
-    local gbs="$6"
-    local act_ckpt="${7:-0}"
-    local seq_par="${8:-0}"
-    local overlap="${9:-0}"
-    local seq_length="${10:-2048}"
-    local enable_fusions="${11:-0}"
-    shift 11 || true
-    local extra_env="${*:-}"
-
-    echo "============================================"
-    echo "Submitting: ${config_name}"
-    echo "  TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}"
-    echo "  SeqLen=${seq_length} Fusions=${enable_fusions}"
-    echo "  ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}"
-    [ -n "${extra_env}" ] && echo "  Extra env: ${extra_env}"
-    echo "============================================"
-
-    local env_exports=""
-    env_exports+="TP=${tp},"
-    env_exports+="PP=${pp},"
-    env_exports+="ZERO_STAGE=${zero},"
-    env_exports+="MICRO_BATCH_SIZE=${mbs},"
-    env_exports+="GLOBAL_BATCH_SIZE=${gbs},"
-    env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt},"
-    env_exports+="USE_SEQUENCE_PARALLEL=${seq_par},"
-    env_exports+="USE_OVERLAP_COMM=${overlap},"
-    env_exports+="SEQ_LENGTH=${seq_length},"
-    env_exports+="ENABLE_FUSIONS=${enable_fusions},"
-    env_exports+="CONFIG_NAME=${config_name}"
-
-    local sbatch_cmd="sbatch"
-    sbatch_cmd+=" --partition=${PARTITION}"
-    sbatch_cmd+=" --nodes=${NODES}"
-
-    # Add extra env vars (NCCL overrides etc.)
-    if [ -n "${extra_env}" ]; then
-        sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}"
-    else
-        sbatch_cmd+=" --export=ALL,${env_exports}"
-    fi
-
-    sbatch_cmd+=" ${SBATCH_SCRIPT}"
-
-    if [ "${DRY_RUN}" -eq 1 ]; then
-        echo "[DRY RUN] ${sbatch_cmd}"
-        echo ""
-        return
-    fi
-
-    local job_output
-    job_output=$(eval "${sbatch_cmd}")
-    local job_id
-    job_id=$(echo "${job_output}" | awk '{print $NF}')
-    echo "Submitted job ${job_id} for config ${config_name}"
-    echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap},${seq_length},${enable_fusions}" >> "${RESULTS_DIR}/sweep_jobs_v2.csv"
-}
-
-# ============================================================
-# Initialize tracking file
-# ============================================================
-echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap,seq_length,enable_fusions" > "${RESULTS_DIR}/sweep_jobs_v2.csv"
-
-# ============================================================
-# ZeRO-2 WITHOUT PIPELINE PARALLELISM (PP=1)
-# ============================================================
-echo ""
-echo "========== ZeRO-2 SWEEP (PP=1) =========="
-echo ""
-
-#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
-submit_config "21_zero2_tp8_pp1"      8  1  2    1   64  0   0   0   2048    0
-submit_config "22_zero2_tp8_pp1_mbs2" 8  1  2    2   64  0   0   0   2048    0
-submit_config "23_zero2_tp4_pp1"      4  1  2    1   64  0   0   0   2048    0
-
-# ============================================================
-# ZeRO-3 (PP=1)
-# ============================================================
-echo ""
-echo "========== ZeRO-3 SWEEP (PP=1) =========="
-echo ""
-
-#            config_name                  TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
-submit_config "24_zero3_tp8_pp1"          8  1  3    1   64  0   0   0   2048    0
-submit_config "25_zero3_tp8_pp1_mbs2"     8  1  3    2   64  0   0   0   2048    0
-submit_config "26_zero3_tp4_pp1"          4  1  3    1   64  0   0   0   2048    0
-submit_config "27_zero3_tp8_pp1_overlap"  8  1  3    1   64  0   0   1   2048    0
-
-# ============================================================
-# MEMORY PUSH / SEQ LENGTH / FUSIONS
-# ============================================================
-echo ""
-echo "========== MEMORY PUSH SWEEP =========="
-echo ""
-
-#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
-submit_config "28_mem_seq4k_tp8_pp2"  8  2  0    1   64  0   0   0   4096    0
-submit_config "29_mem_fused_tp8_pp8"  8  8  0    1   64  0   0   0   2048    1
-
-# ============================================================
-# EXPANDABLE SEGMENTS IMPACT ON BEST CONFIG
-# Re-test best config (TP8/PP8/ZeRO0) — now with expandable_segments
-# enabled automatically via the updated sbatch.
-# ============================================================
-echo ""
-echo "========== EXPANDABLE SEGMENTS IMPACT =========="
-echo ""
-
-#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
-submit_config "30_best_expand_seg"    8  8  0    1   64  0   0   0   2048    0
-
-echo ""
-echo "========== SWEEP V2 SUBMITTED =========="
-echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs_v2.csv"
-echo ""
-echo "Total configs: 10"
-echo "To monitor: watch 'squeue -u \$USER'"
-echo "When all jobs finish, run: python parse_results.py --jobs-csv sweep_results/sweep_jobs_v2.csv"
diff --git a/3.test_cases/pytorch/deepspeed/upload_results.sh b/3.test_cases/pytorch/deepspeed/upload_results.sh
deleted file mode 100755
index c88c79077..000000000
--- a/3.test_cases/pytorch/deepspeed/upload_results.sh
+++ /dev/null
@@ -1,260 +0,0 @@
-#!/bin/bash
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# SPDX-License-Identifier: MIT-0
-#
-# upload_results.sh - Upload benchmark results to S3 and CloudWatch
-#
-# Usage:
-#   bash upload_results.sh [--results-dir sweep_results] [--region us-east-1]
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-RESULTS_DIR="${1:---results-dir}"
-CW_REGION="us-east-1"
-S3_REGION="us-west-2"
-S3_BUCKET="paragao-new-nemo-squash-container"
-CW_NAMESPACE="DeepSpeed/B200Benchmarks"
-CW_DASHBOARD_NAME="DeepSpeed-B200-Benchmarks"
-
-# Parse args
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --results-dir) RESULTS_DIR="$2"; shift 2 ;;
-        --region) CW_REGION="$2"; shift 2 ;;
-        *) shift ;;
-    esac
-done
-
-: "${RESULTS_DIR:=${SCRIPT_DIR}/sweep_results}"
-
-if [ ! -d "${RESULTS_DIR}" ]; then
-    echo "Error: Results directory not found: ${RESULTS_DIR}"
-    exit 1
-fi
-
-# ============================================================
-# 1. Upload JSON files to S3
-# ============================================================
-echo "=== Uploading results to S3 ==="
-
-# Determine S3 path: benchmark-results/b200/2026/March/04/
-YEAR=$(date -u +%Y)
-MONTH=$(date -u +%B)
-DAY=$(date -u +%d)
-S3_PREFIX="benchmark-results/b200/${YEAR}/${MONTH}/${DAY}"
-
-for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
-    if [ ! -f "${json_file}" ]; then
-        echo "No result JSON files found in ${RESULTS_DIR}"
-        break
-    fi
-    filename=$(basename "${json_file}")
-    echo "  Uploading ${filename} -> s3://${S3_BUCKET}/${S3_PREFIX}/${filename}"
-    aws s3 cp "${json_file}" "s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" \
-        --region "${S3_REGION}" \
-        --content-type "application/json"
-done
-
-echo "S3 upload complete: s3://${S3_BUCKET}/${S3_PREFIX}/"
-echo ""
-
-# ============================================================
-# 2. Publish metrics to CloudWatch
-# ============================================================
-echo "=== Publishing metrics to CloudWatch ==="
-
-for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
-    if [ ! -f "${json_file}" ]; then
-        break
-    fi
-
-    filename=$(basename "${json_file}")
-
-    # Extract metadata and summary using python
-    read -r config_name tp pp zero_stage precision avg_tflops avg_step_time timestamp < <(
-        python3 -c "
-import json, sys
-with open('${json_file}') as f:
-    d = json.load(f)
-m = d['metadata']
-s = d['summary']
-sc = m.get('sweep_config', {})
-print(
-    sc.get('config_name', 'unknown'),
-    sc.get('tp', 8),
-    sc.get('pp', 2),
-    sc.get('zero_stage', 1),
-    m.get('precision', 'bf16'),
-    s.get('steady_state_avg_tflops_per_gpu', 0),
-    s.get('steady_state_avg_step_time_s', 0),
-    m.get('timestamp', '$(date -u +%Y-%m-%dT%H:%M:%SZ)')
-)
-"
-    )
-
-    echo "  Publishing: ${config_name} (TFLOPS=${avg_tflops}, StepTime=${avg_step_time}s)"
-
-    # Publish with full dimensions
-    aws cloudwatch put-metric-data \
-        --namespace "${CW_NAMESPACE}" \
-        --region "${CW_REGION}" \
-        --metric-data "[
-            {
-                \"MetricName\": \"TFLOPSPerGPU\",
-                \"Value\": ${avg_tflops},
-                \"Unit\": \"Count\",
-                \"Timestamp\": \"${timestamp}\",
-                \"Dimensions\": [
-                    {\"Name\": \"model_size\", \"Value\": \"103b\"},
-                    {\"Name\": \"tp\", \"Value\": \"${tp}\"},
-                    {\"Name\": \"pp\", \"Value\": \"${pp}\"},
-                    {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"},
-                    {\"Name\": \"precision\", \"Value\": \"${precision}\"},
-                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
-                ]
-            },
-            {
-                \"MetricName\": \"StepTimeSeconds\",
-                \"Value\": ${avg_step_time},
-                \"Unit\": \"Seconds\",
-                \"Timestamp\": \"${timestamp}\",
-                \"Dimensions\": [
-                    {\"Name\": \"model_size\", \"Value\": \"103b\"},
-                    {\"Name\": \"tp\", \"Value\": \"${tp}\"},
-                    {\"Name\": \"pp\", \"Value\": \"${pp}\"},
-                    {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"},
-                    {\"Name\": \"precision\", \"Value\": \"${precision}\"},
-                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
-                ]
-            }
-        ]"
-
-    # Also publish with just config_name dimension for easy dashboard queries
-    aws cloudwatch put-metric-data \
-        --namespace "${CW_NAMESPACE}" \
-        --region "${CW_REGION}" \
-        --metric-data "[
-            {
-                \"MetricName\": \"TFLOPSPerGPU\",
-                \"Value\": ${avg_tflops},
-                \"Unit\": \"Count\",
-                \"Timestamp\": \"${timestamp}\",
-                \"Dimensions\": [
-                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
-                ]
-            },
-            {
-                \"MetricName\": \"StepTimeSeconds\",
-                \"Value\": ${avg_step_time},
-                \"Unit\": \"Seconds\",
-                \"Timestamp\": \"${timestamp}\",
-                \"Dimensions\": [
-                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
-                ]
-            }
-        ]"
-
-done
-
-echo "CloudWatch metrics published to namespace: ${CW_NAMESPACE}"
-echo ""
-
-# ============================================================
-# 3. Create/Update CloudWatch Dashboard
-# ============================================================
-echo "=== Creating CloudWatch Dashboard ==="
-
-# Build metric entries dynamically from results
-TFLOPS_METRICS=""
-STEPTIME_METRICS=""
-TABLE_METRICS=""
-
-for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
-    if [ ! -f "${json_file}" ]; then
-        break
-    fi
-
-    config_name=$(python3 -c "
-import json
-with open('${json_file}') as f:
-    d = json.load(f)
-print(d['metadata'].get('sweep_config', {}).get('config_name', 'unknown'))
-")
-
-    TFLOPS_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}],"
-    STEPTIME_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}],"
-    TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} TFLOPS\"}],"
-    TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} StepTime\"}],"
-done
-
-# Remove trailing commas
-TFLOPS_METRICS="${TFLOPS_METRICS%,}"
-STEPTIME_METRICS="${STEPTIME_METRICS%,}"
-TABLE_METRICS="${TABLE_METRICS%,}"
-
-DASHBOARD_BODY=$(cat <<DASH
-{
-  "widgets": [
-    {
-      "type": "text",
-      "x": 0, "y": 0, "width": 24, "height": 1,
-      "properties": {
-        "markdown": "# DeepSpeed B200 Benchmark Results - GPT 103B\\nCluster: b200-hyperpod | 8 nodes x 8 B200 GPUs | Namespace: \`${CW_NAMESPACE}\`"
-      }
-    },
-    {
-      "type": "metric",
-      "x": 0, "y": 1, "width": 24, "height": 8,
-      "properties": {
-        "title": "TFLOPS/GPU Across Sweep Configurations",
-        "view": "bar",
-        "region": "${CW_REGION}",
-        "stat": "Average",
-        "period": 86400,
-        "yAxis": {"left": {"label": "TFLOPS/GPU", "showUnits": false}},
-        "metrics": [${TFLOPS_METRICS}]
-      }
-    },
-    {
-      "type": "metric",
-      "x": 0, "y": 9, "width": 24, "height": 8,
-      "properties": {
-        "title": "Step Time Comparison (seconds)",
-        "view": "bar",
-        "region": "${CW_REGION}",
-        "stat": "Average",
-        "period": 86400,
-        "yAxis": {"left": {"label": "Step Time (s)", "showUnits": false}},
-        "metrics": [${STEPTIME_METRICS}]
-      }
-    },
-    {
-      "type": "metric",
-      "x": 0, "y": 17, "width": 24, "height": 8,
-      "properties": {
-        "title": "Summary Metrics Table",
-        "view": "table",
-        "region": "${CW_REGION}",
-        "stat": "Average",
-        "period": 86400,
-        "metrics": [${TABLE_METRICS}]
-      }
-    }
-  ]
-}
-DASH
-)
-
-aws cloudwatch put-dashboard \
-    --dashboard-name "${CW_DASHBOARD_NAME}" \
-    --region "${CW_REGION}" \
-    --dashboard-body "${DASHBOARD_BODY}"
-
-echo "Dashboard created: ${CW_DASHBOARD_NAME}"
-echo "URL: https://${CW_REGION}.console.aws.amazon.com/cloudwatch/home?region=${CW_REGION}#dashboards:name=${CW_DASHBOARD_NAME}"
-echo ""
-echo "=== Upload Complete ==="
-echo "S3: s3://${S3_BUCKET}/${S3_PREFIX}/"
-echo "CloudWatch: ${CW_NAMESPACE} in ${CW_REGION}"
-echo "Dashboard: ${CW_DASHBOARD_NAME}"

From 27827c5b869427c4c31c6df96cc2bcc79508d3f0 Mon Sep 17 00:00:00 2001
From: Paulo Aragao <aragao.paulo@gmail.com>
Date: Tue, 10 Mar 2026 03:11:55 +0000
Subject: [PATCH 3/5] fix: remove personal data and parameterize
 environment-specific values

- Remove personal S3 bucket name from parse_results.py and upload_results.sh
- Parameterize cluster name and instance type in parse_results.py (via CLI
  args and env vars)
- Replace hardcoded S3 bucket/regions in upload_results.sh with required
  env vars (S3_BUCKET, S3_REGION, CW_REGION)
- Remove hardcoded --partition=b200 from sbatch script
- Make PARTITION overridable in sweep_runner.sh and sweep_runner_v2.sh
- Change default partition from 'b200' to 'dev' in Makefile and sweep scripts
- Add sweep_runner.sh, sweep_runner_v2.sh, and upload_results.sh to tracking
---
 3.test_cases/pytorch/deepspeed/Makefile       |   2 +-
 .../pytorch/deepspeed/parse_results.py        |  46 ++-
 .../deepspeed/pretrain_gpt_103b.sbatch        |   1 -
 .../pytorch/deepspeed/sweep_runner.sh         | 144 ++++++++++
 .../pytorch/deepspeed/sweep_runner_v2.sh      | 154 ++++++++++
 .../pytorch/deepspeed/upload_results.sh       | 263 ++++++++++++++++++
 6 files changed, 601 insertions(+), 9 deletions(-)
 create mode 100755 3.test_cases/pytorch/deepspeed/sweep_runner.sh
 create mode 100644 3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
 create mode 100755 3.test_cases/pytorch/deepspeed/upload_results.sh

diff --git a/3.test_cases/pytorch/deepspeed/Makefile b/3.test_cases/pytorch/deepspeed/Makefile
index d2b46ee82..a42ace894 100644
--- a/3.test_cases/pytorch/deepspeed/Makefile
+++ b/3.test_cases/pytorch/deepspeed/Makefile
@@ -1,7 +1,7 @@
 ENROOT_IMAGE   ?= deepspeed
 APPS_PATH      ?= /fsx/apps
 SQUASH_FILE    ?= $(APPS_PATH)/$(ENROOT_IMAGE).sqsh
-PARTITION      ?= b200
+PARTITION      ?= dev
 NODES          ?= 8
 LOGS_DIR       ?= logs
 RESULTS_DIR    ?= sweep_results
diff --git a/3.test_cases/pytorch/deepspeed/parse_results.py b/3.test_cases/pytorch/deepspeed/parse_results.py
index 8eacea70c..8b2815396 100755
--- a/3.test_cases/pytorch/deepspeed/parse_results.py
+++ b/3.test_cases/pytorch/deepspeed/parse_results.py
@@ -6,7 +6,7 @@
 
 Reads Slurm log files, extracts per-step metrics, and produces JSON files
 matching the existing benchmark-results schema at:
-  s3://paragao-new-nemo-squash-container/benchmark-results/b200/
+  s3://<YOUR_BUCKET>/benchmark-results/<instance_type>/
 
 Usage:
     python parse_results.py [--logs-dir logs] [--output-dir sweep_results]
@@ -145,6 +145,8 @@ def build_result_json(
     gbs=64,
     seq_length=2048,
     precision="bf16",
+    cluster="unknown",
+    instance_type="unknown",
 ):
     """Build the benchmark JSON matching the existing schema."""
     total_gpus = nodes * gpus_per_node
@@ -219,8 +221,8 @@ def build_result_json(
         "metadata": {
             "timestamp": timestamp,
             "job_id": str(job_id),
-            "cluster": "b200-hyperpod",
-            "instance_type": "ml.p6-b200.48xlarge",
+            "cluster": cluster,
+            "instance_type": instance_type,
             "nodes": nodes,
             "gpus_per_node": gpus_per_node,
             "total_gpus": total_gpus,
@@ -244,7 +246,9 @@ def build_result_json(
     return result
 
 
-def parse_sweep_jobs(jobs_csv, logs_dir, output_dir):
+def parse_sweep_jobs(
+    jobs_csv, logs_dir, output_dir, cluster="unknown", instance_type="unknown"
+):
     """Parse all jobs from the sweep tracking CSV."""
     os.makedirs(output_dir, exist_ok=True)
     results = []
@@ -291,6 +295,8 @@ def parse_sweep_jobs(jobs_csv, logs_dir, output_dir):
                 mbs=int(row.get("mbs", 1)),
                 gbs=int(row.get("gbs", 64)),
                 seq_length=int(row.get("seq_length", 2048)),
+                cluster=cluster,
+                instance_type=instance_type,
             )
 
             # Write individual JSON file
@@ -315,7 +321,9 @@ def parse_sweep_jobs(jobs_csv, logs_dir, output_dir):
     return results
 
 
-def parse_single_log(log_file, config_name, output_dir):
+def parse_single_log(
+    log_file, config_name, output_dir, cluster="unknown", instance_type="unknown"
+):
     """Parse a single log file."""
     os.makedirs(output_dir, exist_ok=True)
 
@@ -334,6 +342,8 @@ def parse_single_log(log_file, config_name, output_dir):
         steps=steps,
         config_name=config_name,
         job_id=job_id,
+        cluster=cluster,
+        instance_type=instance_type,
     )
 
     now = datetime.now(timezone.utc)
@@ -372,11 +382,27 @@ def main():
         default="single_run",
         help="Config name for single log file parsing",
     )
+    parser.add_argument(
+        "--cluster",
+        default=os.environ.get("CLUSTER_NAME", "unknown"),
+        help="Cluster name for metadata (default: $CLUSTER_NAME or 'unknown')",
+    )
+    parser.add_argument(
+        "--instance-type",
+        default=os.environ.get("INSTANCE_TYPE", "unknown"),
+        help="Instance type for metadata (default: $INSTANCE_TYPE or 'unknown')",
+    )
 
     args = parser.parse_args()
 
     if args.log_file:
-        parse_single_log(args.log_file, args.config_name, args.output_dir)
+        parse_single_log(
+            args.log_file,
+            args.config_name,
+            args.output_dir,
+            cluster=args.cluster,
+            instance_type=args.instance_type,
+        )
     else:
         if not os.path.exists(args.jobs_csv):
             print(f"Error: Jobs CSV not found: {args.jobs_csv}")
@@ -384,7 +410,13 @@ def main():
                 "Run sweep_runner.sh first, or use --log-file for single file parsing"
             )
             sys.exit(1)
-        parse_sweep_jobs(args.jobs_csv, args.logs_dir, args.output_dir)
+        parse_sweep_jobs(
+            args.jobs_csv,
+            args.logs_dir,
+            args.output_dir,
+            cluster=args.cluster,
+            instance_type=args.instance_type,
+        )
 
 
 if __name__ == "__main__":
diff --git a/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch b/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch
index 43957885f..d3b4f277a 100755
--- a/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch
+++ b/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch
@@ -7,7 +7,6 @@
 #SBATCH --job-name=deepspeed-pretrain-103b
 #SBATCH --output=logs/%x_%j.out
 #SBATCH --error=logs/%x_%j.err
-#SBATCH --partition=b200
 
 set -euxo pipefail
 
diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner.sh b/3.test_cases/pytorch/deepspeed/sweep_runner.sh
new file mode 100755
index 000000000..d7c15398f
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/sweep_runner.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# sweep_runner.sh - Automated parameter sweep for DeepSpeed 103B pretraining
+# Runs all parallelism and environment flag configurations, collects results.
+#
+# Usage: bash sweep_runner.sh [--dry-run]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch"
+RESULTS_DIR="${SCRIPT_DIR}/sweep_results"
+NODES=8
+PARTITION="${PARTITION:-dev}"
+
+DRY_RUN=0
+if [ "${1:-}" = "--dry-run" ]; then
+    DRY_RUN=1
+    echo "[DRY RUN] Will print commands without submitting"
+fi
+
+mkdir -p "${RESULTS_DIR}" logs
+
+# ============================================================
+# Helper: submit a sweep configuration
+# ============================================================
+submit_config() {
+    local config_name="$1"
+    local tp="$2"
+    local pp="$3"
+    local zero="$4"
+    local mbs="$5"
+    local gbs="$6"
+    local act_ckpt="${7:-0}"
+    local seq_par="${8:-0}"
+    local overlap="${9:-0}"
+    shift 9 || true
+    local extra_env="${*:-}"
+
+    echo "============================================"
+    echo "Submitting: ${config_name}"
+    echo "  TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}"
+    echo "  ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}"
+    [ -n "${extra_env}" ] && echo "  Extra env: ${extra_env}"
+    echo "============================================"
+
+    local env_exports=""
+    env_exports+="TP=${tp},"
+    env_exports+="PP=${pp},"
+    env_exports+="ZERO_STAGE=${zero},"
+    env_exports+="MICRO_BATCH_SIZE=${mbs},"
+    env_exports+="GLOBAL_BATCH_SIZE=${gbs},"
+    env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt},"
+    env_exports+="USE_SEQUENCE_PARALLEL=${seq_par},"
+    env_exports+="USE_OVERLAP_COMM=${overlap},"
+    env_exports+="CONFIG_NAME=${config_name}"
+
+    local sbatch_cmd="sbatch"
+    sbatch_cmd+=" --partition=${PARTITION}"
+    sbatch_cmd+=" --nodes=${NODES}"
+    sbatch_cmd+=" --export=ALL,${env_exports}"
+    sbatch_cmd+=" --job-name=sweep_${config_name}"
+
+    # Add extra env vars for NCCL tuning
+    if [ -n "${extra_env}" ]; then
+        sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}"
+    fi
+
+    sbatch_cmd+=" ${SBATCH_SCRIPT}"
+
+    if [ "${DRY_RUN}" -eq 1 ]; then
+        echo "[DRY RUN] ${sbatch_cmd}"
+        echo ""
+        return
+    fi
+
+    local job_output
+    job_output=$(eval "${sbatch_cmd}")
+    local job_id
+    job_id=$(echo "${job_output}" | awk '{print $NF}')
+    echo "Submitted job ${job_id} for config ${config_name}"
+    echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap}" >> "${RESULTS_DIR}/sweep_jobs.csv"
+}
+
+# ============================================================
+# Initialize tracking file
+# ============================================================
+echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap" > "${RESULTS_DIR}/sweep_jobs.csv"
+
+# ============================================================
+# PARALLELISM SWEEP (Configs 1-11)
+# ============================================================
+echo ""
+echo "========== PARALLELISM SWEEP =========="
+echo ""
+
+#            config_name         TP PP ZeRO MBS GBS ACT SEQ OVR
+submit_config "01_baseline"       8  2  0    1   64  0   0   0
+submit_config "02_more_pp"        8  4  0    1   64  0   0   0
+submit_config "03_zero1"          8  2  1    1   64  0   0   0
+submit_config "04_larger_mbs"     8  2  1    2  128  0   0   0
+submit_config "05_pp4_zero1"      8  4  1    1  128  0   0   0
+submit_config "06_zero2"          8  2  2    1   64  0   0   0
+submit_config "07_full_pp"        8  8  0    1   64  0   0   0
+submit_config "08_tp4_pp4"        4  4  1    1   64  0   0   0
+submit_config "09_act_ckpt"       8  2  1    1   64  1   0   0
+submit_config "10_seq_parallel"   8  2  1    1   64  0   1   0
+submit_config "11_overlap_comm"   8  2  1    1   64  0   0   1
+
+# ============================================================
+# Wait for parallelism sweep to determine best config
+# If not waiting, env sweep uses config 03 (TP8/PP2/ZeRO1) as default
+# ============================================================
+echo ""
+echo "========== ENVIRONMENT FLAGS SWEEP =========="
+echo "(Using TP=8 PP=2 ZeRO=1 as base for env flag sweep)"
+echo ""
+
+# Base parallelism for env sweep
+BASE_TP=8
+BASE_PP=2
+BASE_ZERO=1
+BASE_MBS=1
+BASE_GBS=64
+
+#            config_name              TP       PP       ZeRO     MBS      GBS      ACT SEQ OVR extra_env
+submit_config "12_nccl_ring"          ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Ring"
+submit_config "13_nccl_tree"          ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Tree"
+submit_config "14_nccl_no_tuner"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_TUNER_PLUGIN="
+submit_config "15_nccl_chunk_4mb"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_P2P_NET_CHUNKSIZE=4194304"
+submit_config "16_cuda_max_conn_1"    ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "CUDA_DEVICE_MAX_CONNECTIONS=1"
+submit_config "17_nccl_buf_16mb"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=16777216"
+submit_config "18_nccl_buf_32mb"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=33554432"
+submit_config "19_nccl_min_ch_16"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=16"
+submit_config "20_nccl_min_ch_32"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=32"
+
+echo ""
+echo "========== SWEEP SUBMITTED =========="
+echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs.csv"
+echo ""
+echo "To monitor: watch 'squeue -u \$USER'"
+echo "When all jobs finish, run: python parse_results.py"
diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh b/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
new file mode 100644
index 000000000..297d85fec
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# sweep_runner_v2.sh - Sweep v2: ZeRO-2 (no PP), ZeRO-3, memory push, fusion ops
+#
+# All configs use PYTORCH_CUDA_ALLOC_CONF=expandable_segments:true (set in sbatch).
+# Optimal NCCL flags are the defaults already in the sbatch script.
+#
+# Usage: bash sweep_runner_v2.sh [--dry-run]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch"
+RESULTS_DIR="${SCRIPT_DIR}/sweep_results"
+NODES=8
+PARTITION="${PARTITION:-dev}"
+
+DRY_RUN=0
+if [ "${1:-}" = "--dry-run" ]; then
+    DRY_RUN=1
+    echo "[DRY RUN] Will print commands without submitting"
+fi
+
+mkdir -p "${RESULTS_DIR}" logs
+
+# ============================================================
+# Helper: submit a sweep configuration
+# Extends v1 helper with seq_length and enable_fusions params.
+# ============================================================
+submit_config() {
+    local config_name="$1"
+    local tp="$2"
+    local pp="$3"
+    local zero="$4"
+    local mbs="$5"
+    local gbs="$6"
+    local act_ckpt="${7:-0}"
+    local seq_par="${8:-0}"
+    local overlap="${9:-0}"
+    local seq_length="${10:-2048}"
+    local enable_fusions="${11:-0}"
+    shift 11 || true
+    local extra_env="${*:-}"
+
+    echo "============================================"
+    echo "Submitting: ${config_name}"
+    echo "  TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}"
+    echo "  SeqLen=${seq_length} Fusions=${enable_fusions}"
+    echo "  ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}"
+    [ -n "${extra_env}" ] && echo "  Extra env: ${extra_env}"
+    echo "============================================"
+
+    local env_exports=""
+    env_exports+="TP=${tp},"
+    env_exports+="PP=${pp},"
+    env_exports+="ZERO_STAGE=${zero},"
+    env_exports+="MICRO_BATCH_SIZE=${mbs},"
+    env_exports+="GLOBAL_BATCH_SIZE=${gbs},"
+    env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt},"
+    env_exports+="USE_SEQUENCE_PARALLEL=${seq_par},"
+    env_exports+="USE_OVERLAP_COMM=${overlap},"
+    env_exports+="SEQ_LENGTH=${seq_length},"
+    env_exports+="ENABLE_FUSIONS=${enable_fusions},"
+    env_exports+="CONFIG_NAME=${config_name}"
+
+    local sbatch_cmd="sbatch"
+    sbatch_cmd+=" --partition=${PARTITION}"
+    sbatch_cmd+=" --nodes=${NODES}"
+
+    # Add extra env vars (NCCL overrides etc.)
+    if [ -n "${extra_env}" ]; then
+        sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}"
+    else
+        sbatch_cmd+=" --export=ALL,${env_exports}"
+    fi
+
+    sbatch_cmd+=" ${SBATCH_SCRIPT}"
+
+    if [ "${DRY_RUN}" -eq 1 ]; then
+        echo "[DRY RUN] ${sbatch_cmd}"
+        echo ""
+        return
+    fi
+
+    local job_output
+    job_output=$(eval "${sbatch_cmd}")
+    local job_id
+    job_id=$(echo "${job_output}" | awk '{print $NF}')
+    echo "Submitted job ${job_id} for config ${config_name}"
+    echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap},${seq_length},${enable_fusions}" >> "${RESULTS_DIR}/sweep_jobs_v2.csv"
+}
+
+# ============================================================
+# Initialize tracking file
+# ============================================================
+echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap,seq_length,enable_fusions" > "${RESULTS_DIR}/sweep_jobs_v2.csv"
+
+# ============================================================
+# ZeRO-2 WITHOUT PIPELINE PARALLELISM (PP=1)
+# ============================================================
+echo ""
+echo "========== ZeRO-2 SWEEP (PP=1) =========="
+echo ""
+
+#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
+submit_config "21_zero2_tp8_pp1"      8  1  2    1   64  0   0   0   2048    0
+submit_config "22_zero2_tp8_pp1_mbs2" 8  1  2    2   64  0   0   0   2048    0
+submit_config "23_zero2_tp4_pp1"      4  1  2    1   64  0   0   0   2048    0
+
+# ============================================================
+# ZeRO-3 (PP=1)
+# ============================================================
+echo ""
+echo "========== ZeRO-3 SWEEP (PP=1) =========="
+echo ""
+
+#            config_name                  TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
+submit_config "24_zero3_tp8_pp1"          8  1  3    1   64  0   0   0   2048    0
+submit_config "25_zero3_tp8_pp1_mbs2"     8  1  3    2   64  0   0   0   2048    0
+submit_config "26_zero3_tp4_pp1"          4  1  3    1   64  0   0   0   2048    0
+submit_config "27_zero3_tp8_pp1_overlap"  8  1  3    1   64  0   0   1   2048    0
+
+# ============================================================
+# MEMORY PUSH / SEQ LENGTH / FUSIONS
+# ============================================================
+echo ""
+echo "========== MEMORY PUSH SWEEP =========="
+echo ""
+
+#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
+submit_config "28_mem_seq4k_tp8_pp2"  8  2  0    1   64  0   0   0   4096    0
+submit_config "29_mem_fused_tp8_pp8"  8  8  0    1   64  0   0   0   2048    1
+
+# ============================================================
+# EXPANDABLE SEGMENTS IMPACT ON BEST CONFIG
+# Re-test best config (TP8/PP8/ZeRO0) — now with expandable_segments
+# enabled automatically via the updated sbatch.
+# ============================================================
+echo ""
+echo "========== EXPANDABLE SEGMENTS IMPACT =========="
+echo ""
+
+#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
+submit_config "30_best_expand_seg"    8  8  0    1   64  0   0   0   2048    0
+
+echo ""
+echo "========== SWEEP V2 SUBMITTED =========="
+echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs_v2.csv"
+echo ""
+echo "Total configs: 10"
+echo "To monitor: watch 'squeue -u \$USER'"
+echo "When all jobs finish, run: python parse_results.py --jobs-csv sweep_results/sweep_jobs_v2.csv"
diff --git a/3.test_cases/pytorch/deepspeed/upload_results.sh b/3.test_cases/pytorch/deepspeed/upload_results.sh
new file mode 100755
index 000000000..6df3cab99
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/upload_results.sh
@@ -0,0 +1,263 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# upload_results.sh - Upload benchmark results to S3 and CloudWatch
+#
+# Usage:
+#   export S3_BUCKET=my-benchmark-bucket
+#   export S3_REGION=us-west-2
+#   export CW_REGION=us-east-1
+#   bash upload_results.sh [--results-dir sweep_results]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+RESULTS_DIR="${1:---results-dir}"
+CW_REGION="${CW_REGION:?Error: CW_REGION must be set (e.g. export CW_REGION=us-east-1)}"
+S3_REGION="${S3_REGION:?Error: S3_REGION must be set (e.g. export S3_REGION=us-west-2)}"
+S3_BUCKET="${S3_BUCKET:?Error: S3_BUCKET must be set (e.g. export S3_BUCKET=my-benchmark-bucket)}"
+CW_NAMESPACE="DeepSpeed/B200Benchmarks"
+CW_DASHBOARD_NAME="DeepSpeed-B200-Benchmarks"
+
+# Parse args
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --results-dir) RESULTS_DIR="$2"; shift 2 ;;
+        --region) CW_REGION="$2"; shift 2 ;;
+        *) shift ;;
+    esac
+done
+
+: "${RESULTS_DIR:=${SCRIPT_DIR}/sweep_results}"
+
+if [ ! -d "${RESULTS_DIR}" ]; then
+    echo "Error: Results directory not found: ${RESULTS_DIR}"
+    exit 1
+fi
+
+# ============================================================
+# 1. Upload JSON files to S3
+# ============================================================
+echo "=== Uploading results to S3 ==="
+
+# Determine S3 path: benchmark-results/b200/2026/March/04/
+YEAR=$(date -u +%Y)
+MONTH=$(date -u +%B)
+DAY=$(date -u +%d)
+S3_PREFIX="benchmark-results/b200/${YEAR}/${MONTH}/${DAY}"
+
+for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
+    if [ ! -f "${json_file}" ]; then
+        echo "No result JSON files found in ${RESULTS_DIR}"
+        break
+    fi
+    filename=$(basename "${json_file}")
+    echo "  Uploading ${filename} -> s3://${S3_BUCKET}/${S3_PREFIX}/${filename}"
+    aws s3 cp "${json_file}" "s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" \
+        --region "${S3_REGION}" \
+        --content-type "application/json"
+done
+
+echo "S3 upload complete: s3://${S3_BUCKET}/${S3_PREFIX}/"
+echo ""
+
+# ============================================================
+# 2. Publish metrics to CloudWatch
+# ============================================================
+echo "=== Publishing metrics to CloudWatch ==="
+
+for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
+    if [ ! -f "${json_file}" ]; then
+        break
+    fi
+
+    filename=$(basename "${json_file}")
+
+    # Extract metadata and summary using python
+    read -r config_name tp pp zero_stage precision avg_tflops avg_step_time timestamp < <(
+        python3 -c "
+import json, sys
+with open('${json_file}') as f:
+    d = json.load(f)
+m = d['metadata']
+s = d['summary']
+sc = m.get('sweep_config', {})
+print(
+    sc.get('config_name', 'unknown'),
+    sc.get('tp', 8),
+    sc.get('pp', 2),
+    sc.get('zero_stage', 1),
+    m.get('precision', 'bf16'),
+    s.get('steady_state_avg_tflops_per_gpu', 0),
+    s.get('steady_state_avg_step_time_s', 0),
+    m.get('timestamp', '$(date -u +%Y-%m-%dT%H:%M:%SZ)')
+)
+"
+    )
+
+    echo "  Publishing: ${config_name} (TFLOPS=${avg_tflops}, StepTime=${avg_step_time}s)"
+
+    # Publish with full dimensions
+    aws cloudwatch put-metric-data \
+        --namespace "${CW_NAMESPACE}" \
+        --region "${CW_REGION}" \
+        --metric-data "[
+            {
+                \"MetricName\": \"TFLOPSPerGPU\",
+                \"Value\": ${avg_tflops},
+                \"Unit\": \"Count\",
+                \"Timestamp\": \"${timestamp}\",
+                \"Dimensions\": [
+                    {\"Name\": \"model_size\", \"Value\": \"103b\"},
+                    {\"Name\": \"tp\", \"Value\": \"${tp}\"},
+                    {\"Name\": \"pp\", \"Value\": \"${pp}\"},
+                    {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"},
+                    {\"Name\": \"precision\", \"Value\": \"${precision}\"},
+                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
+                ]
+            },
+            {
+                \"MetricName\": \"StepTimeSeconds\",
+                \"Value\": ${avg_step_time},
+                \"Unit\": \"Seconds\",
+                \"Timestamp\": \"${timestamp}\",
+                \"Dimensions\": [
+                    {\"Name\": \"model_size\", \"Value\": \"103b\"},
+                    {\"Name\": \"tp\", \"Value\": \"${tp}\"},
+                    {\"Name\": \"pp\", \"Value\": \"${pp}\"},
+                    {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"},
+                    {\"Name\": \"precision\", \"Value\": \"${precision}\"},
+                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
+                ]
+            }
+        ]"
+
+    # Also publish with just config_name dimension for easy dashboard queries
+    aws cloudwatch put-metric-data \
+        --namespace "${CW_NAMESPACE}" \
+        --region "${CW_REGION}" \
+        --metric-data "[
+            {
+                \"MetricName\": \"TFLOPSPerGPU\",
+                \"Value\": ${avg_tflops},
+                \"Unit\": \"Count\",
+                \"Timestamp\": \"${timestamp}\",
+                \"Dimensions\": [
+                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
+                ]
+            },
+            {
+                \"MetricName\": \"StepTimeSeconds\",
+                \"Value\": ${avg_step_time},
+                \"Unit\": \"Seconds\",
+                \"Timestamp\": \"${timestamp}\",
+                \"Dimensions\": [
+                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
+                ]
+            }
+        ]"
+
+done
+
+echo "CloudWatch metrics published to namespace: ${CW_NAMESPACE}"
+echo ""
+
+# ============================================================
+# 3. Create/Update CloudWatch Dashboard
+# ============================================================
+echo "=== Creating CloudWatch Dashboard ==="
+
+# Build metric entries dynamically from results
+TFLOPS_METRICS=""
+STEPTIME_METRICS=""
+TABLE_METRICS=""
+
+for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
+    if [ ! -f "${json_file}" ]; then
+        break
+    fi
+
+    config_name=$(python3 -c "
+import json
+with open('${json_file}') as f:
+    d = json.load(f)
+print(d['metadata'].get('sweep_config', {}).get('config_name', 'unknown'))
+")
+
+    TFLOPS_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}],"
+    STEPTIME_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}],"
+    TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} TFLOPS\"}],"
+    TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} StepTime\"}],"
+done
+
+# Remove trailing commas
+TFLOPS_METRICS="${TFLOPS_METRICS%,}"
+STEPTIME_METRICS="${STEPTIME_METRICS%,}"
+TABLE_METRICS="${TABLE_METRICS%,}"
+
+DASHBOARD_BODY=$(cat <<DASH
+{
+  "widgets": [
+    {
+      "type": "text",
+      "x": 0, "y": 0, "width": 24, "height": 1,
+      "properties": {
+        "markdown": "# DeepSpeed B200 Benchmark Results - GPT 103B\\nCluster: b200-hyperpod | 8 nodes x 8 B200 GPUs | Namespace: \`${CW_NAMESPACE}\`"
+      }
+    },
+    {
+      "type": "metric",
+      "x": 0, "y": 1, "width": 24, "height": 8,
+      "properties": {
+        "title": "TFLOPS/GPU Across Sweep Configurations",
+        "view": "bar",
+        "region": "${CW_REGION}",
+        "stat": "Average",
+        "period": 86400,
+        "yAxis": {"left": {"label": "TFLOPS/GPU", "showUnits": false}},
+        "metrics": [${TFLOPS_METRICS}]
+      }
+    },
+    {
+      "type": "metric",
+      "x": 0, "y": 9, "width": 24, "height": 8,
+      "properties": {
+        "title": "Step Time Comparison (seconds)",
+        "view": "bar",
+        "region": "${CW_REGION}",
+        "stat": "Average",
+        "period": 86400,
+        "yAxis": {"left": {"label": "Step Time (s)", "showUnits": false}},
+        "metrics": [${STEPTIME_METRICS}]
+      }
+    },
+    {
+      "type": "metric",
+      "x": 0, "y": 17, "width": 24, "height": 8,
+      "properties": {
+        "title": "Summary Metrics Table",
+        "view": "table",
+        "region": "${CW_REGION}",
+        "stat": "Average",
+        "period": 86400,
+        "metrics": [${TABLE_METRICS}]
+      }
+    }
+  ]
+}
+DASH
+)
+
+aws cloudwatch put-dashboard \
+    --dashboard-name "${CW_DASHBOARD_NAME}" \
+    --region "${CW_REGION}" \
+    --dashboard-body "${DASHBOARD_BODY}"
+
+echo "Dashboard created: ${CW_DASHBOARD_NAME}"
+echo "URL: https://${CW_REGION}.console.aws.amazon.com/cloudwatch/home?region=${CW_REGION}#dashboards:name=${CW_DASHBOARD_NAME}"
+echo ""
+echo "=== Upload Complete ==="
+echo "S3: s3://${S3_BUCKET}/${S3_PREFIX}/"
+echo "CloudWatch: ${CW_NAMESPACE} in ${CW_REGION}"
+echo "Dashboard: ${CW_DASHBOARD_NAME}"

From 0d646f3c3daf8d40615515db33c56979f94acd72 Mon Sep 17 00:00:00 2001
From: Paulo Aragao <aragao.paulo@gmail.com>
Date: Tue, 10 Mar 2026 03:28:41 +0000
Subject: [PATCH 4/5] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?=
 =?UTF-8?q?restructure=20into=20gpt/=20subdir,=20pin=20deps,=20quote=20var?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Move GPT-103B pretraining files into gpt/ subdirectory with slurm/
  and configs/ sub-dirs to match repo conventions
- Pin deepspeed>=0.16,<1.0 and accelerate>=1.0,<2.0 in Dockerfile
- Quote all variable expansions in build and training scripts
- Fix NCCL_P2P_NET_CHUNKSIZE from 2048576 to 2097152 (2MB power-of-two)
- Add PP note to README env vars table clarifying best config uses PP=8
- Add trailing newline to README.md
- Update all path references for new directory structure
---
 .../pytorch/deepspeed/0.deepspeed.dockerfile   |  2 +-
 .../pytorch/deepspeed/1.build-image.sbatch     | 12 ++++++------
 3.test_cases/pytorch/deepspeed/Makefile        |  4 ++--
 3.test_cases/pytorch/deepspeed/README.md       | 18 +++++++++---------
 .../configs/ds_config_103b_template.json       |  0
 .../deepspeed/{ => gpt}/parse_results.py       |  0
 .../{ => gpt/slurm}/pretrain_gpt_103b.sbatch   | 15 +++++++++------
 .../deepspeed/{ => gpt}/sweep_runner.sh        |  2 +-
 .../deepspeed/{ => gpt}/sweep_runner_v2.sh     |  2 +-
 .../deepspeed/{ => gpt}/upload_results.sh      |  0
 10 files changed, 29 insertions(+), 26 deletions(-)
 rename 3.test_cases/pytorch/deepspeed/{ => gpt}/configs/ds_config_103b_template.json (100%)
 rename 3.test_cases/pytorch/deepspeed/{ => gpt}/parse_results.py (100%)
 rename 3.test_cases/pytorch/deepspeed/{ => gpt/slurm}/pretrain_gpt_103b.sbatch (95%)
 rename 3.test_cases/pytorch/deepspeed/{ => gpt}/sweep_runner.sh (98%)
 rename 3.test_cases/pytorch/deepspeed/{ => gpt}/sweep_runner_v2.sh (98%)
 rename 3.test_cases/pytorch/deepspeed/{ => gpt}/upload_results.sh (100%)

diff --git a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile
index ec7f99995..1874ca3d5 100644
--- a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile
+++ b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile
@@ -147,7 +147,7 @@ RUN pip3 install --no-cache-dir \
     awscli pynvml \
     transformers==${TRANSFORMERS_VERSION} \
     sentencepiece python-etcd \
-    deepspeed accelerate
+    deepspeed>=0.16,<1.0 accelerate>=1.0,<2.0
 
 RUN rm -rf /var/lib/apt/lists/*
 
diff --git a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
index f91222361..f999b926b 100644
--- a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
+++ b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
@@ -15,14 +15,14 @@ set -euxo pipefail
 : "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"
 
 # Ensure output directory exists
-mkdir -p ${APPS_PATH}
+mkdir -p "${APPS_PATH}"
 mkdir -p logs
 
 ENROOT_IMAGE=deepspeed
-docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile .
+docker build -t "${ENROOT_IMAGE}" -f 0.deepspeed.dockerfile .
 # Remove old sqsh file if exists
-if [ -f ${ENROOT_IMAGE}.sqsh ] ; then
-    rm ${ENROOT_IMAGE}.sqsh
+if [ -f "${ENROOT_IMAGE}.sqsh" ] ; then
+    rm "${ENROOT_IMAGE}.sqsh"
 fi
-enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest
-mv ${ENROOT_IMAGE}.sqsh ${IMAGE}
\ No newline at end of file
+enroot import -o "${ENROOT_IMAGE}.sqsh" "dockerd://${ENROOT_IMAGE}:latest"
+mv "${ENROOT_IMAGE}.sqsh" "${IMAGE}"
\ No newline at end of file
diff --git a/3.test_cases/pytorch/deepspeed/Makefile b/3.test_cases/pytorch/deepspeed/Makefile
index a42ace894..88b3d2262 100644
--- a/3.test_cases/pytorch/deepspeed/Makefile
+++ b/3.test_cases/pytorch/deepspeed/Makefile
@@ -43,10 +43,10 @@ clean:
 train:
 	sbatch --partition=$(PARTITION) --nodes=$(NODES) \
 		--export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \
-		pretrain_gpt_103b.sbatch
+		gpt/slurm/pretrain_gpt_103b.sbatch
 
 # ---- Results ----
 
 parse:
-	python3 parse_results.py --jobs-csv $(RESULTS_DIR)/sweep_jobs.csv \
+	python3 gpt/parse_results.py --jobs-csv $(RESULTS_DIR)/sweep_jobs.csv \
 		--logs-dir $(LOGS_DIR) --output-dir $(RESULTS_DIR)
diff --git a/3.test_cases/pytorch/deepspeed/README.md b/3.test_cases/pytorch/deepspeed/README.md
index e873d1af8..82e34d46a 100644
--- a/3.test_cases/pytorch/deepspeed/README.md
+++ b/3.test_cases/pytorch/deepspeed/README.md
@@ -6,7 +6,7 @@
 
 | Use Case | Description | Location |
 |----------|-------------|----------|
-| GPT-103B Pretraining | Large-scale GPT pretraining benchmark using Megatron-DeepSpeed with 3D parallelism (TP/PP/DP) and ZeRO optimization | [`pretrain_gpt_103b.sbatch`](pretrain_gpt_103b.sbatch) |
+| GPT-103B Pretraining | Large-scale GPT pretraining benchmark using Megatron-DeepSpeed with 3D parallelism (TP/PP/DP) and ZeRO optimization | [`gpt/`](gpt/) |
 | QLoRA Fine-tuning | Qwen3-8B fine-tuning with QLoRA (4-bit) + DeepSpeed ZeRO-2/3, supports EKS and Slurm | [`qlora/`](qlora/) |
 | Llama2 Fine-tuning | Llama2 fine-tuning from HuggingFace weights using Megatron-DeepSpeed | [`examples_megatron_deepspeed/finetune_hf_llama/`](examples_megatron_deepspeed/finetune_hf_llama/) |
 
@@ -88,9 +88,9 @@ Submit the best-performing configuration (TP=8, PP=8, ZeRO-0, fusions enabled):
 ```bash
 make train
 # or equivalently:
-sbatch --partition=b200 --nodes=8 \
+sbatch --partition=dev --nodes=8 \
     --export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \
-    pretrain_gpt_103b.sbatch
+    gpt/slurm/pretrain_gpt_103b.sbatch
 ```
 
 Override parallelism settings for custom configurations:
@@ -98,7 +98,7 @@ Override parallelism settings for custom configurations:
 ```bash
 sbatch --nodes=8 \
     --export=ALL,TP=8,PP=4,ZERO_STAGE=1,MICRO_BATCH_SIZE=2,CONFIG_NAME=my_config \
-    pretrain_gpt_103b.sbatch
+    gpt/slurm/pretrain_gpt_103b.sbatch
 ```
 
 #### Environment variables
@@ -106,7 +106,7 @@ sbatch --nodes=8 \
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `TP` | 8 | Tensor parallel size |
-| `PP` | 2 | Pipeline parallel size |
+| `PP` | 2 | Pipeline parallel size (best throughput with PP=8, see `make train`) |
 | `ZERO_STAGE` | 1 | DeepSpeed ZeRO stage (0, 1, 2, or 3) |
 | `MICRO_BATCH_SIZE` | 1 | Per-GPU micro batch size |
 | `GLOBAL_BATCH_SIZE` | 64 | Global batch size |
@@ -147,14 +147,14 @@ The following recommendations are based on extensive parameter sweeps across par
 
 ### Parsing results
 
-After training completes, parse the Slurm logs into benchmark JSON using `parse_results.py`:
+After training completes, parse the Slurm logs into benchmark JSON using `gpt/parse_results.py`:
 
 ```bash
 # Single log file
-python3 parse_results.py --log-file logs/deepspeed-pretrain-103b_123.out --config-name my_config
+python3 gpt/parse_results.py --log-file logs/deepspeed-pretrain-103b_123.out --config-name my_config
 
 # Multiple jobs tracked in a CSV
-python3 parse_results.py --jobs-csv sweep_results/sweep_jobs.csv --output-dir sweep_results
+python3 gpt/parse_results.py --jobs-csv sweep_results/sweep_jobs.csv --output-dir sweep_results
 ```
 
 ### Known issues
@@ -176,4 +176,4 @@ See [`qlora/README.md`](qlora/README.md) for full instructions.
 
 Fine-tune Llama2 from HuggingFace weights using Megatron-DeepSpeed. Includes weight conversion from HuggingFace to Megatron format and fine-tuning on the Stanford Alpaca dataset. Uses the shared container image (`0.deepspeed.dockerfile`).
 
-See [`examples_megatron_deepspeed/finetune_hf_llama/README.md`](examples_megatron_deepspeed/finetune_hf_llama/README.md) for full instructions.
\ No newline at end of file
+See [`examples_megatron_deepspeed/finetune_hf_llama/README.md`](examples_megatron_deepspeed/finetune_hf_llama/README.md) for full instructions.
diff --git a/3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json b/3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json
similarity index 100%
rename from 3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json
rename to 3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json
diff --git a/3.test_cases/pytorch/deepspeed/parse_results.py b/3.test_cases/pytorch/deepspeed/gpt/parse_results.py
similarity index 100%
rename from 3.test_cases/pytorch/deepspeed/parse_results.py
rename to 3.test_cases/pytorch/deepspeed/gpt/parse_results.py
diff --git a/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch b/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch
similarity index 95%
rename from 3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch
rename to 3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch
index d3b4f277a..a68585528 100755
--- a/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch
+++ b/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch
@@ -57,7 +57,7 @@ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 # ============================================================
 # Cluster topology
 # ============================================================
-export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+export NODES=( $( scontrol show hostnames "$SLURM_JOB_NODELIST" ) )
 export NODES_ARRAY=($NODES)
 export HEAD_NODE=${NODES_ARRAY[0]}
 export MASTER_ADDR=$(hostname --ip-address)
@@ -72,7 +72,7 @@ export FI_LOG_LEVEL=1
 export FI_PROVIDER=efa
 export FI_EFA_USE_HUGE_PAGE=0
 export NCCL_SOCKET_IFNAME=^docker,lo,veth
-export NCCL_P2P_NET_CHUNKSIZE=2048576
+export NCCL_P2P_NET_CHUNKSIZE=2097152
 export NCCL_BUFFERSIZE=8388608
 export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so
 export NCCL_ASYNC_ERROR_HANDLING=1
@@ -144,20 +144,20 @@ EOF
 # ============================================================
 # Hostfile for DeepSpeed
 # ============================================================
-export HOSTFILE=/fsx/hostfile_${SLURM_JOB_ID}
+export HOSTFILE="/fsx/hostfile_${SLURM_JOB_ID}"
 function makehostfile() {
 perl -e '$slots=split /,/, $ENV{"SLURM_STEP_GPUS"};
 $slots=8 if $slots==0;
 @nodes = split /\n/, qx[scontrol show hostnames $ENV{"SLURM_JOB_NODELIST"}];
 print map { "$b$_ slots=$slots\n" } @nodes'
 }
-makehostfile > ${HOSTFILE}
+makehostfile > "${HOSTFILE}"
 
 # ============================================================
 # Container + distributed launch args
 # ============================================================
 declare -a SRUN_ARGS=(
-    --container-image ${IMAGE}
+    --container-image "${IMAGE}"
     --container-mounts /fsx,/opt/slurm/bin
 )
 
@@ -273,7 +273,10 @@ DIST_ARGS_STR="${DIST_ARGS[*]}"
 MODEL_ARGS_STR="${MODEL_ARGS[*]}"
 DS_ARGS_STR="${DS_ARGS[*]}"
 
+# Note: Variables inside the bash -c string are expanded on the host side before
+# being passed to the container. This is intentional — the container does not have
+# access to these env vars at shell expansion time.
 srun -l "${SRUN_ARGS[@]}" bash -c "export PYTHONPATH=${MEGATRON_DS_PATH} && cd ${MEGATRON_DS_PATH} && python3 -m torch.distributed.run ${DIST_ARGS_STR} pretrain_gpt.py ${MODEL_ARGS_STR} ${DS_ARGS_STR}"
 
 # Cleanup hostfile
-rm -f ${HOSTFILE}
+rm -f "${HOSTFILE}"
diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner.sh b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh
similarity index 98%
rename from 3.test_cases/pytorch/deepspeed/sweep_runner.sh
rename to 3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh
index d7c15398f..28a74e7df 100755
--- a/3.test_cases/pytorch/deepspeed/sweep_runner.sh
+++ b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh
@@ -10,7 +10,7 @@
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch"
+SBATCH_SCRIPT="${SCRIPT_DIR}/slurm/pretrain_gpt_103b.sbatch"
 RESULTS_DIR="${SCRIPT_DIR}/sweep_results"
 NODES=8
 PARTITION="${PARTITION:-dev}"
diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh
similarity index 98%
rename from 3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
rename to 3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh
index 297d85fec..b49c37954 100644
--- a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh
+++ b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh
@@ -12,7 +12,7 @@
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch"
+SBATCH_SCRIPT="${SCRIPT_DIR}/slurm/pretrain_gpt_103b.sbatch"
 RESULTS_DIR="${SCRIPT_DIR}/sweep_results"
 NODES=8
 PARTITION="${PARTITION:-dev}"
diff --git a/3.test_cases/pytorch/deepspeed/upload_results.sh b/3.test_cases/pytorch/deepspeed/gpt/upload_results.sh
similarity index 100%
rename from 3.test_cases/pytorch/deepspeed/upload_results.sh
rename to 3.test_cases/pytorch/deepspeed/gpt/upload_results.sh

From febd787cb10f1b1a049fe5ce9ba4e3a6b50cbfd9 Mon Sep 17 00:00:00 2001
From: Paulo Aragao <aragao.paulo@gmail.com>
Date: Tue, 10 Mar 2026 03:44:14 +0000
Subject: [PATCH 5/5] chore: remove internal sweep and upload scripts from PR

---
 .../pytorch/deepspeed/gpt/sweep_runner.sh     | 144 ----------
 .../pytorch/deepspeed/gpt/sweep_runner_v2.sh  | 154 ----------
 .../pytorch/deepspeed/gpt/upload_results.sh   | 263 ------------------
 3 files changed, 561 deletions(-)
 delete mode 100755 3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh
 delete mode 100644 3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh
 delete mode 100755 3.test_cases/pytorch/deepspeed/gpt/upload_results.sh

diff --git a/3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh
deleted file mode 100755
index 28a74e7df..000000000
--- a/3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/bin/bash
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# SPDX-License-Identifier: MIT-0
-#
-# sweep_runner.sh - Automated parameter sweep for DeepSpeed 103B pretraining
-# Runs all parallelism and environment flag configurations, collects results.
-#
-# Usage: bash sweep_runner.sh [--dry-run]
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-SBATCH_SCRIPT="${SCRIPT_DIR}/slurm/pretrain_gpt_103b.sbatch"
-RESULTS_DIR="${SCRIPT_DIR}/sweep_results"
-NODES=8
-PARTITION="${PARTITION:-dev}"
-
-DRY_RUN=0
-if [ "${1:-}" = "--dry-run" ]; then
-    DRY_RUN=1
-    echo "[DRY RUN] Will print commands without submitting"
-fi
-
-mkdir -p "${RESULTS_DIR}" logs
-
-# ============================================================
-# Helper: submit a sweep configuration
-# ============================================================
-submit_config() {
-    local config_name="$1"
-    local tp="$2"
-    local pp="$3"
-    local zero="$4"
-    local mbs="$5"
-    local gbs="$6"
-    local act_ckpt="${7:-0}"
-    local seq_par="${8:-0}"
-    local overlap="${9:-0}"
-    shift 9 || true
-    local extra_env="${*:-}"
-
-    echo "============================================"
-    echo "Submitting: ${config_name}"
-    echo "  TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}"
-    echo "  ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}"
-    [ -n "${extra_env}" ] && echo "  Extra env: ${extra_env}"
-    echo "============================================"
-
-    local env_exports=""
-    env_exports+="TP=${tp},"
-    env_exports+="PP=${pp},"
-    env_exports+="ZERO_STAGE=${zero},"
-    env_exports+="MICRO_BATCH_SIZE=${mbs},"
-    env_exports+="GLOBAL_BATCH_SIZE=${gbs},"
-    env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt},"
-    env_exports+="USE_SEQUENCE_PARALLEL=${seq_par},"
-    env_exports+="USE_OVERLAP_COMM=${overlap},"
-    env_exports+="CONFIG_NAME=${config_name}"
-
-    local sbatch_cmd="sbatch"
-    sbatch_cmd+=" --partition=${PARTITION}"
-    sbatch_cmd+=" --nodes=${NODES}"
-    sbatch_cmd+=" --export=ALL,${env_exports}"
-    sbatch_cmd+=" --job-name=sweep_${config_name}"
-
-    # Add extra env vars for NCCL tuning
-    if [ -n "${extra_env}" ]; then
-        sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}"
-    fi
-
-    sbatch_cmd+=" ${SBATCH_SCRIPT}"
-
-    if [ "${DRY_RUN}" -eq 1 ]; then
-        echo "[DRY RUN] ${sbatch_cmd}"
-        echo ""
-        return
-    fi
-
-    local job_output
-    job_output=$(eval "${sbatch_cmd}")
-    local job_id
-    job_id=$(echo "${job_output}" | awk '{print $NF}')
-    echo "Submitted job ${job_id} for config ${config_name}"
-    echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap}" >> "${RESULTS_DIR}/sweep_jobs.csv"
-}
-
-# ============================================================
-# Initialize tracking file
-# ============================================================
-echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap" > "${RESULTS_DIR}/sweep_jobs.csv"
-
-# ============================================================
-# PARALLELISM SWEEP (Configs 1-11)
-# ============================================================
-echo ""
-echo "========== PARALLELISM SWEEP =========="
-echo ""
-
-#            config_name         TP PP ZeRO MBS GBS ACT SEQ OVR
-submit_config "01_baseline"       8  2  0    1   64  0   0   0
-submit_config "02_more_pp"        8  4  0    1   64  0   0   0
-submit_config "03_zero1"          8  2  1    1   64  0   0   0
-submit_config "04_larger_mbs"     8  2  1    2  128  0   0   0
-submit_config "05_pp4_zero1"      8  4  1    1  128  0   0   0
-submit_config "06_zero2"          8  2  2    1   64  0   0   0
-submit_config "07_full_pp"        8  8  0    1   64  0   0   0
-submit_config "08_tp4_pp4"        4  4  1    1   64  0   0   0
-submit_config "09_act_ckpt"       8  2  1    1   64  1   0   0
-submit_config "10_seq_parallel"   8  2  1    1   64  0   1   0
-submit_config "11_overlap_comm"   8  2  1    1   64  0   0   1
-
-# ============================================================
-# Wait for parallelism sweep to determine best config
-# If not waiting, env sweep uses config 03 (TP8/PP2/ZeRO1) as default
-# ============================================================
-echo ""
-echo "========== ENVIRONMENT FLAGS SWEEP =========="
-echo "(Using TP=8 PP=2 ZeRO=1 as base for env flag sweep)"
-echo ""
-
-# Base parallelism for env sweep
-BASE_TP=8
-BASE_PP=2
-BASE_ZERO=1
-BASE_MBS=1
-BASE_GBS=64
-
-#            config_name              TP       PP       ZeRO     MBS      GBS      ACT SEQ OVR extra_env
-submit_config "12_nccl_ring"          ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Ring"
-submit_config "13_nccl_tree"          ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Tree"
-submit_config "14_nccl_no_tuner"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_TUNER_PLUGIN="
-submit_config "15_nccl_chunk_4mb"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_P2P_NET_CHUNKSIZE=4194304"
-submit_config "16_cuda_max_conn_1"    ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "CUDA_DEVICE_MAX_CONNECTIONS=1"
-submit_config "17_nccl_buf_16mb"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=16777216"
-submit_config "18_nccl_buf_32mb"      ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=33554432"
-submit_config "19_nccl_min_ch_16"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=16"
-submit_config "20_nccl_min_ch_32"     ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=32"
-
-echo ""
-echo "========== SWEEP SUBMITTED =========="
-echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs.csv"
-echo ""
-echo "To monitor: watch 'squeue -u \$USER'"
-echo "When all jobs finish, run: python parse_results.py"
diff --git a/3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh
deleted file mode 100644
index b49c37954..000000000
--- a/3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/bin/bash
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# SPDX-License-Identifier: MIT-0
-#
-# sweep_runner_v2.sh - Sweep v2: ZeRO-2 (no PP), ZeRO-3, memory push, fusion ops
-#
-# All configs use PYTORCH_CUDA_ALLOC_CONF=expandable_segments:true (set in sbatch).
-# Optimal NCCL flags are the defaults already in the sbatch script.
-#
-# Usage: bash sweep_runner_v2.sh [--dry-run]
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-SBATCH_SCRIPT="${SCRIPT_DIR}/slurm/pretrain_gpt_103b.sbatch"
-RESULTS_DIR="${SCRIPT_DIR}/sweep_results"
-NODES=8
-PARTITION="${PARTITION:-dev}"
-
-DRY_RUN=0
-if [ "${1:-}" = "--dry-run" ]; then
-    DRY_RUN=1
-    echo "[DRY RUN] Will print commands without submitting"
-fi
-
-mkdir -p "${RESULTS_DIR}" logs
-
-# ============================================================
-# Helper: submit a sweep configuration
-# Extends v1 helper with seq_length and enable_fusions params.
-# ============================================================
-submit_config() {
-    local config_name="$1"
-    local tp="$2"
-    local pp="$3"
-    local zero="$4"
-    local mbs="$5"
-    local gbs="$6"
-    local act_ckpt="${7:-0}"
-    local seq_par="${8:-0}"
-    local overlap="${9:-0}"
-    local seq_length="${10:-2048}"
-    local enable_fusions="${11:-0}"
-    shift 11 || true
-    local extra_env="${*:-}"
-
-    echo "============================================"
-    echo "Submitting: ${config_name}"
-    echo "  TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}"
-    echo "  SeqLen=${seq_length} Fusions=${enable_fusions}"
-    echo "  ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}"
-    [ -n "${extra_env}" ] && echo "  Extra env: ${extra_env}"
-    echo "============================================"
-
-    local env_exports=""
-    env_exports+="TP=${tp},"
-    env_exports+="PP=${pp},"
-    env_exports+="ZERO_STAGE=${zero},"
-    env_exports+="MICRO_BATCH_SIZE=${mbs},"
-    env_exports+="GLOBAL_BATCH_SIZE=${gbs},"
-    env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt},"
-    env_exports+="USE_SEQUENCE_PARALLEL=${seq_par},"
-    env_exports+="USE_OVERLAP_COMM=${overlap},"
-    env_exports+="SEQ_LENGTH=${seq_length},"
-    env_exports+="ENABLE_FUSIONS=${enable_fusions},"
-    env_exports+="CONFIG_NAME=${config_name}"
-
-    local sbatch_cmd="sbatch"
-    sbatch_cmd+=" --partition=${PARTITION}"
-    sbatch_cmd+=" --nodes=${NODES}"
-
-    # Add extra env vars (NCCL overrides etc.)
-    if [ -n "${extra_env}" ]; then
-        sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}"
-    else
-        sbatch_cmd+=" --export=ALL,${env_exports}"
-    fi
-
-    sbatch_cmd+=" ${SBATCH_SCRIPT}"
-
-    if [ "${DRY_RUN}" -eq 1 ]; then
-        echo "[DRY RUN] ${sbatch_cmd}"
-        echo ""
-        return
-    fi
-
-    local job_output
-    job_output=$(eval "${sbatch_cmd}")
-    local job_id
-    job_id=$(echo "${job_output}" | awk '{print $NF}')
-    echo "Submitted job ${job_id} for config ${config_name}"
-    echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap},${seq_length},${enable_fusions}" >> "${RESULTS_DIR}/sweep_jobs_v2.csv"
-}
-
-# ============================================================
-# Initialize tracking file
-# ============================================================
-echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap,seq_length,enable_fusions" > "${RESULTS_DIR}/sweep_jobs_v2.csv"
-
-# ============================================================
-# ZeRO-2 WITHOUT PIPELINE PARALLELISM (PP=1)
-# ============================================================
-echo ""
-echo "========== ZeRO-2 SWEEP (PP=1) =========="
-echo ""
-
-#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
-submit_config "21_zero2_tp8_pp1"      8  1  2    1   64  0   0   0   2048    0
-submit_config "22_zero2_tp8_pp1_mbs2" 8  1  2    2   64  0   0   0   2048    0
-submit_config "23_zero2_tp4_pp1"      4  1  2    1   64  0   0   0   2048    0
-
-# ============================================================
-# ZeRO-3 (PP=1)
-# ============================================================
-echo ""
-echo "========== ZeRO-3 SWEEP (PP=1) =========="
-echo ""
-
-#            config_name                  TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
-submit_config "24_zero3_tp8_pp1"          8  1  3    1   64  0   0   0   2048    0
-submit_config "25_zero3_tp8_pp1_mbs2"     8  1  3    2   64  0   0   0   2048    0
-submit_config "26_zero3_tp4_pp1"          4  1  3    1   64  0   0   0   2048    0
-submit_config "27_zero3_tp8_pp1_overlap"  8  1  3    1   64  0   0   1   2048    0
-
-# ============================================================
-# MEMORY PUSH / SEQ LENGTH / FUSIONS
-# ============================================================
-echo ""
-echo "========== MEMORY PUSH SWEEP =========="
-echo ""
-
-#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
-submit_config "28_mem_seq4k_tp8_pp2"  8  2  0    1   64  0   0   0   4096    0
-submit_config "29_mem_fused_tp8_pp8"  8  8  0    1   64  0   0   0   2048    1
-
-# ============================================================
-# EXPANDABLE SEGMENTS IMPACT ON BEST CONFIG
-# Re-test best config (TP8/PP8/ZeRO0) — now with expandable_segments
-# enabled automatically via the updated sbatch.
-# ============================================================
-echo ""
-echo "========== EXPANDABLE SEGMENTS IMPACT =========="
-echo ""
-
-#            config_name              TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE
-submit_config "30_best_expand_seg"    8  8  0    1   64  0   0   0   2048    0
-
-echo ""
-echo "========== SWEEP V2 SUBMITTED =========="
-echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs_v2.csv"
-echo ""
-echo "Total configs: 10"
-echo "To monitor: watch 'squeue -u \$USER'"
-echo "When all jobs finish, run: python parse_results.py --jobs-csv sweep_results/sweep_jobs_v2.csv"
diff --git a/3.test_cases/pytorch/deepspeed/gpt/upload_results.sh b/3.test_cases/pytorch/deepspeed/gpt/upload_results.sh
deleted file mode 100755
index 6df3cab99..000000000
--- a/3.test_cases/pytorch/deepspeed/gpt/upload_results.sh
+++ /dev/null
@@ -1,263 +0,0 @@
-#!/bin/bash
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# SPDX-License-Identifier: MIT-0
-#
-# upload_results.sh - Upload benchmark results to S3 and CloudWatch
-#
-# Usage:
-#   export S3_BUCKET=my-benchmark-bucket
-#   export S3_REGION=us-west-2
-#   export CW_REGION=us-east-1
-#   bash upload_results.sh [--results-dir sweep_results]
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-RESULTS_DIR="${1:---results-dir}"
-CW_REGION="${CW_REGION:?Error: CW_REGION must be set (e.g. export CW_REGION=us-east-1)}"
-S3_REGION="${S3_REGION:?Error: S3_REGION must be set (e.g. export S3_REGION=us-west-2)}"
-S3_BUCKET="${S3_BUCKET:?Error: S3_BUCKET must be set (e.g. export S3_BUCKET=my-benchmark-bucket)}"
-CW_NAMESPACE="DeepSpeed/B200Benchmarks"
-CW_DASHBOARD_NAME="DeepSpeed-B200-Benchmarks"
-
-# Parse args
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --results-dir) RESULTS_DIR="$2"; shift 2 ;;
-        --region) CW_REGION="$2"; shift 2 ;;
-        *) shift ;;
-    esac
-done
-
-: "${RESULTS_DIR:=${SCRIPT_DIR}/sweep_results}"
-
-if [ ! -d "${RESULTS_DIR}" ]; then
-    echo "Error: Results directory not found: ${RESULTS_DIR}"
-    exit 1
-fi
-
-# ============================================================
-# 1. Upload JSON files to S3
-# ============================================================
-echo "=== Uploading results to S3 ==="
-
-# Determine S3 path: benchmark-results/b200/2026/March/04/
-YEAR=$(date -u +%Y)
-MONTH=$(date -u +%B)
-DAY=$(date -u +%d)
-S3_PREFIX="benchmark-results/b200/${YEAR}/${MONTH}/${DAY}"
-
-for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
-    if [ ! -f "${json_file}" ]; then
-        echo "No result JSON files found in ${RESULTS_DIR}"
-        break
-    fi
-    filename=$(basename "${json_file}")
-    echo "  Uploading ${filename} -> s3://${S3_BUCKET}/${S3_PREFIX}/${filename}"
-    aws s3 cp "${json_file}" "s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" \
-        --region "${S3_REGION}" \
-        --content-type "application/json"
-done
-
-echo "S3 upload complete: s3://${S3_BUCKET}/${S3_PREFIX}/"
-echo ""
-
-# ============================================================
-# 2. Publish metrics to CloudWatch
-# ============================================================
-echo "=== Publishing metrics to CloudWatch ==="
-
-for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
-    if [ ! -f "${json_file}" ]; then
-        break
-    fi
-
-    filename=$(basename "${json_file}")
-
-    # Extract metadata and summary using python
-    read -r config_name tp pp zero_stage precision avg_tflops avg_step_time timestamp < <(
-        python3 -c "
-import json, sys
-with open('${json_file}') as f:
-    d = json.load(f)
-m = d['metadata']
-s = d['summary']
-sc = m.get('sweep_config', {})
-print(
-    sc.get('config_name', 'unknown'),
-    sc.get('tp', 8),
-    sc.get('pp', 2),
-    sc.get('zero_stage', 1),
-    m.get('precision', 'bf16'),
-    s.get('steady_state_avg_tflops_per_gpu', 0),
-    s.get('steady_state_avg_step_time_s', 0),
-    m.get('timestamp', '$(date -u +%Y-%m-%dT%H:%M:%SZ)')
-)
-"
-    )
-
-    echo "  Publishing: ${config_name} (TFLOPS=${avg_tflops}, StepTime=${avg_step_time}s)"
-
-    # Publish with full dimensions
-    aws cloudwatch put-metric-data \
-        --namespace "${CW_NAMESPACE}" \
-        --region "${CW_REGION}" \
-        --metric-data "[
-            {
-                \"MetricName\": \"TFLOPSPerGPU\",
-                \"Value\": ${avg_tflops},
-                \"Unit\": \"Count\",
-                \"Timestamp\": \"${timestamp}\",
-                \"Dimensions\": [
-                    {\"Name\": \"model_size\", \"Value\": \"103b\"},
-                    {\"Name\": \"tp\", \"Value\": \"${tp}\"},
-                    {\"Name\": \"pp\", \"Value\": \"${pp}\"},
-                    {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"},
-                    {\"Name\": \"precision\", \"Value\": \"${precision}\"},
-                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
-                ]
-            },
-            {
-                \"MetricName\": \"StepTimeSeconds\",
-                \"Value\": ${avg_step_time},
-                \"Unit\": \"Seconds\",
-                \"Timestamp\": \"${timestamp}\",
-                \"Dimensions\": [
-                    {\"Name\": \"model_size\", \"Value\": \"103b\"},
-                    {\"Name\": \"tp\", \"Value\": \"${tp}\"},
-                    {\"Name\": \"pp\", \"Value\": \"${pp}\"},
-                    {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"},
-                    {\"Name\": \"precision\", \"Value\": \"${precision}\"},
-                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
-                ]
-            }
-        ]"
-
-    # Also publish with just config_name dimension for easy dashboard queries
-    aws cloudwatch put-metric-data \
-        --namespace "${CW_NAMESPACE}" \
-        --region "${CW_REGION}" \
-        --metric-data "[
-            {
-                \"MetricName\": \"TFLOPSPerGPU\",
-                \"Value\": ${avg_tflops},
-                \"Unit\": \"Count\",
-                \"Timestamp\": \"${timestamp}\",
-                \"Dimensions\": [
-                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
-                ]
-            },
-            {
-                \"MetricName\": \"StepTimeSeconds\",
-                \"Value\": ${avg_step_time},
-                \"Unit\": \"Seconds\",
-                \"Timestamp\": \"${timestamp}\",
-                \"Dimensions\": [
-                    {\"Name\": \"config_name\", \"Value\": \"${config_name}\"}
-                ]
-            }
-        ]"
-
-done
-
-echo "CloudWatch metrics published to namespace: ${CW_NAMESPACE}"
-echo ""
-
-# ============================================================
-# 3. Create/Update CloudWatch Dashboard
-# ============================================================
-echo "=== Creating CloudWatch Dashboard ==="
-
-# Build metric entries dynamically from results
-TFLOPS_METRICS=""
-STEPTIME_METRICS=""
-TABLE_METRICS=""
-
-for json_file in "${RESULTS_DIR}"/training_bench_*.json; do
-    if [ ! -f "${json_file}" ]; then
-        break
-    fi
-
-    config_name=$(python3 -c "
-import json
-with open('${json_file}') as f:
-    d = json.load(f)
-print(d['metadata'].get('sweep_config', {}).get('config_name', 'unknown'))
-")
-
-    TFLOPS_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}],"
-    STEPTIME_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}],"
-    TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} TFLOPS\"}],"
-    TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} StepTime\"}],"
-done
-
-# Remove trailing commas
-TFLOPS_METRICS="${TFLOPS_METRICS%,}"
-STEPTIME_METRICS="${STEPTIME_METRICS%,}"
-TABLE_METRICS="${TABLE_METRICS%,}"
-
-DASHBOARD_BODY=$(cat <<DASH
-{
-  "widgets": [
-    {
-      "type": "text",
-      "x": 0, "y": 0, "width": 24, "height": 1,
-      "properties": {
-        "markdown": "# DeepSpeed B200 Benchmark Results - GPT 103B\\nCluster: b200-hyperpod | 8 nodes x 8 B200 GPUs | Namespace: \`${CW_NAMESPACE}\`"
-      }
-    },
-    {
-      "type": "metric",
-      "x": 0, "y": 1, "width": 24, "height": 8,
-      "properties": {
-        "title": "TFLOPS/GPU Across Sweep Configurations",
-        "view": "bar",
-        "region": "${CW_REGION}",
-        "stat": "Average",
-        "period": 86400,
-        "yAxis": {"left": {"label": "TFLOPS/GPU", "showUnits": false}},
-        "metrics": [${TFLOPS_METRICS}]
-      }
-    },
-    {
-      "type": "metric",
-      "x": 0, "y": 9, "width": 24, "height": 8,
-      "properties": {
-        "title": "Step Time Comparison (seconds)",
-        "view": "bar",
-        "region": "${CW_REGION}",
-        "stat": "Average",
-        "period": 86400,
-        "yAxis": {"left": {"label": "Step Time (s)", "showUnits": false}},
-        "metrics": [${STEPTIME_METRICS}]
-      }
-    },
-    {
-      "type": "metric",
-      "x": 0, "y": 17, "width": 24, "height": 8,
-      "properties": {
-        "title": "Summary Metrics Table",
-        "view": "table",
-        "region": "${CW_REGION}",
-        "stat": "Average",
-        "period": 86400,
-        "metrics": [${TABLE_METRICS}]
-      }
-    }
-  ]
-}
-DASH
-)
-
-aws cloudwatch put-dashboard \
-    --dashboard-name "${CW_DASHBOARD_NAME}" \
-    --region "${CW_REGION}" \
-    --dashboard-body "${DASHBOARD_BODY}"
-
-echo "Dashboard created: ${CW_DASHBOARD_NAME}"
-echo "URL: https://${CW_REGION}.console.aws.amazon.com/cloudwatch/home?region=${CW_REGION}#dashboards:name=${CW_DASHBOARD_NAME}"
-echo ""
-echo "=== Upload Complete ==="
-echo "S3: s3://${S3_BUCKET}/${S3_PREFIX}/"
-echo "CloudWatch: ${CW_NAMESPACE} in ${CW_REGION}"
-echo "Dashboard: ${CW_DASHBOARD_NAME}"