From c87bc69bf4efd9153f504e078932acc3114daa42 Mon Sep 17 00:00:00 2001 From: Paulo Aragao Date: Thu, 5 Mar 2026 15:04:02 +0000 Subject: [PATCH 1/5] Add DeepSpeed 103B GPT pretraining benchmark for B200 cluster - Update Dockerfile: pytorch:25.04-py3 base, EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1 - Add 103B GPT pretraining sbatch script with parameterized parallelism, ZeRO stages, fusion ops, and correct NCCL/EFA flags - Add sweep runners (v1: 20 configs, v2: 10 configs) covering TP/PP/ZeRO/fusion/memory variations - Add results parser and S3 upload script with CloudWatch metric publishing - Best result: 476.6 TFLOPS/GPU (TP=8, PP=8, ZeRO=0, fusions enabled) on 8x B200 nodes --- .../pytorch/deepspeed/0.deepspeed.dockerfile | 177 ++++---- .../pytorch/deepspeed/1.build-image.sbatch | 6 +- .../configs/ds_config_103b_template.json | 20 + .../pytorch/deepspeed/parse_results.py | 391 ++++++++++++++++++ .../deepspeed/pretrain_gpt_103b.sbatch | 280 +++++++++++++ .../pytorch/deepspeed/sweep_runner.sh | 144 +++++++ .../pytorch/deepspeed/sweep_runner_v2.sh | 154 +++++++ .../pytorch/deepspeed/upload_results.sh | 260 ++++++++++++ 8 files changed, 1358 insertions(+), 74 deletions(-) create mode 100644 3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json create mode 100755 3.test_cases/pytorch/deepspeed/parse_results.py create mode 100755 3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch create mode 100755 3.test_cases/pytorch/deepspeed/sweep_runner.sh create mode 100644 3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh create mode 100755 3.test_cases/pytorch/deepspeed/upload_results.sh diff --git a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile index 472edc476..ec7f99995 100644 --- a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile +++ b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile @@ -1,19 +1,20 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 -FROM nvcr.io/nvidia/pytorch:25.03-py3 +# ============================================================ +# Base image: PyTorch 25.04 with CUDA 12.9.0 (required for NCCL 2.29.x) +# Supports Blackwell (sm_100), Hopper, Ampere architectures +# ============================================================ +FROM nvcr.io/nvidia/pytorch:25.04-py3 -ARG GDRCOPY_VERSION=v2.4.1 -ARG EFA_INSTALLER_VERSION=1.37.0 -ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws ARG TRANSFORMERS_VERSION=4.44.2 -ARG MEGATRON_LM_VERSION=core_r0.8.0 - ARG OPEN_MPI_PATH=/opt/amazon/openmpi -###################### -# Update and remove the IB libverbs -###################### +ENV DEBIAN_FRONTEND=noninteractive + +# ============================================================ +# 1. System packages and SSH setup (needed for multi-node training) +# ============================================================ RUN apt-get update -y && apt-get upgrade -y RUN apt-get remove -y --allow-change-held-packages \ ibverbs-utils \ @@ -26,8 +27,7 @@ RUN rm -rf /opt/hpcx/ompi \ && rm -rf /usr/local/ucx \ && ldconfig -RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \ - apt-utils \ +RUN apt-get install -y --no-install-recommends \ autoconf \ automake \ build-essential \ @@ -36,6 +36,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \ gcc \ gdb \ git \ + gnupg \ kmod \ libtool \ openssh-client \ @@ -55,69 +56,99 @@ RUN rm -rf /root/.ssh/ \ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config -ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:$LD_LIBRARY_PATH -ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH - -################################################# -## Install NVIDIA GDRCopy -## -## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure -## that the cuda-compat-xx-x package is the latest. -RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \ - && cd /tmp/gdrcopy \ - && make prefix=/opt/gdrcopy install - -ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH -ENV CPATH /opt/gdrcopy/include:$CPATH -ENV PATH /opt/gdrcopy/bin:$PATH - -################################################# -## Install EFA installer -RUN cd $HOME \ - && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ - && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ +# ============================================================ +# 2. Install EFA Installer 1.47.0 +# This bundles libfabric, rdma-core, and pre-built aws-ofi-nccl +# No source build of aws-ofi-nccl needed (unlike EFA < 1.40) +# ============================================================ +ENV EFA_INSTALLER_VERSION=1.47.0 +WORKDIR /tmp +RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xz \ && cd aws-efa-installer \ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ - && rm -rf $HOME/aws-efa-installer - - -################################################### -## Install AWS-OFI-NCCL plugin -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev -#Switch from sh to bash to allow parameter expansion -SHELL ["/bin/bash", "-c"] -RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ - && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ - && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ - && ./configure --prefix=/opt/aws-ofi-nccl/install \ - --with-mpi=/opt/amazon/openmpi \ - --with-libfabric=/opt/amazon/efa \ - --with-cuda=/usr/local/cuda \ - --enable-platform-aws \ - && make -j $(nproc) \ - && make install \ - && cd .. \ - && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ - && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz - -SHELL ["/bin/sh", "-c"] - -################################################### -RUN rm -rf /var/lib/apt/lists/* - -RUN echo "hwloc_base_binding_policy = none" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf \ - && echo "rmaps_base_mapping_policy = slot" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf + && cd / && rm -rf /tmp/aws-efa-installer + +# ============================================================ +# 3. Remove old aws-ofi-nccl and create NCCL plugin symlinks +# NCCL_NET_PLUGIN=aws-ofi looks for libnccl-net-aws-ofi.so +# EFA installer names it libnccl-net-ofi.so +# Without this symlink NCCL falls back to TCP sockets silently +# ============================================================ +RUN rm -rf /opt/amazon/aws-ofi-nccl + +RUN ln -sf /opt/amazon/ofi-nccl/lib/libnccl-net-ofi.so \ + /opt/amazon/ofi-nccl/lib/libnccl-net-aws-ofi.so && \ + ln -sf /opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \ + /opt/amazon/ofi-nccl/lib/libnccl-tuner-aws-ofi.so + +# ============================================================ +# 4. Upgrade NCCL to 2.29.3 (matches B200 host version) +# Requires CUDA >= 12.9 (which pytorch:25.04-py3 provides) +# Must add NVIDIA CUDA apt repo first since base image may not have it +# ============================================================ +ENV NCCL_VERSION=2.29.3-1 +RUN apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget -qO /tmp/cuda-keyring.deb \ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i /tmp/cuda-keyring.deb && \ + rm /tmp/cuda-keyring.deb && \ + apt-get update && \ + apt-get install -y --allow-downgrades --allow-change-held-packages \ + libnccl2=${NCCL_VERSION}+cuda12.9 \ + libnccl-dev=${NCCL_VERSION}+cuda12.9 && \ + rm -rf /var/lib/apt/lists/* + +# ============================================================ +# 5. Install GDRCopy v2.5.1 (lib-only, no binaries needed) +# ============================================================ +RUN cd /tmp && \ + git clone --branch v2.5.1 --depth 1 https://github.com/NVIDIA/gdrcopy.git && \ + cd gdrcopy && \ + make -j$(nproc) lib lib_install && \ + cd / && rm -rf /tmp/gdrcopy + +# ============================================================ +# 6. Fix library path references +# Use ld.so.conf.d for system-wide discovery (more robust +# than relying solely on LD_LIBRARY_PATH) +# ============================================================ +RUN echo "/opt/amazon/ofi-nccl/lib" > /etc/ld.so.conf.d/aws-ofi-nccl.conf && \ + echo "/opt/amazon/efa/lib" > /etc/ld.so.conf.d/efa.conf + +RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/environment 2>/dev/null || true +RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/shinit_v2 2>/dev/null || true + +# Rebuild ldconfig cache +RUN rm -f /etc/ld.so.cache && ldconfig + +# ============================================================ +# 7. Environment variables +# ============================================================ +ENV LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}" +ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}" +ENV FI_PROVIDER=efa + +# ============================================================ +# 8. OpenMPI tuning for EFA (needed for multi-node training) +# ============================================================ +RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf + +RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \ + && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "${OPEN_MPI_PATH}/bin/mpirun.real \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \ + && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun + +# ============================================================ +# 9. Python packages for DeepSpeed training +# ============================================================ +RUN pip3 install --no-cache-dir \ + awscli pynvml \ + transformers==${TRANSFORMERS_VERSION} \ + sentencepiece python-etcd \ + deepspeed accelerate -RUN pip3 install awscli pynvml - -RUN mv $OPEN_MPI_PATH/bin/mpirun $OPEN_MPI_PATH/bin/mpirun.real \ - && echo '#!/bin/bash' > $OPEN_MPI_PATH/bin/mpirun \ - && echo '/opt/amazon/openmpi/bin/mpirun.real "$@"' >> $OPEN_MPI_PATH/bin/mpirun \ - && chmod a+x $OPEN_MPI_PATH/bin/mpirun - -###################### -# DeepSpeed dependencies -###################### -RUN pip install transformers==${TRANSFORMERS_VERSION} sentencepiece python-etcd deepspeed accelerate +RUN rm -rf /var/lib/apt/lists/* +WORKDIR /workspace diff --git a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch index ebd7b0b04..f91222361 100644 --- a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch +++ b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MIT-0 #SBATCH -N 1 # number of nodes to use -#SBATCH --job-name=build-neox-image # name of your job +#SBATCH --job-name=build-deepspeed-image # name of your job #SBATCH --output=logs/%x_%j.out # logfile for stdout #SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs @@ -14,6 +14,10 @@ set -euxo pipefail : "${APPS_PATH:=/fsx/apps}" : "${IMAGE:=$APPS_PATH/deepspeed.sqsh}" +# Ensure output directory exists +mkdir -p ${APPS_PATH} +mkdir -p logs + ENROOT_IMAGE=deepspeed docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile . # Remove old sqsh file if exists diff --git a/3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json b/3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json new file mode 100644 index 000000000..6197eaf78 --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json @@ -0,0 +1,20 @@ +{ + "train_batch_size": 64, + "train_micro_batch_size_per_gpu": 1, + "steps_per_print": 10, + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 500000000, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 500000000, + "contiguous_gradients": true + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "bf16": { + "enabled": true + }, + "wall_clock_breakdown": false +} diff --git a/3.test_cases/pytorch/deepspeed/parse_results.py b/3.test_cases/pytorch/deepspeed/parse_results.py new file mode 100755 index 000000000..8eacea70c --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/parse_results.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +""" +parse_results.py - Parse Megatron-DeepSpeed training logs into benchmark JSON. + +Reads Slurm log files, extracts per-step metrics, and produces JSON files +matching the existing benchmark-results schema at: + s3://paragao-new-nemo-squash-container/benchmark-results/b200/ + +Usage: + python parse_results.py [--logs-dir logs] [--output-dir sweep_results] + python parse_results.py --log-file logs/sweep_01_baseline_123.out --config-name 01_baseline +""" + +import argparse +import csv +import json +import os +import re +import statistics +import sys +from datetime import datetime, timezone + + +# ============================================================ +# Megatron-DeepSpeed log line patterns +# ============================================================ +# Example: " iteration 10/ 50 | consumed samples: ..." +# Example: "elapsed time per iteration (ms): 4725.7 | ..." +# Example: "lm loss: 1.3389E+01 | ..." +# Example: "learning rate: 3.000E-05 | ..." +# Example: "global batch size: 128 | ..." +# Example: "loss scale: 1.0 | ..." +# Example: "grad norm: 74.776 | ..." +# Example: "TFLOPs: 125.4 | ..." + +ITER_PATTERN = re.compile(r"iteration\s+(\d+)/\s*(\d+)") +ELAPSED_PATTERN = re.compile(r"elapsed time per iteration \(ms\):\s*([\d.]+)") +LOSS_PATTERN = re.compile(r"lm loss:\s*([\d.eE+\-]+)") +LR_PATTERN = re.compile(r"learning rate:\s*([\d.eE+\-]+)") +GBS_PATTERN = re.compile(r"global batch size:\s*(\d+)") +LOSS_SCALE_PATTERN = re.compile(r"loss scale:\s*([\d.eE+\-]+)") +GRAD_NORM_PATTERN = re.compile(r"grad norm:\s*([\d.eE+\-]+)") +TFLOPS_PATTERN = re.compile(r"TFLOPs:\s*([\d.]+)") + + +def parse_log_file(log_path): + """Parse a single Megatron-DeepSpeed log file and extract per-step metrics.""" + steps = [] + current_step = {} + + with open(log_path, "r") as f: + for line in f: + # Check for iteration marker + m = ITER_PATTERN.search(line) + if m: + if current_step: + steps.append(current_step) + current_step = { + "step": int(m.group(1)), + "total_steps": int(m.group(2)), + } + + if not current_step: + continue + + # Extract metrics from the same log block + m = ELAPSED_PATTERN.search(line) + if m: + elapsed_ms = float(m.group(1)) + current_step["elapsed_ms"] = elapsed_ms + current_step["step_time_s"] = round(elapsed_ms / 1000.0, 2) + + m = LOSS_PATTERN.search(line) + if m: + current_step["lm_loss"] = float(m.group(1)) + + m = LR_PATTERN.search(line) + if m: + current_step["learning_rate"] = float(m.group(1)) + + m = GBS_PATTERN.search(line) + if m: + current_step["global_batch_size"] = int(m.group(1)) + + m = LOSS_SCALE_PATTERN.search(line) + if m: + current_step["loss_scale"] = float(m.group(1)) + + m = GRAD_NORM_PATTERN.search(line) + if m: + current_step["grad_norm"] = float(m.group(1)) + + m = TFLOPS_PATTERN.search(line) + if m: + current_step["tflops_per_gpu"] = float(m.group(1)) + + # Don't forget the last step + if current_step: + steps.append(current_step) + + return steps + + +def compute_tflops_from_step_time( + step_time_s, + global_batch_size, + seq_length=2048, + hidden_size=12288, + num_layers=80, + num_heads=96, + total_gpus=64, +): + """ + Estimate TFLOPS/GPU for a GPT model using the standard formula: + FLOPs per iteration = 8 * seq * hidden^2 * layers * (1 + seq/(6*hidden) + vocab/(12*hidden*layers)) + Simplified: ~= 8 * B * s * h^2 * L * (1 + s/(6h)) + where B = global_batch_size + """ + vocab_size = 50257 # GPT-2 vocab + s = seq_length + h = hidden_size + L = num_layers + B = global_batch_size + + # Standard approximation for GPT FLOP count + flops_per_iter = ( + 8 * B * s * h * h * L * (1 + s / (6 * h) + vocab_size / (12 * h * L)) + ) + tflops_per_gpu = flops_per_iter / (step_time_s * total_gpus * 1e12) + return round(tflops_per_gpu, 1) + + +def build_result_json( + steps, + config_name, + job_id, + nodes=8, + gpus_per_node=8, + tp=8, + pp=2, + zero_stage=1, + mbs=1, + gbs=64, + seq_length=2048, + precision="bf16", +): + """Build the benchmark JSON matching the existing schema.""" + total_gpus = nodes * gpus_per_node + warmup_steps = 5 + total_steps = len(steps) + + # Ensure TFLOPS values exist (compute if not in logs) + for step in steps: + if "tflops_per_gpu" not in step and "step_time_s" in step: + step["tflops_per_gpu"] = compute_tflops_from_step_time( + step["step_time_s"], + step.get("global_batch_size", gbs), + seq_length=seq_length, + total_gpus=total_gpus, + ) + + # Steady-state metrics (skip warmup) + steady_steps = [s for s in steps if s.get("step", 0) > warmup_steps] + + if not steady_steps: + print(f"Warning: No steady-state steps found for {config_name}") + steady_steps = steps + + steady_tflops = [s["tflops_per_gpu"] for s in steady_steps if "tflops_per_gpu" in s] + steady_times = [s["step_time_s"] for s in steady_steps if "step_time_s" in s] + + summary = { + "total_steps": total_steps, + "warmup_steps": warmup_steps, + "steady_state_steps": len(steady_steps), + } + + if steady_tflops: + summary.update( + { + "steady_state_avg_tflops_per_gpu": round( + statistics.mean(steady_tflops), 2 + ), + "steady_state_median_tflops_per_gpu": round( + statistics.median(steady_tflops), 1 + ), + "steady_state_min_tflops_per_gpu": round(min(steady_tflops), 1), + "steady_state_max_tflops_per_gpu": round(max(steady_tflops), 1), + "steady_state_stdev_tflops_per_gpu": round( + statistics.stdev(steady_tflops), 2 + ) + if len(steady_tflops) > 1 + else 0.0, + "peak_tflops_per_gpu": round(max(steady_tflops), 1), + } + ) + + if steady_times: + summary.update( + { + "steady_state_avg_step_time_s": round(statistics.mean(steady_times), 4), + "steady_state_median_step_time_s": round( + statistics.median(steady_times), 2 + ), + "steady_state_min_step_time_s": round(min(steady_times), 2), + "steady_state_max_step_time_s": round(max(steady_times), 2), + } + ) + + if steps: + summary["final_loss"] = steps[-1].get("lm_loss", None) + summary["initial_loss"] = steps[0].get("lm_loss", None) + + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + result = { + "metadata": { + "timestamp": timestamp, + "job_id": str(job_id), + "cluster": "b200-hyperpod", + "instance_type": "ml.p6-b200.48xlarge", + "nodes": nodes, + "gpus_per_node": gpus_per_node, + "total_gpus": total_gpus, + "model": "deepspeed-gpt-103b", + "precision": precision, + "framework": "megatron-deepspeed", + "sweep_config": { + "config_name": config_name, + "tp": tp, + "pp": pp, + "zero_stage": zero_stage, + "micro_batch_size": mbs, + "global_batch_size": gbs, + "seq_length": seq_length, + }, + }, + "summary": summary, + "steps": steps, + } + + return result + + +def parse_sweep_jobs(jobs_csv, logs_dir, output_dir): + """Parse all jobs from the sweep tracking CSV.""" + os.makedirs(output_dir, exist_ok=True) + results = [] + + with open(jobs_csv, "r") as f: + reader = csv.DictReader(f) + for row in reader: + job_id = row["job_id"] + config_name = row["config_name"] + + # Find the log file for this job + log_pattern = f"sweep_{config_name}_{job_id}.out" + log_path = os.path.join(logs_dir, log_pattern) + + if not os.path.exists(log_path): + # Try alternate pattern + log_candidates = [ + f + for f in os.listdir(logs_dir) + if job_id in f and f.endswith(".out") + ] + if log_candidates: + log_path = os.path.join(logs_dir, log_candidates[0]) + else: + print( + f"Warning: No log file found for job {job_id} ({config_name})" + ) + continue + + print(f"Parsing {config_name} (job {job_id}): {log_path}") + steps = parse_log_file(log_path) + + if not steps: + print(f" Warning: No steps found in log file") + continue + + result = build_result_json( + steps=steps, + config_name=config_name, + job_id=job_id, + tp=int(row.get("tp", 8)), + pp=int(row.get("pp", 2)), + zero_stage=int(row.get("zero", 1)), + mbs=int(row.get("mbs", 1)), + gbs=int(row.get("gbs", 64)), + seq_length=int(row.get("seq_length", 2048)), + ) + + # Write individual JSON file + now = datetime.now(timezone.utc) + filename = ( + f"training_bench_deepspeed-gpt-103b_bf16_" + f"{now.strftime('%Y-%m-%d_%H%M')}_job{job_id}.json" + ) + filepath = os.path.join(output_dir, filename) + with open(filepath, "w") as jf: + json.dump(result, jf, indent=2) + print(f" Wrote: {filepath}") + + results.append(result) + + # Write combined summary + summary_path = os.path.join(output_dir, "sweep_summary.json") + with open(summary_path, "w") as f: + json.dump(results, f, indent=2) + print(f"\nWrote combined summary: {summary_path}") + + return results + + +def parse_single_log(log_file, config_name, output_dir): + """Parse a single log file.""" + os.makedirs(output_dir, exist_ok=True) + + # Extract job ID from filename + job_id_match = re.search(r"_(\d+)\.out", log_file) + job_id = job_id_match.group(1) if job_id_match else "unknown" + + print(f"Parsing {config_name} (job {job_id}): {log_file}") + steps = parse_log_file(log_file) + + if not steps: + print("Error: No steps found in log file") + sys.exit(1) + + result = build_result_json( + steps=steps, + config_name=config_name, + job_id=job_id, + ) + + now = datetime.now(timezone.utc) + filename = ( + f"training_bench_deepspeed-gpt-103b_bf16_" + f"{now.strftime('%Y-%m-%d_%H%M')}_job{job_id}.json" + ) + filepath = os.path.join(output_dir, filename) + with open(filepath, "w") as f: + json.dump(result, f, indent=2) + print(f"Wrote: {filepath}") + + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Parse Megatron-DeepSpeed logs into benchmark JSON" + ) + parser.add_argument( + "--logs-dir", default="logs", help="Directory containing Slurm log files" + ) + parser.add_argument( + "--output-dir", default="sweep_results", help="Directory to write JSON results" + ) + parser.add_argument( + "--jobs-csv", + default="sweep_results/sweep_jobs.csv", + help="CSV file tracking sweep job IDs", + ) + parser.add_argument( + "--log-file", default=None, help="Parse a single log file instead of sweep CSV" + ) + parser.add_argument( + "--config-name", + default="single_run", + help="Config name for single log file parsing", + ) + + args = parser.parse_args() + + if args.log_file: + parse_single_log(args.log_file, args.config_name, args.output_dir) + else: + if not os.path.exists(args.jobs_csv): + print(f"Error: Jobs CSV not found: {args.jobs_csv}") + print( + "Run sweep_runner.sh first, or use --log-file for single file parsing" + ) + sys.exit(1) + parse_sweep_jobs(args.jobs_csv, args.logs_dir, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch b/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch new file mode 100755 index 000000000..43957885f --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch @@ -0,0 +1,280 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --exclusive +#SBATCH --job-name=deepspeed-pretrain-103b +#SBATCH --output=logs/%x_%j.out +#SBATCH --error=logs/%x_%j.err +#SBATCH --partition=b200 + +set -euxo pipefail + +# ============================================================ +# Environment defaults +# ============================================================ +: "${APPS_PATH:=/fsx/apps}" +: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}" +: "${FSX_PATH:=/fsx}" +: "${DATA_DIR:=$FSX_PATH/deepspeed/data}" +: "${MEGATRON_DS_PATH:=$FSX_PATH/deepspeed/Megatron-DeepSpeed}" + +# ============================================================ +# Parallelism config (overridable via env vars from sweep_runner.sh) +# ============================================================ +: "${TP:=8}" +: "${PP:=2}" +: "${ZERO_STAGE:=1}" +: "${MICRO_BATCH_SIZE:=1}" +: "${GLOBAL_BATCH_SIZE:=64}" +: "${TRAIN_ITERS:=50}" + +# ============================================================ +# ~103B GPT model architecture +# Layers=80, Hidden=12288, Heads=96, FFN=49152 +# Estimated parameters: ~103B +# ============================================================ +: "${NUM_LAYERS:=80}" +: "${HIDDEN_SIZE:=12288}" +: "${NUM_HEADS:=96}" +: "${FFN_HIDDEN_SIZE:=49152}" +: "${SEQ_LENGTH:=2048}" + +# ============================================================ +# Optional features (set to 1 to enable) +# ============================================================ +: "${USE_ACTIVATION_CHECKPOINTING:=0}" +: "${USE_SEQUENCE_PARALLEL:=0}" +: "${USE_OVERLAP_COMM:=0}" +: "${ENABLE_FUSIONS:=0}" +: "${CONFIG_NAME:=baseline}" + +# ============================================================ +# PyTorch memory allocator optimisation +# ============================================================ +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + +# ============================================================ +# Cluster topology +# ============================================================ +export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +export NODES_ARRAY=($NODES) +export HEAD_NODE=${NODES_ARRAY[0]} +export MASTER_ADDR=$(hostname --ip-address) +export MASTER_PORT=$((RANDOM + 10000)) +export NNODES=$SLURM_JOB_NUM_NODES +export NUM_GPUS_PER_NODE=8 + +# ============================================================ +# Network settings for EFA + NCCL +# ============================================================ +export FI_LOG_LEVEL=1 +export FI_PROVIDER=efa +export FI_EFA_USE_HUGE_PAGE=0 +export NCCL_SOCKET_IFNAME=^docker,lo,veth +export NCCL_P2P_NET_CHUNKSIZE=2048576 +export NCCL_BUFFERSIZE=8388608 +export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so +export NCCL_ASYNC_ERROR_HANDLING=1 +export OMPI_MCA_plm=^slurm + +# ============================================================ +# Generate DeepSpeed config dynamically +# ============================================================ +mkdir -p configs logs + +PRESCALE_GRAD="false" +if [ "${ZERO_STAGE}" -eq 0 ]; then + PRESCALE_GRAD="true" +fi + +OVERLAP_COMM_BOOL="false" +if [ "${USE_OVERLAP_COMM}" -eq 1 ]; then + OVERLAP_COMM_BOOL="true" +fi + +# Build ZeRO optimisation block depending on stage +if [ "${ZERO_STAGE}" -eq 3 ]; then + ZERO_BLOCK=$(cat < configs/ds_config_run.json +{ + "train_batch_size": ${GLOBAL_BATCH_SIZE}, + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 10, +${ZERO_BLOCK}, + "gradient_clipping": 1.0, + "prescale_gradients": ${PRESCALE_GRAD}, + "bf16": { + "enabled": true + }, + "wall_clock_breakdown": false +} +EOF + +# ============================================================ +# Hostfile for DeepSpeed +# ============================================================ +export HOSTFILE=/fsx/hostfile_${SLURM_JOB_ID} +function makehostfile() { +perl -e '$slots=split /,/, $ENV{"SLURM_STEP_GPUS"}; +$slots=8 if $slots==0; +@nodes = split /\n/, qx[scontrol show hostnames $ENV{"SLURM_JOB_NODELIST"}]; +print map { "$b$_ slots=$slots\n" } @nodes' +} +makehostfile > ${HOSTFILE} + +# ============================================================ +# Container + distributed launch args +# ============================================================ +declare -a SRUN_ARGS=( + --container-image ${IMAGE} + --container-mounts /fsx,/opt/slurm/bin +) + +declare -a DIST_ARGS=( + --nnodes ${NNODES} + --nproc-per-node ${NUM_GPUS_PER_NODE} + --master_addr ${MASTER_ADDR} + --master_port ${MASTER_PORT} + --rdzv_id $RANDOM + --rdzv_backend c10d + --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} +) + +# ============================================================ +# Model + training args +# ============================================================ +declare -a MODEL_ARGS=( + --num-layers ${NUM_LAYERS} + --hidden-size ${HIDDEN_SIZE} + --num-attention-heads ${NUM_HEADS} + --ffn-hidden-size ${FFN_HIDDEN_SIZE} + --seq-length ${SEQ_LENGTH} + --max-position-embeddings ${SEQ_LENGTH} + --micro-batch-size ${MICRO_BATCH_SIZE} + --global-batch-size ${GLOBAL_BATCH_SIZE} + --train-iters ${TRAIN_ITERS} + --lr 1.0e-4 + --min-lr 1.0e-6 + --lr-decay-style cosine + --lr-warmup-iters 5 + --lr-decay-iters 50 + --weight-decay 0.1 + --clip-grad 1.0 + --adam-beta1 0.9 + --adam-beta2 0.95 + --init-method-std 0.006 + --log-interval 1 + --eval-iters 0 + --eval-interval 1000 + --bf16 + --data-path ${DATA_DIR}/BookCorpusDataset_text_document + --vocab-file ${DATA_DIR}/gpt2-vocab.json + --merge-file ${DATA_DIR}/gpt2-merges.txt + --split 100,0,0 + --data-impl mmap + --num-workers 0 +) + +# By default disable fusions (matches sweep v1 behaviour). +# Set ENABLE_FUSIONS=1 to enable them. +if [ "${ENABLE_FUSIONS}" -eq 0 ]; then + MODEL_ARGS+=( + --no-masked-softmax-fusion + --no-bias-gelu-fusion + --no-bias-dropout-fusion + --no-gradient-accumulation-fusion + ) +fi + +declare -a DS_ARGS=( + --tensor-model-parallel-size ${TP} + --zero-stage ${ZERO_STAGE} + --deepspeed_config ${PWD}/configs/ds_config_run.json + --deepspeed + --distributed-backend nccl +) + +# Megatron-DeepSpeed wraps the model in PipelineModule by default. +# DeepSpeed's PipelineEngine asserts that ZeRO stage < 2. +# When using ZeRO-2 or ZeRO-3 we must disable pipeline parallel entirely +# via --no-pipeline-parallel (which sets ds_pipeline_enabled=False). +if [ "${ZERO_STAGE}" -ge 2 ]; then + DS_ARGS+=( + --pipeline-model-parallel-size 1 + --no-pipeline-parallel + ) +else + DS_ARGS+=(--pipeline-model-parallel-size ${PP}) +fi + +# ============================================================ +# Optional features +# ============================================================ +if [ "${USE_ACTIVATION_CHECKPOINTING}" -eq 1 ]; then + DS_ARGS+=( + --checkpoint-activations + --deepspeed-activation-checkpointing + ) +fi + +if [ "${USE_SEQUENCE_PARALLEL}" -eq 1 ]; then + DS_ARGS+=(--sequence-parallel) +fi + +# ============================================================ +# Launch training +# Note: Using python3 -m torch.distributed.run instead of torchrun +# because the container's Python version may differ from the host +# ============================================================ +echo "=== DeepSpeed 103B GPT Pretraining ===" +echo "Config: ${CONFIG_NAME}" +echo "Nodes: ${NNODES}, GPUs/node: ${NUM_GPUS_PER_NODE}, Total GPUs: $((NNODES * NUM_GPUS_PER_NODE))" +echo "TP=${TP}, PP=${PP}, ZeRO=${ZERO_STAGE}" +echo "MBS=${MICRO_BATCH_SIZE}, GBS=${GLOBAL_BATCH_SIZE}" +echo "Model: layers=${NUM_LAYERS}, hidden=${HIDDEN_SIZE}, heads=${NUM_HEADS}, ffn=${FFN_HIDDEN_SIZE}" +echo "Seq length: ${SEQ_LENGTH}, Fusions: ${ENABLE_FUSIONS}" +echo "Activation ckpt: ${USE_ACTIVATION_CHECKPOINTING}, Seq parallel: ${USE_SEQUENCE_PARALLEL}" +echo "PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-}" +echo "=======================================" + +# Convert arrays to strings for bash -c invocation +DIST_ARGS_STR="${DIST_ARGS[*]}" +MODEL_ARGS_STR="${MODEL_ARGS[*]}" +DS_ARGS_STR="${DS_ARGS[*]}" + +srun -l "${SRUN_ARGS[@]}" bash -c "export PYTHONPATH=${MEGATRON_DS_PATH} && cd ${MEGATRON_DS_PATH} && python3 -m torch.distributed.run ${DIST_ARGS_STR} pretrain_gpt.py ${MODEL_ARGS_STR} ${DS_ARGS_STR}" + +# Cleanup hostfile +rm -f ${HOSTFILE} diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner.sh b/3.test_cases/pytorch/deepspeed/sweep_runner.sh new file mode 100755 index 000000000..9f7d5a944 --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/sweep_runner.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# sweep_runner.sh - Automated parameter sweep for DeepSpeed 103B pretraining +# Runs all parallelism and environment flag configurations, collects results. +# +# Usage: bash sweep_runner.sh [--dry-run] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch" +RESULTS_DIR="${SCRIPT_DIR}/sweep_results" +NODES=8 +PARTITION="b200" + +DRY_RUN=0 +if [ "${1:-}" = "--dry-run" ]; then + DRY_RUN=1 + echo "[DRY RUN] Will print commands without submitting" +fi + +mkdir -p "${RESULTS_DIR}" logs + +# ============================================================ +# Helper: submit a sweep configuration +# ============================================================ +submit_config() { + local config_name="$1" + local tp="$2" + local pp="$3" + local zero="$4" + local mbs="$5" + local gbs="$6" + local act_ckpt="${7:-0}" + local seq_par="${8:-0}" + local overlap="${9:-0}" + shift 9 || true + local extra_env="${*:-}" + + echo "============================================" + echo "Submitting: ${config_name}" + echo " TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}" + echo " ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}" + [ -n "${extra_env}" ] && echo " Extra env: ${extra_env}" + echo "============================================" + + local env_exports="" + env_exports+="TP=${tp}," + env_exports+="PP=${pp}," + env_exports+="ZERO_STAGE=${zero}," + env_exports+="MICRO_BATCH_SIZE=${mbs}," + env_exports+="GLOBAL_BATCH_SIZE=${gbs}," + env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt}," + env_exports+="USE_SEQUENCE_PARALLEL=${seq_par}," + env_exports+="USE_OVERLAP_COMM=${overlap}," + env_exports+="CONFIG_NAME=${config_name}" + + local sbatch_cmd="sbatch" + sbatch_cmd+=" --partition=${PARTITION}" + sbatch_cmd+=" --nodes=${NODES}" + sbatch_cmd+=" --export=ALL,${env_exports}" + sbatch_cmd+=" --job-name=sweep_${config_name}" + + # Add extra env vars for NCCL tuning + if [ -n "${extra_env}" ]; then + sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}" + fi + + sbatch_cmd+=" ${SBATCH_SCRIPT}" + + if [ "${DRY_RUN}" -eq 1 ]; then + echo "[DRY RUN] ${sbatch_cmd}" + echo "" + return + fi + + local job_output + job_output=$(eval "${sbatch_cmd}") + local job_id + job_id=$(echo "${job_output}" | awk '{print $NF}') + echo "Submitted job ${job_id} for config ${config_name}" + echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap}" >> "${RESULTS_DIR}/sweep_jobs.csv" +} + +# ============================================================ +# Initialize tracking file +# ============================================================ +echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap" > "${RESULTS_DIR}/sweep_jobs.csv" + +# ============================================================ +# PARALLELISM SWEEP (Configs 1-11) +# ============================================================ +echo "" +echo "========== PARALLELISM SWEEP ==========" +echo "" + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR +submit_config "01_baseline" 8 2 0 1 64 0 0 0 +submit_config "02_more_pp" 8 4 0 1 64 0 0 0 +submit_config "03_zero1" 8 2 1 1 64 0 0 0 +submit_config "04_larger_mbs" 8 2 1 2 128 0 0 0 +submit_config "05_pp4_zero1" 8 4 1 1 128 0 0 0 +submit_config "06_zero2" 8 2 2 1 64 0 0 0 +submit_config "07_full_pp" 8 8 0 1 64 0 0 0 +submit_config "08_tp4_pp4" 4 4 1 1 64 0 0 0 +submit_config "09_act_ckpt" 8 2 1 1 64 1 0 0 +submit_config "10_seq_parallel" 8 2 1 1 64 0 1 0 +submit_config "11_overlap_comm" 8 2 1 1 64 0 0 1 + +# ============================================================ +# Wait for parallelism sweep to determine best config +# If not waiting, env sweep uses config 03 (TP8/PP2/ZeRO1) as default +# ============================================================ +echo "" +echo "========== ENVIRONMENT FLAGS SWEEP ==========" +echo "(Using TP=8 PP=2 ZeRO=1 as base for env flag sweep)" +echo "" + +# Base parallelism for env sweep +BASE_TP=8 +BASE_PP=2 +BASE_ZERO=1 +BASE_MBS=1 +BASE_GBS=64 + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR extra_env +submit_config "12_nccl_ring" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Ring" +submit_config "13_nccl_tree" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Tree" +submit_config "14_nccl_no_tuner" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_TUNER_PLUGIN=" +submit_config "15_nccl_chunk_4mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_P2P_NET_CHUNKSIZE=4194304" +submit_config "16_cuda_max_conn_1" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "CUDA_DEVICE_MAX_CONNECTIONS=1" +submit_config "17_nccl_buf_16mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=16777216" +submit_config "18_nccl_buf_32mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=33554432" +submit_config "19_nccl_min_ch_16" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=16" +submit_config "20_nccl_min_ch_32" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=32" + +echo "" +echo "========== SWEEP SUBMITTED ==========" +echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs.csv" +echo "" +echo "To monitor: watch 'squeue -u \$USER'" +echo "When all jobs finish, run: python parse_results.py" diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh b/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh new file mode 100644 index 000000000..977b0149c --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# sweep_runner_v2.sh - Sweep v2: ZeRO-2 (no PP), ZeRO-3, memory push, fusion ops +# +# All configs use PYTORCH_CUDA_ALLOC_CONF=expandable_segments:true (set in sbatch). +# Optimal NCCL flags are the defaults already in the sbatch script. +# +# Usage: bash sweep_runner_v2.sh [--dry-run] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch" +RESULTS_DIR="${SCRIPT_DIR}/sweep_results" +NODES=8 +PARTITION="b200" + +DRY_RUN=0 +if [ "${1:-}" = "--dry-run" ]; then + DRY_RUN=1 + echo "[DRY RUN] Will print commands without submitting" +fi + +mkdir -p "${RESULTS_DIR}" logs + +# ============================================================ +# Helper: submit a sweep configuration +# Extends v1 helper with seq_length and enable_fusions params. +# ============================================================ +submit_config() { + local config_name="$1" + local tp="$2" + local pp="$3" + local zero="$4" + local mbs="$5" + local gbs="$6" + local act_ckpt="${7:-0}" + local seq_par="${8:-0}" + local overlap="${9:-0}" + local seq_length="${10:-2048}" + local enable_fusions="${11:-0}" + shift 11 || true + local extra_env="${*:-}" + + echo "============================================" + echo "Submitting: ${config_name}" + echo " TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}" + echo " SeqLen=${seq_length} Fusions=${enable_fusions}" + echo " ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}" + [ -n "${extra_env}" ] && echo " Extra env: ${extra_env}" + echo "============================================" + + local env_exports="" + env_exports+="TP=${tp}," + env_exports+="PP=${pp}," + env_exports+="ZERO_STAGE=${zero}," + env_exports+="MICRO_BATCH_SIZE=${mbs}," + env_exports+="GLOBAL_BATCH_SIZE=${gbs}," + env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt}," + env_exports+="USE_SEQUENCE_PARALLEL=${seq_par}," + env_exports+="USE_OVERLAP_COMM=${overlap}," + env_exports+="SEQ_LENGTH=${seq_length}," + env_exports+="ENABLE_FUSIONS=${enable_fusions}," + env_exports+="CONFIG_NAME=${config_name}" + + local sbatch_cmd="sbatch" + sbatch_cmd+=" --partition=${PARTITION}" + sbatch_cmd+=" --nodes=${NODES}" + + # Add extra env vars (NCCL overrides etc.) + if [ -n "${extra_env}" ]; then + sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}" + else + sbatch_cmd+=" --export=ALL,${env_exports}" + fi + + sbatch_cmd+=" ${SBATCH_SCRIPT}" + + if [ "${DRY_RUN}" -eq 1 ]; then + echo "[DRY RUN] ${sbatch_cmd}" + echo "" + return + fi + + local job_output + job_output=$(eval "${sbatch_cmd}") + local job_id + job_id=$(echo "${job_output}" | awk '{print $NF}') + echo "Submitted job ${job_id} for config ${config_name}" + echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap},${seq_length},${enable_fusions}" >> "${RESULTS_DIR}/sweep_jobs_v2.csv" +} + +# ============================================================ +# Initialize tracking file +# ============================================================ +echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap,seq_length,enable_fusions" > "${RESULTS_DIR}/sweep_jobs_v2.csv" + +# ============================================================ +# ZeRO-2 WITHOUT PIPELINE PARALLELISM (PP=1) +# ============================================================ +echo "" +echo "========== ZeRO-2 SWEEP (PP=1) ==========" +echo "" + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE +submit_config "21_zero2_tp8_pp1" 8 1 2 1 64 0 0 0 2048 0 +submit_config "22_zero2_tp8_pp1_mbs2" 8 1 2 2 64 0 0 0 2048 0 +submit_config "23_zero2_tp4_pp1" 4 1 2 1 64 0 0 0 2048 0 + +# ============================================================ +# ZeRO-3 (PP=1) +# ============================================================ +echo "" +echo "========== ZeRO-3 SWEEP (PP=1) ==========" +echo "" + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE +submit_config "24_zero3_tp8_pp1" 8 1 3 1 64 0 0 0 2048 0 +submit_config "25_zero3_tp8_pp1_mbs2" 8 1 3 2 64 0 0 0 2048 0 +submit_config "26_zero3_tp4_pp1" 4 1 3 1 64 0 0 0 2048 0 +submit_config "27_zero3_tp8_pp1_overlap" 8 1 3 1 64 0 0 1 2048 0 + +# ============================================================ +# MEMORY PUSH / SEQ LENGTH / FUSIONS +# ============================================================ +echo "" +echo "========== MEMORY PUSH SWEEP ==========" +echo "" + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE +submit_config "28_mem_seq4k_tp8_pp2" 8 2 0 1 64 0 0 0 4096 0 +submit_config "29_mem_fused_tp8_pp8" 8 8 0 1 64 0 0 0 2048 1 + +# ============================================================ +# EXPANDABLE SEGMENTS IMPACT ON BEST CONFIG +# Re-test best config (TP8/PP8/ZeRO0) — now with expandable_segments +# enabled automatically via the updated sbatch. +# ============================================================ +echo "" +echo "========== EXPANDABLE SEGMENTS IMPACT ==========" +echo "" + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE +submit_config "30_best_expand_seg" 8 8 0 1 64 0 0 0 2048 0 + +echo "" +echo "========== SWEEP V2 SUBMITTED ==========" +echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs_v2.csv" +echo "" +echo "Total configs: 10" +echo "To monitor: watch 'squeue -u \$USER'" +echo "When all jobs finish, run: python parse_results.py --jobs-csv sweep_results/sweep_jobs_v2.csv" diff --git a/3.test_cases/pytorch/deepspeed/upload_results.sh b/3.test_cases/pytorch/deepspeed/upload_results.sh new file mode 100755 index 000000000..c88c79077 --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/upload_results.sh @@ -0,0 +1,260 @@ +#!/bin/bash +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# upload_results.sh - Upload benchmark results to S3 and CloudWatch +# +# Usage: +# bash upload_results.sh [--results-dir sweep_results] [--region us-east-1] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RESULTS_DIR="${1:---results-dir}" +CW_REGION="us-east-1" +S3_REGION="us-west-2" +S3_BUCKET="paragao-new-nemo-squash-container" +CW_NAMESPACE="DeepSpeed/B200Benchmarks" +CW_DASHBOARD_NAME="DeepSpeed-B200-Benchmarks" + +# Parse args +while [[ $# -gt 0 ]]; do + case $1 in + --results-dir) RESULTS_DIR="$2"; shift 2 ;; + --region) CW_REGION="$2"; shift 2 ;; + *) shift ;; + esac +done + +: "${RESULTS_DIR:=${SCRIPT_DIR}/sweep_results}" + +if [ ! -d "${RESULTS_DIR}" ]; then + echo "Error: Results directory not found: ${RESULTS_DIR}" + exit 1 +fi + +# ============================================================ +# 1. Upload JSON files to S3 +# ============================================================ +echo "=== Uploading results to S3 ===" + +# Determine S3 path: benchmark-results/b200/2026/March/04/ +YEAR=$(date -u +%Y) +MONTH=$(date -u +%B) +DAY=$(date -u +%d) +S3_PREFIX="benchmark-results/b200/${YEAR}/${MONTH}/${DAY}" + +for json_file in "${RESULTS_DIR}"/training_bench_*.json; do + if [ ! -f "${json_file}" ]; then + echo "No result JSON files found in ${RESULTS_DIR}" + break + fi + filename=$(basename "${json_file}") + echo " Uploading ${filename} -> s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" + aws s3 cp "${json_file}" "s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" \ + --region "${S3_REGION}" \ + --content-type "application/json" +done + +echo "S3 upload complete: s3://${S3_BUCKET}/${S3_PREFIX}/" +echo "" + +# ============================================================ +# 2. Publish metrics to CloudWatch +# ============================================================ +echo "=== Publishing metrics to CloudWatch ===" + +for json_file in "${RESULTS_DIR}"/training_bench_*.json; do + if [ ! -f "${json_file}" ]; then + break + fi + + filename=$(basename "${json_file}") + + # Extract metadata and summary using python + read -r config_name tp pp zero_stage precision avg_tflops avg_step_time timestamp < <( + python3 -c " +import json, sys +with open('${json_file}') as f: + d = json.load(f) +m = d['metadata'] +s = d['summary'] +sc = m.get('sweep_config', {}) +print( + sc.get('config_name', 'unknown'), + sc.get('tp', 8), + sc.get('pp', 2), + sc.get('zero_stage', 1), + m.get('precision', 'bf16'), + s.get('steady_state_avg_tflops_per_gpu', 0), + s.get('steady_state_avg_step_time_s', 0), + m.get('timestamp', '$(date -u +%Y-%m-%dT%H:%M:%SZ)') +) +" + ) + + echo " Publishing: ${config_name} (TFLOPS=${avg_tflops}, StepTime=${avg_step_time}s)" + + # Publish with full dimensions + aws cloudwatch put-metric-data \ + --namespace "${CW_NAMESPACE}" \ + --region "${CW_REGION}" \ + --metric-data "[ + { + \"MetricName\": \"TFLOPSPerGPU\", + \"Value\": ${avg_tflops}, + \"Unit\": \"Count\", + \"Timestamp\": \"${timestamp}\", + \"Dimensions\": [ + {\"Name\": \"model_size\", \"Value\": \"103b\"}, + {\"Name\": \"tp\", \"Value\": \"${tp}\"}, + {\"Name\": \"pp\", \"Value\": \"${pp}\"}, + {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"}, + {\"Name\": \"precision\", \"Value\": \"${precision}\"}, + {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} + ] + }, + { + \"MetricName\": \"StepTimeSeconds\", + \"Value\": ${avg_step_time}, + \"Unit\": \"Seconds\", + \"Timestamp\": \"${timestamp}\", + \"Dimensions\": [ + {\"Name\": \"model_size\", \"Value\": \"103b\"}, + {\"Name\": \"tp\", \"Value\": \"${tp}\"}, + {\"Name\": \"pp\", \"Value\": \"${pp}\"}, + {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"}, + {\"Name\": \"precision\", \"Value\": \"${precision}\"}, + {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} + ] + } + ]" + + # Also publish with just config_name dimension for easy dashboard queries + aws cloudwatch put-metric-data \ + --namespace "${CW_NAMESPACE}" \ + --region "${CW_REGION}" \ + --metric-data "[ + { + \"MetricName\": \"TFLOPSPerGPU\", + \"Value\": ${avg_tflops}, + \"Unit\": \"Count\", + \"Timestamp\": \"${timestamp}\", + \"Dimensions\": [ + {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} + ] + }, + { + \"MetricName\": \"StepTimeSeconds\", + \"Value\": ${avg_step_time}, + \"Unit\": \"Seconds\", + \"Timestamp\": \"${timestamp}\", + \"Dimensions\": [ + {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} + ] + } + ]" + +done + +echo "CloudWatch metrics published to namespace: ${CW_NAMESPACE}" +echo "" + +# ============================================================ +# 3. Create/Update CloudWatch Dashboard +# ============================================================ +echo "=== Creating CloudWatch Dashboard ===" + +# Build metric entries dynamically from results +TFLOPS_METRICS="" +STEPTIME_METRICS="" +TABLE_METRICS="" + +for json_file in "${RESULTS_DIR}"/training_bench_*.json; do + if [ ! -f "${json_file}" ]; then + break + fi + + config_name=$(python3 -c " +import json +with open('${json_file}') as f: + d = json.load(f) +print(d['metadata'].get('sweep_config', {}).get('config_name', 'unknown')) +") + + TFLOPS_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}]," + STEPTIME_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}]," + TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} TFLOPS\"}]," + TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} StepTime\"}]," +done + +# Remove trailing commas +TFLOPS_METRICS="${TFLOPS_METRICS%,}" +STEPTIME_METRICS="${STEPTIME_METRICS%,}" +TABLE_METRICS="${TABLE_METRICS%,}" + +DASHBOARD_BODY=$(cat < Date: Fri, 6 Mar 2026 11:31:01 +0000 Subject: [PATCH 2/5] Update README, Makefile, and QLoRA Dockerfile for PR readiness - Rewrite README as use-case-focused guide: GPT-103B pretraining, QLoRA fine-tuning, and Llama2 fine-tuning with best practices and proper configuration docs (no benchmark numbers) - Simplify Makefile: best-config train target, remove sweep/upload targets - Standardize QLoRA Dockerfile: pytorch:25.04-py3 base, EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1, OFI-NCCL symlinks, proper NCCL/EFA env vars - Remove sweep runners and upload script from tracked files (internal tooling) --- 3.test_cases/pytorch/deepspeed/Makefile | 52 +++- 3.test_cases/pytorch/deepspeed/README.md | 192 +++++++++---- .../pytorch/deepspeed/qlora/Dockerfile | 164 ++++++++--- .../pytorch/deepspeed/qlora/requirements.txt | 2 +- .../pytorch/deepspeed/sweep_runner.sh | 144 ---------- .../pytorch/deepspeed/sweep_runner_v2.sh | 154 ----------- .../pytorch/deepspeed/upload_results.sh | 260 ------------------ 7 files changed, 322 insertions(+), 646 deletions(-) delete mode 100755 3.test_cases/pytorch/deepspeed/sweep_runner.sh delete mode 100644 3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh delete mode 100755 3.test_cases/pytorch/deepspeed/upload_results.sh diff --git a/3.test_cases/pytorch/deepspeed/Makefile b/3.test_cases/pytorch/deepspeed/Makefile index e4615c60f..d2b46ee82 100644 --- a/3.test_cases/pytorch/deepspeed/Makefile +++ b/3.test_cases/pytorch/deepspeed/Makefile @@ -1,12 +1,52 @@ -ENROOT_IMAGE=deepspeed +ENROOT_IMAGE ?= deepspeed +APPS_PATH ?= /fsx/apps +SQUASH_FILE ?= $(APPS_PATH)/$(ENROOT_IMAGE).sqsh +PARTITION ?= b200 +NODES ?= 8 +LOGS_DIR ?= logs +RESULTS_DIR ?= sweep_results -all: build clean import +.PHONY: all build clean import build-remote train parse help + +all: build import + +help: + @echo "Container targets:" + @echo " build - Build Docker image locally" + @echo " import - Convert Docker image to Enroot squash file" + @echo " build-remote - Build image on a compute node via sbatch" + @echo " clean - Remove local squash file" + @echo "" + @echo "Training targets:" + @echo " train - Submit 103B GPT pretraining (best config: TP=8, PP=8, fusions)" + @echo "" + @echo "Results targets:" + @echo " parse - Parse training logs into benchmark JSON" + +# ---- Container ---- build: - docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile . + docker build -t $(ENROOT_IMAGE) -f 0.deepspeed.dockerfile . + +import: + mkdir -p $(APPS_PATH) + enroot import -o $(SQUASH_FILE) dockerd://$(ENROOT_IMAGE):latest + +build-remote: + sbatch 1.build-image.sbatch clean: - -rm ${ENROOT_IMAGE}.sqsh + -rm -f $(SQUASH_FILE) -import: - enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest +# ---- Training (best config: TP=8, PP=8, ZeRO=0, fusions enabled) ---- + +train: + sbatch --partition=$(PARTITION) --nodes=$(NODES) \ + --export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \ + pretrain_gpt_103b.sbatch + +# ---- Results ---- + +parse: + python3 parse_results.py --jobs-csv $(RESULTS_DIR)/sweep_jobs.csv \ + --logs-dir $(LOGS_DIR) --output-dir $(RESULTS_DIR) diff --git a/3.test_cases/pytorch/deepspeed/README.md b/3.test_cases/pytorch/deepspeed/README.md index fd2ef7524..e873d1af8 100644 --- a/3.test_cases/pytorch/deepspeed/README.md +++ b/3.test_cases/pytorch/deepspeed/README.md @@ -1,87 +1,179 @@ -# DeepSpeed Test Cases +# DeepSpeed on AWS -[DeepSpeed](https://github.com/microsoft/DeepSpeed) enables world's most powerful language models like MT-530B and BLOOM. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. `deepspeed` illustrates several example test cases for DeepSpeed training on AWS. +[DeepSpeed](https://github.com/microsoft/DeepSpeed) is a deep learning optimization library that enables efficient distributed training at scale. This directory contains test cases for running DeepSpeed workloads on AWS GPU clusters, covering large-scale pretraining and parameter-efficient fine-tuning. -## 1. Preparation +## Use Cases -This guide assumes that you have the following: +| Use Case | Description | Location | +|----------|-------------|----------| +| GPT-103B Pretraining | Large-scale GPT pretraining benchmark using Megatron-DeepSpeed with 3D parallelism (TP/PP/DP) and ZeRO optimization | [`pretrain_gpt_103b.sbatch`](pretrain_gpt_103b.sbatch) | +| QLoRA Fine-tuning | Qwen3-8B fine-tuning with QLoRA (4-bit) + DeepSpeed ZeRO-2/3, supports EKS and Slurm | [`qlora/`](qlora/) | +| Llama2 Fine-tuning | Llama2 fine-tuning from HuggingFace weights using Megatron-DeepSpeed | [`examples_megatron_deepspeed/finetune_hf_llama/`](examples_megatron_deepspeed/finetune_hf_llama/) | -* A functional Slurm cluster on AWS. -* Docker, [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) installed. -* An FSx for Lustre filesystem mounted on `/fsx`. +## Prerequisites -We recommend that you set up a Slurm cluster using the templates in the architectures [directory](../../1.architectures). You need to set the following environment variables to run these test cases: +- A functional Slurm cluster on AWS. We recommend [SageMaker HyperPod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html) or the templates in the [architectures directory](../../1.architectures). +- [Docker](https://docs.docker.com/engine/install/), [Pyxis](https://github.com/NVIDIA/pyxis), and [Enroot](https://github.com/NVIDIA/enroot) installed on compute nodes. +- An [FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html) filesystem mounted on `/fsx`. +- NVIDIA GPU instances with [EFA networking](https://aws.amazon.com/hpc/efa/) (B200, H100, A100, etc.). -```bash -export APPS_PATH=/fsx/apps -export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh -export FSX_PATH=/fsx -export MODEL_PATH=$FSX_PATH/deepspeed -export TEST_CASE_PATH=${HOME}/18.deepspeed # where you copy the test case or set to your test case path -cd $TEST_CASE_PATH # Note that we assume that you are here during the following command executions -``` +## 1. GPT-103B Pretraining Benchmark + +A ~103B-parameter GPT model (80 layers, hidden=12288, heads=96, FFN=49152) trained with [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) using 3D parallelism (tensor, pipeline, data) and DeepSpeed ZeRO optimization. Designed for benchmarking multi-node GPU clusters. +### Container setup +The container image (`0.deepspeed.dockerfile`) is built on `nvcr.io/nvidia/pytorch:25.04-py3` and includes: -## 2. Build the container +- **EFA 1.47.0** with the bundled aws-ofi-nccl plugin and NCCL tuner +- **NCCL 2.29.3** (upgraded to match B200 host driver) +- **GDRCopy v2.5.1** for GPU-direct RDMA +- **DeepSpeed**, **Transformers 4.44.2**, and multi-node SSH configuration -Before running training jobs, you need to use a build docker container image. [Enroot](https://github.com/NVIDIA/enroot) will be used to turn the image into unprivileged sandbox for Slurm but build step may exceed the storage available on the head node so we reccomend building it on a compute node following instructions below (option 2) +Build the container on a compute node (recommended, avoids head node storage limits): -### Option 1: build image on a head node +```bash +sbatch 1.build-image.sbatch +``` + +Or build locally and convert to a squash file: + +```bash +make build # docker build +make import # enroot import to /fsx/apps/deepspeed.sqsh +``` -Below are the steps you need to follow: +### Data preparation +The benchmark uses preprocessed data in Megatron format with the GPT-2 tokenizer. -1. Build the Docker image with the command below in this directory. +1. Download the GPT-2 tokenizer: ```bash - docker build -t deepspeed -f 0.deepspeed.dockerfile . + mkdir -p /fsx/deepspeed/data && cd /fsx/deepspeed/data + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt ``` - -2. Once the Docker image is built, you can check if it is present with `docker images`. You should see an output similar to this one: +2. Prepare training data (any text corpus works; for benchmarking, synthetic data is sufficient): ```bash - REPOSITORY TAG IMAGE ID CREATED SIZE - deepspeed latest b6c49033c424 9 minutes ago 23.3GB - ... + python3 -c " + import json + with open('synthetic_corpus.json', 'w') as f: + for i in range(50000): + json.dump({'text': 'The quick brown fox ' * 100}, f) + f.write('\n') + " ``` -3. Convert the Docker image to a squash file with the command below. +3. Clone Megatron-DeepSpeed and preprocess: ```bash - enroot import -o ${ENROOT_IMAGE} dockerd://deepspeed:latest + git clone https://github.com/microsoft/Megatron-DeepSpeed /fsx/deepspeed/Megatron-DeepSpeed + + python3 /fsx/deepspeed/Megatron-DeepSpeed/tools/preprocess_data.py \ + --input synthetic_corpus.json \ + --output-prefix BookCorpusDataset_text_document \ + --vocab-file gpt2-vocab.json \ + --merge-file gpt2-merges.txt \ + --tokenizer-type GPT2BPETokenizer \ + --workers 16 --append-eod ``` - The file will be stored in the `/apps` directory (by default). The output should look as below. +### Running + +Submit the best-performing configuration (TP=8, PP=8, ZeRO-0, fusions enabled): + +```bash +make train +# or equivalently: +sbatch --partition=b200 --nodes=8 \ + --export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \ + pretrain_gpt_103b.sbatch +``` + +Override parallelism settings for custom configurations: + +```bash +sbatch --nodes=8 \ + --export=ALL,TP=8,PP=4,ZERO_STAGE=1,MICRO_BATCH_SIZE=2,CONFIG_NAME=my_config \ + pretrain_gpt_103b.sbatch +``` + +#### Environment variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `TP` | 8 | Tensor parallel size | +| `PP` | 2 | Pipeline parallel size | +| `ZERO_STAGE` | 1 | DeepSpeed ZeRO stage (0, 1, 2, or 3) | +| `MICRO_BATCH_SIZE` | 1 | Per-GPU micro batch size | +| `GLOBAL_BATCH_SIZE` | 64 | Global batch size | +| `SEQ_LENGTH` | 2048 | Sequence length | +| `ENABLE_FUSIONS` | 0 | Set to 1 to enable kernel fusion ops | +| `USE_ACTIVATION_CHECKPOINTING` | 0 | Set to 1 for activation checkpointing | +| `USE_OVERLAP_COMM` | 0 | Set to 1 to overlap communication with compute | +| `TRAIN_ITERS` | 50 | Number of training iterations | +| `CONFIG_NAME` | baseline | Label for this configuration | + +### Best practices + +The following recommendations are based on extensive parameter sweeps across parallelism strategies, ZeRO stages, NCCL flags, and memory optimizations: + +**Parallelism strategy:** + +- **Maximize pipeline parallelism** (PP) alongside tensor parallelism (TP) for best throughput. For an 8-node cluster with 8 GPUs per node, TP=8 with PP=8 is optimal. +- **Enable kernel fusion ops** (`ENABLE_FUSIONS=1`) for a significant throughput improvement over the non-fused baseline. This enables masked-softmax, bias-gelu, bias-dropout, and gradient-accumulation fusions. +- **ZeRO-0 outperforms ZeRO-1** when the data-parallel group size is small (e.g., DP=1 with TP=8/PP=8). ZeRO-1's allreduce overhead is not amortized. + +**ZeRO-2 and ZeRO-3:** + +- ZeRO-2 and ZeRO-3 are **incompatible with pipeline parallelism** in Megatron-DeepSpeed. The sbatch script automatically sets `PP=1` and adds `--no-pipeline-parallel` when `ZERO_STAGE >= 2`. +- ZeRO-3's parameter partitioning **enables lower TP values** that ZeRO-2 cannot fit in memory (e.g., TP=4 works with ZeRO-3 but OOMs with ZeRO-2). +- **Increasing micro-batch size** (e.g., `MICRO_BATCH_SIZE=2`) substantially improves throughput for ZeRO-2 and ZeRO-3 configurations. +- `overlap_comm` provides only marginal improvement (~2%) with ZeRO-3. - ```bash - [INFO] Fetching image +**NCCL and networking:** - 36a8c752c28a2db543d2a632a3fc1fcbd5789a6f3d45b9d3a24632420dedcfa8 +- NCCL environment flag variations (buffer sizes, chunk sizes, min channels) have **negligible impact** on throughput (~1% range). The defaults in the sbatch script are well-tuned. +- **Do not set `NCCL_ALGO=Tree`** on EFA-based clusters -- it causes hangs. Let the NCCL tuner plugin (`libnccl-ofi-tuner.so`) choose the algorithm automatically. +- **Do not set `NCCL_PROTO` or `FI_EFA_FORK_SAFE`** -- these are not needed and can cause issues. + +**Memory:** + +- `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` is set by default in the sbatch script. Note the **capital T** is required in pytorch:25.04 containers; lowercase `true` causes a `RuntimeError`. +- Sequence length 4096 exceeds available HBM even with TP=8/PP=2 on B200 (178GB per GPU). Use seq=2048 for this model size. + +### Parsing results + +After training completes, parse the Slurm logs into benchmark JSON using `parse_results.py`: + +```bash +# Single log file +python3 parse_results.py --log-file logs/deepspeed-pretrain-103b_123.out --config-name my_config + +# Multiple jobs tracked in a CSV +python3 parse_results.py --jobs-csv sweep_results/sweep_jobs.csv --output-dir sweep_results +``` - [INFO] Extracting image content... - [INFO] Creating squashfs filesystem... +### Known issues - Parallel mksquashfs: Using 32 processors - Creating 4.0 filesystem on /apps/deepspeed.sqsh, block size 131072. - [========================================================================================================================================================================================================================-] 291068/291068 100% +- **torchrun shebang**: The container's `torchrun` may have a shebang pointing to the wrong Python version. The sbatch script uses `python3 -m torch.distributed.run` as a workaround. +- **`expandable_segments` case sensitivity**: Must use `expandable_segments:True` (capital T) in pytorch:25.04-py3. Lowercase causes a `RuntimeError`. +- **NCCL Tree algorithm**: Incompatible with EFA topology -- causes hangs. Do not set `NCCL_ALGO=Tree`. +- **Sequence parallelism**: Incompatible with pipeline parallelism (PP>1) in this Megatron-DeepSpeed version. - Exportable Squashfs 4.0 filesystem, gzip compressed, data block size 131072 - uncompressed data, uncompressed metadata, uncompressed fragments, uncompressed xattrs - duplicates are not removed - ... - ``` +## 2. QLoRA Fine-tuning (Qwen3-8B) -Once done proceed to the next stage. +Fine-tune [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) using QLoRA (4-bit quantization + LoRA adapters) with DeepSpeed ZeRO-2 or ZeRO-3. Supports deployment on SageMaker HyperPod with both EKS and Slurm orchestrators, including MIG GPU partitioning and automatic checkpoint resume. -### Option 2: Build image on a compute node +The QLoRA use case has its own container (`qlora/Dockerfile`) optimized for the same infrastructure best practices (EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1). -In this option, you will use a compute node to build the image. Submit the job as: +See [`qlora/README.md`](qlora/README.md) for full instructions. - ```bash - sbatch 1.build-image.sbatch - ``` +## 3. Llama2 Fine-tuning (Megatron-DeepSpeed) +Fine-tune Llama2 from HuggingFace weights using Megatron-DeepSpeed. Includes weight conversion from HuggingFace to Megatron format and fine-tuning on the Stanford Alpaca dataset. Uses the shared container image (`0.deepspeed.dockerfile`). -Once the image is prepared, you can proceed to `examples_*` directory for various deepspeed test cases. \ No newline at end of file +See [`examples_megatron_deepspeed/finetune_hf_llama/README.md`](examples_megatron_deepspeed/finetune_hf_llama/README.md) for full instructions. \ No newline at end of file diff --git a/3.test_cases/pytorch/deepspeed/qlora/Dockerfile b/3.test_cases/pytorch/deepspeed/qlora/Dockerfile index 27f9bcd8c..32f02d35e 100644 --- a/3.test_cases/pytorch/deepspeed/qlora/Dockerfile +++ b/3.test_cases/pytorch/deepspeed/qlora/Dockerfile @@ -1,61 +1,154 @@ # Dockerfile for QLoRA Fine-tuning of Qwen3-8B # ============================================= -# Base Image: NVIDIA CUDA 12.8 with cuDNN 9 -# Python: 3.10 +# Base Image: PyTorch 25.04 with CUDA 12.9.0 (supports Blackwell, Hopper, Ampere) +# Python: 3.12 (bundled with pytorch:25.04-py3) # Key Libraries: PyTorch, Transformers, PEFT, BitsAndBytes, DeepSpeed +# Networking: EFA 1.47, NCCL 2.29.3, GDRCopy 2.5.1 # # Build: # docker build -t qwen3-qlora-training:latest . # # If you encounter CUBLAS errors at runtime (typically caused by CUDA -# library conflicts on the host), switch the torch index URL below to -# cu126 as a fallback — see docs/TROUBLESHOOTING.md. +# library conflicts on the host), see docs/TROUBLESHOOTING.md. -# Stage 1: Base image with CUDA -FROM nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 AS base +# ============================================================ +# Base image: PyTorch 25.04 with CUDA 12.9.0 +# ============================================================ +FROM nvcr.io/nvidia/pytorch:25.04-py3 -# Prevent interactive prompts during build ENV DEBIAN_FRONTEND=noninteractive -# Install system dependencies -RUN apt-get update && apt-get install -y \ - python3.10 \ - python3.10-dev \ - python3-pip \ - python3.10-venv \ +# ============================================================ +# 1. System packages and SSH setup (needed for multi-node training) +# ============================================================ +RUN apt-get update -y && apt-get upgrade -y +RUN apt-get remove -y --allow-change-held-packages \ + ibverbs-utils \ + libibverbs-dev \ + libibverbs1 \ + && rm -rf /opt/hpcx/ompi \ + && rm -rf /usr/local/ucx \ + && ldconfig + +RUN apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + build-essential \ + cmake \ + curl \ + gcc \ + gdb \ git \ git-lfs \ + gnupg \ + kmod \ + libtool \ + openssh-client \ + openssh-server \ wget \ - curl \ && rm -rf /var/lib/apt/lists/* -# Set Python 3.10 as default -RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \ - ln -sf /usr/bin/pip3 /usr/bin/pip +# SSH configuration for multi-node +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config -# Upgrade pip -RUN pip install --upgrade pip setuptools wheel +# ============================================================ +# 2. Install EFA Installer 1.47.0 +# ============================================================ +ENV EFA_INSTALLER_VERSION=1.47.0 +WORKDIR /tmp +RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && cd / && rm -rf /tmp/aws-efa-installer -# Set working directory -WORKDIR /app +# ============================================================ +# 3. NCCL plugin symlinks +# EFA installer names the plugin libnccl-net-ofi.so but NCCL +# looks for libnccl-net-aws-ofi.so. Without this symlink NCCL +# falls back to TCP sockets silently. +# ============================================================ +RUN rm -rf /opt/amazon/aws-ofi-nccl + +RUN ln -sf /opt/amazon/ofi-nccl/lib/libnccl-net-ofi.so \ + /opt/amazon/ofi-nccl/lib/libnccl-net-aws-ofi.so && \ + ln -sf /opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \ + /opt/amazon/ofi-nccl/lib/libnccl-tuner-aws-ofi.so + +# ============================================================ +# 4. Upgrade NCCL to 2.29.3 (requires CUDA >= 12.9) +# ============================================================ +ENV NCCL_VERSION=2.29.3-1 +RUN apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget -qO /tmp/cuda-keyring.deb \ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i /tmp/cuda-keyring.deb && \ + rm /tmp/cuda-keyring.deb && \ + apt-get update && \ + apt-get install -y --allow-downgrades --allow-change-held-packages \ + libnccl2=${NCCL_VERSION}+cuda12.9 \ + libnccl-dev=${NCCL_VERSION}+cuda12.9 && \ + rm -rf /var/lib/apt/lists/* + +# ============================================================ +# 5. Install GDRCopy v2.5.1 (lib-only) +# ============================================================ +RUN cd /tmp && \ + git clone --branch v2.5.1 --depth 1 https://github.com/NVIDIA/gdrcopy.git && \ + cd gdrcopy && \ + make -j$(nproc) lib lib_install && \ + cd / && rm -rf /tmp/gdrcopy + +# ============================================================ +# 6. Library path configuration +# ============================================================ +RUN echo "/opt/amazon/ofi-nccl/lib" > /etc/ld.so.conf.d/aws-ofi-nccl.conf && \ + echo "/opt/amazon/efa/lib" > /etc/ld.so.conf.d/efa.conf -# Stage 2: Install Python dependencies -FROM base AS dependencies +RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/environment 2>/dev/null || true +RUN sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/amazon/ofi-nccl/lib|g' /etc/shinit_v2 2>/dev/null || true + +RUN rm -f /etc/ld.so.cache && ldconfig + +ENV LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}" +ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}" + +# ============================================================ +# 7. OpenMPI tuning for EFA +# ============================================================ +ARG OPEN_MPI_PATH=/opt/amazon/openmpi +RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf + +RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \ + && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "${OPEN_MPI_PATH}/bin/mpirun.real \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \ + && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun + +# ============================================================ +# 8. Python dependencies for QLoRA training +# ============================================================ +WORKDIR /app # Copy requirements first for better caching COPY requirements.txt . -# Install PyTorch with CUDA 12.8 support +# Install PyTorch with CUDA 12.9 support # Note: torch 2.10+ has a breaking LR scheduler change (strict zip) that is # incompatible with some DeepSpeed/transformers versions. Pin to <2.10 until # upstream libraries catch up. -RUN pip install --no-cache-dir 'torch>=2.7.0,<2.10.0' --index-url https://download.pytorch.org/whl/cu128 +RUN pip install --no-cache-dir 'torch>=2.7.0,<2.10.0' --index-url https://download.pytorch.org/whl/cu129 # Install other dependencies RUN pip install --no-cache-dir -r requirements.txt -# Stage 3: Final image with application code -FROM dependencies AS final +# ============================================================ +# 9. Application code +# ============================================================ # Copy source code and entrypoint COPY entrypoint.sh /app/entrypoint.sh @@ -66,14 +159,23 @@ COPY configs/ /app/configs/ # Create directories for outputs and cache RUN mkdir -p /workspace/outputs /workspace/hf_cache -# Set environment variables +# ============================================================ +# 10. Environment variables +# ============================================================ ENV PYTHONPATH=/app ENV HF_HOME=/workspace/hf_cache ENV PYTHONUNBUFFERED=1 -# Do NOT set CUDA_VISIBLE_DEVICES here — let torchrun / K8s manage GPU visibility -# DeepSpeed / NCCL settings for multi-GPU communication +# Do NOT set CUDA_VISIBLE_DEVICES here -- let torchrun / K8s manage GPU visibility + +# NCCL / EFA settings for multi-GPU and multi-node communication ENV NCCL_DEBUG=INFO -ENV NCCL_SOCKET_IFNAME=^lo +ENV NCCL_SOCKET_IFNAME=^docker,lo,veth +ENV FI_PROVIDER=efa +ENV NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so + +# PyTorch memory allocator -- expandable segments reduces fragmentation +# Note: capital T is required in pytorch:25.04 containers +ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # Entrypoint reads PET_* env vars set by the Kubeflow Training Operator # and launches torchrun with the correct number of processes per node. diff --git a/3.test_cases/pytorch/deepspeed/qlora/requirements.txt b/3.test_cases/pytorch/deepspeed/qlora/requirements.txt index 893e443a7..9bdf0eb7a 100644 --- a/3.test_cases/pytorch/deepspeed/qlora/requirements.txt +++ b/3.test_cases/pytorch/deepspeed/qlora/requirements.txt @@ -1,6 +1,6 @@ # Core ML Libraries # NOTE: torch is installed separately with the correct CUDA index URL. -# - Docker (see Dockerfile): torch>=2.7.0,<2.10.0 with cu128 +# - Docker (see Dockerfile): torch>=2.7.0,<2.10.0 with cu129 (pytorch:25.04-py3 base) # - Slurm venv (see slurm/README): torch==2.6.0 with cu126 # See docs/TROUBLESHOOTING.md if you encounter CUBLAS errors (typically caused # by environment-level CUDA library conflicts, not a library bug). diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner.sh b/3.test_cases/pytorch/deepspeed/sweep_runner.sh deleted file mode 100755 index 9f7d5a944..000000000 --- a/3.test_cases/pytorch/deepspeed/sweep_runner.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/bin/bash -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: MIT-0 -# -# sweep_runner.sh - Automated parameter sweep for DeepSpeed 103B pretraining -# Runs all parallelism and environment flag configurations, collects results. -# -# Usage: bash sweep_runner.sh [--dry-run] - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch" -RESULTS_DIR="${SCRIPT_DIR}/sweep_results" -NODES=8 -PARTITION="b200" - -DRY_RUN=0 -if [ "${1:-}" = "--dry-run" ]; then - DRY_RUN=1 - echo "[DRY RUN] Will print commands without submitting" -fi - -mkdir -p "${RESULTS_DIR}" logs - -# ============================================================ -# Helper: submit a sweep configuration -# ============================================================ -submit_config() { - local config_name="$1" - local tp="$2" - local pp="$3" - local zero="$4" - local mbs="$5" - local gbs="$6" - local act_ckpt="${7:-0}" - local seq_par="${8:-0}" - local overlap="${9:-0}" - shift 9 || true - local extra_env="${*:-}" - - echo "============================================" - echo "Submitting: ${config_name}" - echo " TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}" - echo " ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}" - [ -n "${extra_env}" ] && echo " Extra env: ${extra_env}" - echo "============================================" - - local env_exports="" - env_exports+="TP=${tp}," - env_exports+="PP=${pp}," - env_exports+="ZERO_STAGE=${zero}," - env_exports+="MICRO_BATCH_SIZE=${mbs}," - env_exports+="GLOBAL_BATCH_SIZE=${gbs}," - env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt}," - env_exports+="USE_SEQUENCE_PARALLEL=${seq_par}," - env_exports+="USE_OVERLAP_COMM=${overlap}," - env_exports+="CONFIG_NAME=${config_name}" - - local sbatch_cmd="sbatch" - sbatch_cmd+=" --partition=${PARTITION}" - sbatch_cmd+=" --nodes=${NODES}" - sbatch_cmd+=" --export=ALL,${env_exports}" - sbatch_cmd+=" --job-name=sweep_${config_name}" - - # Add extra env vars for NCCL tuning - if [ -n "${extra_env}" ]; then - sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}" - fi - - sbatch_cmd+=" ${SBATCH_SCRIPT}" - - if [ "${DRY_RUN}" -eq 1 ]; then - echo "[DRY RUN] ${sbatch_cmd}" - echo "" - return - fi - - local job_output - job_output=$(eval "${sbatch_cmd}") - local job_id - job_id=$(echo "${job_output}" | awk '{print $NF}') - echo "Submitted job ${job_id} for config ${config_name}" - echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap}" >> "${RESULTS_DIR}/sweep_jobs.csv" -} - -# ============================================================ -# Initialize tracking file -# ============================================================ -echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap" > "${RESULTS_DIR}/sweep_jobs.csv" - -# ============================================================ -# PARALLELISM SWEEP (Configs 1-11) -# ============================================================ -echo "" -echo "========== PARALLELISM SWEEP ==========" -echo "" - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR -submit_config "01_baseline" 8 2 0 1 64 0 0 0 -submit_config "02_more_pp" 8 4 0 1 64 0 0 0 -submit_config "03_zero1" 8 2 1 1 64 0 0 0 -submit_config "04_larger_mbs" 8 2 1 2 128 0 0 0 -submit_config "05_pp4_zero1" 8 4 1 1 128 0 0 0 -submit_config "06_zero2" 8 2 2 1 64 0 0 0 -submit_config "07_full_pp" 8 8 0 1 64 0 0 0 -submit_config "08_tp4_pp4" 4 4 1 1 64 0 0 0 -submit_config "09_act_ckpt" 8 2 1 1 64 1 0 0 -submit_config "10_seq_parallel" 8 2 1 1 64 0 1 0 -submit_config "11_overlap_comm" 8 2 1 1 64 0 0 1 - -# ============================================================ -# Wait for parallelism sweep to determine best config -# If not waiting, env sweep uses config 03 (TP8/PP2/ZeRO1) as default -# ============================================================ -echo "" -echo "========== ENVIRONMENT FLAGS SWEEP ==========" -echo "(Using TP=8 PP=2 ZeRO=1 as base for env flag sweep)" -echo "" - -# Base parallelism for env sweep -BASE_TP=8 -BASE_PP=2 -BASE_ZERO=1 -BASE_MBS=1 -BASE_GBS=64 - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR extra_env -submit_config "12_nccl_ring" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Ring" -submit_config "13_nccl_tree" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Tree" -submit_config "14_nccl_no_tuner" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_TUNER_PLUGIN=" -submit_config "15_nccl_chunk_4mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_P2P_NET_CHUNKSIZE=4194304" -submit_config "16_cuda_max_conn_1" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "CUDA_DEVICE_MAX_CONNECTIONS=1" -submit_config "17_nccl_buf_16mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=16777216" -submit_config "18_nccl_buf_32mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=33554432" -submit_config "19_nccl_min_ch_16" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=16" -submit_config "20_nccl_min_ch_32" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=32" - -echo "" -echo "========== SWEEP SUBMITTED ==========" -echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs.csv" -echo "" -echo "To monitor: watch 'squeue -u \$USER'" -echo "When all jobs finish, run: python parse_results.py" diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh b/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh deleted file mode 100644 index 977b0149c..000000000 --- a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh +++ /dev/null @@ -1,154 +0,0 @@ -#!/bin/bash -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: MIT-0 -# -# sweep_runner_v2.sh - Sweep v2: ZeRO-2 (no PP), ZeRO-3, memory push, fusion ops -# -# All configs use PYTORCH_CUDA_ALLOC_CONF=expandable_segments:true (set in sbatch). -# Optimal NCCL flags are the defaults already in the sbatch script. -# -# Usage: bash sweep_runner_v2.sh [--dry-run] - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch" -RESULTS_DIR="${SCRIPT_DIR}/sweep_results" -NODES=8 -PARTITION="b200" - -DRY_RUN=0 -if [ "${1:-}" = "--dry-run" ]; then - DRY_RUN=1 - echo "[DRY RUN] Will print commands without submitting" -fi - -mkdir -p "${RESULTS_DIR}" logs - -# ============================================================ -# Helper: submit a sweep configuration -# Extends v1 helper with seq_length and enable_fusions params. -# ============================================================ -submit_config() { - local config_name="$1" - local tp="$2" - local pp="$3" - local zero="$4" - local mbs="$5" - local gbs="$6" - local act_ckpt="${7:-0}" - local seq_par="${8:-0}" - local overlap="${9:-0}" - local seq_length="${10:-2048}" - local enable_fusions="${11:-0}" - shift 11 || true - local extra_env="${*:-}" - - echo "============================================" - echo "Submitting: ${config_name}" - echo " TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}" - echo " SeqLen=${seq_length} Fusions=${enable_fusions}" - echo " ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}" - [ -n "${extra_env}" ] && echo " Extra env: ${extra_env}" - echo "============================================" - - local env_exports="" - env_exports+="TP=${tp}," - env_exports+="PP=${pp}," - env_exports+="ZERO_STAGE=${zero}," - env_exports+="MICRO_BATCH_SIZE=${mbs}," - env_exports+="GLOBAL_BATCH_SIZE=${gbs}," - env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt}," - env_exports+="USE_SEQUENCE_PARALLEL=${seq_par}," - env_exports+="USE_OVERLAP_COMM=${overlap}," - env_exports+="SEQ_LENGTH=${seq_length}," - env_exports+="ENABLE_FUSIONS=${enable_fusions}," - env_exports+="CONFIG_NAME=${config_name}" - - local sbatch_cmd="sbatch" - sbatch_cmd+=" --partition=${PARTITION}" - sbatch_cmd+=" --nodes=${NODES}" - - # Add extra env vars (NCCL overrides etc.) - if [ -n "${extra_env}" ]; then - sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}" - else - sbatch_cmd+=" --export=ALL,${env_exports}" - fi - - sbatch_cmd+=" ${SBATCH_SCRIPT}" - - if [ "${DRY_RUN}" -eq 1 ]; then - echo "[DRY RUN] ${sbatch_cmd}" - echo "" - return - fi - - local job_output - job_output=$(eval "${sbatch_cmd}") - local job_id - job_id=$(echo "${job_output}" | awk '{print $NF}') - echo "Submitted job ${job_id} for config ${config_name}" - echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap},${seq_length},${enable_fusions}" >> "${RESULTS_DIR}/sweep_jobs_v2.csv" -} - -# ============================================================ -# Initialize tracking file -# ============================================================ -echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap,seq_length,enable_fusions" > "${RESULTS_DIR}/sweep_jobs_v2.csv" - -# ============================================================ -# ZeRO-2 WITHOUT PIPELINE PARALLELISM (PP=1) -# ============================================================ -echo "" -echo "========== ZeRO-2 SWEEP (PP=1) ==========" -echo "" - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE -submit_config "21_zero2_tp8_pp1" 8 1 2 1 64 0 0 0 2048 0 -submit_config "22_zero2_tp8_pp1_mbs2" 8 1 2 2 64 0 0 0 2048 0 -submit_config "23_zero2_tp4_pp1" 4 1 2 1 64 0 0 0 2048 0 - -# ============================================================ -# ZeRO-3 (PP=1) -# ============================================================ -echo "" -echo "========== ZeRO-3 SWEEP (PP=1) ==========" -echo "" - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE -submit_config "24_zero3_tp8_pp1" 8 1 3 1 64 0 0 0 2048 0 -submit_config "25_zero3_tp8_pp1_mbs2" 8 1 3 2 64 0 0 0 2048 0 -submit_config "26_zero3_tp4_pp1" 4 1 3 1 64 0 0 0 2048 0 -submit_config "27_zero3_tp8_pp1_overlap" 8 1 3 1 64 0 0 1 2048 0 - -# ============================================================ -# MEMORY PUSH / SEQ LENGTH / FUSIONS -# ============================================================ -echo "" -echo "========== MEMORY PUSH SWEEP ==========" -echo "" - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE -submit_config "28_mem_seq4k_tp8_pp2" 8 2 0 1 64 0 0 0 4096 0 -submit_config "29_mem_fused_tp8_pp8" 8 8 0 1 64 0 0 0 2048 1 - -# ============================================================ -# EXPANDABLE SEGMENTS IMPACT ON BEST CONFIG -# Re-test best config (TP8/PP8/ZeRO0) — now with expandable_segments -# enabled automatically via the updated sbatch. -# ============================================================ -echo "" -echo "========== EXPANDABLE SEGMENTS IMPACT ==========" -echo "" - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE -submit_config "30_best_expand_seg" 8 8 0 1 64 0 0 0 2048 0 - -echo "" -echo "========== SWEEP V2 SUBMITTED ==========" -echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs_v2.csv" -echo "" -echo "Total configs: 10" -echo "To monitor: watch 'squeue -u \$USER'" -echo "When all jobs finish, run: python parse_results.py --jobs-csv sweep_results/sweep_jobs_v2.csv" diff --git a/3.test_cases/pytorch/deepspeed/upload_results.sh b/3.test_cases/pytorch/deepspeed/upload_results.sh deleted file mode 100755 index c88c79077..000000000 --- a/3.test_cases/pytorch/deepspeed/upload_results.sh +++ /dev/null @@ -1,260 +0,0 @@ -#!/bin/bash -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: MIT-0 -# -# upload_results.sh - Upload benchmark results to S3 and CloudWatch -# -# Usage: -# bash upload_results.sh [--results-dir sweep_results] [--region us-east-1] - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -RESULTS_DIR="${1:---results-dir}" -CW_REGION="us-east-1" -S3_REGION="us-west-2" -S3_BUCKET="paragao-new-nemo-squash-container" -CW_NAMESPACE="DeepSpeed/B200Benchmarks" -CW_DASHBOARD_NAME="DeepSpeed-B200-Benchmarks" - -# Parse args -while [[ $# -gt 0 ]]; do - case $1 in - --results-dir) RESULTS_DIR="$2"; shift 2 ;; - --region) CW_REGION="$2"; shift 2 ;; - *) shift ;; - esac -done - -: "${RESULTS_DIR:=${SCRIPT_DIR}/sweep_results}" - -if [ ! -d "${RESULTS_DIR}" ]; then - echo "Error: Results directory not found: ${RESULTS_DIR}" - exit 1 -fi - -# ============================================================ -# 1. Upload JSON files to S3 -# ============================================================ -echo "=== Uploading results to S3 ===" - -# Determine S3 path: benchmark-results/b200/2026/March/04/ -YEAR=$(date -u +%Y) -MONTH=$(date -u +%B) -DAY=$(date -u +%d) -S3_PREFIX="benchmark-results/b200/${YEAR}/${MONTH}/${DAY}" - -for json_file in "${RESULTS_DIR}"/training_bench_*.json; do - if [ ! -f "${json_file}" ]; then - echo "No result JSON files found in ${RESULTS_DIR}" - break - fi - filename=$(basename "${json_file}") - echo " Uploading ${filename} -> s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" - aws s3 cp "${json_file}" "s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" \ - --region "${S3_REGION}" \ - --content-type "application/json" -done - -echo "S3 upload complete: s3://${S3_BUCKET}/${S3_PREFIX}/" -echo "" - -# ============================================================ -# 2. Publish metrics to CloudWatch -# ============================================================ -echo "=== Publishing metrics to CloudWatch ===" - -for json_file in "${RESULTS_DIR}"/training_bench_*.json; do - if [ ! -f "${json_file}" ]; then - break - fi - - filename=$(basename "${json_file}") - - # Extract metadata and summary using python - read -r config_name tp pp zero_stage precision avg_tflops avg_step_time timestamp < <( - python3 -c " -import json, sys -with open('${json_file}') as f: - d = json.load(f) -m = d['metadata'] -s = d['summary'] -sc = m.get('sweep_config', {}) -print( - sc.get('config_name', 'unknown'), - sc.get('tp', 8), - sc.get('pp', 2), - sc.get('zero_stage', 1), - m.get('precision', 'bf16'), - s.get('steady_state_avg_tflops_per_gpu', 0), - s.get('steady_state_avg_step_time_s', 0), - m.get('timestamp', '$(date -u +%Y-%m-%dT%H:%M:%SZ)') -) -" - ) - - echo " Publishing: ${config_name} (TFLOPS=${avg_tflops}, StepTime=${avg_step_time}s)" - - # Publish with full dimensions - aws cloudwatch put-metric-data \ - --namespace "${CW_NAMESPACE}" \ - --region "${CW_REGION}" \ - --metric-data "[ - { - \"MetricName\": \"TFLOPSPerGPU\", - \"Value\": ${avg_tflops}, - \"Unit\": \"Count\", - \"Timestamp\": \"${timestamp}\", - \"Dimensions\": [ - {\"Name\": \"model_size\", \"Value\": \"103b\"}, - {\"Name\": \"tp\", \"Value\": \"${tp}\"}, - {\"Name\": \"pp\", \"Value\": \"${pp}\"}, - {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"}, - {\"Name\": \"precision\", \"Value\": \"${precision}\"}, - {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} - ] - }, - { - \"MetricName\": \"StepTimeSeconds\", - \"Value\": ${avg_step_time}, - \"Unit\": \"Seconds\", - \"Timestamp\": \"${timestamp}\", - \"Dimensions\": [ - {\"Name\": \"model_size\", \"Value\": \"103b\"}, - {\"Name\": \"tp\", \"Value\": \"${tp}\"}, - {\"Name\": \"pp\", \"Value\": \"${pp}\"}, - {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"}, - {\"Name\": \"precision\", \"Value\": \"${precision}\"}, - {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} - ] - } - ]" - - # Also publish with just config_name dimension for easy dashboard queries - aws cloudwatch put-metric-data \ - --namespace "${CW_NAMESPACE}" \ - --region "${CW_REGION}" \ - --metric-data "[ - { - \"MetricName\": \"TFLOPSPerGPU\", - \"Value\": ${avg_tflops}, - \"Unit\": \"Count\", - \"Timestamp\": \"${timestamp}\", - \"Dimensions\": [ - {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} - ] - }, - { - \"MetricName\": \"StepTimeSeconds\", - \"Value\": ${avg_step_time}, - \"Unit\": \"Seconds\", - \"Timestamp\": \"${timestamp}\", - \"Dimensions\": [ - {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} - ] - } - ]" - -done - -echo "CloudWatch metrics published to namespace: ${CW_NAMESPACE}" -echo "" - -# ============================================================ -# 3. Create/Update CloudWatch Dashboard -# ============================================================ -echo "=== Creating CloudWatch Dashboard ===" - -# Build metric entries dynamically from results -TFLOPS_METRICS="" -STEPTIME_METRICS="" -TABLE_METRICS="" - -for json_file in "${RESULTS_DIR}"/training_bench_*.json; do - if [ ! -f "${json_file}" ]; then - break - fi - - config_name=$(python3 -c " -import json -with open('${json_file}') as f: - d = json.load(f) -print(d['metadata'].get('sweep_config', {}).get('config_name', 'unknown')) -") - - TFLOPS_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}]," - STEPTIME_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}]," - TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} TFLOPS\"}]," - TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} StepTime\"}]," -done - -# Remove trailing commas -TFLOPS_METRICS="${TFLOPS_METRICS%,}" -STEPTIME_METRICS="${STEPTIME_METRICS%,}" -TABLE_METRICS="${TABLE_METRICS%,}" - -DASHBOARD_BODY=$(cat < Date: Tue, 10 Mar 2026 03:11:55 +0000 Subject: [PATCH 3/5] fix: remove personal data and parameterize environment-specific values - Remove personal S3 bucket name from parse_results.py and upload_results.sh - Parameterize cluster name and instance type in parse_results.py (via CLI args and env vars) - Replace hardcoded S3 bucket/regions in upload_results.sh with required env vars (S3_BUCKET, S3_REGION, CW_REGION) - Remove hardcoded --partition=b200 from sbatch script - Make PARTITION overridable in sweep_runner.sh and sweep_runner_v2.sh - Change default partition from 'b200' to 'dev' in Makefile and sweep scripts - Add sweep_runner.sh, sweep_runner_v2.sh, and upload_results.sh to tracking --- 3.test_cases/pytorch/deepspeed/Makefile | 2 +- .../pytorch/deepspeed/parse_results.py | 46 ++- .../deepspeed/pretrain_gpt_103b.sbatch | 1 - .../pytorch/deepspeed/sweep_runner.sh | 144 ++++++++++ .../pytorch/deepspeed/sweep_runner_v2.sh | 154 ++++++++++ .../pytorch/deepspeed/upload_results.sh | 263 ++++++++++++++++++ 6 files changed, 601 insertions(+), 9 deletions(-) create mode 100755 3.test_cases/pytorch/deepspeed/sweep_runner.sh create mode 100644 3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh create mode 100755 3.test_cases/pytorch/deepspeed/upload_results.sh diff --git a/3.test_cases/pytorch/deepspeed/Makefile b/3.test_cases/pytorch/deepspeed/Makefile index d2b46ee82..a42ace894 100644 --- a/3.test_cases/pytorch/deepspeed/Makefile +++ b/3.test_cases/pytorch/deepspeed/Makefile @@ -1,7 +1,7 @@ ENROOT_IMAGE ?= deepspeed APPS_PATH ?= /fsx/apps SQUASH_FILE ?= $(APPS_PATH)/$(ENROOT_IMAGE).sqsh -PARTITION ?= b200 +PARTITION ?= dev NODES ?= 8 LOGS_DIR ?= logs RESULTS_DIR ?= sweep_results diff --git a/3.test_cases/pytorch/deepspeed/parse_results.py b/3.test_cases/pytorch/deepspeed/parse_results.py index 8eacea70c..8b2815396 100755 --- a/3.test_cases/pytorch/deepspeed/parse_results.py +++ b/3.test_cases/pytorch/deepspeed/parse_results.py @@ -6,7 +6,7 @@ Reads Slurm log files, extracts per-step metrics, and produces JSON files matching the existing benchmark-results schema at: - s3://paragao-new-nemo-squash-container/benchmark-results/b200/ + s3:///benchmark-results// Usage: python parse_results.py [--logs-dir logs] [--output-dir sweep_results] @@ -145,6 +145,8 @@ def build_result_json( gbs=64, seq_length=2048, precision="bf16", + cluster="unknown", + instance_type="unknown", ): """Build the benchmark JSON matching the existing schema.""" total_gpus = nodes * gpus_per_node @@ -219,8 +221,8 @@ def build_result_json( "metadata": { "timestamp": timestamp, "job_id": str(job_id), - "cluster": "b200-hyperpod", - "instance_type": "ml.p6-b200.48xlarge", + "cluster": cluster, + "instance_type": instance_type, "nodes": nodes, "gpus_per_node": gpus_per_node, "total_gpus": total_gpus, @@ -244,7 +246,9 @@ def build_result_json( return result -def parse_sweep_jobs(jobs_csv, logs_dir, output_dir): +def parse_sweep_jobs( + jobs_csv, logs_dir, output_dir, cluster="unknown", instance_type="unknown" +): """Parse all jobs from the sweep tracking CSV.""" os.makedirs(output_dir, exist_ok=True) results = [] @@ -291,6 +295,8 @@ def parse_sweep_jobs(jobs_csv, logs_dir, output_dir): mbs=int(row.get("mbs", 1)), gbs=int(row.get("gbs", 64)), seq_length=int(row.get("seq_length", 2048)), + cluster=cluster, + instance_type=instance_type, ) # Write individual JSON file @@ -315,7 +321,9 @@ def parse_sweep_jobs(jobs_csv, logs_dir, output_dir): return results -def parse_single_log(log_file, config_name, output_dir): +def parse_single_log( + log_file, config_name, output_dir, cluster="unknown", instance_type="unknown" +): """Parse a single log file.""" os.makedirs(output_dir, exist_ok=True) @@ -334,6 +342,8 @@ def parse_single_log(log_file, config_name, output_dir): steps=steps, config_name=config_name, job_id=job_id, + cluster=cluster, + instance_type=instance_type, ) now = datetime.now(timezone.utc) @@ -372,11 +382,27 @@ def main(): default="single_run", help="Config name for single log file parsing", ) + parser.add_argument( + "--cluster", + default=os.environ.get("CLUSTER_NAME", "unknown"), + help="Cluster name for metadata (default: $CLUSTER_NAME or 'unknown')", + ) + parser.add_argument( + "--instance-type", + default=os.environ.get("INSTANCE_TYPE", "unknown"), + help="Instance type for metadata (default: $INSTANCE_TYPE or 'unknown')", + ) args = parser.parse_args() if args.log_file: - parse_single_log(args.log_file, args.config_name, args.output_dir) + parse_single_log( + args.log_file, + args.config_name, + args.output_dir, + cluster=args.cluster, + instance_type=args.instance_type, + ) else: if not os.path.exists(args.jobs_csv): print(f"Error: Jobs CSV not found: {args.jobs_csv}") @@ -384,7 +410,13 @@ def main(): "Run sweep_runner.sh first, or use --log-file for single file parsing" ) sys.exit(1) - parse_sweep_jobs(args.jobs_csv, args.logs_dir, args.output_dir) + parse_sweep_jobs( + args.jobs_csv, + args.logs_dir, + args.output_dir, + cluster=args.cluster, + instance_type=args.instance_type, + ) if __name__ == "__main__": diff --git a/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch b/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch index 43957885f..d3b4f277a 100755 --- a/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch +++ b/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch @@ -7,7 +7,6 @@ #SBATCH --job-name=deepspeed-pretrain-103b #SBATCH --output=logs/%x_%j.out #SBATCH --error=logs/%x_%j.err -#SBATCH --partition=b200 set -euxo pipefail diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner.sh b/3.test_cases/pytorch/deepspeed/sweep_runner.sh new file mode 100755 index 000000000..d7c15398f --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/sweep_runner.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# sweep_runner.sh - Automated parameter sweep for DeepSpeed 103B pretraining +# Runs all parallelism and environment flag configurations, collects results. +# +# Usage: bash sweep_runner.sh [--dry-run] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch" +RESULTS_DIR="${SCRIPT_DIR}/sweep_results" +NODES=8 +PARTITION="${PARTITION:-dev}" + +DRY_RUN=0 +if [ "${1:-}" = "--dry-run" ]; then + DRY_RUN=1 + echo "[DRY RUN] Will print commands without submitting" +fi + +mkdir -p "${RESULTS_DIR}" logs + +# ============================================================ +# Helper: submit a sweep configuration +# ============================================================ +submit_config() { + local config_name="$1" + local tp="$2" + local pp="$3" + local zero="$4" + local mbs="$5" + local gbs="$6" + local act_ckpt="${7:-0}" + local seq_par="${8:-0}" + local overlap="${9:-0}" + shift 9 || true + local extra_env="${*:-}" + + echo "============================================" + echo "Submitting: ${config_name}" + echo " TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}" + echo " ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}" + [ -n "${extra_env}" ] && echo " Extra env: ${extra_env}" + echo "============================================" + + local env_exports="" + env_exports+="TP=${tp}," + env_exports+="PP=${pp}," + env_exports+="ZERO_STAGE=${zero}," + env_exports+="MICRO_BATCH_SIZE=${mbs}," + env_exports+="GLOBAL_BATCH_SIZE=${gbs}," + env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt}," + env_exports+="USE_SEQUENCE_PARALLEL=${seq_par}," + env_exports+="USE_OVERLAP_COMM=${overlap}," + env_exports+="CONFIG_NAME=${config_name}" + + local sbatch_cmd="sbatch" + sbatch_cmd+=" --partition=${PARTITION}" + sbatch_cmd+=" --nodes=${NODES}" + sbatch_cmd+=" --export=ALL,${env_exports}" + sbatch_cmd+=" --job-name=sweep_${config_name}" + + # Add extra env vars for NCCL tuning + if [ -n "${extra_env}" ]; then + sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}" + fi + + sbatch_cmd+=" ${SBATCH_SCRIPT}" + + if [ "${DRY_RUN}" -eq 1 ]; then + echo "[DRY RUN] ${sbatch_cmd}" + echo "" + return + fi + + local job_output + job_output=$(eval "${sbatch_cmd}") + local job_id + job_id=$(echo "${job_output}" | awk '{print $NF}') + echo "Submitted job ${job_id} for config ${config_name}" + echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap}" >> "${RESULTS_DIR}/sweep_jobs.csv" +} + +# ============================================================ +# Initialize tracking file +# ============================================================ +echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap" > "${RESULTS_DIR}/sweep_jobs.csv" + +# ============================================================ +# PARALLELISM SWEEP (Configs 1-11) +# ============================================================ +echo "" +echo "========== PARALLELISM SWEEP ==========" +echo "" + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR +submit_config "01_baseline" 8 2 0 1 64 0 0 0 +submit_config "02_more_pp" 8 4 0 1 64 0 0 0 +submit_config "03_zero1" 8 2 1 1 64 0 0 0 +submit_config "04_larger_mbs" 8 2 1 2 128 0 0 0 +submit_config "05_pp4_zero1" 8 4 1 1 128 0 0 0 +submit_config "06_zero2" 8 2 2 1 64 0 0 0 +submit_config "07_full_pp" 8 8 0 1 64 0 0 0 +submit_config "08_tp4_pp4" 4 4 1 1 64 0 0 0 +submit_config "09_act_ckpt" 8 2 1 1 64 1 0 0 +submit_config "10_seq_parallel" 8 2 1 1 64 0 1 0 +submit_config "11_overlap_comm" 8 2 1 1 64 0 0 1 + +# ============================================================ +# Wait for parallelism sweep to determine best config +# If not waiting, env sweep uses config 03 (TP8/PP2/ZeRO1) as default +# ============================================================ +echo "" +echo "========== ENVIRONMENT FLAGS SWEEP ==========" +echo "(Using TP=8 PP=2 ZeRO=1 as base for env flag sweep)" +echo "" + +# Base parallelism for env sweep +BASE_TP=8 +BASE_PP=2 +BASE_ZERO=1 +BASE_MBS=1 +BASE_GBS=64 + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR extra_env +submit_config "12_nccl_ring" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Ring" +submit_config "13_nccl_tree" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Tree" +submit_config "14_nccl_no_tuner" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_TUNER_PLUGIN=" +submit_config "15_nccl_chunk_4mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_P2P_NET_CHUNKSIZE=4194304" +submit_config "16_cuda_max_conn_1" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "CUDA_DEVICE_MAX_CONNECTIONS=1" +submit_config "17_nccl_buf_16mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=16777216" +submit_config "18_nccl_buf_32mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=33554432" +submit_config "19_nccl_min_ch_16" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=16" +submit_config "20_nccl_min_ch_32" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=32" + +echo "" +echo "========== SWEEP SUBMITTED ==========" +echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs.csv" +echo "" +echo "To monitor: watch 'squeue -u \$USER'" +echo "When all jobs finish, run: python parse_results.py" diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh b/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh new file mode 100644 index 000000000..297d85fec --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# sweep_runner_v2.sh - Sweep v2: ZeRO-2 (no PP), ZeRO-3, memory push, fusion ops +# +# All configs use PYTORCH_CUDA_ALLOC_CONF=expandable_segments:true (set in sbatch). +# Optimal NCCL flags are the defaults already in the sbatch script. +# +# Usage: bash sweep_runner_v2.sh [--dry-run] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch" +RESULTS_DIR="${SCRIPT_DIR}/sweep_results" +NODES=8 +PARTITION="${PARTITION:-dev}" + +DRY_RUN=0 +if [ "${1:-}" = "--dry-run" ]; then + DRY_RUN=1 + echo "[DRY RUN] Will print commands without submitting" +fi + +mkdir -p "${RESULTS_DIR}" logs + +# ============================================================ +# Helper: submit a sweep configuration +# Extends v1 helper with seq_length and enable_fusions params. +# ============================================================ +submit_config() { + local config_name="$1" + local tp="$2" + local pp="$3" + local zero="$4" + local mbs="$5" + local gbs="$6" + local act_ckpt="${7:-0}" + local seq_par="${8:-0}" + local overlap="${9:-0}" + local seq_length="${10:-2048}" + local enable_fusions="${11:-0}" + shift 11 || true + local extra_env="${*:-}" + + echo "============================================" + echo "Submitting: ${config_name}" + echo " TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}" + echo " SeqLen=${seq_length} Fusions=${enable_fusions}" + echo " ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}" + [ -n "${extra_env}" ] && echo " Extra env: ${extra_env}" + echo "============================================" + + local env_exports="" + env_exports+="TP=${tp}," + env_exports+="PP=${pp}," + env_exports+="ZERO_STAGE=${zero}," + env_exports+="MICRO_BATCH_SIZE=${mbs}," + env_exports+="GLOBAL_BATCH_SIZE=${gbs}," + env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt}," + env_exports+="USE_SEQUENCE_PARALLEL=${seq_par}," + env_exports+="USE_OVERLAP_COMM=${overlap}," + env_exports+="SEQ_LENGTH=${seq_length}," + env_exports+="ENABLE_FUSIONS=${enable_fusions}," + env_exports+="CONFIG_NAME=${config_name}" + + local sbatch_cmd="sbatch" + sbatch_cmd+=" --partition=${PARTITION}" + sbatch_cmd+=" --nodes=${NODES}" + + # Add extra env vars (NCCL overrides etc.) + if [ -n "${extra_env}" ]; then + sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}" + else + sbatch_cmd+=" --export=ALL,${env_exports}" + fi + + sbatch_cmd+=" ${SBATCH_SCRIPT}" + + if [ "${DRY_RUN}" -eq 1 ]; then + echo "[DRY RUN] ${sbatch_cmd}" + echo "" + return + fi + + local job_output + job_output=$(eval "${sbatch_cmd}") + local job_id + job_id=$(echo "${job_output}" | awk '{print $NF}') + echo "Submitted job ${job_id} for config ${config_name}" + echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap},${seq_length},${enable_fusions}" >> "${RESULTS_DIR}/sweep_jobs_v2.csv" +} + +# ============================================================ +# Initialize tracking file +# ============================================================ +echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap,seq_length,enable_fusions" > "${RESULTS_DIR}/sweep_jobs_v2.csv" + +# ============================================================ +# ZeRO-2 WITHOUT PIPELINE PARALLELISM (PP=1) +# ============================================================ +echo "" +echo "========== ZeRO-2 SWEEP (PP=1) ==========" +echo "" + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE +submit_config "21_zero2_tp8_pp1" 8 1 2 1 64 0 0 0 2048 0 +submit_config "22_zero2_tp8_pp1_mbs2" 8 1 2 2 64 0 0 0 2048 0 +submit_config "23_zero2_tp4_pp1" 4 1 2 1 64 0 0 0 2048 0 + +# ============================================================ +# ZeRO-3 (PP=1) +# ============================================================ +echo "" +echo "========== ZeRO-3 SWEEP (PP=1) ==========" +echo "" + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE +submit_config "24_zero3_tp8_pp1" 8 1 3 1 64 0 0 0 2048 0 +submit_config "25_zero3_tp8_pp1_mbs2" 8 1 3 2 64 0 0 0 2048 0 +submit_config "26_zero3_tp4_pp1" 4 1 3 1 64 0 0 0 2048 0 +submit_config "27_zero3_tp8_pp1_overlap" 8 1 3 1 64 0 0 1 2048 0 + +# ============================================================ +# MEMORY PUSH / SEQ LENGTH / FUSIONS +# ============================================================ +echo "" +echo "========== MEMORY PUSH SWEEP ==========" +echo "" + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE +submit_config "28_mem_seq4k_tp8_pp2" 8 2 0 1 64 0 0 0 4096 0 +submit_config "29_mem_fused_tp8_pp8" 8 8 0 1 64 0 0 0 2048 1 + +# ============================================================ +# EXPANDABLE SEGMENTS IMPACT ON BEST CONFIG +# Re-test best config (TP8/PP8/ZeRO0) — now with expandable_segments +# enabled automatically via the updated sbatch. +# ============================================================ +echo "" +echo "========== EXPANDABLE SEGMENTS IMPACT ==========" +echo "" + +# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE +submit_config "30_best_expand_seg" 8 8 0 1 64 0 0 0 2048 0 + +echo "" +echo "========== SWEEP V2 SUBMITTED ==========" +echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs_v2.csv" +echo "" +echo "Total configs: 10" +echo "To monitor: watch 'squeue -u \$USER'" +echo "When all jobs finish, run: python parse_results.py --jobs-csv sweep_results/sweep_jobs_v2.csv" diff --git a/3.test_cases/pytorch/deepspeed/upload_results.sh b/3.test_cases/pytorch/deepspeed/upload_results.sh new file mode 100755 index 000000000..6df3cab99 --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/upload_results.sh @@ -0,0 +1,263 @@ +#!/bin/bash +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# upload_results.sh - Upload benchmark results to S3 and CloudWatch +# +# Usage: +# export S3_BUCKET=my-benchmark-bucket +# export S3_REGION=us-west-2 +# export CW_REGION=us-east-1 +# bash upload_results.sh [--results-dir sweep_results] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RESULTS_DIR="${1:---results-dir}" +CW_REGION="${CW_REGION:?Error: CW_REGION must be set (e.g. export CW_REGION=us-east-1)}" +S3_REGION="${S3_REGION:?Error: S3_REGION must be set (e.g. export S3_REGION=us-west-2)}" +S3_BUCKET="${S3_BUCKET:?Error: S3_BUCKET must be set (e.g. export S3_BUCKET=my-benchmark-bucket)}" +CW_NAMESPACE="DeepSpeed/B200Benchmarks" +CW_DASHBOARD_NAME="DeepSpeed-B200-Benchmarks" + +# Parse args +while [[ $# -gt 0 ]]; do + case $1 in + --results-dir) RESULTS_DIR="$2"; shift 2 ;; + --region) CW_REGION="$2"; shift 2 ;; + *) shift ;; + esac +done + +: "${RESULTS_DIR:=${SCRIPT_DIR}/sweep_results}" + +if [ ! -d "${RESULTS_DIR}" ]; then + echo "Error: Results directory not found: ${RESULTS_DIR}" + exit 1 +fi + +# ============================================================ +# 1. Upload JSON files to S3 +# ============================================================ +echo "=== Uploading results to S3 ===" + +# Determine S3 path: benchmark-results/b200/2026/March/04/ +YEAR=$(date -u +%Y) +MONTH=$(date -u +%B) +DAY=$(date -u +%d) +S3_PREFIX="benchmark-results/b200/${YEAR}/${MONTH}/${DAY}" + +for json_file in "${RESULTS_DIR}"/training_bench_*.json; do + if [ ! -f "${json_file}" ]; then + echo "No result JSON files found in ${RESULTS_DIR}" + break + fi + filename=$(basename "${json_file}") + echo " Uploading ${filename} -> s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" + aws s3 cp "${json_file}" "s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" \ + --region "${S3_REGION}" \ + --content-type "application/json" +done + +echo "S3 upload complete: s3://${S3_BUCKET}/${S3_PREFIX}/" +echo "" + +# ============================================================ +# 2. Publish metrics to CloudWatch +# ============================================================ +echo "=== Publishing metrics to CloudWatch ===" + +for json_file in "${RESULTS_DIR}"/training_bench_*.json; do + if [ ! -f "${json_file}" ]; then + break + fi + + filename=$(basename "${json_file}") + + # Extract metadata and summary using python + read -r config_name tp pp zero_stage precision avg_tflops avg_step_time timestamp < <( + python3 -c " +import json, sys +with open('${json_file}') as f: + d = json.load(f) +m = d['metadata'] +s = d['summary'] +sc = m.get('sweep_config', {}) +print( + sc.get('config_name', 'unknown'), + sc.get('tp', 8), + sc.get('pp', 2), + sc.get('zero_stage', 1), + m.get('precision', 'bf16'), + s.get('steady_state_avg_tflops_per_gpu', 0), + s.get('steady_state_avg_step_time_s', 0), + m.get('timestamp', '$(date -u +%Y-%m-%dT%H:%M:%SZ)') +) +" + ) + + echo " Publishing: ${config_name} (TFLOPS=${avg_tflops}, StepTime=${avg_step_time}s)" + + # Publish with full dimensions + aws cloudwatch put-metric-data \ + --namespace "${CW_NAMESPACE}" \ + --region "${CW_REGION}" \ + --metric-data "[ + { + \"MetricName\": \"TFLOPSPerGPU\", + \"Value\": ${avg_tflops}, + \"Unit\": \"Count\", + \"Timestamp\": \"${timestamp}\", + \"Dimensions\": [ + {\"Name\": \"model_size\", \"Value\": \"103b\"}, + {\"Name\": \"tp\", \"Value\": \"${tp}\"}, + {\"Name\": \"pp\", \"Value\": \"${pp}\"}, + {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"}, + {\"Name\": \"precision\", \"Value\": \"${precision}\"}, + {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} + ] + }, + { + \"MetricName\": \"StepTimeSeconds\", + \"Value\": ${avg_step_time}, + \"Unit\": \"Seconds\", + \"Timestamp\": \"${timestamp}\", + \"Dimensions\": [ + {\"Name\": \"model_size\", \"Value\": \"103b\"}, + {\"Name\": \"tp\", \"Value\": \"${tp}\"}, + {\"Name\": \"pp\", \"Value\": \"${pp}\"}, + {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"}, + {\"Name\": \"precision\", \"Value\": \"${precision}\"}, + {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} + ] + } + ]" + + # Also publish with just config_name dimension for easy dashboard queries + aws cloudwatch put-metric-data \ + --namespace "${CW_NAMESPACE}" \ + --region "${CW_REGION}" \ + --metric-data "[ + { + \"MetricName\": \"TFLOPSPerGPU\", + \"Value\": ${avg_tflops}, + \"Unit\": \"Count\", + \"Timestamp\": \"${timestamp}\", + \"Dimensions\": [ + {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} + ] + }, + { + \"MetricName\": \"StepTimeSeconds\", + \"Value\": ${avg_step_time}, + \"Unit\": \"Seconds\", + \"Timestamp\": \"${timestamp}\", + \"Dimensions\": [ + {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} + ] + } + ]" + +done + +echo "CloudWatch metrics published to namespace: ${CW_NAMESPACE}" +echo "" + +# ============================================================ +# 3. Create/Update CloudWatch Dashboard +# ============================================================ +echo "=== Creating CloudWatch Dashboard ===" + +# Build metric entries dynamically from results +TFLOPS_METRICS="" +STEPTIME_METRICS="" +TABLE_METRICS="" + +for json_file in "${RESULTS_DIR}"/training_bench_*.json; do + if [ ! -f "${json_file}" ]; then + break + fi + + config_name=$(python3 -c " +import json +with open('${json_file}') as f: + d = json.load(f) +print(d['metadata'].get('sweep_config', {}).get('config_name', 'unknown')) +") + + TFLOPS_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}]," + STEPTIME_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}]," + TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} TFLOPS\"}]," + TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} StepTime\"}]," +done + +# Remove trailing commas +TFLOPS_METRICS="${TFLOPS_METRICS%,}" +STEPTIME_METRICS="${STEPTIME_METRICS%,}" +TABLE_METRICS="${TABLE_METRICS%,}" + +DASHBOARD_BODY=$(cat < Date: Tue, 10 Mar 2026 03:28:41 +0000 Subject: [PATCH 4/5] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?= =?UTF-8?q?restructure=20into=20gpt/=20subdir,=20pin=20deps,=20quote=20var?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move GPT-103B pretraining files into gpt/ subdirectory with slurm/ and configs/ sub-dirs to match repo conventions - Pin deepspeed>=0.16,<1.0 and accelerate>=1.0,<2.0 in Dockerfile - Quote all variable expansions in build and training scripts - Fix NCCL_P2P_NET_CHUNKSIZE from 2048576 to 2097152 (2MB power-of-two) - Add PP note to README env vars table clarifying best config uses PP=8 - Add trailing newline to README.md - Update all path references for new directory structure --- .../pytorch/deepspeed/0.deepspeed.dockerfile | 2 +- .../pytorch/deepspeed/1.build-image.sbatch | 12 ++++++------ 3.test_cases/pytorch/deepspeed/Makefile | 4 ++-- 3.test_cases/pytorch/deepspeed/README.md | 18 +++++++++--------- .../configs/ds_config_103b_template.json | 0 .../deepspeed/{ => gpt}/parse_results.py | 0 .../{ => gpt/slurm}/pretrain_gpt_103b.sbatch | 15 +++++++++------ .../deepspeed/{ => gpt}/sweep_runner.sh | 2 +- .../deepspeed/{ => gpt}/sweep_runner_v2.sh | 2 +- .../deepspeed/{ => gpt}/upload_results.sh | 0 10 files changed, 29 insertions(+), 26 deletions(-) rename 3.test_cases/pytorch/deepspeed/{ => gpt}/configs/ds_config_103b_template.json (100%) rename 3.test_cases/pytorch/deepspeed/{ => gpt}/parse_results.py (100%) rename 3.test_cases/pytorch/deepspeed/{ => gpt/slurm}/pretrain_gpt_103b.sbatch (95%) rename 3.test_cases/pytorch/deepspeed/{ => gpt}/sweep_runner.sh (98%) rename 3.test_cases/pytorch/deepspeed/{ => gpt}/sweep_runner_v2.sh (98%) rename 3.test_cases/pytorch/deepspeed/{ => gpt}/upload_results.sh (100%) diff --git a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile index ec7f99995..1874ca3d5 100644 --- a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile +++ b/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile @@ -147,7 +147,7 @@ RUN pip3 install --no-cache-dir \ awscli pynvml \ transformers==${TRANSFORMERS_VERSION} \ sentencepiece python-etcd \ - deepspeed accelerate + deepspeed>=0.16,<1.0 accelerate>=1.0,<2.0 RUN rm -rf /var/lib/apt/lists/* diff --git a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch index f91222361..f999b926b 100644 --- a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch +++ b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch @@ -15,14 +15,14 @@ set -euxo pipefail : "${IMAGE:=$APPS_PATH/deepspeed.sqsh}" # Ensure output directory exists -mkdir -p ${APPS_PATH} +mkdir -p "${APPS_PATH}" mkdir -p logs ENROOT_IMAGE=deepspeed -docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile . +docker build -t "${ENROOT_IMAGE}" -f 0.deepspeed.dockerfile . # Remove old sqsh file if exists -if [ -f ${ENROOT_IMAGE}.sqsh ] ; then - rm ${ENROOT_IMAGE}.sqsh +if [ -f "${ENROOT_IMAGE}.sqsh" ] ; then + rm "${ENROOT_IMAGE}.sqsh" fi -enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest -mv ${ENROOT_IMAGE}.sqsh ${IMAGE} \ No newline at end of file +enroot import -o "${ENROOT_IMAGE}.sqsh" "dockerd://${ENROOT_IMAGE}:latest" +mv "${ENROOT_IMAGE}.sqsh" "${IMAGE}" \ No newline at end of file diff --git a/3.test_cases/pytorch/deepspeed/Makefile b/3.test_cases/pytorch/deepspeed/Makefile index a42ace894..88b3d2262 100644 --- a/3.test_cases/pytorch/deepspeed/Makefile +++ b/3.test_cases/pytorch/deepspeed/Makefile @@ -43,10 +43,10 @@ clean: train: sbatch --partition=$(PARTITION) --nodes=$(NODES) \ --export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \ - pretrain_gpt_103b.sbatch + gpt/slurm/pretrain_gpt_103b.sbatch # ---- Results ---- parse: - python3 parse_results.py --jobs-csv $(RESULTS_DIR)/sweep_jobs.csv \ + python3 gpt/parse_results.py --jobs-csv $(RESULTS_DIR)/sweep_jobs.csv \ --logs-dir $(LOGS_DIR) --output-dir $(RESULTS_DIR) diff --git a/3.test_cases/pytorch/deepspeed/README.md b/3.test_cases/pytorch/deepspeed/README.md index e873d1af8..82e34d46a 100644 --- a/3.test_cases/pytorch/deepspeed/README.md +++ b/3.test_cases/pytorch/deepspeed/README.md @@ -6,7 +6,7 @@ | Use Case | Description | Location | |----------|-------------|----------| -| GPT-103B Pretraining | Large-scale GPT pretraining benchmark using Megatron-DeepSpeed with 3D parallelism (TP/PP/DP) and ZeRO optimization | [`pretrain_gpt_103b.sbatch`](pretrain_gpt_103b.sbatch) | +| GPT-103B Pretraining | Large-scale GPT pretraining benchmark using Megatron-DeepSpeed with 3D parallelism (TP/PP/DP) and ZeRO optimization | [`gpt/`](gpt/) | | QLoRA Fine-tuning | Qwen3-8B fine-tuning with QLoRA (4-bit) + DeepSpeed ZeRO-2/3, supports EKS and Slurm | [`qlora/`](qlora/) | | Llama2 Fine-tuning | Llama2 fine-tuning from HuggingFace weights using Megatron-DeepSpeed | [`examples_megatron_deepspeed/finetune_hf_llama/`](examples_megatron_deepspeed/finetune_hf_llama/) | @@ -88,9 +88,9 @@ Submit the best-performing configuration (TP=8, PP=8, ZeRO-0, fusions enabled): ```bash make train # or equivalently: -sbatch --partition=b200 --nodes=8 \ +sbatch --partition=dev --nodes=8 \ --export=ALL,TP=8,PP=8,ZERO_STAGE=0,ENABLE_FUSIONS=1,CONFIG_NAME=best_fused_tp8_pp8 \ - pretrain_gpt_103b.sbatch + gpt/slurm/pretrain_gpt_103b.sbatch ``` Override parallelism settings for custom configurations: @@ -98,7 +98,7 @@ Override parallelism settings for custom configurations: ```bash sbatch --nodes=8 \ --export=ALL,TP=8,PP=4,ZERO_STAGE=1,MICRO_BATCH_SIZE=2,CONFIG_NAME=my_config \ - pretrain_gpt_103b.sbatch + gpt/slurm/pretrain_gpt_103b.sbatch ``` #### Environment variables @@ -106,7 +106,7 @@ sbatch --nodes=8 \ | Variable | Default | Description | |----------|---------|-------------| | `TP` | 8 | Tensor parallel size | -| `PP` | 2 | Pipeline parallel size | +| `PP` | 2 | Pipeline parallel size (best throughput with PP=8, see `make train`) | | `ZERO_STAGE` | 1 | DeepSpeed ZeRO stage (0, 1, 2, or 3) | | `MICRO_BATCH_SIZE` | 1 | Per-GPU micro batch size | | `GLOBAL_BATCH_SIZE` | 64 | Global batch size | @@ -147,14 +147,14 @@ The following recommendations are based on extensive parameter sweeps across par ### Parsing results -After training completes, parse the Slurm logs into benchmark JSON using `parse_results.py`: +After training completes, parse the Slurm logs into benchmark JSON using `gpt/parse_results.py`: ```bash # Single log file -python3 parse_results.py --log-file logs/deepspeed-pretrain-103b_123.out --config-name my_config +python3 gpt/parse_results.py --log-file logs/deepspeed-pretrain-103b_123.out --config-name my_config # Multiple jobs tracked in a CSV -python3 parse_results.py --jobs-csv sweep_results/sweep_jobs.csv --output-dir sweep_results +python3 gpt/parse_results.py --jobs-csv sweep_results/sweep_jobs.csv --output-dir sweep_results ``` ### Known issues @@ -176,4 +176,4 @@ See [`qlora/README.md`](qlora/README.md) for full instructions. Fine-tune Llama2 from HuggingFace weights using Megatron-DeepSpeed. Includes weight conversion from HuggingFace to Megatron format and fine-tuning on the Stanford Alpaca dataset. Uses the shared container image (`0.deepspeed.dockerfile`). -See [`examples_megatron_deepspeed/finetune_hf_llama/README.md`](examples_megatron_deepspeed/finetune_hf_llama/README.md) for full instructions. \ No newline at end of file +See [`examples_megatron_deepspeed/finetune_hf_llama/README.md`](examples_megatron_deepspeed/finetune_hf_llama/README.md) for full instructions. diff --git a/3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json b/3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json similarity index 100% rename from 3.test_cases/pytorch/deepspeed/configs/ds_config_103b_template.json rename to 3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json diff --git a/3.test_cases/pytorch/deepspeed/parse_results.py b/3.test_cases/pytorch/deepspeed/gpt/parse_results.py similarity index 100% rename from 3.test_cases/pytorch/deepspeed/parse_results.py rename to 3.test_cases/pytorch/deepspeed/gpt/parse_results.py diff --git a/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch b/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch similarity index 95% rename from 3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch rename to 3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch index d3b4f277a..a68585528 100755 --- a/3.test_cases/pytorch/deepspeed/pretrain_gpt_103b.sbatch +++ b/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch @@ -57,7 +57,7 @@ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # ============================================================ # Cluster topology # ============================================================ -export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +export NODES=( $( scontrol show hostnames "$SLURM_JOB_NODELIST" ) ) export NODES_ARRAY=($NODES) export HEAD_NODE=${NODES_ARRAY[0]} export MASTER_ADDR=$(hostname --ip-address) @@ -72,7 +72,7 @@ export FI_LOG_LEVEL=1 export FI_PROVIDER=efa export FI_EFA_USE_HUGE_PAGE=0 export NCCL_SOCKET_IFNAME=^docker,lo,veth -export NCCL_P2P_NET_CHUNKSIZE=2048576 +export NCCL_P2P_NET_CHUNKSIZE=2097152 export NCCL_BUFFERSIZE=8388608 export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so export NCCL_ASYNC_ERROR_HANDLING=1 @@ -144,20 +144,20 @@ EOF # ============================================================ # Hostfile for DeepSpeed # ============================================================ -export HOSTFILE=/fsx/hostfile_${SLURM_JOB_ID} +export HOSTFILE="/fsx/hostfile_${SLURM_JOB_ID}" function makehostfile() { perl -e '$slots=split /,/, $ENV{"SLURM_STEP_GPUS"}; $slots=8 if $slots==0; @nodes = split /\n/, qx[scontrol show hostnames $ENV{"SLURM_JOB_NODELIST"}]; print map { "$b$_ slots=$slots\n" } @nodes' } -makehostfile > ${HOSTFILE} +makehostfile > "${HOSTFILE}" # ============================================================ # Container + distributed launch args # ============================================================ declare -a SRUN_ARGS=( - --container-image ${IMAGE} + --container-image "${IMAGE}" --container-mounts /fsx,/opt/slurm/bin ) @@ -273,7 +273,10 @@ DIST_ARGS_STR="${DIST_ARGS[*]}" MODEL_ARGS_STR="${MODEL_ARGS[*]}" DS_ARGS_STR="${DS_ARGS[*]}" +# Note: Variables inside the bash -c string are expanded on the host side before +# being passed to the container. This is intentional — the container does not have +# access to these env vars at shell expansion time. srun -l "${SRUN_ARGS[@]}" bash -c "export PYTHONPATH=${MEGATRON_DS_PATH} && cd ${MEGATRON_DS_PATH} && python3 -m torch.distributed.run ${DIST_ARGS_STR} pretrain_gpt.py ${MODEL_ARGS_STR} ${DS_ARGS_STR}" # Cleanup hostfile -rm -f ${HOSTFILE} +rm -f "${HOSTFILE}" diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner.sh b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh similarity index 98% rename from 3.test_cases/pytorch/deepspeed/sweep_runner.sh rename to 3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh index d7c15398f..28a74e7df 100755 --- a/3.test_cases/pytorch/deepspeed/sweep_runner.sh +++ b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh @@ -10,7 +10,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch" +SBATCH_SCRIPT="${SCRIPT_DIR}/slurm/pretrain_gpt_103b.sbatch" RESULTS_DIR="${SCRIPT_DIR}/sweep_results" NODES=8 PARTITION="${PARTITION:-dev}" diff --git a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh similarity index 98% rename from 3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh rename to 3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh index 297d85fec..b49c37954 100644 --- a/3.test_cases/pytorch/deepspeed/sweep_runner_v2.sh +++ b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh @@ -12,7 +12,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SBATCH_SCRIPT="${SCRIPT_DIR}/pretrain_gpt_103b.sbatch" +SBATCH_SCRIPT="${SCRIPT_DIR}/slurm/pretrain_gpt_103b.sbatch" RESULTS_DIR="${SCRIPT_DIR}/sweep_results" NODES=8 PARTITION="${PARTITION:-dev}" diff --git a/3.test_cases/pytorch/deepspeed/upload_results.sh b/3.test_cases/pytorch/deepspeed/gpt/upload_results.sh similarity index 100% rename from 3.test_cases/pytorch/deepspeed/upload_results.sh rename to 3.test_cases/pytorch/deepspeed/gpt/upload_results.sh From febd787cb10f1b1a049fe5ce9ba4e3a6b50cbfd9 Mon Sep 17 00:00:00 2001 From: Paulo Aragao Date: Tue, 10 Mar 2026 03:44:14 +0000 Subject: [PATCH 5/5] chore: remove internal sweep and upload scripts from PR --- .../pytorch/deepspeed/gpt/sweep_runner.sh | 144 ---------- .../pytorch/deepspeed/gpt/sweep_runner_v2.sh | 154 ---------- .../pytorch/deepspeed/gpt/upload_results.sh | 263 ------------------ 3 files changed, 561 deletions(-) delete mode 100755 3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh delete mode 100644 3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh delete mode 100755 3.test_cases/pytorch/deepspeed/gpt/upload_results.sh diff --git a/3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh deleted file mode 100755 index 28a74e7df..000000000 --- a/3.test_cases/pytorch/deepspeed/gpt/sweep_runner.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/bin/bash -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: MIT-0 -# -# sweep_runner.sh - Automated parameter sweep for DeepSpeed 103B pretraining -# Runs all parallelism and environment flag configurations, collects results. -# -# Usage: bash sweep_runner.sh [--dry-run] - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SBATCH_SCRIPT="${SCRIPT_DIR}/slurm/pretrain_gpt_103b.sbatch" -RESULTS_DIR="${SCRIPT_DIR}/sweep_results" -NODES=8 -PARTITION="${PARTITION:-dev}" - -DRY_RUN=0 -if [ "${1:-}" = "--dry-run" ]; then - DRY_RUN=1 - echo "[DRY RUN] Will print commands without submitting" -fi - -mkdir -p "${RESULTS_DIR}" logs - -# ============================================================ -# Helper: submit a sweep configuration -# ============================================================ -submit_config() { - local config_name="$1" - local tp="$2" - local pp="$3" - local zero="$4" - local mbs="$5" - local gbs="$6" - local act_ckpt="${7:-0}" - local seq_par="${8:-0}" - local overlap="${9:-0}" - shift 9 || true - local extra_env="${*:-}" - - echo "============================================" - echo "Submitting: ${config_name}" - echo " TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}" - echo " ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}" - [ -n "${extra_env}" ] && echo " Extra env: ${extra_env}" - echo "============================================" - - local env_exports="" - env_exports+="TP=${tp}," - env_exports+="PP=${pp}," - env_exports+="ZERO_STAGE=${zero}," - env_exports+="MICRO_BATCH_SIZE=${mbs}," - env_exports+="GLOBAL_BATCH_SIZE=${gbs}," - env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt}," - env_exports+="USE_SEQUENCE_PARALLEL=${seq_par}," - env_exports+="USE_OVERLAP_COMM=${overlap}," - env_exports+="CONFIG_NAME=${config_name}" - - local sbatch_cmd="sbatch" - sbatch_cmd+=" --partition=${PARTITION}" - sbatch_cmd+=" --nodes=${NODES}" - sbatch_cmd+=" --export=ALL,${env_exports}" - sbatch_cmd+=" --job-name=sweep_${config_name}" - - # Add extra env vars for NCCL tuning - if [ -n "${extra_env}" ]; then - sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}" - fi - - sbatch_cmd+=" ${SBATCH_SCRIPT}" - - if [ "${DRY_RUN}" -eq 1 ]; then - echo "[DRY RUN] ${sbatch_cmd}" - echo "" - return - fi - - local job_output - job_output=$(eval "${sbatch_cmd}") - local job_id - job_id=$(echo "${job_output}" | awk '{print $NF}') - echo "Submitted job ${job_id} for config ${config_name}" - echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap}" >> "${RESULTS_DIR}/sweep_jobs.csv" -} - -# ============================================================ -# Initialize tracking file -# ============================================================ -echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap" > "${RESULTS_DIR}/sweep_jobs.csv" - -# ============================================================ -# PARALLELISM SWEEP (Configs 1-11) -# ============================================================ -echo "" -echo "========== PARALLELISM SWEEP ==========" -echo "" - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR -submit_config "01_baseline" 8 2 0 1 64 0 0 0 -submit_config "02_more_pp" 8 4 0 1 64 0 0 0 -submit_config "03_zero1" 8 2 1 1 64 0 0 0 -submit_config "04_larger_mbs" 8 2 1 2 128 0 0 0 -submit_config "05_pp4_zero1" 8 4 1 1 128 0 0 0 -submit_config "06_zero2" 8 2 2 1 64 0 0 0 -submit_config "07_full_pp" 8 8 0 1 64 0 0 0 -submit_config "08_tp4_pp4" 4 4 1 1 64 0 0 0 -submit_config "09_act_ckpt" 8 2 1 1 64 1 0 0 -submit_config "10_seq_parallel" 8 2 1 1 64 0 1 0 -submit_config "11_overlap_comm" 8 2 1 1 64 0 0 1 - -# ============================================================ -# Wait for parallelism sweep to determine best config -# If not waiting, env sweep uses config 03 (TP8/PP2/ZeRO1) as default -# ============================================================ -echo "" -echo "========== ENVIRONMENT FLAGS SWEEP ==========" -echo "(Using TP=8 PP=2 ZeRO=1 as base for env flag sweep)" -echo "" - -# Base parallelism for env sweep -BASE_TP=8 -BASE_PP=2 -BASE_ZERO=1 -BASE_MBS=1 -BASE_GBS=64 - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR extra_env -submit_config "12_nccl_ring" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Ring" -submit_config "13_nccl_tree" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_ALGO=Tree" -submit_config "14_nccl_no_tuner" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_TUNER_PLUGIN=" -submit_config "15_nccl_chunk_4mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_P2P_NET_CHUNKSIZE=4194304" -submit_config "16_cuda_max_conn_1" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "CUDA_DEVICE_MAX_CONNECTIONS=1" -submit_config "17_nccl_buf_16mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=16777216" -submit_config "18_nccl_buf_32mb" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_BUFFERSIZE=33554432" -submit_config "19_nccl_min_ch_16" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=16" -submit_config "20_nccl_min_ch_32" ${BASE_TP} ${BASE_PP} ${BASE_ZERO} ${BASE_MBS} ${BASE_GBS} 0 0 0 "NCCL_MIN_NCHANNELS=32" - -echo "" -echo "========== SWEEP SUBMITTED ==========" -echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs.csv" -echo "" -echo "To monitor: watch 'squeue -u \$USER'" -echo "When all jobs finish, run: python parse_results.py" diff --git a/3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh b/3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh deleted file mode 100644 index b49c37954..000000000 --- a/3.test_cases/pytorch/deepspeed/gpt/sweep_runner_v2.sh +++ /dev/null @@ -1,154 +0,0 @@ -#!/bin/bash -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: MIT-0 -# -# sweep_runner_v2.sh - Sweep v2: ZeRO-2 (no PP), ZeRO-3, memory push, fusion ops -# -# All configs use PYTORCH_CUDA_ALLOC_CONF=expandable_segments:true (set in sbatch). -# Optimal NCCL flags are the defaults already in the sbatch script. -# -# Usage: bash sweep_runner_v2.sh [--dry-run] - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SBATCH_SCRIPT="${SCRIPT_DIR}/slurm/pretrain_gpt_103b.sbatch" -RESULTS_DIR="${SCRIPT_DIR}/sweep_results" -NODES=8 -PARTITION="${PARTITION:-dev}" - -DRY_RUN=0 -if [ "${1:-}" = "--dry-run" ]; then - DRY_RUN=1 - echo "[DRY RUN] Will print commands without submitting" -fi - -mkdir -p "${RESULTS_DIR}" logs - -# ============================================================ -# Helper: submit a sweep configuration -# Extends v1 helper with seq_length and enable_fusions params. -# ============================================================ -submit_config() { - local config_name="$1" - local tp="$2" - local pp="$3" - local zero="$4" - local mbs="$5" - local gbs="$6" - local act_ckpt="${7:-0}" - local seq_par="${8:-0}" - local overlap="${9:-0}" - local seq_length="${10:-2048}" - local enable_fusions="${11:-0}" - shift 11 || true - local extra_env="${*:-}" - - echo "============================================" - echo "Submitting: ${config_name}" - echo " TP=${tp} PP=${pp} ZeRO=${zero} MBS=${mbs} GBS=${gbs}" - echo " SeqLen=${seq_length} Fusions=${enable_fusions}" - echo " ActCkpt=${act_ckpt} SeqPar=${seq_par} Overlap=${overlap}" - [ -n "${extra_env}" ] && echo " Extra env: ${extra_env}" - echo "============================================" - - local env_exports="" - env_exports+="TP=${tp}," - env_exports+="PP=${pp}," - env_exports+="ZERO_STAGE=${zero}," - env_exports+="MICRO_BATCH_SIZE=${mbs}," - env_exports+="GLOBAL_BATCH_SIZE=${gbs}," - env_exports+="USE_ACTIVATION_CHECKPOINTING=${act_ckpt}," - env_exports+="USE_SEQUENCE_PARALLEL=${seq_par}," - env_exports+="USE_OVERLAP_COMM=${overlap}," - env_exports+="SEQ_LENGTH=${seq_length}," - env_exports+="ENABLE_FUSIONS=${enable_fusions}," - env_exports+="CONFIG_NAME=${config_name}" - - local sbatch_cmd="sbatch" - sbatch_cmd+=" --partition=${PARTITION}" - sbatch_cmd+=" --nodes=${NODES}" - - # Add extra env vars (NCCL overrides etc.) - if [ -n "${extra_env}" ]; then - sbatch_cmd+=" --export=ALL,${env_exports},${extra_env}" - else - sbatch_cmd+=" --export=ALL,${env_exports}" - fi - - sbatch_cmd+=" ${SBATCH_SCRIPT}" - - if [ "${DRY_RUN}" -eq 1 ]; then - echo "[DRY RUN] ${sbatch_cmd}" - echo "" - return - fi - - local job_output - job_output=$(eval "${sbatch_cmd}") - local job_id - job_id=$(echo "${job_output}" | awk '{print $NF}') - echo "Submitted job ${job_id} for config ${config_name}" - echo "${job_id},${config_name},${tp},${pp},${zero},${mbs},${gbs},${act_ckpt},${seq_par},${overlap},${seq_length},${enable_fusions}" >> "${RESULTS_DIR}/sweep_jobs_v2.csv" -} - -# ============================================================ -# Initialize tracking file -# ============================================================ -echo "job_id,config_name,tp,pp,zero,mbs,gbs,act_ckpt,seq_par,overlap,seq_length,enable_fusions" > "${RESULTS_DIR}/sweep_jobs_v2.csv" - -# ============================================================ -# ZeRO-2 WITHOUT PIPELINE PARALLELISM (PP=1) -# ============================================================ -echo "" -echo "========== ZeRO-2 SWEEP (PP=1) ==========" -echo "" - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE -submit_config "21_zero2_tp8_pp1" 8 1 2 1 64 0 0 0 2048 0 -submit_config "22_zero2_tp8_pp1_mbs2" 8 1 2 2 64 0 0 0 2048 0 -submit_config "23_zero2_tp4_pp1" 4 1 2 1 64 0 0 0 2048 0 - -# ============================================================ -# ZeRO-3 (PP=1) -# ============================================================ -echo "" -echo "========== ZeRO-3 SWEEP (PP=1) ==========" -echo "" - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE -submit_config "24_zero3_tp8_pp1" 8 1 3 1 64 0 0 0 2048 0 -submit_config "25_zero3_tp8_pp1_mbs2" 8 1 3 2 64 0 0 0 2048 0 -submit_config "26_zero3_tp4_pp1" 4 1 3 1 64 0 0 0 2048 0 -submit_config "27_zero3_tp8_pp1_overlap" 8 1 3 1 64 0 0 1 2048 0 - -# ============================================================ -# MEMORY PUSH / SEQ LENGTH / FUSIONS -# ============================================================ -echo "" -echo "========== MEMORY PUSH SWEEP ==========" -echo "" - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE -submit_config "28_mem_seq4k_tp8_pp2" 8 2 0 1 64 0 0 0 4096 0 -submit_config "29_mem_fused_tp8_pp8" 8 8 0 1 64 0 0 0 2048 1 - -# ============================================================ -# EXPANDABLE SEGMENTS IMPACT ON BEST CONFIG -# Re-test best config (TP8/PP8/ZeRO0) — now with expandable_segments -# enabled automatically via the updated sbatch. -# ============================================================ -echo "" -echo "========== EXPANDABLE SEGMENTS IMPACT ==========" -echo "" - -# config_name TP PP ZeRO MBS GBS ACT SEQ OVR SEQ_LEN FUSE -submit_config "30_best_expand_seg" 8 8 0 1 64 0 0 0 2048 0 - -echo "" -echo "========== SWEEP V2 SUBMITTED ==========" -echo "Job tracking file: ${RESULTS_DIR}/sweep_jobs_v2.csv" -echo "" -echo "Total configs: 10" -echo "To monitor: watch 'squeue -u \$USER'" -echo "When all jobs finish, run: python parse_results.py --jobs-csv sweep_results/sweep_jobs_v2.csv" diff --git a/3.test_cases/pytorch/deepspeed/gpt/upload_results.sh b/3.test_cases/pytorch/deepspeed/gpt/upload_results.sh deleted file mode 100755 index 6df3cab99..000000000 --- a/3.test_cases/pytorch/deepspeed/gpt/upload_results.sh +++ /dev/null @@ -1,263 +0,0 @@ -#!/bin/bash -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: MIT-0 -# -# upload_results.sh - Upload benchmark results to S3 and CloudWatch -# -# Usage: -# export S3_BUCKET=my-benchmark-bucket -# export S3_REGION=us-west-2 -# export CW_REGION=us-east-1 -# bash upload_results.sh [--results-dir sweep_results] - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -RESULTS_DIR="${1:---results-dir}" -CW_REGION="${CW_REGION:?Error: CW_REGION must be set (e.g. export CW_REGION=us-east-1)}" -S3_REGION="${S3_REGION:?Error: S3_REGION must be set (e.g. export S3_REGION=us-west-2)}" -S3_BUCKET="${S3_BUCKET:?Error: S3_BUCKET must be set (e.g. export S3_BUCKET=my-benchmark-bucket)}" -CW_NAMESPACE="DeepSpeed/B200Benchmarks" -CW_DASHBOARD_NAME="DeepSpeed-B200-Benchmarks" - -# Parse args -while [[ $# -gt 0 ]]; do - case $1 in - --results-dir) RESULTS_DIR="$2"; shift 2 ;; - --region) CW_REGION="$2"; shift 2 ;; - *) shift ;; - esac -done - -: "${RESULTS_DIR:=${SCRIPT_DIR}/sweep_results}" - -if [ ! -d "${RESULTS_DIR}" ]; then - echo "Error: Results directory not found: ${RESULTS_DIR}" - exit 1 -fi - -# ============================================================ -# 1. Upload JSON files to S3 -# ============================================================ -echo "=== Uploading results to S3 ===" - -# Determine S3 path: benchmark-results/b200/2026/March/04/ -YEAR=$(date -u +%Y) -MONTH=$(date -u +%B) -DAY=$(date -u +%d) -S3_PREFIX="benchmark-results/b200/${YEAR}/${MONTH}/${DAY}" - -for json_file in "${RESULTS_DIR}"/training_bench_*.json; do - if [ ! -f "${json_file}" ]; then - echo "No result JSON files found in ${RESULTS_DIR}" - break - fi - filename=$(basename "${json_file}") - echo " Uploading ${filename} -> s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" - aws s3 cp "${json_file}" "s3://${S3_BUCKET}/${S3_PREFIX}/${filename}" \ - --region "${S3_REGION}" \ - --content-type "application/json" -done - -echo "S3 upload complete: s3://${S3_BUCKET}/${S3_PREFIX}/" -echo "" - -# ============================================================ -# 2. Publish metrics to CloudWatch -# ============================================================ -echo "=== Publishing metrics to CloudWatch ===" - -for json_file in "${RESULTS_DIR}"/training_bench_*.json; do - if [ ! -f "${json_file}" ]; then - break - fi - - filename=$(basename "${json_file}") - - # Extract metadata and summary using python - read -r config_name tp pp zero_stage precision avg_tflops avg_step_time timestamp < <( - python3 -c " -import json, sys -with open('${json_file}') as f: - d = json.load(f) -m = d['metadata'] -s = d['summary'] -sc = m.get('sweep_config', {}) -print( - sc.get('config_name', 'unknown'), - sc.get('tp', 8), - sc.get('pp', 2), - sc.get('zero_stage', 1), - m.get('precision', 'bf16'), - s.get('steady_state_avg_tflops_per_gpu', 0), - s.get('steady_state_avg_step_time_s', 0), - m.get('timestamp', '$(date -u +%Y-%m-%dT%H:%M:%SZ)') -) -" - ) - - echo " Publishing: ${config_name} (TFLOPS=${avg_tflops}, StepTime=${avg_step_time}s)" - - # Publish with full dimensions - aws cloudwatch put-metric-data \ - --namespace "${CW_NAMESPACE}" \ - --region "${CW_REGION}" \ - --metric-data "[ - { - \"MetricName\": \"TFLOPSPerGPU\", - \"Value\": ${avg_tflops}, - \"Unit\": \"Count\", - \"Timestamp\": \"${timestamp}\", - \"Dimensions\": [ - {\"Name\": \"model_size\", \"Value\": \"103b\"}, - {\"Name\": \"tp\", \"Value\": \"${tp}\"}, - {\"Name\": \"pp\", \"Value\": \"${pp}\"}, - {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"}, - {\"Name\": \"precision\", \"Value\": \"${precision}\"}, - {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} - ] - }, - { - \"MetricName\": \"StepTimeSeconds\", - \"Value\": ${avg_step_time}, - \"Unit\": \"Seconds\", - \"Timestamp\": \"${timestamp}\", - \"Dimensions\": [ - {\"Name\": \"model_size\", \"Value\": \"103b\"}, - {\"Name\": \"tp\", \"Value\": \"${tp}\"}, - {\"Name\": \"pp\", \"Value\": \"${pp}\"}, - {\"Name\": \"zero_stage\", \"Value\": \"${zero_stage}\"}, - {\"Name\": \"precision\", \"Value\": \"${precision}\"}, - {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} - ] - } - ]" - - # Also publish with just config_name dimension for easy dashboard queries - aws cloudwatch put-metric-data \ - --namespace "${CW_NAMESPACE}" \ - --region "${CW_REGION}" \ - --metric-data "[ - { - \"MetricName\": \"TFLOPSPerGPU\", - \"Value\": ${avg_tflops}, - \"Unit\": \"Count\", - \"Timestamp\": \"${timestamp}\", - \"Dimensions\": [ - {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} - ] - }, - { - \"MetricName\": \"StepTimeSeconds\", - \"Value\": ${avg_step_time}, - \"Unit\": \"Seconds\", - \"Timestamp\": \"${timestamp}\", - \"Dimensions\": [ - {\"Name\": \"config_name\", \"Value\": \"${config_name}\"} - ] - } - ]" - -done - -echo "CloudWatch metrics published to namespace: ${CW_NAMESPACE}" -echo "" - -# ============================================================ -# 3. Create/Update CloudWatch Dashboard -# ============================================================ -echo "=== Creating CloudWatch Dashboard ===" - -# Build metric entries dynamically from results -TFLOPS_METRICS="" -STEPTIME_METRICS="" -TABLE_METRICS="" - -for json_file in "${RESULTS_DIR}"/training_bench_*.json; do - if [ ! -f "${json_file}" ]; then - break - fi - - config_name=$(python3 -c " -import json -with open('${json_file}') as f: - d = json.load(f) -print(d['metadata'].get('sweep_config', {}).get('config_name', 'unknown')) -") - - TFLOPS_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}]," - STEPTIME_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name}\"}]," - TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"TFLOPSPerGPU\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} TFLOPS\"}]," - TABLE_METRICS+="[\"${CW_NAMESPACE}\",\"StepTimeSeconds\",\"config_name\",\"${config_name}\",{\"label\":\"${config_name} StepTime\"}]," -done - -# Remove trailing commas -TFLOPS_METRICS="${TFLOPS_METRICS%,}" -STEPTIME_METRICS="${STEPTIME_METRICS%,}" -TABLE_METRICS="${TABLE_METRICS%,}" - -DASHBOARD_BODY=$(cat <