diff --git a/3.test_cases/pytorch/lerobot/pi0-fast-droid/Dockerfile b/3.test_cases/pytorch/lerobot/pi0-fast-droid/Dockerfile new file mode 100644 index 000000000..ee1a606b0 --- /dev/null +++ b/3.test_cases/pytorch/lerobot/pi0-fast-droid/Dockerfile @@ -0,0 +1,120 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +FROM nvcr.io/nvidia/pytorch:25.08-py3 +ENV DEBIAN_FRONTEND=noninteractive + +########################### +# Component Versions +########################### +ARG EFA_INSTALLER_VERSION=1.47.0 +ARG NCCL_VERSION=2.28.4-1 +ARG AWS_OFI_NCCL_VERSION=1.14.1-aws +ARG LEROBOT_VERSION=0.4.3 + +########################### +# Remove conflicting libs +########################### +RUN apt-get update -y +RUN apt-get remove -y --allow-change-held-packages \ + libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 + +RUN rm -rf /opt/hpcx/ompi \ + && rm -rf /usr/local/mpi \ + && rm -rf /opt/hpcx/nccl_rdma_sharp_plugin \ + && ldconfig +ENV OPAL_PREFIX= + +########################### +# System dependencies +########################### +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + git \ + gcc \ + vim \ + kmod \ + openssh-client \ + openssh-server \ + build-essential \ + curl \ + autoconf \ + libtool \ + automake \ + cmake \ + apt-utils \ + libhwloc-dev \ + # Video decode dependencies for LeRobot dataset pipeline + ffmpeg \ + libavcodec-dev \ + libavformat-dev \ + libswscale-dev && \ + DEBIAN_FRONTEND=noninteractive apt autoremove -y + +########################### +# Install EFA +########################### +RUN cd /tmp && \ + curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + cd aws-efa-installer && \ + ./efa_installer.sh -y -g -d --skip-kmod --no-verify --skip-limit-conf && \ + ldconfig && \ + rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* +ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH +ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH + +########################### +# Install NCCL from source +########################### +RUN apt-get update && apt-get remove -y libnccl2 libnccl-dev \ + && cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j src.build BUILDDIR=/usr \ + NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80" \ + && rm -rf /tmp/nccl \ + && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf + +########################### +# Install aws-ofi-nccl +########################### +RUN cd /tmp && \ + curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ + tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ + rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ + mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \ + cd /tmp/aws-ofi-nccl && \ + ./autogen.sh && \ + ./configure --prefix=/opt/amazon/efa \ + --with-libfabric=/opt/amazon/efa \ + --with-cuda=/usr/local/cuda \ + --enable-platform-aws \ + --with-mpi=/opt/amazon/openmpi && \ + make -j$(nproc) install && \ + rm -rf /tmp/aws-ofi-nccl + +RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \ + echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \ + ldconfig + +########################### +# Environment variables +########################### +ENV OMPI_MCA_pml=^cm,ucx \ + OMPI_MCA_btl=tcp,self \ + OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \ + OPAL_PREFIX=/opt/amazon/openmpi \ + NCCL_SOCKET_IFNAME=^docker,lo + +ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" + +########################### +# Install LeRobot + deps +########################### +COPY requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt + +# Install LeRobot with pi0 extra (includes custom transformers branch for pi0-fast support) +RUN pip install --no-cache-dir "lerobot[pi]==${LEROBOT_VERSION}" + +WORKDIR /workspace diff --git a/3.test_cases/pytorch/lerobot/pi0-fast-droid/Makefile b/3.test_cases/pytorch/lerobot/pi0-fast-droid/Makefile new file mode 100644 index 000000000..b836d9697 --- /dev/null +++ b/3.test_cases/pytorch/lerobot/pi0-fast-droid/Makefile @@ -0,0 +1,15 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +ENROOT_IMAGE := lerobot-pi0-fast-droid + +all: build clean import + +build: + docker build -t ${ENROOT_IMAGE} -f Dockerfile . + +clean: + -rm ${ENROOT_IMAGE}.sqsh + +import: + enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest diff --git a/3.test_cases/pytorch/lerobot/pi0-fast-droid/README.md b/3.test_cases/pytorch/lerobot/pi0-fast-droid/README.md new file mode 100644 index 000000000..82d91eeb0 --- /dev/null +++ b/3.test_cases/pytorch/lerobot/pi0-fast-droid/README.md @@ -0,0 +1,172 @@ +# LeRobot pi0-FAST — DROID VLA Training + +Multi-node distributed training of a Vision-Language-Action (VLA) policy using +[LeRobot](https://github.com/huggingface/lerobot) with the +[pi0-FAST](https://huggingface.co/docs/lerobot/en/pi0fast) architecture +(SigLIP vision encoder + Gemma 2B language backbone) on the +[DROID 1.0.1](https://huggingface.co/datasets/lerobot/droid_1.0.1) dataset +(76k+ real-robot trajectories, multi-camera, 1.7TB). + +This test case exercises multi-node data-parallel scaling with a realistic +Physical AI workload: multi-stream video decode, language conditioning, +proprioceptive state, and action chunking over EFA-connected GPU instances. + +## Prerequisites + +- AWS infrastructure set up per [1.architectures/](../../../../1.architectures/) +- Slurm cluster with Enroot/Pyxis (SageMaker HyperPod or ParallelCluster) +- P5 (H100) or P6 (B200) instances with EFA networking +- HuggingFace Hub token with access to gated models (Gemma 2B used by pi0-FAST) +- Shared filesystem (FSx for Lustre) for dataset caching and checkpoints + +## Quick Start + +### 1. Configure Environment + +```bash +cp env_vars.example env_vars +# Edit env_vars with your HF_TOKEN, output directory, etc. +source env_vars +``` + +### 2. Build the Container + +```bash +make all +``` + +This builds the Docker image, converts it to an Enroot `.sqsh` file for Pyxis. + +### 3. Run on Slurm (Multi-Node) + +```bash +# 2-node training (default) +sbatch slurm/run.sh + +# Scale to more nodes +sbatch --nodes=4 slurm/run.sh + +# Override dataset or model +DATASET_REPO_ID=lerobot/aloha_sim_transfer_cube_human \ +BATCH_SIZE=8 STEPS=50000 \ +sbatch slurm/run.sh +``` + +### 4. Run on Slurm (Single-Node) + +```bash +sbatch --nodes=1 slurm/run.sh +``` + +## Architecture + +``` +┌─────────────────────────────────────────────────┐ +│ Slurm Job (N nodes) │ +│ │ +│ Node 0 (rank 0) Node 1 (rank 1) │ +│ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ accelerate │ │ accelerate │ │ +│ │ launch │◄────►│ launch │ │ +│ │ └─ 8 GPU workers│ EFA │ └─ 8 GPU workers│ │ +│ │ lerobot-train│ │ lerobot-train│ │ +│ └─────────────────┘ └─────────────────┘ │ +│ │ │ │ +│ └────────┬────────────────┘ │ +│ │ │ +│ ┌────────▼────────┐ │ +│ │ DROID Dataset │ │ +│ │ (HF Hub / FSx) │ │ +│ └─────────────────┘ │ +└─────────────────────────────────────────────────┘ +``` + +- **Launcher**: HuggingFace Accelerate (wraps torchrun for distributed setup) +- **Distribution**: DDP via `accelerate launch` — one process per node, each spawning 8 GPU workers +- **Rendezvous**: c10d backend with head node IP from Slurm +- **Dataset**: Streamed from HuggingFace Hub or pre-cached on shared filesystem + +## Configuration + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `NUM_NODES` | `2` | Number of Slurm nodes (set via `--nodes`) | +| `GPUS_PER_NODE` | `8` | GPUs per node (8 for P5/P6) | +| `DATASET_REPO_ID` | `lerobot/droid_1.0.1` | HuggingFace dataset repo ID | +| `PRETRAINED_PATH` | `lerobot/pi0fast_base` | Pretrained model to fine-tune | +| `BATCH_SIZE` | `4` | Per-GPU batch size | +| `STEPS` | `200000` | Total training steps | +| `OUTPUT_DIR` | `/fsx/lerobot-output` | Checkpoint and log output directory | +| `HF_HOME` | (system default) | HuggingFace cache directory | +| `HF_TOKEN` | (required) | HuggingFace Hub token for gated models | + +### pi0-FAST Policy Parameters + +These are set in `slurm/run.sh` and can be adjusted: + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `policy.dtype` | `bfloat16` | Mixed precision dtype | +| `policy.gradient_checkpointing` | `true` | Reduce memory via activation checkpointing | +| `policy.chunk_size` | `10` | Action prediction horizon | +| `policy.n_action_steps` | `10` | Number of action steps to execute | +| `policy.max_action_tokens` | `256` | Max FAST tokenizer output tokens | + +## Scaling Variants + +### Variant A: Compute-Heavy (DDP Scaling Test) + +Increase model compute to stress gradient synchronization: + +```bash +# Use larger action horizons and higher resolution +TRAIN_ARGS="... --policy.chunk_size=20 --policy.n_action_steps=20" +``` + +### Variant B: Data-Heavy (I/O Scaling Test) + +Stress the data pipeline with DROID's multi-camera streams: + +```bash +# Increase dataloader workers and prefetch +TRAIN_ARGS="... --dataloader.num_workers=8" +``` + +## Metrics to Measure + +- **Samples/sec** and scaling efficiency from 1 to N nodes +- **GPU utilization** and step time breakdown (forward/backward vs allreduce vs data loading) +- **Data throughput**: video decode frames/sec per node +- **Checkpoint throughput**: time to save/load checkpoints at scale +- **Loss curve consistency**: verify same loss trajectory across different node counts + +## Troubleshooting + +### DROID Dataset Download + +DROID 1.0.1 is ~1.7TB. Pre-cache it on shared filesystem before training: + +```python +from lerobot.common.datasets.lerobot_dataset import LeRobotDataset +ds = LeRobotDataset("lerobot/droid_1.0.1") +``` + +### HuggingFace Token for Gated Models + +pi0-FAST uses Gemma 2B (gated). Log in before training: + +```bash +huggingface-cli login --token $HF_TOKEN +``` + +### EFA Verification + +Verify EFA is working on each node: + +```bash +fi_info -p efa +``` + +## License + +This project is licensed under MIT-0. See [LICENSE](../../../../LICENSE). diff --git a/3.test_cases/pytorch/lerobot/pi0-fast-droid/env_vars.example b/3.test_cases/pytorch/lerobot/pi0-fast-droid/env_vars.example new file mode 100644 index 000000000..37ba0dbed --- /dev/null +++ b/3.test_cases/pytorch/lerobot/pi0-fast-droid/env_vars.example @@ -0,0 +1,24 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +# Copy this file to env_vars and fill in your values: +# cp env_vars.example env_vars +# source env_vars + +# HuggingFace Hub token (required for downloading gated models like Gemma 2B) +export HF_TOKEN="hf_xxxxxxxxxxxxxxxxxxxx" + +# HuggingFace cache directory (should be on shared filesystem) +export HF_HOME="/fsx/.cache/huggingface" + +# Weights & Biases API key (optional, for experiment tracking) +export WANDB_API_KEY="your-wandb-key" + +# Training output directory (on shared filesystem) +export OUTPUT_DIR="/fsx/lerobot-output" + +# Number of nodes for multi-node training +export NUM_NODES=2 + +# GPUs per node (8 for p5/p6, 4 for g5.12xlarge) +export GPUS_PER_NODE=8 diff --git a/3.test_cases/pytorch/lerobot/pi0-fast-droid/requirements.txt b/3.test_cases/pytorch/lerobot/pi0-fast-droid/requirements.txt new file mode 100644 index 000000000..eddf4c99d --- /dev/null +++ b/3.test_cases/pytorch/lerobot/pi0-fast-droid/requirements.txt @@ -0,0 +1,6 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +# Core dependencies (LeRobot and its [pi] extra are installed separately in the Dockerfile) +accelerate>=1.10.0,<2.0.0 +wandb>=0.24.0,<0.25.0 diff --git a/3.test_cases/pytorch/lerobot/pi0-fast-droid/slurm/run.sh b/3.test_cases/pytorch/lerobot/pi0-fast-droid/slurm/run.sh new file mode 100755 index 000000000..8a14c8d94 --- /dev/null +++ b/3.test_cases/pytorch/lerobot/pi0-fast-droid/slurm/run.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=lerobot-pi0fast-droid +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node=8 +#SBATCH --exclusive +#SBATCH --wait-all-nodes=1 +#SBATCH --output=logs/%x_%j.out +#SBATCH --error=logs/%x_%j.err + +set -ex + +########################### +###### User Variables ##### +########################### + +GPUS_PER_NODE=8 +CONTAINER_IMAGE="${CONTAINER_IMAGE:-$(pwd)/lerobot-pi0-fast-droid.sqsh}" +OUTPUT_DIR="${OUTPUT_DIR:-/fsx/lerobot-output}" + +# Training hyperparameters +DATASET_REPO_ID="${DATASET_REPO_ID:-lerobot/droid_1.0.1}" +PRETRAINED_PATH="${PRETRAINED_PATH:-lerobot/pi0fast_base}" +BATCH_SIZE="${BATCH_SIZE:-4}" +STEPS="${STEPS:-200000}" + +########################### +## Environment Variables ## +########################### + +## EFA / libfabric settings +export FI_PROVIDER=efa +export FI_EFA_FORK_SAFE=1 +export FI_EFA_USE_HUGE_PAGE=0 + +## NCCL settings +export NCCL_DEBUG=INFO +export NCCL_SOCKET_IFNAME=^docker,lo,veth + +## Performance tuning +export NCCL_BUFFSIZE=8388608 +export NCCL_P2P_NET_CHUNKSIZE=524288 + +## HuggingFace timeouts (important for large datasets like DROID 1.7TB) +export HF_HUB_ETAG_TIMEOUT=120 +export HF_HUB_DOWNLOAD_TIMEOUT=120 + +########################### +# Network Configuration +########################### + +head_node_ip=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) + +########################### +# Accelerate Launcher +########################### + +# accelerate does NOT auto-detect SLURM variables — every parameter must be explicit. +# --machine_rank uses escaped $SLURM_PROCID so it expands per-node inside bash -c. +LAUNCHER="accelerate launch \ + --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ + --num_machines ${SLURM_NNODES} \ + --machine_rank \$SLURM_PROCID \ + --rdzv_backend c10d \ + --main_process_ip ${head_node_ip} \ + --main_process_port 29500 \ + --mixed_precision bf16" + +########################### +# LeRobot Training Args +########################### + +TRAIN_ARGS="\ + --dataset.repo_id=${DATASET_REPO_ID} \ + --policy.path=${PRETRAINED_PATH} \ + --policy.dtype=bfloat16 \ + --policy.gradient_checkpointing=true \ + --policy.chunk_size=10 \ + --policy.n_action_steps=10 \ + --policy.max_action_tokens=256 \ + --steps=${STEPS} \ + --batch_size=${BATCH_SIZE} \ + --save_freq=5000 \ + --output_dir=${OUTPUT_DIR} \ + --job_name=pi0fast_droid_${SLURM_JOB_ID}" + +# Compose into single string (accelerate launch does not handle multiline args correctly) +CMD="${LAUNCHER} \$(which lerobot-train) ${TRAIN_ARGS}" + +########################### +# Container Mounts +########################### + +declare -a SRUN_ARGS=( + --container-image "${CONTAINER_IMAGE}" + --container-mounts "${OUTPUT_DIR}:${OUTPUT_DIR}" +) + +# Mount HF cache if set +if [ -n "${HF_HOME}" ]; then + SRUN_ARGS+=(--container-mounts "${HF_HOME}:${HF_HOME}") +fi + +########################### +# HyperPod Auto-Resume +########################### + +if [ -d "/opt/sagemaker_cluster" ]; then + echo "Detected HyperPod cluster — enabling auto-resume" + SRUN_ARGS+=(--auto-resume=1) +fi + +########################### +# Create log directory +########################### + +mkdir -p logs + +########################### +# Launch Training +########################### + +srun -l "${SRUN_ARGS[@]}" bash -c "${CMD}" diff --git a/3.test_cases/pytorch/lerobot/pi0-fast-droid/src/__init__.py b/3.test_cases/pytorch/lerobot/pi0-fast-droid/src/__init__.py new file mode 100644 index 000000000..cd4ae9de3 --- /dev/null +++ b/3.test_cases/pytorch/lerobot/pi0-fast-droid/src/__init__.py @@ -0,0 +1,2 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 diff --git a/3.test_cases/pytorch/lerobot/pi0-fast-droid/test_pi0_fast_droid.py b/3.test_cases/pytorch/lerobot/pi0-fast-droid/test_pi0_fast_droid.py new file mode 100644 index 000000000..d3a13d3fb --- /dev/null +++ b/3.test_cases/pytorch/lerobot/pi0-fast-droid/test_pi0_fast_droid.py @@ -0,0 +1,34 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +import pytest + + +class TestPi0FastDroid: + """Tests for the LeRobot pi0-fast-droid test case.""" + + def test_docker_build(self, docker_build): + """Verify the Docker image builds successfully.""" + image = docker_build(".", dockerfile="Dockerfile", tag="lerobot-pi0-fast-droid.test") + assert image is not None + + def test_docker_run_smoke(self, docker_build, docker_run): + """Verify the container starts and key imports work.""" + image = docker_build(".", dockerfile="Dockerfile", tag="lerobot-pi0-fast-droid.test") + result = docker_run( + image, + command="python -c 'import torch; import lerobot; print(f\"torch={torch.__version__}, lerobot={lerobot.__version__}\")'", + ) + assert result.exit_code == 0 + + def test_lerobot_train_entrypoint(self, docker_build, docker_run): + """Verify the lerobot-train CLI entrypoint is available.""" + image = docker_build(".", dockerfile="Dockerfile", tag="lerobot-pi0-fast-droid.test") + result = docker_run(image, command="which lerobot-train") + assert result.exit_code == 0 + + def test_efa_installed(self, docker_build, docker_run): + """Verify EFA libraries are installed.""" + image = docker_build(".", dockerfile="Dockerfile", tag="lerobot-pi0-fast-droid.test") + result = docker_run(image, command="fi_info -p efa") + assert result.exit_code == 0