Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions 3.test_cases/pytorch/lerobot/pi0-fast-droid/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

FROM nvcr.io/nvidia/pytorch:25.08-py3
ENV DEBIAN_FRONTEND=noninteractive

###########################
# Component Versions
###########################
ARG EFA_INSTALLER_VERSION=1.47.0
ARG NCCL_VERSION=2.28.4-1
ARG AWS_OFI_NCCL_VERSION=1.14.1-aws
ARG LEROBOT_VERSION=0.4.3

###########################
# Remove conflicting libs
###########################
RUN apt-get update -y
RUN apt-get remove -y --allow-change-held-packages \
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1

RUN rm -rf /opt/hpcx/ompi \
&& rm -rf /usr/local/mpi \
&& rm -rf /opt/hpcx/nccl_rdma_sharp_plugin \
&& ldconfig
ENV OPAL_PREFIX=

###########################
# System dependencies
###########################
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
git \
gcc \
vim \
kmod \
openssh-client \
openssh-server \
build-essential \
curl \
autoconf \
libtool \
automake \
cmake \
apt-utils \
libhwloc-dev \
# Video decode dependencies for LeRobot dataset pipeline
ffmpeg \
libavcodec-dev \
libavformat-dev \
libswscale-dev && \
DEBIAN_FRONTEND=noninteractive apt autoremove -y

###########################
# Install EFA
###########################
RUN cd /tmp && \
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
cd aws-efa-installer && \
./efa_installer.sh -y -g -d --skip-kmod --no-verify --skip-limit-conf && \
ldconfig && \
rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH
ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH

###########################
# Install NCCL from source
###########################
RUN apt-get update && apt-get remove -y libnccl2 libnccl-dev \
&& cd /tmp \
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
&& cd nccl \
&& make -j src.build BUILDDIR=/usr \
NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80" \
&& rm -rf /tmp/nccl \
&& echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf

###########################
# Install aws-ofi-nccl
###########################
RUN cd /tmp && \
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
cd /tmp/aws-ofi-nccl && \
./autogen.sh && \
./configure --prefix=/opt/amazon/efa \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
--with-mpi=/opt/amazon/openmpi && \
make -j$(nproc) install && \
rm -rf /tmp/aws-ofi-nccl

RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
ldconfig

###########################
# Environment variables
###########################
ENV OMPI_MCA_pml=^cm,ucx \
OMPI_MCA_btl=tcp,self \
OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \
OPAL_PREFIX=/opt/amazon/openmpi \
NCCL_SOCKET_IFNAME=^docker,lo

ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"

###########################
# Install LeRobot + deps
###########################
COPY requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir -r /tmp/requirements.txt

# Install LeRobot with pi0 extra (includes custom transformers branch for pi0-fast support)
RUN pip install --no-cache-dir "lerobot[pi]==${LEROBOT_VERSION}"

WORKDIR /workspace
15 changes: 15 additions & 0 deletions 3.test_cases/pytorch/lerobot/pi0-fast-droid/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

ENROOT_IMAGE := lerobot-pi0-fast-droid

all: build clean import

build:
docker build -t ${ENROOT_IMAGE} -f Dockerfile .

clean:
-rm ${ENROOT_IMAGE}.sqsh

import:
enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest
172 changes: 172 additions & 0 deletions 3.test_cases/pytorch/lerobot/pi0-fast-droid/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# LeRobot pi0-FAST — DROID VLA Training

Multi-node distributed training of a Vision-Language-Action (VLA) policy using
[LeRobot](https://github.com/huggingface/lerobot) with the
[pi0-FAST](https://huggingface.co/docs/lerobot/en/pi0fast) architecture
(SigLIP vision encoder + Gemma 2B language backbone) on the
[DROID 1.0.1](https://huggingface.co/datasets/lerobot/droid_1.0.1) dataset
(76k+ real-robot trajectories, multi-camera, 1.7TB).

This test case exercises multi-node data-parallel scaling with a realistic
Physical AI workload: multi-stream video decode, language conditioning,
proprioceptive state, and action chunking over EFA-connected GPU instances.

## Prerequisites

- AWS infrastructure set up per [1.architectures/](../../../../1.architectures/)
- Slurm cluster with Enroot/Pyxis (SageMaker HyperPod or ParallelCluster)
- P5 (H100) or P6 (B200) instances with EFA networking
- HuggingFace Hub token with access to gated models (Gemma 2B used by pi0-FAST)
- Shared filesystem (FSx for Lustre) for dataset caching and checkpoints

## Quick Start

### 1. Configure Environment

```bash
cp env_vars.example env_vars
# Edit env_vars with your HF_TOKEN, output directory, etc.
source env_vars
```

### 2. Build the Container

```bash
make all
```

This builds the Docker image, converts it to an Enroot `.sqsh` file for Pyxis.

### 3. Run on Slurm (Multi-Node)

```bash
# 2-node training (default)
sbatch slurm/run.sh

# Scale to more nodes
sbatch --nodes=4 slurm/run.sh

# Override dataset or model
DATASET_REPO_ID=lerobot/aloha_sim_transfer_cube_human \
BATCH_SIZE=8 STEPS=50000 \
sbatch slurm/run.sh
```

### 4. Run on Slurm (Single-Node)

```bash
sbatch --nodes=1 slurm/run.sh
```

## Architecture

```
┌─────────────────────────────────────────────────┐
│ Slurm Job (N nodes) │
│ │
│ Node 0 (rank 0) Node 1 (rank 1) │
│ ┌─────────────────┐ ┌─────────────────┐ │
│ │ accelerate │ │ accelerate │ │
│ │ launch │◄────►│ launch │ │
│ │ └─ 8 GPU workers│ EFA │ └─ 8 GPU workers│ │
│ │ lerobot-train│ │ lerobot-train│ │
│ └─────────────────┘ └─────────────────┘ │
│ │ │ │
│ └────────┬────────────────┘ │
│ │ │
│ ┌────────▼────────┐ │
│ │ DROID Dataset │ │
│ │ (HF Hub / FSx) │ │
│ └─────────────────┘ │
└─────────────────────────────────────────────────┘
```

- **Launcher**: HuggingFace Accelerate (wraps torchrun for distributed setup)
- **Distribution**: DDP via `accelerate launch` — one process per node, each spawning 8 GPU workers
- **Rendezvous**: c10d backend with head node IP from Slurm
- **Dataset**: Streamed from HuggingFace Hub or pre-cached on shared filesystem

## Configuration

| Parameter | Default | Description |
|-----------|---------|-------------|
| `NUM_NODES` | `2` | Number of Slurm nodes (set via `--nodes`) |
| `GPUS_PER_NODE` | `8` | GPUs per node (8 for P5/P6) |
| `DATASET_REPO_ID` | `lerobot/droid_1.0.1` | HuggingFace dataset repo ID |
| `PRETRAINED_PATH` | `lerobot/pi0fast_base` | Pretrained model to fine-tune |
| `BATCH_SIZE` | `4` | Per-GPU batch size |
| `STEPS` | `200000` | Total training steps |
| `OUTPUT_DIR` | `/fsx/lerobot-output` | Checkpoint and log output directory |
| `HF_HOME` | (system default) | HuggingFace cache directory |
| `HF_TOKEN` | (required) | HuggingFace Hub token for gated models |

### pi0-FAST Policy Parameters

These are set in `slurm/run.sh` and can be adjusted:

| Parameter | Default | Description |
|-----------|---------|-------------|
| `policy.dtype` | `bfloat16` | Mixed precision dtype |
| `policy.gradient_checkpointing` | `true` | Reduce memory via activation checkpointing |
| `policy.chunk_size` | `10` | Action prediction horizon |
| `policy.n_action_steps` | `10` | Number of action steps to execute |
| `policy.max_action_tokens` | `256` | Max FAST tokenizer output tokens |

## Scaling Variants

### Variant A: Compute-Heavy (DDP Scaling Test)

Increase model compute to stress gradient synchronization:

```bash
# Use larger action horizons and higher resolution
TRAIN_ARGS="... --policy.chunk_size=20 --policy.n_action_steps=20"
```

### Variant B: Data-Heavy (I/O Scaling Test)

Stress the data pipeline with DROID's multi-camera streams:

```bash
# Increase dataloader workers and prefetch
TRAIN_ARGS="... --dataloader.num_workers=8"
```

## Metrics to Measure

- **Samples/sec** and scaling efficiency from 1 to N nodes
- **GPU utilization** and step time breakdown (forward/backward vs allreduce vs data loading)
- **Data throughput**: video decode frames/sec per node
- **Checkpoint throughput**: time to save/load checkpoints at scale
- **Loss curve consistency**: verify same loss trajectory across different node counts

## Troubleshooting

### DROID Dataset Download

DROID 1.0.1 is ~1.7TB. Pre-cache it on shared filesystem before training:

```python
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
ds = LeRobotDataset("lerobot/droid_1.0.1")
```

### HuggingFace Token for Gated Models

pi0-FAST uses Gemma 2B (gated). Log in before training:

```bash
huggingface-cli login --token $HF_TOKEN
```

### EFA Verification

Verify EFA is working on each node:

```bash
fi_info -p efa
```

## License

This project is licensed under MIT-0. See [LICENSE](../../../../LICENSE).
24 changes: 24 additions & 0 deletions 3.test_cases/pytorch/lerobot/pi0-fast-droid/env_vars.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# Copy this file to env_vars and fill in your values:
# cp env_vars.example env_vars
# source env_vars

# HuggingFace Hub token (required for downloading gated models like Gemma 2B)
export HF_TOKEN="hf_xxxxxxxxxxxxxxxxxxxx"

# HuggingFace cache directory (should be on shared filesystem)
export HF_HOME="/fsx/.cache/huggingface"

# Weights & Biases API key (optional, for experiment tracking)
export WANDB_API_KEY="your-wandb-key"

# Training output directory (on shared filesystem)
export OUTPUT_DIR="/fsx/lerobot-output"

# Number of nodes for multi-node training
export NUM_NODES=2

# GPUs per node (8 for p5/p6, 4 for g5.12xlarge)
export GPUS_PER_NODE=8
6 changes: 6 additions & 0 deletions 3.test_cases/pytorch/lerobot/pi0-fast-droid/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# Core dependencies (LeRobot and its [pi] extra are installed separately in the Dockerfile)
accelerate>=1.10.0,<2.0.0
wandb>=0.24.0,<0.25.0
Loading