Project website: https://mos-vla.github.io/
Paper: https://arxiv.org/abs/2510.16617
# Create and activate conda environment
conda create -n mos-vla python=3.10.16
conda activate mos-vla
# Install cuda 12.4
conda install -c nvidia/label/cuda-12.4.0 cuda
# Install PyTorch
# Use a command specific to your machine: https://pytorch.org/get-started/locally/
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
# Use pip install to download dependencies
pip install -e .
# Install Flash Attention 2 for training (https://github.com/Dao-AILab/flash-attention)
# =>> If you run into difficulty, try `pip cache remove flash_attn` first
pip install packaging ninja
ninja --version; echo $? # Verify Ninja --> should return exit code "0"
pip install "flash-attn==2.5.5" --no-build-isolation# Pre-train MoS-VLA
export NUM_GPUS=1
torchrun --standalone --nnodes 1 --nproc-per-node $NUM_GPUS \
vla-scripts/pretrain_mos_vla.py \
--vla_path openvla/openvla-7b \
--data_root_dir ../openvla_rlds \
--data_mix oxe_magic_soup_plus \
--run_root_dir runs/mos_vla_pretrain \
--shuffle_buffer_size 10000 \
--fe_basis_functions 16 \
--n_continuous_actions 6 \
--calibration_buffer_size 128 \
--calibrate_interval 16 \
--prefill_samples_per_dataset 64 \
--batch_size 10 \
--learning_rate 1e-4 \
--lr_warmup_steps 1000 \
--num_steps_before_decay 100000 \
--max_steps 100000 \
--save_freq 5000 \
--save_latest_checkpoint_only False \
--use_lora True \
--seed 7 \
--wandb_project mos-vla# Evaluate MoS-VLA
python eval-scripts/val_mos_vla.py \
--fe_checkpoint_path runs/mos_vla_pretrain/checkpoint-100000
--data_root_dir ../openvla_rldsIf you use our code in your work, please cite our paper:
@article{Mos-VLA2025,
title={MoS-VLA: A Vision-Language-Action Model with One-Shot Skill Adaptation},
author={Ruihan Zhao and Tyler Ingebrand and Sandeep Chinchali and Ufuk Topcu},
journal={Under Review},
year={2025}
}This codebase builds on top of OpenVLA-OFT.