Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions examples/diffusion/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Megatron Examples

Recipes and configuration overrides for megatron training.
3 changes: 3 additions & 0 deletions examples/diffusion/override_configs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Override Configs

Parallelism configuration overrides for different CP/TP/SP sizes.
44 changes: 44 additions & 0 deletions examples/diffusion/override_configs/wan_pretrain_sample_data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# WAN Pretrain Mock Data Test Configuration
# Converted from L2_Function_Tests_GPU_Wan_Mock_Data.sh

model:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
context_parallel_size: 1
crossattn_emb_size: 1536
hidden_size: 1536
ffn_hidden_size: 8960
num_attention_heads: 12
num_layers: 3
qkv_format: thd
seq_length: 2048

train:
eval_iters: 0
train_iters: 10
global_batch_size: 2
micro_batch_size: 1

optimizer:
lr: 5.0e-6
min_lr: 5.0e-6

scheduler:
lr_decay_style: constant
lr_warmup_iters: 0

checkpoint:
save: ${oc.env:CHECKPOINT_DIR,null}
load: ${oc.env:CHECKPOINT_DIR,null}
load_optim: false
save_interval: 200

dataset:
path: ${oc.env:DATASET_PATH,null}
seq_length: 2048
global_batch_size: 2
micro_batch_size: 1
packing_buffer_size: 50

logger:
log_interval: 1
3 changes: 3 additions & 0 deletions examples/diffusion/recipes/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Recipe

Training recipes for Wan2.1 pretraining, finetuning, and weight verification.
Empty file.
Empty file.
41 changes: 41 additions & 0 deletions examples/diffusion/recipes/dit/conf/dit_pretrain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# DiT Pretraining Configuration
# This file contains all the configuration parameters for DiT pretraining
# You can override any of these values via command line using Hydra-style syntax

# Model configuration
model:
tensor_model_parallel_size: 1
sequence_parallel: false
context_parallel_size: 1
qkv_format: thd # Must be 'thd' for sequence packing
num_attention_heads: 16
vae_cache_folder: null # Set to your VAE cache folder path

# Dataset configuration
dataset:
path: DATASET_FOLDER # Set to your dataset folder path
task_encoder_seq_length: 15360
packing_buffer_size: 100
num_workers: 20

# Checkpoint configuration
checkpoint:
save: "dfm_experiment" # Set to your checkpoint folder path
load: "dfm_experiment" # Set to your checkpoint folder path (same as save for resuming)
load_optim: true
save_interval: 1000

# Training configuration
train:
eval_interval: 1000
train_iters: 10000
eval_iters: 32
global_batch_size: 8 # Set this to match NUM_GPUS or your desired batch size
micro_batch_size: 1 # Must be 1 for sequence packing

# Logger configuration
logger:
log_interval: 10
# remove wandb_project and wandb_exp_name to disable wandb logging
wandb_project: "DiT"
wandb_exp_name: "dfm_experiment" # Set to your experiment name
Loading
Loading