NVIDIA-NeMo · abhinavg4 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 26, 2026
diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md
@@ -0,0 +1,3 @@
+# Megatron Examples
+
+Recipes and configuration overrides for megatron training.
diff --git a/examples/diffusion/override_configs/README.md b/examples/diffusion/override_configs/README.md
@@ -0,0 +1,3 @@
+# Override Configs
+
+Parallelism configuration overrides for different CP/TP/SP sizes.
diff --git a/examples/diffusion/override_configs/wan_pretrain_sample_data.yaml b/examples/diffusion/override_configs/wan_pretrain_sample_data.yaml
@@ -0,0 +1,44 @@
+# WAN Pretrain Mock Data Test Configuration
+# Converted from L2_Function_Tests_GPU_Wan_Mock_Data.sh
+
+model:
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
+  crossattn_emb_size: 1536
+  hidden_size: 1536
+  ffn_hidden_size: 8960
+  num_attention_heads: 12
+  num_layers: 3
+  qkv_format: thd
+  seq_length: 2048
+
+train:
+  eval_iters: 0
+  train_iters: 10
+  global_batch_size: 2
+  micro_batch_size: 1
+
+optimizer:
+  lr: 5.0e-6
+  min_lr: 5.0e-6
+
+scheduler:
+  lr_decay_style: constant
+  lr_warmup_iters: 0
+
+checkpoint:
+  save: ${oc.env:CHECKPOINT_DIR,null}
+  load: ${oc.env:CHECKPOINT_DIR,null}
+  load_optim: false
+  save_interval: 200
+
+dataset:
+  path: ${oc.env:DATASET_PATH,null}
+  seq_length: 2048
+  global_batch_size: 2
+  micro_batch_size: 1
+  packing_buffer_size: 50
+
+logger:
+  log_interval: 1
diff --git a/examples/diffusion/recipes/README.md b/examples/diffusion/recipes/README.md
@@ -0,0 +1,3 @@
+# Recipe
+
+Training recipes for Wan2.1 pretraining, finetuning, and weight verification.
diff --git a/examples/diffusion/recipes/__init__.py b/examples/diffusion/recipes/__init__.py
diff --git a/examples/diffusion/recipes/dit/__init__.py b/examples/diffusion/recipes/dit/__init__.py
diff --git a/examples/diffusion/recipes/dit/conf/dit_pretrain.yaml b/examples/diffusion/recipes/dit/conf/dit_pretrain.yaml
@@ -0,0 +1,41 @@
+# DiT Pretraining Configuration
+# This file contains all the configuration parameters for DiT pretraining
+# You can override any of these values via command line using Hydra-style syntax
+
+# Model configuration
+model:
+  tensor_model_parallel_size: 1
+  sequence_parallel: false
+  context_parallel_size: 1
+  qkv_format: thd  # Must be 'thd' for sequence packing
+  num_attention_heads: 16
+  vae_cache_folder: null  # Set to your VAE cache folder path
+
+# Dataset configuration
+dataset:
+  path: DATASET_FOLDER  # Set to your dataset folder path
+  task_encoder_seq_length: 15360
+  packing_buffer_size: 100
+  num_workers: 20
+
+# Checkpoint configuration
+checkpoint:
+  save: "dfm_experiment"  # Set to your checkpoint folder path
+  load: "dfm_experiment"  # Set to your checkpoint folder path (same as save for resuming)
+  load_optim: true
+  save_interval: 1000
+
+# Training configuration
+train:
+  eval_interval: 1000
+  train_iters: 10000
+  eval_iters: 32
+  global_batch_size: 8  # Set this to match NUM_GPUS or your desired batch size
+  micro_batch_size: 1  # Must be 1 for sequence packing
+
+# Logger configuration
+logger:
+  log_interval: 10
+  # remove wandb_project and wandb_exp_name to disable wandb logging
+  wandb_project: "DiT"
+  wandb_exp_name: "dfm_experiment"  # Set to your experiment name
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Megatron Examples

		Recipes and configuration overrides for megatron training.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Override Configs

		Parallelism configuration overrides for different CP/TP/SP sizes.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Recipe

		Training recipes for Wan2.1 pretraining, finetuning, and weight verification.